merge internal changes

2016-04-11 16:46:46 -07:00 · 2016-04-11 16:46:46 -07:00 · f6a1d34447
commit f6a1d34447
parent 5454627c34 a77499c87d
145 changed files with 5347 additions and 1291 deletions
--- a/78
+++ b/78
@ -46,7 +46,7 @@ new_git_repository(
 new_git_repository(
  name = "font_roboto",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/font-roboto.git",
+  remote = "https://github.com/polymerelements/font-roboto.git",
  tag = "v1.0.1",
 )

@ -60,49 +60,49 @@ new_git_repository(
 new_git_repository(
  name = "iron_a11y_announcer",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-a11y-announcer.git",
+  remote = "https://github.com/polymerelements/iron-a11y-announcer.git",
  tag = "v1.0.4",
 )

 new_git_repository(
  name = "iron_a11y_keys_behavior",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-a11y-keys-behavior.git",
+  remote = "https://github.com/polymerelements/iron-a11y-keys-behavior.git",
  tag = "v1.1.2",
 )

 new_git_repository(
  name = "iron_ajax",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-ajax.git",
+  remote = "https://github.com/polymerelements/iron-ajax.git",
  tag = "v1.1.1",
 )

 new_git_repository(
  name = "iron_autogrow_textarea",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-autogrow-textarea.git",
+  remote = "https://github.com/polymerelements/iron-autogrow-textarea.git",
  tag = "v1.0.12",
 )

 new_git_repository(
  name = "iron_behaviors",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-behaviors.git",
+  remote = "https://github.com/polymerelements/iron-behaviors.git",
  tag = "v1.0.13",
 )

 new_git_repository(
  name = "iron_checked_element_behavior",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-checked-element-behavior.git",
+  remote = "https://github.com/polymerelements/iron-checked-element-behavior.git",
  tag = "v1.0.4",
 )

 new_git_repository(
  name = "iron_collapse",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-collapse.git",
+  remote = "https://github.com/polymerelements/iron-collapse.git",
  tag = "v1.0.6",
 )

@ -116,7 +116,7 @@ new_git_repository(
 new_git_repository(
  name = "iron_fit_behavior",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-fit-behavior.git",
+  remote = "https://github.com/polymerelements/iron-fit-behavior.git",
  tag = "v1.0.6",
 )

@ -130,7 +130,7 @@ new_git_repository(
 new_git_repository(
  name = "iron_form_element_behavior",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-form-element-behavior.git",
+  remote = "https://github.com/polymerelements/iron-form-element-behavior.git",
  tag = "v1.0.6",
 )

@ -151,28 +151,28 @@ new_git_repository(
 new_git_repository(
  name = "iron_iconset_svg",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-iconset-svg.git",
+  remote = "https://github.com/polymerelements/iron-iconset-svg.git",
  tag = "v1.0.9",
 )

 new_git_repository(
  name = "iron_input",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-input.git",
+  remote = "https://github.com/polymerelements/iron-input.git",
  tag = "v1.0.9",
 )

 new_git_repository(
  name = "iron_list",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-list.git",
+  remote = "https://github.com/polymerelements/iron-list.git",
  tag = "v1.1.7",
 )

 new_git_repository(
  name = "iron_menu_behavior",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-menu-behavior.git",
+  remote = "https://github.com/polymerelements/iron-menu-behavior.git",
  tag = "v1.1.5",
 )

@ -187,13 +187,13 @@ new_git_repository(
  name = "iron_overlay_behavior",
  build_file = "bower.BUILD",
  remote = "https://github.com/polymerelements/iron-overlay-behavior.git",
-  tag = "v1.6.1",
+  tag = "v1.6.2",
 )

 new_git_repository(
  name = "iron_range_behavior",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-range-behavior.git",
+  remote = "https://github.com/polymerelements/iron-range-behavior.git",
  tag = "v1.0.4",
 )

@ -207,14 +207,14 @@ new_git_repository(
 new_git_repository(
  name = "iron_selector",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-selector.git",
+  remote = "https://github.com/polymerelements/iron-selector.git",
  tag = "v1.2.4",
 )

 new_git_repository(
  name = "iron_validatable_behavior",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/iron-validatable-behavior.git",
+  remote = "https://github.com/polymerelements/iron-validatable-behavior.git",
  tag = "v1.0.5",
 )

@ -235,56 +235,56 @@ new_git_repository(
 new_git_repository(
  name = "paper_behaviors",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-behaviors.git",
+  remote = "https://github.com/polymerelements/paper-behaviors.git",
  tag = "v1.0.11",
 )

 new_git_repository(
  name = "paper_button",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-button.git",
+  remote = "https://github.com/polymerelements/paper-button.git",
  tag = "v1.0.11",
 )

 new_git_repository(
  name = "paper_checkbox",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-checkbox.git",
+  remote = "https://github.com/polymerelements/paper-checkbox.git",
  tag = "v1.1.3",
 )

 new_git_repository(
  name = "paper_dropdown_menu",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-dropdown-menu.git",
+  remote = "https://github.com/polymerelements/paper-dropdown-menu.git",
  tag = "v1.1.3",
 )

 new_git_repository(
  name = "paper_header_panel",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-header-panel.git",
+  remote = "https://github.com/polymerelements/paper-header-panel.git",
  tag = "v1.1.4",
 )

 new_git_repository(
  name = "paper_icon_button",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-icon-button.git",
+  remote = "https://github.com/polymerelements/paper-icon-button.git",
  tag = "v1.0.6",
 )

 new_git_repository(
  name = "paper_input",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-input.git",
+  remote = "https://github.com/polymerelements/paper-input.git",
  tag = "v1.1.5",
 )

 new_git_repository(
  name = "paper_item",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-item.git",
+  remote = "https://github.com/polymerelements/paper-item.git",
  tag = "v1.1.4",
 )

@ -298,7 +298,7 @@ new_git_repository(
 new_git_repository(
  name = "paper_menu",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-menu.git",
+  remote = "https://github.com/polymerelements/paper-menu.git",
  tag = "v1.2.2",
 )

@ -306,27 +306,27 @@ new_git_repository(
  name = "paper_menu_button",
  build_file = "bower.BUILD",
  remote = "https://github.com/polymerelements/paper-menu-button.git",
-  tag = "v1.0.4",
+  tag = "v1.1.0",
 )

 new_git_repository(
  name = "paper_progress",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-progress.git",
-  tag = "v1.0.8",
+  remote = "https://github.com/polymerelements/paper-progress.git",
+  tag = "v1.0.9",
 )

 new_git_repository(
  name = "paper_radio_button",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-radio-button.git",
+  remote = "https://github.com/polymerelements/paper-radio-button.git",
  tag = "v1.1.1",
 )

 new_git_repository(
  name = "paper_radio_group",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-radio-group.git",
+  remote = "https://github.com/polymerelements/paper-radio-group.git",
  tag = "v1.0.9",
 )

@ -340,35 +340,35 @@ new_git_repository(
 new_git_repository(
  name = "paper_slider",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-slider.git",
+  remote = "https://github.com/polymerelements/paper-slider.git",
  tag = "v1.0.8",
 )

 new_git_repository(
  name = "paper_styles",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-styles.git",
+  remote = "https://github.com/polymerelements/paper-styles.git",
  tag = "v1.1.1",
 )

 new_git_repository(
  name = "paper_tabs",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-tabs.git",
+  remote = "https://github.com/polymerelements/paper-tabs.git",
  tag = "v1.2.4",
 )

 new_git_repository(
  name = "paper_toggle_button",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-toggle-button.git",
+  remote = "https://github.com/polymerelements/paper-toggle-button.git",
  tag = "v1.0.12",
 )

 new_git_repository(
  name = "paper_toolbar",
  build_file = "bower.BUILD",
-  remote = "https://github.com/PolymerElements/paper-toolbar.git",
+  remote = "https://github.com/polymerelements/paper-toolbar.git",
  tag = "v1.1.2",
 )

@ -382,7 +382,7 @@ new_git_repository(
 new_git_repository(
  name = "polymer",
  build_file = "bower.BUILD",
-  remote = "https://github.com/Polymer/polymer.git",
+  remote = "https://github.com/polymer/polymer.git",
  tag = "v1.4.0",
 )

@ -403,6 +403,6 @@ new_git_repository(
 new_git_repository(
  name = "webcomponentsjs",
  build_file = "bower.BUILD",
-  remote = "https://github.com/Polymer/webcomponentsjs.git",
+  remote = "https://github.com/polymer/webcomponentsjs.git",
  tag = "v0.7.21",
 )
--- a/eigen.BUILD
+++ b/eigen.BUILD
@ -1,6 +1,6 @@
 package(default_visibility = ["//visibility:public"])

-archive_dir = "eigen-eigen-6e521c802bf5"
+archive_dir = "eigen-eigen-3f653ace7d28"

 cc_library(
    name = "eigen",
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -20,6 +20,15 @@ config_setting(
    visibility = ["//visibility:public"],
 )

+config_setting(
+    name = "android_arm",
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+        "android_cpu": "armeabi-v7a",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
    name = "darwin",
    values = {"cpu": "darwin"},
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@ -7,7 +7,7 @@

 include (ExternalProject)

-set(eigen_archive_hash "6e521c802bf5")
+set(eigen_archive_hash "3f653ace7d28")

 set(eigen_INCLUDE_DIRS
    ${CMAKE_CURRENT_BINARY_DIR}
@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
    ${tensorflow_source_dir}/third_party/eigen3
 )
 set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
-set(eigen_HASH SHA256=f1b4b4401d08d0d44128ab80ebe76633363dab20c29b1bf2370aed8b4893cc5e)
+set(eigen_HASH SHA256=b49502f423deda55cea33bc503f84409cca92157f3b536d17113b81138f86715)
 set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
 set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)

--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@ -752,7 +752,7 @@ class WithShapeTest(tf.test.TestCase):

      for incompatible_shape in [[0], [1]]:
        self.assertRaisesRegexp(
-            ValueError, "must have the same rank",
+            ValueError, r"Shapes \(\?, 2\) and \([01],\) are not compatible",
            tf.contrib.framework.with_shape,
            incompatible_shape, tensor_partial_shape)
      for incompatible_shape in [[1, 2, 1]]:
@ -761,7 +761,7 @@ class WithShapeTest(tf.test.TestCase):
            incompatible_shape, tensor_partial_shape)
      for incompatible_shape in [[2, 1]]:
        self.assertRaisesRegexp(
-            ValueError, "Dimensions.*are not compatible",
+            ValueError, r"Shapes \(\?, 2\) and \(2, 1\) are not compatible",
            tf.contrib.framework.with_shape,
            incompatible_shape, tensor_partial_shape)

--- a/tensorflow/contrib/linear_optimizer/kernels/resources_test.cc
+++ b/tensorflow/contrib/linear_optimizer/kernels/resources_test.cc
@ -164,7 +164,6 @@ TEST_F(DataByExampleTest, VisitUnavailable) {
    signal(&updated_data);
  });
  wait(&completed_visit);
-  EXPECT_FALSE(thread_pool.HasPendingClosures());
  EXPECT_TRUE(errors::IsUnavailable(status));
 }

--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -245,6 +245,7 @@ tf_cuda_library(
        "framework/register_types.h",
        "framework/resource_mgr.h",
        "framework/selective_registration.h",
+        "framework/session_state.h",
        "framework/tensor.h",
        "framework/tensor_shape.h",
        "framework/tensor_slice.h",
@ -267,6 +268,7 @@ tf_cuda_library(
        "util/saved_tensor_slice_util.h",
        "util/sparse/group_iterator.h",
        "util/sparse/sparse_tensor.h",
+        "util/stat_summarizer.h",
        "util/tensor_format.h",
        "util/tensor_slice_reader.h",
        "util/tensor_slice_reader_cache.h",
@ -856,6 +858,7 @@ filegroup(
        "framework/partial_tensor_shape.h",
        "framework/rendezvous.h",
        "framework/selective_registration.h",
+        "framework/session_state.h",
        "framework/tensor.h",
        "framework/tensor_reference.h",
        "framework/tensor_shape.h",
@ -1268,6 +1271,7 @@ tf_cc_test(
        "//tensorflow/core/kernels:matmul_op",
        "//tensorflow/core/kernels:ops_util",
        "//tensorflow/core/kernels:queue_ops",
+        "//tensorflow/core/kernels:session_ops",
        "//tensorflow/core/kernels:variable_ops",
        "//third_party/eigen3",
    ],
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@ -50,6 +50,11 @@ bool IsConstantFoldable(const Node* n,
  if (n->IsControlFlow() || n->IsSend() || n->IsRecv()) {
    return false;
  }
+  // TODO(yuanbyu): For now disable these session handle operations.
+  if (n->IsGetSessionHandle() || n->IsGetSessionTensor() ||
+      n->IsDeleteSessionTensor()) {
+    return false;
+  }
  if (n->IsSource()) {
    return false;
  }
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@ -313,6 +313,8 @@ Status DirectSession::Run(const RunOptions& run_options,
  args.rendezvous = run_state.rendez;
  args.cancellation_manager = cancellation_manager_;
  args.runner = [this](Executor::Args::Closure c) { SchedClosure(c); };
+  args.session_state = &session_state_;
+  args.tensor_store = &run_state.tensor_store;
  if (LogMemory::IsEnabled()) {
    LogMemory::RecordStep(args.step_id, run_state_args.handle);
  }
@ -340,6 +342,11 @@ Status DirectSession::Run(const RunOptions& run_options,
  // Receive outputs.
  TF_RETURN_IF_ERROR(
      RecvOutputs(output_names, executors_and_keys, &run_state, outputs));
+
+  // Save the output tensors of this run we choose to keep.
+  TF_RETURN_IF_ERROR(
+      run_state.tensor_store.SaveTensors(output_names, &session_state_));
+
  return Status::OK();
 }

@ -369,9 +376,8 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
  {
    mutex_lock l(executor_lock_);
    if (!partial_runs_.insert({run_state_args.handle, run_state}).second) {
-      return errors::Internal("The handle ", run_state_args.handle,
-                              " created for this partial"
-                              " run is not unique.");
+      return errors::Internal("The handle '", run_state_args.handle,
+                              "' created for this partial run is not unique.");
    }
  }

@ -390,13 +396,12 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
      });

  Executor::Args args;
-  {
-    mutex_lock l(mu_);
-    args.step_id = name_counter_++;
-  }
+  args.step_id = step_id_counter_.fetch_add(1);
  args.rendezvous = run_state->rendez;
  args.cancellation_manager = cancellation_manager_;
  args.runner = [this](Executor::Args::Closure c) { SchedClosure(c); };
+  args.session_state = &session_state_;
+  args.tensor_store = &run_state->tensor_store;
  if (LogMemory::IsEnabled()) {
    LogMemory::RecordStep(args.step_id, run_state_args.handle);
  }
@ -470,9 +475,14 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
    s = RecvOutputs(output_names, executors_and_keys, run_state, outputs);
  }

-  // Delete the run state if there is an error or all fetches are done.
+  // Save the output tensors of this run we choose to keep.
+  if (s.ok()) {
+    s = run_state->tensor_store.SaveTensors(output_names, &session_state_);
+  }
+
  {
    mutex_lock l(executor_lock_);
+    // Delete the run state if there is an error or all fetches are done.
    bool done = true;
    if (s.ok()) {
      {
@ -911,7 +921,7 @@ Status DirectSession::CreateGraphs(gtl::ArraySlice<string> feeds,
    // allow.
    device_opts.allow_internal_ops = true;
    device_opts.expect_device_spec = true;
-    Status s = ConvertGraphDefToGraph(device_opts, *graph_def, device_graph);
+    s = ConvertGraphDefToGraph(device_opts, *graph_def, device_graph);
    if (!s.ok()) {
      delete device_graph;
      break;
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@ -78,6 +79,7 @@ class DirectSession : public Session {
  ::tensorflow::Status PRun(const string& handle, const NamedTensorList& inputs,
                            const std::vector<string>& output_names,
                            std::vector<Tensor>* outputs) override;
+
  ::tensorflow::Status Close() override;

  // NOTE: This is a temporary api that is only meant to enable testing.
@ -135,6 +137,7 @@ class DirectSession : public Session {
    Notification executors_done;
    std::unordered_set<string> pending_inputs;
    std::unordered_set<string> pending_outputs;
+    TensorStore tensor_store;

    RunState(const std::vector<string>& input_names,
             const std::vector<string>& output_names) {
@ -146,6 +149,7 @@ class DirectSession : public Session {
        pending_outputs.emplace(name);
      }
    }
+
    ~RunState();
  };

@ -228,6 +232,9 @@ class DirectSession : public Session {
  std::unordered_map<string, RunState*> partial_runs_
      GUARDED_BY(executor_lock_);

+  // This holds all the tensors that are currently alive in the session.
+  SessionState session_state_;
+
  CancellationManager* cancellation_manager_;

  // Saves and restores device placements for stateful nodes.
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@ -564,6 +564,77 @@ TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
  ASSERT_EQ(true, outputs[0].flat<bool>()(0));
 }

+TEST(DirectSessionTest, RunHandleTest) {
+  GraphDef def;
+  Graph g(OpRegistry::Global());
+
+  Tensor value0(DT_FLOAT, TensorShape({}));
+  value0.scalar<float>()() = 1.0;
+  Node* const0 = test::graph::Constant(&g, value0);
+  Node* identity0 = test::graph::Identity(&g, const0);
+
+  Tensor value1(DT_FLOAT, TensorShape({}));
+  value1.scalar<float>()() = 2.0;
+  Node* const1 = test::graph::Constant(&g, value1);
+  Node* node3 = test::graph::Add(&g, identity0, const1);
+  Node* node4 = test::graph::Unary(&g, "GetSessionHandle", node3);
+
+  Tensor value2(DT_STRING, TensorShape({}));
+  Node* const2 = test::graph::Constant(&g, value2);
+  Node* node5 = test::graph::GetSessionTensor(&g, const2);
+  Node* node6 = test::graph::Add(&g, node5, const1);
+
+  Node* node7 = test::graph::Unary(&g, "DeleteSessionTensor", const2);
+
+  test::graph::ToGraphDef(&g, &def);
+
+  std::unique_ptr<Session> session(CreateSession());
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def));
+
+  // First run call: Create a handle.
+  std::vector<Tensor> outputs;
+  Status s = session->Run({}, {node4->name() + ":0"}, {}, &outputs);
+  ASSERT_TRUE(s.ok());
+  ASSERT_EQ(1, outputs.size());
+
+  // Second run call: Use a handle.
+  std::vector<Tensor> outputs1;
+  s = session->Run({{const2->name(), outputs[0]}}, {node6->name() + ":0"}, {},
+                   &outputs1);
+  ASSERT_TRUE(s.ok());
+  ASSERT_EQ(1, outputs1.size());
+  ASSERT_EQ(5.0, outputs1[0].flat<float>()(0));
+
+  // Third run call: Delete a handle.
+  std::vector<Tensor> outputs2;
+  s = session->Run({{const2->name(), outputs[0]}}, {}, {node7->name()},
+                   &outputs2);
+  ASSERT_TRUE(s.ok());
+}
+
+TEST(DirectSessionTest, CreateGraphFailsWhenAssigningAFedVar) {
+  Graph graph(OpRegistry::Global());
+
+  Node* a = test::graph::Var(&graph, DT_FLOAT, {});
+  Node* b = test::graph::Constant(&graph, {});
+
+  Tensor zero(DT_FLOAT, {});
+  test::FillValues<float>(&zero, {0});
+
+  // a = b
+  Node* assign = test::graph::Assign(&graph, a, b);
+
+  std::unique_ptr<Session> session(CreateSession());
+  ASSERT_TRUE(session != nullptr);
+
+  // The graph is invalid since a constant cannot be assigned to a constant.
+  // The return Status of session->Run should flag this as an invalid argument.
+  std::vector<Tensor> outputs;
+  Status s = session->Run({{a->name(), zero}}, {assign->name()}, {}, &outputs);
+  ASSERT_TRUE(errors::IsInvalidArgument(s));
+}
+
 TEST(DirectSessionTest, TimeoutSession) {
  GraphDef graph;
  // Creates a graph with one FIFOQueue and one dequeue op.
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@ -645,6 +645,8 @@ class ExecutorState {
  int64 step_id_;
  // Not owned.
  Rendezvous* rendezvous_;
+  SessionState* session_state_;
+  TensorStore* tensor_store_;
  StepStatsCollector* stats_collector_;
  // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper
  // instead of a pointer?  (avoids having to delete).
@ -793,6 +795,8 @@ class ExecutorState {
 ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
    : step_id_(args.step_id),
      rendezvous_(args.rendezvous),
+      session_state_(args.session_state),
+      tensor_store_(args.tensor_store),
      stats_collector_(args.stats_collector),
      slice_reader_cache_(new checkpoint::TensorSliceReaderCacheWrapper),
      call_frame_(args.call_frame),
@ -938,6 +942,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
  // track allocations if and only if we are collecting statistics
  params.track_allocations = (stats_collector_ != nullptr);
  params.rendezvous = rendezvous_;
+  params.session_state = session_state_;
+  params.tensor_store = tensor_store_;
  params.cancellation_manager = cancellation_manager_;
  params.call_frame = call_frame_;
  params.function_library = impl_->params_.function_library;
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@ -18,6 +18,7 @@ limitations under the License.

 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/notification.h"
@ -85,6 +86,8 @@ class Executor {
    StepStatsCollector* stats_collector = nullptr;
    FunctionCallFrame* call_frame = nullptr;
    CancellationManager* cancellation_manager = nullptr;
+    SessionState* session_state = nullptr;
+    TensorStore* tensor_store = nullptr;

    typedef std::function<void()> Closure;
    typedef std::function<void(Closure)> Runner;
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>

 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/logging.h"
--- a/tensorflow/core/common_runtime/session_state.cc
+++ b/tensorflow/core/common_runtime/session_state.cc
@ -0,0 +1,83 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/session_state.h"
+#include "tensorflow/core/graph/tensor_id.h"
+
+namespace tensorflow {
+
+Status SessionState::GetTensor(const string& handle, Tensor* tensor) {
+  mutex_lock l(state_lock_);
+  auto it = tensors_.find(handle);
+  if (it == tensors_.end()) {
+    return errors::InvalidArgument("The tensor with handle '", handle,
+                                   "' is not in the session store.");
+  }
+  *tensor = it->second;
+  return Status::OK();
+}
+
+Status SessionState::AddTensor(const string& handle, const Tensor& tensor) {
+  mutex_lock l(state_lock_);
+  if (!tensors_.insert({handle, tensor}).second) {
+    return errors::InvalidArgument("Failed to add a tensor with handle '",
+                                   handle, "' to the session store.");
+  }
+  return Status::OK();
+}
+
+Status SessionState::DeleteTensor(const string& handle) {
+  mutex_lock l(state_lock_);
+  if (tensors_.erase(handle) == 0) {
+    return errors::InvalidArgument("Failed to delete a tensor with handle '",
+                                   handle, "' in the session store.");
+  }
+  return Status::OK();
+}
+
+int64 SessionState::GetNewId() {
+  mutex_lock l(state_lock_);
+  return tensor_id_++;
+}
+
+Status TensorStore::AddTensor(const string& name, const TensorAndKey& tk) {
+  mutex_lock l(lock_);
+  if (!tensors_.insert({name, tk}).second) {
+    return errors::InvalidArgument("Failed to add a tensor with name '", name,
+                                   "' to the tensor store.");
+  }
+  return Status::OK();
+}
+
+Status TensorStore::SaveTensors(const std::vector<string>& output_names,
+                                SessionState* session_state) {
+  mutex_lock l(lock_);
+  if (tensors_.size() != 0) {
+    // Save only the tensors in output_names in the session.
+    for (const string& name : output_names) {
+      TensorId id(ParseTensorName(name));
+      const string& op_name = id.first.ToString();
+      auto it = tensors_.find(op_name);
+      if (it != tensors_.end()) {
+        // Save the tensor to the session state.
+        string key = it->second.GetHandle(op_name);
+        TF_RETURN_IF_ERROR(session_state->AddTensor(key, it->second.tensor));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@ -103,10 +103,14 @@ Status GrpcServer::Init() {
        return errors::InvalidArgument("Task ", server_def_.task_index(),
                                       " was not defined in job \"",
                                       server_def_.job_name(), "\"");
-      } else if (!strings::safe_strto32(str_util::Split(iter->second, ':')[1],
-                                        &requested_port_)) {
-        return errors::Internal("Could not parse port for local server from \"",
-                                iter->second, "\"");
+      }
+      const std::vector<string> hostname_port =
+          str_util::Split(iter->second, ':');
+      if (hostname_port.size() != 2 ||
+          !strings::safe_strto32(hostname_port[1], &requested_port_)) {
+        return errors::InvalidArgument(
+            "Could not parse port for local server from \"", iter->second,
+            "\"");
      } else {
        break;
      }
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@ -89,12 +89,12 @@ class GrpcServer : public ServerInterface {

  // Implementation of a TensorFlow master, and RPC polling thread.
  MasterEnv master_env_;
-  AsyncServiceInterface* master_service_;
+  AsyncServiceInterface* master_service_ = nullptr;
  std::unique_ptr<Thread> master_thread_ GUARDED_BY(mu_);

  // Implementation of a TensorFlow worker, and RPC polling thread.
  WorkerEnv worker_env_;
-  AsyncServiceInterface* worker_service_;
+  AsyncServiceInterface* worker_service_ = nullptr;
  std::unique_ptr<Thread> worker_thread_ GUARDED_BY(mu_);

  std::unique_ptr<::grpc::Server> server_ GUARDED_BY(mu_);
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@ -678,8 +678,8 @@ Status FunctionCallFrame::GetRetvals(std::vector<Tensor>* rets) const {

 Status FunctionCallFrame::GetArg(int index, Tensor* val) const {
  if (index < 0 || static_cast<size_t>(index) >= args_.size()) {
-    return errors::OutOfRange("GetArg ", index, " is not within [0, ",
-                              args_.size(), ")");
+    return errors::InvalidArgument("GetArg ", index, " is not within [0, ",
+                                   args_.size(), ")");
  }
  *val = args_[index];
  return Status::OK();
@ -687,8 +687,8 @@ Status FunctionCallFrame::GetArg(int index, Tensor* val) const {

 Status FunctionCallFrame::SetRetval(int index, const Tensor& val) {
  if (index < 0 || static_cast<size_t>(index) >= rets_.size()) {
-    return errors::OutOfRange("SetRetval ", index, " is not within [0, ",
-                              rets_.size(), ")");
+    return errors::InvalidArgument("SetRetval ", index, " is not within [0, ",
+                                   rets_.size(), ")");
  }
  if (val.dtype() != ret_types_[index]) {
    return errors::InvalidArgument(
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@ -563,8 +563,8 @@ TEST(FunctionCallFrame, Void_Void) {
  auto a = test::AsTensor<float>({100});
  HasError(frame.SetArgs({a}), "Invalid argument");
  Tensor v;
-  HasError(frame.GetArg(0, &v), "Out of range");
-  HasError(frame.SetRetval(0, v), "Out of range");
+  HasError(frame.GetArg(0, &v), "Invalid argument");
+  HasError(frame.SetRetval(0, v), "Invalid argument");
  std::vector<Tensor> rets;
  TF_EXPECT_OK(frame.GetRetvals(&rets));
  EXPECT_EQ(rets.size(), 0);
@ -581,16 +581,16 @@ TEST(FunctionCallFrame, Float_Float_Float) {
  TF_EXPECT_OK(frame.SetArgs({a, b}));

  Tensor v;
-  HasError(frame.GetArg(-1, &v), "Out of range");
-  HasError(frame.GetArg(2, &v), "Out of range");
+  HasError(frame.GetArg(-1, &v), "Invalid argument");
+  HasError(frame.GetArg(2, &v), "Invalid argument");
  TF_EXPECT_OK(frame.GetArg(0, &v));
  test::ExpectTensorEqual<float>(a, v);
  TF_EXPECT_OK(frame.GetArg(1, &v));
  test::ExpectTensorEqual<float>(b, v);

  v = test::AsTensor<float>({-100});
-  HasError(frame.SetRetval(-1, v), "Out of range");
-  HasError(frame.SetRetval(1, v), "Out of range");
+  HasError(frame.SetRetval(-1, v), "Invalid argument");
+  HasError(frame.SetRetval(1, v), "Invalid argument");
  HasError(frame.SetRetval(0, test::AsTensor<int64>({-100})),
           "Invalid argument: Expects ret[0] to be float");

--- a/tensorflow/core/framework/numeric_op.h
+++ b/tensorflow/core/framework/numeric_op.h
@ -99,7 +99,7 @@ class BinaryElementWiseOp : public BinaryOp<T> {
 #undef NDIM_CASE

      default:
-        context->SetStatus(errors::OutOfRange(
+        context->SetStatus(errors::InvalidArgument(
            "We only handle up to Tensor::dims() up to 8, not ", a.dims()));
        break;
    }
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/selective_registration.h"
+#include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@ -502,6 +503,12 @@ class OpKernelContext {
    // computations running on other devices.
    Rendezvous* rendezvous = nullptr;

+    // The session state for this op.
+    SessionState* session_state = nullptr;
+
+    // The tensor store for this op.
+    TensorStore* tensor_store = nullptr;
+
    // Mechanism used by this op kernel invocation to register a callback
    // for its cancellation.
    CancellationManager* cancellation_manager = nullptr;
@ -841,6 +848,12 @@ class OpKernelContext {
  // Rendezvous Send() and Recv().
  Rendezvous* rendezvous() const { return params_->rendezvous; }

+  // An op kernel can access the session state it belongs to.
+  SessionState* session_state() const { return params_->session_state; }
+
+  // An op kernel can access the tensor store of the run it belongs to.
+  TensorStore* tensor_store() const { return params_->tensor_store; }
+
  // Function call support.
  //
  // If this kernel invocation is within a function execution,
@ -1031,15 +1044,16 @@ typedef ::tensorflow::KernelDefBuilder Name;
 #define REGISTER_KERNEL_BUILDER_UNIQ_HELPER(ctr, kernel_builder, ...) \
  REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, __VA_ARGS__)

-#define REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)        \
-  static ::tensorflow::kernel_factory::OpKernelRegistrar              \
-      registrar__body__##ctr##__object(                               \
-          SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__)                     \
-              ? ::tensorflow::register_kernel::kernel_builder.Build() \
-              : nullptr,                                              \
-          #__VA_ARGS__,                                               \
-          [](::tensorflow::OpKernelConstruction* context)             \
-              -> ::tensorflow::OpKernel* { return new __VA_ARGS__(context); })
+#define REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)          \
+  static ::tensorflow::kernel_factory::OpKernelRegistrar                \
+      registrar__body__##ctr##__object(                                 \
+          SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__)                       \
+              ? ::tensorflow::register_kernel::kernel_builder.Build()   \
+              : nullptr,                                                \
+          #__VA_ARGS__, [](::tensorflow::OpKernelConstruction* context) \
+                            -> ::tensorflow::OpKernel* {                \
+                              return new __VA_ARGS__(context);          \
+                            });

 void* GlobalKernelRegistry();

--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@ -120,8 +120,8 @@ class OpKernelTest : public ::testing::Test {
  void ExpectEqual(const string& what, const DataTypeVector& expected,
                   const DataTypeVector& observed) {
    EXPECT_EQ(expected.size(), observed.size()) << what;
-    const int size = std::min(expected.size(), observed.size());
-    for (int i = 0; i < size; ++i) {
+    const size_t size = std::min(expected.size(), observed.size());
+    for (size_t i = 0; i < size; ++i) {
      bool match = TypesCompatible(expected[i], observed[i]);
      EXPECT_TRUE(match) << what << " i:" << i << ", expected: " << expected[i]
                         << ", observed: " << observed[i];
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@ -47,65 +47,42 @@ limitations under the License.
 // Call "m" for all number types that support the comparison operations "<" and
 // ">".
 #define TF_CALL_REAL_NUMBER_TYPES(m) \
-  m(float);                          \
-  m(double);                         \
-  m(int64);                          \
-  m(int32);                          \
-  m(uint8);                          \
-  m(int16);                          \
-  m(int8)
+  m(float) m(double) m(int64) m(int32) m(uint8) m(int16) m(int8)

 #define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m) \
-  m(float);                                   \
-  m(double);                                  \
-  m(int64);                                   \
-  m(uint8);                                   \
-  m(int16);                                   \
-  m(int8)
+  m(float) m(double) m(int64) m(uint8) m(int16) m(int8)

 // Call "m" for all number types, including complex64 and complex128.
 #define TF_CALL_NUMBER_TYPES(m) \
-  TF_CALL_REAL_NUMBER_TYPES(m); \
-  m(complex64);                 \
-  m(complex128)
+  TF_CALL_REAL_NUMBER_TYPES(m)  \
+  m(complex64) m(complex128)

 #define TF_CALL_NUMBER_TYPES_NO_INT32(m) \
-  TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m); \
-  m(complex64);                          \
-  m(complex128)
+  TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)  \
+  m(complex64) m(complex128)

 #define TF_CALL_POD_TYPES(m) \
-  TF_CALL_NUMBER_TYPES(m);   \
+  TF_CALL_NUMBER_TYPES(m)    \
  m(bool)

 // Call "m" on all types.
 #define TF_CALL_ALL_TYPES(m) \
-  TF_CALL_POD_TYPES(m);      \
+  TF_CALL_POD_TYPES(m)       \
  m(string)

 // Call "m" on all types supported on GPU.
-#define TF_CALL_GPU_NUMBER_TYPES(m) \
-  m(float);                         \
-  m(double)
+#define TF_CALL_GPU_NUMBER_TYPES(m) m(float) m(double)

 // Call "m" on all quantized types.
-#define TF_CALL_QUANTIZED_TYPES(m) \
-  m(qint8);                        \
-  m(quint8);                       \
-  m(qint32)
+#define TF_CALL_QUANTIZED_TYPES(m) m(qint8) m(quint8) m(qint32)

 #elif defined(__ANDROID_TYPES_FULL__)

-#define TF_CALL_REAL_NUMBER_TYPES(m) \
-  m(float);                          \
-  m(int32);                          \
-  m(int64)
+#define TF_CALL_REAL_NUMBER_TYPES(m) m(float) m(int32) m(int64)

 #define TF_CALL_NUMBER_TYPES(m) TF_CALL_REAL_NUMBER_TYPES(m)

-#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m) \
-  m(float);                                   \
-  m(int64)
+#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m) m(float) m(int64)

 #define TF_CALL_NUMBER_TYPES_NO_INT32(m) TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)

@ -117,16 +94,11 @@ limitations under the License.
 #define TF_CALL_GPU_NUMBER_TYPES(m) m(float)

 // Call "m" on all quantized types.
-#define TF_CALL_QUANTIZED_TYPES(m) \
-  m(qint8);                        \
-  m(quint8);                       \
-  m(qint32)
+#define TF_CALL_QUANTIZED_TYPES(m) m(qint8) m(quint8) m(qint32)

 #else  // defined(__ANDROID__) && !defined(__ANDROID_TYPES_FULL__)

-#define TF_CALL_REAL_NUMBER_TYPES(m) \
-  m(float);                          \
-  m(int32)
+#define TF_CALL_REAL_NUMBER_TYPES(m) m(float) m(int32)

 #define TF_CALL_NUMBER_TYPES(m) TF_CALL_REAL_NUMBER_TYPES(m)

--- a/tensorflow/core/framework/session_state.h
+++ b/tensorflow/core/framework/session_state.h
@ -0,0 +1,85 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_FRAMEWORK_SESSION_STATE_H_
+#define TENSORFLOW_FRAMEWORK_SESSION_STATE_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// The session state remembers the tensors we choose to keep across
+// multiple run calls.
+class SessionState {
+ public:
+  // Get a tensor from the session state.
+  Status GetTensor(const string& handle, Tensor* tensor);
+
+  // Store a tensor in the session state.
+  Status AddTensor(const string& handle, const Tensor& tensor);
+
+  // Delete a tensdor from the session state.
+  Status DeleteTensor(const string& handle);
+
+  int64 GetNewId();
+
+ private:
+  mutex state_lock_;
+
+  // For generating unique ids for tensors stored in the session.
+  int64 tensor_id_ = 0;
+
+  // The live tensors in the session. A map from tensor handle to tensor.
+  std::unordered_map<string, Tensor> tensors_;
+};
+
+// The tensor store remembers the tensors we choose to keep for the
+// current run call. It is available to every op kernel.
+class TensorStore {
+ public:
+  struct TensorAndKey {
+    Tensor tensor;
+    int64 id;
+    string device_name;
+
+    string GetHandle(const string& tensor_name) {
+      return strings::StrCat(tensor_name, ";", id, ";", device_name);
+    }
+  };
+
+  // Add the named tensor to the tensor store for this run.
+  Status AddTensor(const string& name, const TensorAndKey& tk);
+
+  // Save the tensors in the tensor store of this run to the session.
+  Status SaveTensors(const std::vector<string>& output_names,
+                     SessionState* session_state);
+
+ private:
+  mutex lock_;
+
+  // The tensors that will be saved to session state when this run completes.
+  // A map from tensor string name to tensor.
+  std::unordered_map<string, TensorAndKey> tensors_ GUARDED_BY(lock_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_FRAMEWORK_SESSION_STATE_H_
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@ -44,6 +44,7 @@ void TensorShape::CheckDimsAtLeast(int NDIMS) const {

 bool TensorShape::IsValid(const TensorShapeProto& proto) {
  int64 num_elements = 1;
+  if (proto.dim().size() > MaxDimensions()) return false;
  for (const auto& d : proto.dim()) {
    if (d.size() < 0) return false;
    num_elements *= d.size();
@ -54,6 +55,10 @@ bool TensorShape::IsValid(const TensorShapeProto& proto) {

 Status TensorShape::IsValidShape(const TensorShapeProto& proto) {
  int64 num_elements = 1;
+  if (proto.dim().size() > MaxDimensions()) {
+    return errors::InvalidArgument("Shape ", DebugString(proto),
+                                   " has too many dimensions");
+  }
  for (const auto& d : proto.dim()) {
    if (d.size() < 0) {
      return errors::InvalidArgument("Shape ", DebugString(proto),
@ -165,7 +170,7 @@ void TensorShape::RecomputeNumElements() {
 void TensorShape::AddDim(int64 size) {
  CHECK_GE(size, 0);
  const int nd = ndims_byte();
-  CHECK_LT(nd, 255) << "Too many dimensions in tensor";
+  CHECK_LT(nd, MaxDimensions()) << "Too many dimensions in tensor";
  if (tag() == REP16 && nd < 6 && size < kMaxRep16) {
    as16()->dims_[nd] = static_cast<int16>(size);
  } else if (tag() == REP32 && nd < 3 && size < kMaxRep32) {
@ -214,6 +219,7 @@ void TensorShape::InsertDim(int d, int64 size) {
  CHECK_GE(d, 0);
  CHECK_LE(d, dims());
  CHECK_GE(size, 0);
+  CHECK_LT(dims(), MaxDimensions());
  gtl::InlinedVector<int64, 8> vals;
  AppendTo(*this, &vals);
  vals.insert(vals.begin() + d, size);
@ -341,6 +347,9 @@ bool TensorShapeUtils::StartsWith(const TensorShape& shape,
 template <typename T>
 static inline Status MakeShapeHelper(const T* dims, int n, TensorShape* out) {
  *out = TensorShape();
+  if (n > TensorShape::MaxDimensions()) {
+    return errors::InvalidArgument("Too many dimensions");
+  }
  for (int i = 0; i < n; ++i) {
    const T dim = internal::SubtleMustCopy(dims[i]);
    if (dim >= 0) {
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@ -71,6 +71,9 @@ class TensorShape {
  /// Appends all the dimensions from `shape`.
  void AppendShape(const TensorShape& shape);

+  // Maximum number of dimensions in a tensor.
+  static constexpr int MaxDimensions() { return 255; }
+
  /// \brief Insert a dimension somewhere in the `TensorShape`.
  /// REQUIRES: `0 <= d <= dims()`
  /// REQUIRES: `size >= 0`
@ -277,6 +280,7 @@ template <int NDIMS>
 Eigen::DSizes<Eigen::DenseIndex, NDIMS> TensorShape::AsEigenDSizesWithPadding()
    const {
  CheckDimsAtLeast(NDIMS);
+  static_assert(NDIMS <= TensorShape::MaxDimensions(), "Too many dimensions");
  Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
  for (int d = 0; d < dims(); d++) {
    dsizes[d] = dim_size(d);
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@ -15,6 +15,7 @@ limitations under the License.

 #include "tensorflow/core/framework/tensor_shape.h"

+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@ -87,6 +88,21 @@ TEST(TensorShapeTest, InvalidShapeProto) {
  EXPECT_FALSE(TensorShape::IsValid(proto));
 }

+TEST(TensorShapeTest, TooManyDimsProto) {
+  TensorShapeProto proto;
+  // Deliberate redundancy to ensure that both paths work.
+  EXPECT_TRUE(TensorShape::IsValid(proto));
+  TF_EXPECT_OK(TensorShape::IsValidShape(proto));
+  for (int i = 0; i < TensorShape::MaxDimensions(); i++) {
+    proto.add_dim()->set_size(1);
+  }
+  EXPECT_TRUE(TensorShape::IsValid(proto));
+  TF_EXPECT_OK(TensorShape::IsValidShape(proto));
+  proto.add_dim()->set_size(1);
+  EXPECT_FALSE(TensorShape::IsValid(proto));
+  EXPECT_FALSE(TensorShape::IsValidShape(proto).ok());
+}
+
 TEST(TensorShapeTest, SetDimForEmptyTensor) {
  TensorShape s({10, 5, 20});
  EXPECT_EQ(1000, s.num_elements());
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@ -95,6 +95,9 @@ void Node::Initialize(int id, int cost_id, Properties* props) {
  SET_CLASS(NC_CONSTANT, ts, "Const", "HostConst");
  SET_CLASS(NC_VARIABLE, ts, "Variable", "");
  SET_CLASS(NC_IDENTITY, ts, "Identity", "RefIdentity");
+  SET_CLASS(NC_GET_SESSION_HANDLE, ts, "GetSessionHandle", "");
+  SET_CLASS(NC_GET_SESSION_TENSOR, ts, "GetSessionTensor", "");
+  SET_CLASS(NC_DELETE_SESSION_TENSOR, ts, "DeleteSessionTensor", "");
  if (class_ == NC_UNINITIALIZED) {
    class_ = NC_OTHER;  // Catch all
  }
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@ -118,6 +118,11 @@ class Node {
  bool IsConstant() const { return (class_ == NC_CONSTANT); }
  bool IsVariable() const { return (class_ == NC_VARIABLE); }
  bool IsIdentity() const { return (class_ == NC_IDENTITY); }
+  bool IsGetSessionHandle() const { return (class_ == NC_GET_SESSION_HANDLE); }
+  bool IsGetSessionTensor() const { return (class_ == NC_GET_SESSION_TENSOR); }
+  bool IsDeleteSessionTensor() const {
+    return (class_ == NC_DELETE_SESSION_TENSOR);
+  }
  bool IsControlFlow() const {
    return (class_ != NC_OTHER) &&  // Fast path
           (IsSwitch() || IsMerge() || IsEnter() || IsExit() ||
@ -172,6 +177,9 @@ class Node {
    NC_CONSTANT,
    NC_VARIABLE,
    NC_IDENTITY,
+    NC_GET_SESSION_HANDLE,
+    NC_GET_SESSION_TENSOR,
+    NC_DELETE_SESSION_TENSOR,
    NC_OTHER  // Not a special kind of node
  };

--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@ -360,6 +360,15 @@ Node* Gather(Graph* g, Node* in0, Node* in1) {
  return ret;
 }

+Node* GetSessionTensor(Graph* g, Node* in) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "GetSessionTensor")
+                  .Input(in, 0)
+                  .Attr("dtype", DT_FLOAT)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 void ToGraphDef(Graph* g, GraphDef* gdef) { g->ToGraphDef(gdef); }

 }  // end namespace graph
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@ -161,6 +161,9 @@ Node* Gather(Graph* g, Node* in0, Node* in1);
 // Computes the args needed broadcast gradient function.
 Node* BroadcastGradientArgs(Graph* g, Node* s0, Node* s1);

+// Gets a tensor stored in the session state.
+Node* GetSessionTensor(Graph* g, Node* in);
+
 }  // end namespace graph
 }  // end namespace test
 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -173,6 +173,7 @@ cc_library(
    srcs = ["save_restore_tensor.cc"],
    hdrs = ["save_restore_tensor.h"],
    deps = [
+        ":bounds_check",
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
    ],
@ -261,6 +262,7 @@ tf_kernel_libraries(
        "concat_op",
        "constant_op",
        "diag_op",
+        "batch_matrix_diag_op",
        "edit_distance_op",
        "gather_nd_op",
        "gather_op",
@ -337,6 +339,23 @@ tf_cc_test(
    ],
 )

+tf_cc_test(
+    name = "example_parsing_ops_test",
+    size = "small",
+    deps = [
+        ":example_parsing_ops",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_cc_test(
    name = "gather_op_test",
    size = "small",
@ -523,6 +542,7 @@ tf_kernel_libraries(
        "padding_fifo_queue_op",
        "queue_ops",
        "random_shuffle_queue_op",
+        "session_ops",
        "stack_ops",
        "tensor_array_ops",
    ],
@ -593,14 +613,16 @@ cc_library(
    ],
 )

-cc_library(
+tf_kernel_library(
    name = "tensor_array",
    srcs = ["tensor_array.cc"],
    hdrs = ["tensor_array.h"],
    visibility = ["//visibility:private"],
    deps = [
+        ":aggregate_ops",
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
+        "//third_party/eigen3",
    ],
 )

@ -1282,6 +1304,7 @@ tf_kernel_libraries(
    name = "string",
    prefixes = [
        "string_to_hash_bucket_op",
+        "reduce_join_op",
    ],
    deps = [
        "//tensorflow/core:framework",
@ -1497,6 +1520,7 @@ filegroup(
        "restore_op.cc",
        "save_op.cc",
        "save_restore_tensor.cc",
+        "session_ops.cc",
        "softplus_op.cc",
        "softsign_op.cc",
        "sparse_to_dense_op.cc",
--- a/tensorflow/core/kernels/batch_matrix_diag_op.cc
+++ b/tensorflow/core/kernels/batch_matrix_diag_op.cc
@ -0,0 +1,232 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/batch_matrix_diag_op.h"
+
+#include <memory>
+#include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+inline typename TTypes<T, 3>::ConstTensor flat_inner_dims_matrix(
+    const Tensor& t) {
+  int64 last_size = t.dims() > 1 ? t.dim_size(t.dims() - 1) : 1;
+  int64 but_last_size = t.dims() > 1 ? t.dim_size(t.dims() - 2) : 1;
+  if (last_size * but_last_size == 0) {
+    DCHECK_EQ(t.NumElements(), 0);
+    // Return something empty, avoiding divide by 0
+    return t.shaped<T, 3>({0, 0, 0});
+  } else {
+    return t.shaped<T, 3>({t.NumElements() / (but_last_size * last_size),
+                           but_last_size, last_size});
+  }
+}
+
+template <typename T>
+inline typename TTypes<T, 3>::Tensor flat_inner_dims_matrix(Tensor* t) {
+  int64 last_size = t->dims() > 1 ? t->dim_size(t->dims() - 1) : 1;
+  int64 but_last_size = t->dims() > 1 ? t->dim_size(t->dims() - 2) : 1;
+  if (last_size * but_last_size == 0) {
+    DCHECK_EQ(t->NumElements(), 0);
+    // Return something empty, avoiding divide by 0
+    return t->shaped<T, 3>({0, 0, 0});
+  } else {
+    return t->shaped<T, 3>({t->NumElements() / (but_last_size * last_size),
+                            but_last_size, last_size});
+  }
+}
+
+template <typename Device, typename T>
+class BatchMatrixDiagPartOp : public OpKernel {
+ public:
+  explicit BatchMatrixDiagPartOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+
+    const TensorShape& input_shape = input.shape();
+    const int rank = input_shape.dims();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input.shape().DebugString()));
+
+    // Check to make sure the last two dimensions have the same value
+    const int64 k = input_shape.dim_size(rank - 1);
+    OP_REQUIRES(
+        context, k == input_shape.dim_size(rank - 2),
+        errors::InvalidArgument(
+            "input's last two dimensions must be equal, received shape: ",
+            input.shape().DebugString()));
+
+    auto input_reshaped = flat_inner_dims_matrix<T>(input);
+
+    TensorShape output_shape = input_shape;
+    output_shape.RemoveDim(rank - 1);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    auto output_reshaped = output->flat_inner_dims<T>();
+
+    functor::BatchMatrixDiagPart<Device, T>::Compute(
+        context->eigen_device<Device>(), input_reshaped, output_reshaped);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(BatchMatrixDiagPartOp);
+};
+
+template <typename Device, typename T>
+class BatchMatrixDiagOp : public OpKernel {
+ public:
+  explicit BatchMatrixDiagOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+
+    const TensorShape& input_shape = input.shape();
+    const int rank = input_shape.dims();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 1-dim, received shape: ",
+                    input.shape().DebugString()));
+
+    // Check to make sure the last two dimensions have the same value
+    const int64 k = input_shape.dim_size(rank - 1);
+    auto input_reshaped = input.flat_inner_dims<T>();
+
+    TensorShape output_shape = input_shape;
+    output_shape.AddDim(k);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    auto output_reshaped = flat_inner_dims_matrix<T>(output);
+
+    functor::BatchMatrixDiag<Device, T>::Compute(
+        context->eigen_device<Device>(), input_reshaped, output_reshaped);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(BatchMatrixDiagOp);
+};
+
+#define REGISTER_BATCH_MATRIX_DIAG(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("BatchMatrixDiag").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      BatchMatrixDiagOp<CPUDevice, type>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("BatchMatrixDiagPart")                       \
+                              .Device(DEVICE_CPU)                           \
+                              .TypeConstraint<type>("T"),                   \
+                          BatchMatrixDiagPartOp<CPUDevice, type>);
+
+TF_CALL_NUMBER_TYPES(REGISTER_BATCH_MATRIX_DIAG);
+
+// Implementation of the functor specialization for CPU.
+namespace functor {
+template <typename T>
+struct BatchMatrixDiag<CPUDevice, T> {
+  static void Compute(const CPUDevice& d,
+                      typename TTypes<T, 2>::ConstTensor input,
+                      typename TTypes<T, 3>::Tensor output) {
+    output.device(d) = output.constant(T());
+    for (int64 r = 0; r < output.dimension(0); ++r) {
+      for (int64 d = 0; d < output.dimension(1); ++d) {
+        output(r, d, d) = input(r, d);
+      }
+    }
+  }
+};
+
+template <typename T>
+struct BatchMatrixDiagPart<CPUDevice, T> {
+  static void Compute(const CPUDevice& d,
+                      typename TTypes<T, 3>::ConstTensor input,
+                      typename TTypes<T, 2>::Tensor output) {
+    for (int64 r = 0; r < output.dimension(0); ++r) {
+      for (int64 d = 0; d < output.dimension(1); ++d) {
+        output(r, d) = input(r, d, d);
+      }
+    }
+  }
+};
+
+}  // namespace functor
+
+#if GOOGLE_CUDA
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                         \
+  template <>                                                       \
+  void BatchMatrixDiag<GPUDevice, T>::Compute(                      \
+      const GPUDevice& d, typename TTypes<T, 2>::ConstTensor input, \
+      typename TTypes<T, 3>::Tensor output);                        \
+  extern template struct BatchMatrixDiag<GPUDevice, T>;             \
+  template <>                                                       \
+  void BatchMatrixDiagPart<GPUDevice, T>::Compute(                  \
+      const GPUDevice& d, typename TTypes<T, 3>::ConstTensor input, \
+      typename TTypes<T, 2>::Tensor output);                        \
+  extern template struct BatchMatrixDiagPart<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_BATCH_MATRIX_DIAG_GPU(type)                                \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("BatchMatrixDiag").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      BatchMatrixDiagOp<GPUDevice, type>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("BatchMatrixDiagPart")                       \
+                              .Device(DEVICE_GPU)                           \
+                              .TypeConstraint<type>("T"),                   \
+                          BatchMatrixDiagPartOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_BATCH_MATRIX_DIAG_GPU);
+
+#undef REGISTER_BATCH_MATRIX_DIAG_GPU
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
--- a/tensorflow/core/kernels/batch_matrix_diag_op.h
+++ b/tensorflow/core/kernels/batch_matrix_diag_op.h
@ -0,0 +1,94 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_BATCH_MATRIX_DIAG_OP_H_
+#define TENSORFLOW_KERNELS_BATCH_MATRIX_DIAG_OP_H_
+
+// Generator definition for BatchMatrixDiagOp, must be compilable by nvcc.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace generator {
+
+template <typename T>
+class BatchMatrixDiagPartGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+      BatchMatrixDiagPartGenerator(typename TTypes<T, 3>::ConstTensor input)
+      : input_(input) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  operator()(const Eigen::array<Eigen::DenseIndex, 2>& coords) const {
+    Eigen::array<Eigen::DenseIndex, 3> diag_from_coords(
+        {coords[0], coords[1], coords[1]});
+    return input_(diag_from_coords);
+  }
+
+ private:
+  typename TTypes<T, 3>::ConstTensor input_;
+};
+
+template <typename T>
+class BatchMatrixDiagGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+      BatchMatrixDiagGenerator(typename TTypes<T, 2>::ConstTensor input)
+      : input_(input) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  operator()(const Eigen::array<Eigen::DenseIndex, 3>& coords) const {
+    if (coords[2] != coords[1]) return T();
+
+    Eigen::array<Eigen::DenseIndex, 2> diag_coords({coords[0], coords[1]});
+    return input_(diag_coords);
+  }
+
+ private:
+  typename TTypes<T, 2>::ConstTensor input_;
+};
+
+}  // namespace generator
+
+namespace functor {
+
+template <typename Device, typename T>
+struct BatchMatrixDiagPart {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      const Device& d, typename TTypes<T, 3>::ConstTensor input,
+      typename TTypes<T, 2>::Tensor output) {
+    generator::BatchMatrixDiagPartGenerator<T> generator(input);
+    output.device(d) = output.generate(generator);
+  }
+};
+
+template <typename Device, typename T>
+struct BatchMatrixDiag {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      const Device& d, typename TTypes<T, 2>::ConstTensor input,
+      typename TTypes<T, 3>::Tensor output) {
+    generator::BatchMatrixDiagGenerator<T> generator(input);
+    output.device(d) = output.generate(generator);
+  }
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_BATCH_MATRIX_DIAG_OP_H_
--- a/tensorflow/core/kernels/batch_matrix_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/batch_matrix_diag_op_gpu.cu.cc
@ -0,0 +1,37 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/batch_matrix_diag_op.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_SPEC(T)                                   \
+  template class generator::BatchMatrixDiagGenerator<T>;     \
+  template struct functor::BatchMatrixDiag<GPUDevice, T>;    \
+  template class generator::BatchMatrixDiagPartGenerator<T>; \
+  template struct functor::BatchMatrixDiagPart<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@ -60,7 +60,7 @@ class CheckNumericsOp<CPUDevice, T> : public OpKernel {

    auto in = context->input(0).flat<T>();
    const T* data = in.data();
-    const int size = in.size();
+    const int64 size = in.size();
    // Check to see if any element of the tensor is NaN or Inf.
    int fp_props =
        std::accumulate(data, data + size, 0, [](const int& x, const T& y) {
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@ -23,8 +23,10 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/macros.h"

@ -145,6 +147,10 @@ class FillOp : public OpKernel {
                errors::InvalidArgument("value must be a scalar, got shape ",
                                        Tvalue.shape().DebugString()));
    auto dims = Tdims.flat<int32>();
+    OP_REQUIRES(context,
+                FastBoundsCheck(dims.size(), TensorShape::MaxDimensions()),
+                errors::InvalidArgument("dims must have size < ",
+                                        TensorShape::MaxDimensions()));
    for (int i = 0; i < dims.size(); i++) {
      OP_REQUIRES(context, dims(i) >= 0,
                  errors::InvalidArgument("dims[", i, "] = ", dims(i),
@ -153,7 +159,7 @@ class FillOp : public OpKernel {
    TensorShape shape;
    OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
                                reinterpret_cast<const int32*>(dims.data()),
-                                dims.size(), &shape));
+                                static_cast<int>(dims.size()), &shape));
    Tensor* out = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, shape, &out));
    functor::FillFunctor<Device, T> functor;
--- a/tensorflow/core/kernels/constant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@ -78,7 +78,7 @@ struct FillFunctor<GPUDevice, T> {
  }
 };

-#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>
+#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
 TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
 DEFINE_FILL_GPU(bool);
 DEFINE_FILL_GPU(Eigen::half);
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@ -341,62 +341,6 @@ struct cos : base<T, Eigen::internal::scalar_cos_op<T> > {};
 struct logical_not : base<bool, Eigen::internal::scalar_boolean_not_op<bool> > {
 };

-namespace impl {
-
-#ifndef __CUDACC__
-// Uses STL std cmath functions.
-template <typename T>
-bool isinf(T v) {
-  return std::isinf(v);
-}
-
-template <typename T>
-bool isnan(T v) {
-  return std::isnan(v);
-}
-
-template <typename T>
-bool isfinite(T v) {
-  return std::isfinite(v);
-}
-
-template <typename T>
-T floor(T v) {
-  return std::floor(v);
-}
-
-template <typename T>
-T ceil(T v) {
-  return std::ceil(v);
-}
-#else
-// Uses CUDA's functions for float and double.
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isinf(T v) {
-  return ::isinf(v);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isnan(T v) {
-  return ::isnan(v);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isfinite(T v) {
-  return ::isfinite(v);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T floor(T v) {
-  return ::floor(v);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T ceil(T v) {
-  return ::ceil(v);
-}
-#endif
-}  // end namespace impl

 // NOTE: std::isinf, std::isnan, std::isfinite are plain function.
 // Therefore we need to wrap them in functors to be used with Eigen's
@ -406,7 +350,7 @@ template <typename T>
 struct isinf_func {
  typedef bool result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
-    return impl::isinf(x);
+    return Eigen::numext::isinf(x);
  }
 };

@ -417,7 +361,7 @@ template <typename T>
 struct isnan_func {
  typedef bool result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
-    return impl::isnan(x);
+    return Eigen::numext::isnan(x);
  }
 };

@ -428,7 +372,7 @@ template <typename T>
 struct isfinite_func {
  typedef bool result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const {
-    return impl::isfinite(x);
+    return Eigen::numext::isfinite(x);
  }
 };

@ -439,7 +383,7 @@ template <typename T>
 struct floor_func {
  typedef T result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x) const {
-    return impl::floor(x);
+    return Eigen::numext::floor(x);
  }
 };

@ -450,7 +394,7 @@ template <typename T>
 struct ceil_func {
  typedef T result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x) const {
-    return impl::ceil(x);
+    return Eigen::numext::ceil(x);
  }
 };

--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@ -130,7 +130,7 @@ namespace functor {
  void DenseUpdate<GPUDevice, T, OP>::operator()(          \
      const GPUDevice& d, typename TTypes<T>::Flat params, \
      typename TTypes<T>::ConstFlat update);               \
-  extern template struct DenseUpdate<GPUDevice, T, OP>
+  extern template struct DenseUpdate<GPUDevice, T, OP>;
 #define DECLARE_GPU_SPEC(T)                         \
  DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::ADD); \
  DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::SUB)
--- a/tensorflow/core/kernels/example_parsing_ops_test.cc
+++ b/tensorflow/core/kernels/example_parsing_ops_test.cc
@ -0,0 +1,121 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef std::map<std::pair<int, int>, Tensor> ExampleTensorMap;
+
+struct DenseStringExampleStore {
+  static ExampleTensorMap GetSerializedExamples() {
+    ExampleTensorMap examples;
+    int keys[] = {10, 100, 1000, 10000};
+    int batch_sizes[] = {128};
+    Example example;
+    for (int num_keys : keys) {
+      for (int batch_size : batch_sizes) {
+        Tensor record_string(DT_STRING, TensorShape({batch_size}));
+        auto string_t = record_string.vec<string>();
+        example.Clear();
+        for (int b = 0; b < batch_size; ++b) {
+          for (int k = 0; k < num_keys; ++k) {
+            string k_str = strings::Printf("%d", k);
+            Feature f;
+            f.mutable_bytes_list()->add_value("abc");
+            Features* features = example.mutable_features();
+            (*features->mutable_feature())[k_str] = f;
+          }
+          CHECK(example.SerializeToString(&string_t(b)));
+        }
+        examples[std::make_pair(batch_size, num_keys)] = record_string;
+      }
+    }
+    return examples;
+  }
+  static ExampleTensorMap serialized_example;
+};
+
+ExampleTensorMap DenseStringExampleStore::serialized_example =
+    DenseStringExampleStore::GetSerializedExamples();
+
+static Graph* ParseDenseStringExample(int batch_size, int num_keys) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor& serialized =
+      DenseStringExampleStore::serialized_example[std::make_pair(batch_size,
+                                                                 num_keys)];
+
+  Tensor names(DT_STRING, TensorShape({batch_size}));
+
+  std::vector<NodeBuilder::NodeOut> sparse_keys;
+  std::vector<NodeBuilder::NodeOut> dense_keys;
+  std::vector<NodeBuilder::NodeOut> dense_defaults;
+  for (int i = 0; i < num_keys; ++i) {
+    Tensor dense_key(DT_STRING, TensorShape());
+    dense_key.scalar<string>()() = strings::Printf("%d", i);
+    dense_keys.emplace_back(test::graph::Constant(g, dense_key));
+
+    Tensor dense_default(DT_STRING, TensorShape());
+    dense_defaults.emplace_back(test::graph::Constant(g, dense_default));
+  }
+
+  std::vector<DataType> sparse_types;
+  std::vector<TensorShape> dense_shapes(num_keys, TensorShape());
+
+  Node* ret;
+  TF_EXPECT_OK(NodeBuilder(g->NewName("n"), "ParseExample")
+                   .Input(test::graph::Constant(g, serialized))
+                   .Input(test::graph::Constant(g, names))
+                   .Input(sparse_keys)
+                   .Input(dense_keys)
+                   .Input(dense_defaults)
+                   .Attr("sparse_types", sparse_types)
+                   .Attr("dense_shapes", dense_shapes)
+                   .Finalize(g, &ret));
+
+  return g;
+}
+
+// B == batch_size, K == num_keys.  K must be one of 10, 100, 1000, 10000
+#define BM_ParseDenseStringExample(B, K)                                 \
+  static void BM_ParseDenseStringExample##_##B##_##K(int iters) {        \
+    int64 items_per_iter = static_cast<int64>(B) * K;                    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter); \
+    test::Benchmark("cpu", ParseDenseStringExample(B, K)).Run(iters);    \
+  }                                                                      \
+  BENCHMARK(BM_ParseDenseStringExample##_##B##_##K);
+
+BM_ParseDenseStringExample(128, 10);
+BM_ParseDenseStringExample(128, 100);
+BM_ParseDenseStringExample(128, 1000);
+BM_ParseDenseStringExample(128, 10000);
+
+}  // end namespace tensorflow
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@ -200,7 +200,7 @@ namespace functor {
      const GPUDevice& d, typename TTypes<T, NDIM>::ConstTensor Tparams, \
      typename TTypes<Index>::ConstMatrix Tindices,                      \
      typename TTypes<T>::Flat Tout);                                    \
-  extern template struct GatherNd<GPUDevice, T, Index, NDIM>
+  extern template struct GatherNd<GPUDevice, T, Index, NDIM>;

 #define DECLARE_GPU_SPECS_INDEX(T, Index)    \
  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 1); \
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@ -187,7 +187,7 @@ namespace functor {
      const GPUDevice& d, typename TTypes<T>::ConstMatrix Tparams, \
      typename TTypes<Index>::ConstFlat Tindices,                  \
      typename TTypes<T>::Matrix Tout);                            \
-  extern template struct Gather<GPUDevice, T, Index>
+  extern template struct Gather<GPUDevice, T, Index>;

 #define DECLARE_GPU_SPECS(T)         \
  DECLARE_GPU_SPECS_INDEX(T, int32); \
--- a/tensorflow/core/kernels/listdiff_op.cc
+++ b/tensorflow/core/kernels/listdiff_op.cc
@ -73,7 +73,7 @@ class ListDiffOp : public OpKernel {
    for (int i = 0, p = 0; i < x_size; ++i) {
      if (y_set.count(Tx(i)) == 0) {
        OP_REQUIRES(context, p < out_size,
-                    errors::OutOfRange(
+                    errors::InvalidArgument(
                        "Tried to set output index ", p,
                        " when output Tensor only had ", out_size,
                        " elements. Check that your "
--- a/tensorflow/core/kernels/reduce_join_op.cc
+++ b/tensorflow/core/kernels/reduce_join_op.cc
@ -0,0 +1,202 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/string_ops.cc.
+
+#include <string>
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+namespace {
+
+const gtl::InlinedVector<int64, 8> GetStrides(const TensorShape& shape) {
+  gtl::InlinedVector<int64, 8> result(shape.dims());
+  int64 product = 1;
+  for (int32 i = shape.dims() - 1; i >= 0; --i) {
+    result[i] = product;
+    product *= shape.dim_size(i);
+  }
+  return result;
+}
+
+// Given a linear index to a subset of dimensions, full shape,
+// precomputed list of running products of the full shape, and list of
+// dimensions in the subset, outputs the linear index to the full shape with
+// nonspecified dimensions set to 0.  Dimensions must be ordered from outer-most
+// to inner-most with respect to the subset linear index.
+inline int64 LinearSubIndexToFullIndex(
+    int64 output_index, const gtl::InlinedVector<int32, 8>& dim_list,
+    const TensorShape& input_shape,
+    const gtl::InlinedVector<int64, 8>& strides) {
+  int64 result = 0;
+  int64 quotient = output_index;
+  for (int32 i = dim_list.size() - 1; i >= 0; --i) {
+    int32 dim = dim_list[i];
+    int64 dim_value = quotient % input_shape.dim_size(dim);
+    quotient = quotient / input_shape.dim_size(dim);
+    result += strides[dim] * dim_value;
+  }
+  return result;
+}
+
+// Computes the number of input elements reduced per output element.
+int64 GetReductionIterSize(const gtl::InlinedVector<int32, 8>& reduced_indices,
+                           const TensorShape& input_shape) {
+  int64 result = 1;
+  for (int32 reduce_dim : reduced_indices) {
+    result *= input_shape.dim_size(reduce_dim);
+  }
+  return result;
+}
+
+// Computes a list of all true reduced indices, accounting for negative
+// indices and empty inputs.
+gtl::InlinedVector<int32, 8> GetReducedIndices(const Tensor& reduction_indices,
+                                               int32 input_dims) {
+  const auto reduction_indices_flat = reduction_indices.flat<int32>();
+  const int32 reduction_dims = reduction_indices_flat.size();
+
+  gtl::InlinedVector<int32, 8> reduced_indices(reduction_dims);
+  if (reduction_dims > 0) {
+    for (int32 i = 0; i < reduction_dims; ++i) {
+      reduced_indices[i] = reduction_indices_flat(reduction_dims - i - 1);
+      reduced_indices[i] += reduced_indices[i] < 0 ? input_dims : 0;
+    }
+  } else {
+    for (int32 i = 0; i < input_dims; ++i) {
+      reduced_indices.push_back(i);
+    }
+  }
+
+  return reduced_indices;
+}
+
+// Appends all unreduced dimensions to the given vector.
+void MakeUnreducedIndices(gtl::InlinedVector<bool, 8> index_is_reduced,
+                          int32 input_dims,
+                          gtl::InlinedVector<int32, 8>* unreduced_indices) {
+  for (int32 index = 0; index < input_dims; ++index) {
+    if (!index_is_reduced[index]) unreduced_indices->push_back(index);
+  }
+}
+
+TensorShape GetOutputShape(gtl::InlinedVector<bool, 8> index_is_reduced,
+                           const TensorShape& input_shape, bool keep_dims) {
+  TensorShape output_shape;
+  for (int32 index = 0; index < index_is_reduced.size(); ++index) {
+    if (index_is_reduced[index]) {
+      if (keep_dims) output_shape.AddDim(1);
+    } else {
+      output_shape.AddDim(input_shape.dim_size(index));
+    }
+  }
+  return output_shape;
+}
+
+}  // namespace
+
+class ReduceJoinOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  explicit ReduceJoinOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("separator", &separator_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const auto input_flat = input.flat<string>();
+    const TensorShape& input_shape = input.shape();
+    const int32 input_dims = input_shape.dims();
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input_shape),
+                errors::InvalidArgument("Input cannot be a scalar."));
+
+    const Tensor& reduction_indices = context->input(1);
+    const auto reduction_indices_flat = reduction_indices.flat<int32>();
+    const int32 reduction_dims = reduction_indices_flat.size();
+
+    // Empty reduction_indices indicates that all indices are reduced.
+    gtl::InlinedVector<bool, 8> index_is_reduced(input_dims,
+                                                 reduction_dims == 0);
+    for (int32 i = 0; i < reduction_dims; i++) {
+      int32 reduce_index = reduction_indices_flat(i);
+      const int32 true_reduce_index =
+          reduce_index < 0 ? reduce_index + input_dims : reduce_index;
+      OP_REQUIRES(
+          context, reduce_index >= -input_dims && reduce_index < input_dims,
+          errors::OutOfRange("Invalid reduction dimension ", reduce_index,
+                             " for input with ", input_dims, " dimension(s)"));
+      OP_REQUIRES(context, input_shape.dim_size(true_reduce_index) > 0,
+                  errors::InvalidArgument("Reduction dimension ", reduce_index,
+                                          " has size 0"));
+      OP_REQUIRES(context, !index_is_reduced[true_reduce_index],
+                  errors::InvalidArgument("Duplicate reduction dimension ",
+                                          reduce_index));
+      index_is_reduced[true_reduce_index] = true;
+    }
+
+    gtl::InlinedVector<int32, 8> reduced_indices =
+        GetReducedIndices(reduction_indices, input_dims);
+    gtl::InlinedVector<int32, 8> unreduced_indices;
+    if (reduction_indices.shape().num_elements() > 0) {
+      MakeUnreducedIndices(index_is_reduced, input_dims, &unreduced_indices);
+    }
+    const auto strides = GetStrides(input_shape);
+
+    Tensor* output_tensor = nullptr;
+    TensorShape output_shape =
+        GetOutputShape(index_is_reduced, input_shape, keep_dims_);
+    OP_REQUIRES_OK(context, context->allocate_output("output", output_shape,
+                                                     &output_tensor));
+    auto output_flat = output_tensor->flat<string>();
+
+    const int64 reduction_iter_size =
+        GetReductionIterSize(reduced_indices, input_shape);
+    gtl::InlinedVector<StringPiece, 8> curr_strings(reduction_iter_size);
+    for (int64 output_index = 0; output_index < output_shape.num_elements();
+         ++output_index) {
+      int64 output_full_index = LinearSubIndexToFullIndex(
+          output_index, unreduced_indices, input_shape, strides);
+      for (int64 reduction_index = 0; reduction_index < reduction_iter_size;
+           ++reduction_index) {
+        int64 reduction_full_index = LinearSubIndexToFullIndex(
+            reduction_index, reduced_indices, input_shape, strides);
+        curr_strings[reduction_index] =
+            input_flat(output_full_index + reduction_full_index);
+      }
+      output_flat(output_index) =
+          str_util::Join(curr_strings, separator_.c_str());
+    }
+  }
+
+ private:
+  bool keep_dims_;
+  string separator_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ReduceJoin").Device(DEVICE_CPU), ReduceJoinOp);
+
+}  // namespace tensorflow
--- a/tensorflow/core/kernels/reduction_ops_common.cc
+++ b/tensorflow/core/kernels/reduction_ops_common.cc
@ -63,9 +63,9 @@ Status ReductionHelper::Simplify(const Tensor& data, const Tensor& axis,
  for (int64 i = 0; i < axis.NumElements(); ++i) {
    const int32 index = axis_vec(i);
    if (index < 0 || index >= data.dims()) {
-      return errors::OutOfRange("Invalid reduction dimension (", index,
-                                " for input with ", data.dims(),
-                                " dimension(s)");
+      return errors::InvalidArgument("Invalid reduction dimension (", index,
+                                     " for input with ", data.dims(),
+                                     " dimension(s)");
    }
    bitmap[index] = true;
  }
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@ -35,7 +35,7 @@ typedef Eigen::GpuDevice GPUDevice;
  template struct functor::Relu6<GPUDevice, T>;     \
  template struct functor::Relu6Grad<GPUDevice, T>; \
  template struct functor::Elu<GPUDevice, T>;       \
-  template struct functor::EluGrad<GPUDevice, T>
+  template struct functor::EluGrad<GPUDevice, T>;

 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);

--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@ -49,17 +50,7 @@ bool ParseShapeAndSlice(const string& shape_and_slice, TensorShape* shape,
        shape_and_slice);
    return false;
  }
-  int num_dims = splits.size() - 1;
-  shape->Clear();
-  for (int i = 0; i < num_dims; ++i) {
-    int dim;
-    if (!strings::safe_strto32(splits[i], &dim)) {
-      *error = strings::StrCat("Non numerical dimension in shape_and_slice: ",
-                               shape_and_slice);
-      return false;
-    }
-    shape->AddDim(dim);
-  }
+
  // The last split is the slice specification.
  slice->Clear();
  auto status = slice->Parse(splits.back(), slice);
@ -67,6 +58,20 @@ bool ParseShapeAndSlice(const string& shape_and_slice, TensorShape* shape,
    *error = status.error_message();
    return false;
  }
+
+  // The first n-1 are the shape specification.
+  splits.pop_back();
+  shape->Clear();
+  for (const auto& s : splits) {
+    int dim;
+    if (!strings::safe_strto32(s, &dim)) {
+      *error = strings::StrCat("Non numerical dimension in shape_and_slice: ",
+                               shape_and_slice);
+      return false;
+    }
+    shape->AddDim(dim);
+  }
+
  // The specified slice must be compatible with the specified shape.
  status = slice->SliceTensorShape(*shape, shape_slice);
  if (!status.ok()) {
@ -91,13 +96,20 @@ void SaveTensors(
            size, "elements"));
  }

+  // Path, names, and slices if save_slices is true.
+  const int kFixedInputs = save_slices ? 3 : 2;
  const Tensor& tensor_names_t = context->input(1);
-  const int64 N = tensor_names_t.NumElements();
+  OP_REQUIRES(context,
+              FastBoundsCheck(tensor_names_t.NumElements() + kFixedInputs,
+                              std::numeric_limits<int>::max()),
+              errors::InvalidArgument("Too many inputs to SaveTensors"));
+  const int N = static_cast<int>(tensor_names_t.NumElements());
  const string* tensor_shapes_and_slices_ptr = nullptr;
  if (save_slices) {
    const Tensor& tensor_shapes_and_slices_t = context->input(2);
    OP_REQUIRES(
-        context, tensor_shapes_and_slices_t.NumElements() == N,
+        context,
+        tensor_shapes_and_slices_t.NumElements() == static_cast<int64>(N),
        errors::InvalidArgument("Expected ", N,
                                " elements for the tensor "
                                "shapes and slices but got ",
@ -105,8 +117,6 @@ void SaveTensors(
    tensor_shapes_and_slices_ptr =
        tensor_shapes_and_slices_t.flat<string>().data();
  }
-  // Path, names, and slices if save_slices is true.
-  const int kFixedInputs = save_slices ? 3 : 2;
  OP_REQUIRES(context, context->num_inputs() == N + kFixedInputs,
              errors::InvalidArgument("Expected totally ", N + kFixedInputs,
                                      " inputs as input #1 (which is a string "
@ -123,7 +133,7 @@ void SaveTensors(
  auto tensor_names_flat = tensor_names_t.flat<string>();

  string error;
-  for (int64 i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i) {
    const string& name = tensor_names_flat(i);
    const Tensor& input = context->input(i + kFixedInputs);
    TensorShape shape(input.shape());
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@ -130,7 +130,7 @@ class ScatterUpdateOp : public OpKernel {
                    "indices has too many elements for ",
                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
                    N_big, " > ", std::numeric_limits<Index>::max()));
-    const Index N = indices.NumElements();
+    const Index N = static_cast<Index>(indices.NumElements());
    OP_REQUIRES(
        c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
        errors::InvalidArgument("params.shape[0] too large for ",
@ -166,8 +166,9 @@ struct ScatterFunctor<CPUDevice, T, Index, op> {
                   typename TTypes<T>::Matrix params,
                   typename TTypes<T>::ConstMatrix updates,
                   typename TTypes<Index>::ConstFlat indices) {
-    const Index N = indices.size();
-    const Index limit = params.dimension(0);
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
    for (Index i = 0; i < N; i++) {
      // Grab the index and check its validity.  An earlier version of the
      // code checked it and then grabbed it from memory a second time, which
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@ -0,0 +1,120 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/data_flow_ops.cc.
+
+#include <limits.h>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class GetSessionHandleOp : public OpKernel {
+ public:
+  explicit GetSessionHandleOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor val = ctx->input(0);
+    int64 id = ctx->session_state()->GetNewId();
+    TensorStore::TensorAndKey tk{val, id, def().device()};
+    OP_REQUIRES_OK(ctx, ctx->tensor_store()->AddTensor(def().name(), tk));
+    Tensor* handle = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->flat<string>().setConstant(tk.GetHandle(def().name()));
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GetSessionHandleOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("GetSessionHandle").Device(DEVICE_CPU),
+                        GetSessionHandleOp);
+
+#define REGISTER_GPU_KERNEL(type)                         \
+  REGISTER_KERNEL_BUILDER(Name("GetSessionHandle")        \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("handle")       \
+                              .TypeConstraint<type>("T"), \
+                          GetSessionHandleOp)
+
+TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+REGISTER_GPU_KERNEL(bool);
+#undef REGISTER_GPU_KERNEL
+
+class GetSessionTensorOp : public OpKernel {
+ public:
+  explicit GetSessionTensorOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& handle = ctx->input(0);
+    const string& name = handle.scalar<string>()();
+    Tensor val;
+    OP_REQUIRES_OK(ctx, ctx->session_state()->GetTensor(name, &val));
+    ctx->set_output(0, val);
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GetSessionTensorOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("GetSessionTensor").Device(DEVICE_CPU),
+                        GetSessionTensorOp);
+
+#define REGISTER_GPU_KERNEL(type)                             \
+  REGISTER_KERNEL_BUILDER(Name("GetSessionTensor")            \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("handle")           \
+                              .TypeConstraint<type>("dtype"), \
+                          GetSessionTensorOp)
+
+TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+REGISTER_GPU_KERNEL(bool);
+#undef REGISTER_GPU_KERNEL
+
+class DeleteSessionTensorOp : public OpKernel {
+ public:
+  explicit DeleteSessionTensorOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& handle = ctx->input(0);
+    const string& name = handle.scalar<string>()();
+    OP_REQUIRES_OK(ctx, ctx->session_state()->DeleteTensor(name));
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DeleteSessionTensorOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("DeleteSessionTensor").Device(DEVICE_CPU),
+                        DeleteSessionTensorOp);
+REGISTER_KERNEL_BUILDER(
+    Name("DeleteSessionTensor").Device(DEVICE_GPU).HostMemory("handle"),
+    DeleteSessionTensorOp);
+
+}  // namespace tensorflow
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@ -155,7 +155,7 @@ class SliceOp : public OpKernel {
        // TODO(agarwal): Consider multi-threading this loop for cases where
        // size[0] is very large.
        for (int i = 0; i < size[0]; ++i) {
-          const int row = begin[0] + i;
+          const int64 row = begin[0] + i;
          if (i + 1 < size[0]) {
            port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0));
            port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1]));
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@ -89,7 +89,7 @@ class SummaryHistoOp : public OpKernel {
      T v = flat(i);
      if (!std::isfinite(v)) {
        c->SetStatus(
-            errors::OutOfRange("Nan in summary histogram for: ", name()));
+            errors::InvalidArgument("Nan in summary histogram for: ", name()));
        break;
      }
      histo.Add(static_cast<double>(v));
--- a/tensorflow/core/kernels/tensor_array.cc
+++ b/tensorflow/core/kernels/tensor_array.cc
@ -13,49 +13,45 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

+#define EIGEN_USE_THREADS
 #include "tensorflow/core/kernels/tensor_array.h"

+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/aggregate_ops_cpu.h"
+
 namespace tensorflow {

-Status TensorArray::LockedWrite(OpKernelContext* ctx, const int32 index,
-                                PersistentTensor* value) {
-  TF_RETURN_IF_ERROR(LockedReturnIfClosed());
-  size_t index_size = static_cast<size_t>(index);
-  if (index < 0 ||
-      (!dynamic_size_ && index_size >= tensors_.size())) {
-    return errors::InvalidArgument(
-        "TensorArray ", handle_.vec<string>()(1), ": Tried to write to index ",
-        index, " but array is not resizeable and size is: ", tensors_.size());
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace tensor_array {
+
+#define TENSOR_ARRAY_WRITE_OR_ADD(Device, T)                                \
+  template <>                                                               \
+  Status AddToTensor<Device, T>(OpKernelContext * ctx, Tensor * sum,        \
+                                const Tensor* current, const Tensor* add) { \
+    functor::Add2Functor<Device, T> add_functor;                            \
+    add_functor(ctx->template eigen_device<Device>(), sum->flat<T>(),       \
+                current->flat<T>(), add->flat<T>());                        \
+    return Status::OK();                                                    \
  }
-  if (dynamic_size_) {
-    // We must grow the internal TensorArray
-    if (index_size >= tensors_.capacity()) {
-      tensors_.reserve(2 * (index_size + 1));
-    }
-    if (index_size >= tensors_.size()) {
-      tensors_.resize(index_size + 1);
-    }
-  }
-  TensorAndState& t = tensors_[index];
-  if (t.written) {
-    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
-                                   ": Could not write to TensorArray index ",
-                                   index,
-                                   " because it has already been written to.");
-  }
-  Tensor* value_t = value->AccessTensor(ctx);
-  if (value_t->dtype() != dtype_) {
-    return errors::InvalidArgument(
-        "TensorArray ", handle_.vec<string>()(1),
-        ": Could not write to TensorArray index ", index,
-        " because the value dtype is ", DataTypeString(value_t->dtype()),
-        " but TensorArray dtype is ", DataTypeString(dtype_), ".");
-  }
-  t.tensor = *value;
-  t.shape = value_t->shape();
-  t.written = true;
-  return Status::OK();
-}
+
+#define TENSOR_ARRAY_WRITE_OR_ADD_CPU(T) TENSOR_ARRAY_WRITE_OR_ADD(CPUDevice, T)
+TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_CPU)
+#undef TENSOR_ARRAY_WRITE_OR_ADD_CPU
+
+#if GOOGLE_CUDA
+
+#define TENSOR_ARRAY_WRITE_OR_ADD_GPU(T) TENSOR_ARRAY_WRITE_OR_ADD(GPUDevice, T)
+TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+#undef TENSOR_ARRAY_WRITE_OR_ADD_GPU
+
+#endif  // GOOGLE_CUDA
+
+#undef TENSOR_ARRAY_WRITE_OR_ADD
+
+}  // namespace tensor_array

 Status TensorArray::LockedRead(const int32 index, PersistentTensor* value) {
  TF_RETURN_IF_ERROR(LockedReturnIfClosed());
@ -64,20 +60,25 @@ Status TensorArray::LockedRead(const int32 index, PersistentTensor* value) {
                                   " but array size is: ", tensors_.size());
  }
  TensorAndState& t = tensors_[index];
-  if (t.read) {
-    return errors::InvalidArgument(
-        "TensorArray ", handle_.vec<string>()(1), ": Could not read index ",
-        index, " twice because TensorArray a read-once object.");
-  }
  if (!t.written) {
    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
                                   ": Could not read from TensorArray index ",
                                   index,
                                   " because it has not yet been written to.");
  }
+  if (t.cleared) {
+    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
+                                   ": Could not read index ", index,
+                                   " twice because it was cleared after a "
+                                   "previous read (perhaps try setting "
+                                   "clear_after_read = false?).");
+  }
  *value = t.tensor;
+  if (clear_after_read_) {
+    t.tensor = PersistentTensor();
+    t.cleared = true;
+  }
  t.read = true;
-  t.tensor = PersistentTensor();
  return Status::OK();
 }

--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@ -24,22 +24,60 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/aggregate_ops.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"

 namespace tensorflow {

+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace tensor_array {
+
+// Full implementations are in tensor_array.cc
+template <typename Device, typename T>
+Status AddToTensor(OpKernelContext* ctx, Tensor* sum, const Tensor* current,
+                   const Tensor* add) {
+  return errors::InvalidArgument(
+      "tensor_array::AddToTensor type not supported: ",
+      DataTypeString(DataTypeToEnum<T>::value));
+};
+
+#define TENSOR_ARRAY_WRITE_OR_ADD(Device, T)                         \
+  template <>                                                        \
+  Status AddToTensor<Device, T>(OpKernelContext * ctx, Tensor * sum, \
+                                const Tensor* current, const Tensor* add);
+
+#define TENSOR_ARRAY_WRITE_OR_ADD_CPU(T) TENSOR_ARRAY_WRITE_OR_ADD(CPUDevice, T)
+TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_CPU)
+#undef TENSOR_ARRAY_WRITE_OR_ADD_CPU
+
+#if GOOGLE_CUDA
+
+#define TENSOR_ARRAY_WRITE_OR_ADD_GPU(T) TENSOR_ARRAY_WRITE_OR_ADD(GPUDevice, T)
+TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+#undef TENSOR_ARRAY_WRITE_OR_ADD_GPU
+
+#endif  // GOOGLE_CUDA
+
+#undef TENSOR_ARRAY_WRITE_OR_ADD
+
+}  // namespace tensor_array
+
 // The TensorArray object keeps an array of PersistentTensors.  It
 // allows reading from the array and writing to the array.
 //
 // Important properties:
-//   * Reading and writing to a particular index in the TensorArray
-//     is allowed at most once per index.
-//   * Upon reading an entry, that entry is cleared from the array and
-//     marked as read.  This allows removal of Tensor from memory
-//     as soon as it is not needed.  Its shape is saved.
-//   * No deep copies of any PersistentTensor are ever made.
+//   * Usually, writing to a particular index in the TensorArray is allowed at
+//     most once per index.  In a special case, writes with the flag
+//     multiple_writes_aggregate allow multiple writes to the same
+//     index.  In this case, the writes are summed.
+//   * Multiple reads are supported.
+//   * Deep copies of PersistentTensors are rarely made.  The only
+//     time they are made is when WriteOrAggregate is called at least twice
+//     on the same index with the flag multiple_writes_aggregate = True.
 //   * Reading and Writing to the array is protected by a mutex.
 //     All operations on a TensorArray are thread-safe.
 //   * A TensorArray may be preemptively closed, which releases all
@ -51,8 +89,12 @@ namespace tensorflow {
 //   * Write-Once semantics mean the gradient of a TensorArray Read never has to
 //     worry which of multiple writes to that index the gradient value
 //     is meant for.
-//   * Read-Once semantics mean the TensorArray never sees
-//     multiple writes to the same index as part of gradient aggregation.
+//   * Read-Many semantics (when using clear_after_read=false) allow the
+//     TensorArray to be read, packed, or concatenated multiple times;
+//     and the gradient operations use the multiple_writes_aggregate
+//     flag to aggregate the backprop writes.  Multiple backprop writes to
+//     the same index are partial gradients corresponding to the
+//     multiple reads of that index in the forward phase.
 //
 class TensorArray : public ResourceBase {
 public:
@ -61,11 +103,15 @@ class TensorArray : public ResourceBase {
  // can hold more than MAX_INT entries, in practice we do not expect
  // users to construct this many Tensors for storage in a TensorArray.
  TensorArray(const DataType& dtype, const Tensor& handle, int32 N,
-              bool dynamic_size)
+              bool dynamic_size, bool multiple_writes_aggregate,
+              bool clear_after_read)
      : dtype_(dtype),
        handle_(handle),
        closed_(false),
        dynamic_size_(dynamic_size),
+        multiple_writes_aggregate_(multiple_writes_aggregate),
+        gradients_disallowed_(false),
+        clear_after_read_(clear_after_read),
        tensors_(N) {}

  // Write PersistentTensor 'value' to index 'index'.
@ -77,25 +123,40 @@ class TensorArray : public ResourceBase {
  //    Otherwise:
  //      The index is in [0, N) where N == Size()
  //  * The dtype of the Tensor in 'value' matches the TensorArray's dtype.
-  //  * The Tensor at 'index' has not yet been written to.
+  //  * If multiple_writes_aggregate is false:
+  //    The Tensor at 'index' has not yet been written to.
+  //  * If multiple_writes_aggregate is true:
+  //    The Tensor at 'index' has the same shape as value.
  //
  // Side effects:
-  //  * The underlying Tensor in 'value' has a new reference to it.
-  //  * Index 'index' is marked as written.
+  //  * On the first write to 'index':
+  //    - The underlying Tensor in 'value' has a new reference to it.
+  //    - The index 'index' is marked as written.
+  //  * If multiple_writes_aggregate is false, subsequent writes to 'index'
+  //    raise an InvalidArgument error.
+  //  * If multiple_writes_aggregate is true, subsequent writes to 'index':
+  //    - The underlying Tensors in 'value' and from the first write
+  //      are released and a local PersistentTensor is created.
+  //    - Index 'index' is also marked as local_copy.
+  //    - The gradient_disallowed flag is set true (GradientAllowed()
+  //      will now return false).
  //
  // Note, value is passed as a pointer because we its underlying
  // Tensor's shape is accessed.  Otherwise it is not modified.
-  Status Write(OpKernelContext* ctx, const int32 index,
-               PersistentTensor* value) {
+  template <typename Device, typename T>
+  Status WriteOrAggregate(OpKernelContext* ctx, const int32 index,
+                          PersistentTensor* value) {
    mutex_lock l(mu_);
-    return LockedWrite(ctx, index, value);
+    return LockedWriteOrAggregate<Device, T>(ctx, index, value);
  }

-  Status WriteMany(OpKernelContext* ctx,
-                   std::vector<PersistentTensor>* values) {
+  template <typename Device, typename T>
+  Status WriteOrAggregateMany(OpKernelContext* ctx,
+                              std::vector<PersistentTensor>* values) {
    mutex_lock l(mu_);
    for (int32 i = values->size() - 1; i >= 0; --i) {
-      TF_RETURN_IF_ERROR(LockedWrite(ctx, i, &(*values)[i]));
+      Status s = LockedWriteOrAggregate<Device, T>(ctx, i, &(*values)[i]);
+      TF_RETURN_IF_ERROR(s);
    }
    return Status::OK();
  }
@ -106,13 +167,15 @@ class TensorArray : public ResourceBase {
  //  * The TensorArray is not closed
  //  * The index is in [0, N)
  //  * The Tensor at 'index' has been written to.
-  //  * The Tensor at 'index' has not already been read.
+  //  * The Tensor at 'index' has not been read from with flag
+  //    clear_after_read = true.
  //
  // Side effects:
-  //  * The PersistentTensor at 'index' is cleared from the given index.
-  //  * The reference to the underlying Tensor at 'index' is shifted to
+  //  * If clear_after_read is true, the reference to the underlying
+  //    Tensor is deleted.
+  //  * The reference to the underlying Tensor at 'index' is copied to
  //    the returned '*value'.
-  //  * Index 'index' is marked as read.
+  //  * The index is marked as read (it cannot be rewritten to).
  Status Read(const int32 index, PersistentTensor* value) {
    mutex_lock l(mu_);
    return LockedRead(index, value);
@ -161,6 +224,11 @@ class TensorArray : public ResourceBase {
    return dynamic_size_;
  }

+  bool GradientsAllowed() {
+    mutex_lock l(mu_);
+    return !gradients_disallowed_;
+  }
+
  // Clear the TensorArray, including any Tensor references, and mark as closed.
  void ClearAndMarkClosed() {
    mutex_lock l(mu_);
@ -175,6 +243,11 @@ class TensorArray : public ResourceBase {
  Status LockedWrite(OpKernelContext* ctx, const int32 index,
                     PersistentTensor* value) EXCLUSIVE_LOCKS_REQUIRED(mu_);

+  template <typename Device, typename T>
+  Status LockedWriteOrAggregate(OpKernelContext* ctx, const int32 index,
+                                PersistentTensor* value)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
  Status LockedRead(const int32 index, PersistentTensor* value)
      EXCLUSIVE_LOCKS_REQUIRED(mu_);

@ -191,25 +264,134 @@ class TensorArray : public ResourceBase {

  mutex mu_;

-  bool closed_
-      GUARDED_BY(mu_);  // Marks that the tensor_array_ has been cleared.
+  // Marks that the tensor_array_ has been cleared.
+  bool closed_ GUARDED_BY(mu_);

-  bool dynamic_size_;  // Determines if Writes are allowed to grow the array.
+  // Writes are allowed to grow the array.
+  bool dynamic_size_;
+
+  // Multiple writes to the same index will result in summation of the
+  // values (used by backprop)
+  bool multiple_writes_aggregate_;
+
+  // If multiple Writes were attempted (e.g. via attribute
+  // multiple_writes_aggregate), then gradients are disallowed.
+  bool gradients_disallowed_ GUARDED_BY(mu_);
+
+  // After a read at an index, clear away its PersistentTensor to
+  // release memory.
+  bool clear_after_read_;

  // TensorAndState is used to keep track of the PersistentTensors
  // stored in the TensorArray, along with their shapes, and a boolean
  // that determines whether they have already been read or not.
  struct TensorAndState {
-    TensorAndState() : written(false), read(false) {}
+    TensorAndState()
+        : written(false), read(false), cleared(false), local_copy(false) {}
    PersistentTensor tensor;
    TensorShape shape;
    bool written;  // True if a Tensor has been written to the index.
    bool read;  // True if a Tensor has been written to and read from the index.
+    bool cleared;  // True if a tensor has been read with
+                   // clear_after_read = true;
+
+    // Used by writes when multiple_writes_aggregate is true.  In this
+    // case, the first time a value is written, it is a shallow copy.
+    // The second time a value is written, it is aggregated.  However,
+    // in this case a new Tensor must be constructed to hold the
+    // aggregated value.  This flag marks that such a Tensor is being
+    // used.  All future writes will aggregate to the existing local Tensor.
+    bool local_copy;
  };
  // The list of underlying PersistentTensors and states.
  std::vector<TensorAndState> tensors_ GUARDED_BY(mu_);
 };

+template <typename Device, typename T>
+Status TensorArray::LockedWriteOrAggregate(OpKernelContext* ctx,
+                                           const int32 index,
+                                           PersistentTensor* value) {
+  TF_RETURN_IF_ERROR(LockedReturnIfClosed());
+  size_t index_size = static_cast<size_t>(index);
+  if (index < 0 || (!dynamic_size_ && index_size >= tensors_.size())) {
+    return errors::InvalidArgument(
+        "TensorArray ", handle_.vec<string>()(1), ": Tried to write to index ",
+        index, " but array is not resizeable and size is: ", tensors_.size());
+  }
+  if (dynamic_size_) {
+    // We must grow the internal TensorArray
+    if (index_size >= tensors_.capacity()) {
+      tensors_.reserve(2 * (index_size + 1));
+    }
+    if (index_size >= tensors_.size()) {
+      tensors_.resize(index_size + 1);
+    }
+  }
+  TensorAndState& t = tensors_[index];
+
+  Tensor* value_t = value->AccessTensor(ctx);
+  if (value_t->dtype() != dtype_) {
+    return errors::InvalidArgument(
+        "TensorArray ", handle_.vec<string>()(1),
+        ": Could not write to TensorArray index ", index,
+        " because the value dtype is ", DataTypeString(value_t->dtype()),
+        " but TensorArray dtype is ", DataTypeString(dtype_), ".");
+  }
+
+  if (t.read) {
+    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
+                                   ": Could not write to TensorArray index ",
+                                   index, " because it has already been read.");
+  }
+
+  if (!multiple_writes_aggregate_ && t.written) {
+    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
+                                   ": Could not write to TensorArray index ",
+                                   index,
+                                   " because it has already been written to.");
+  }
+
+  if (t.written) {
+    DCHECK(multiple_writes_aggregate_);
+
+    // Check that value_t shape matches t.shape
+    if (value_t->shape() != t.shape) {
+      return errors::InvalidArgument(
+          "TensorArray ", handle_.vec<string>()(1),
+          ": Could not aggregate to TensorArray index ", index,
+          " because the existing shape is ", t.shape.DebugString(),
+          " but the new input shape is ", value_t->shape().DebugString(), ".");
+    }
+
+    Tensor* existing_t = t.tensor.AccessTensor(ctx);
+
+    if (t.local_copy) {
+      Status s = tensor_array::AddToTensor<Device, T>(ctx, existing_t,
+                                                      existing_t, value_t);
+      TF_RETURN_IF_ERROR(s);
+    } else {
+      PersistentTensor local_tensor;
+      Tensor* local_tensor_t;
+      TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+          dtype_, existing_t->shape(), &local_tensor, &local_tensor_t));
+      Status s = tensor_array::AddToTensor<Device, T>(ctx, local_tensor_t,
+                                                      existing_t, value_t);
+      TF_RETURN_IF_ERROR(s);
+      t.tensor = local_tensor;
+      t.local_copy = true;
+    }
+
+    // We've aggregated the values, so disallow backprop on this
+    // TensorArray.
+    gradients_disallowed_ = true;
+  } else {
+    t.tensor = *value;
+    t.shape = value_t->shape();
+    t.written = true;
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow

 #endif  // TENSORFLOW_KERNELS_TENSOR_ARRAY_H_
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@ -125,6 +125,8 @@ class TensorArrayOp : public TensorArrayCreationOp {
      : TensorArrayCreationOp(context) {
    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
    OP_REQUIRES_OK(context, context->GetAttr("dynamic_size", &dynamic_size_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("clear_after_read", &clear_after_read_));
    OP_REQUIRES_OK(context,
                   context->GetAttr("tensor_array_name", &tensor_array_name_));
    if (tensor_array_name_ == "") tensor_array_name_ = name();
@ -148,7 +150,8 @@ class TensorArrayOp : public TensorArrayCreationOp {
    handle(1) = tensor_array_name_;

    TensorArray* tensor_array = new TensorArray(
-        dtype_, *tensor_array_output_handle, size, dynamic_size_);
+        dtype_, *tensor_array_output_handle, size, dynamic_size_,
+        false /* multiple_writes_aggregate */, clear_after_read_);

    TF_RETURN_IF_ERROR(rm->Create(handle(0), tensor_array_name_, tensor_array));

@ -160,6 +163,7 @@ class TensorArrayOp : public TensorArrayCreationOp {
 private:
  DataType dtype_;
  bool dynamic_size_;
+  bool clear_after_read_;
  string tensor_array_name_;  // The name used to create the TensorArray.

  TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayOp);
@ -220,11 +224,20 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
    tensor_array->DisableDynamicSize();
    TF_RETURN_IF_ERROR(tensor_array->Size(&array_size));

+    if (!tensor_array->GradientsAllowed()) {
+      return errors::InvalidArgument(
+          "Unable to create a gradients TensorArray for ", tensor_array_name,
+          ".  Perhaps you used the multiple_writes_aggregate flag on a "
+          "previous write?  Gradient calculation is impossible when multiple "
+          "writes are performed to the same index.");
+    }
+
    auto creator = [this, tensor_array, array_size,
                    tensor_array_output_handle](TensorArray** ret) {
-      *ret =
-          new TensorArray(tensor_array->ElemType(), *tensor_array_output_handle,
-                          array_size, false /* dynamic_size */);
+      *ret = new TensorArray(
+          tensor_array->ElemType(), *tensor_array_output_handle, array_size,
+          false /* dynamic_size */, true /* multiple_writes_aggregate */,
+          true /* close_after_read */);
      return Status::OK();
    };

@ -285,10 +298,10 @@ class TensorArrayWriteOp : public OpKernel {
                                " but Op is trying to write dtype ",
                                DataTypeString(tensor_value->dtype()), "."));
    PersistentTensor persistent_tensor(*tensor_value);
-    OP_REQUIRES_OK(ctx, tensor_array->Write(ctx, index, &persistent_tensor));
+    Status s = tensor_array->WriteOrAggregate<Device, T>(ctx, index,
+                                                         &persistent_tensor);
+    OP_REQUIRES_OK(ctx, s);
  }
-
-  bool IsExpensive() override { return false; }
 };

 #define REGISTER_WRITE(type)                                                 \
@ -737,7 +750,9 @@ class TensorArrayUnpackOp : public OpKernel {
      write_values.push_back(persistent_tensor);
    }

-    OP_REQUIRES_OK(ctx, tensor_array->WriteMany(ctx, &write_values));
+    Status s =
+        tensor_array->WriteOrAggregateMany<Device, T>(ctx, &write_values);
+    OP_REQUIRES_OK(ctx, s);
  }
 };

@ -871,7 +886,9 @@ class TensorArraySplitOp : public OpKernel {
      write_values.push_back(persistent_tensor);
    }

-    OP_REQUIRES_OK(ctx, tensor_array->WriteMany(ctx, &write_values));
+    Status s =
+        tensor_array->WriteOrAggregateMany<Device, T>(ctx, &write_values);
+    OP_REQUIRES_OK(ctx, s);
  }
 };

--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@ -49,7 +49,12 @@ class InvertPermutationOp : public OpKernel {
        context, TensorShapeUtils::IsVector(input.shape()),
        errors::InvalidArgument("invert_permutation expects a 1D vector."));
    auto Tin = input.vec<int32>();
-    const int N = Tin.size();
+    OP_REQUIRES(context,
+                FastBoundsCheck(Tin.size(), std::numeric_limits<int32>::max()),
+                errors::InvalidArgument("permutation of nonnegative int32s "
+                                        "must have <= int32 max elements"));
+    const int32 N =
+        static_cast<int32>(Tin.size());  // Safe: bounds-checked above.
    Tensor* output = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, input.shape(), &output));
--- a/tensorflow/core/lib/core/error_codes.proto
+++ b/tensorflow/core/lib/core/error_codes.proto
@ -99,7 +99,7 @@ enum Code {
  // ABORTED, and UNAVAILABLE.
  ABORTED = 10;

-  // Operation was attempted past the valid range.  E.g., seeking or
+  // Operation tried to iterate past the valid input range.  E.g., seeking or
  // reading past end of file.
  //
  // Unlike INVALID_ARGUMENT, this error indicates a problem that may
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@ -15,6 +15,16 @@ limitations under the License.

 #include "tensorflow/core/lib/core/threadpool.h"

+#ifdef TENSORFLOW_USE_EIGEN_THREADPOOL
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_CUSTOM_THREAD_POOL
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#else
+#include <deque>
+#include <thread>
+#include <vector>
+#endif
+
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
@ -24,26 +34,97 @@ limitations under the License.
 namespace tensorflow {
 namespace thread {

-struct ThreadPool::Waiter {
-  condition_variable cv;
-  bool ready;
+#ifdef TENSORFLOW_USE_EIGEN_THREADPOOL
+
+struct EigenEnvironment {
+  typedef Thread EnvThread;
+  struct Task {
+    std::function<void()> f;
+    uint64 trace_id;
+  };
+
+  Env* const env_;
+  const ThreadOptions thread_options_;
+  const string name_;
+
+  EigenEnvironment(Env* env, const ThreadOptions& thread_options,
+                   const string& name)
+      : env_(env), thread_options_(thread_options), name_(name) {}
+
+  EnvThread* CreateThread(std::function<void()> f) {
+    return env_->StartThread(thread_options_, name_, [=]() {
+      // Set the processor flag to flush denormals to zero
+      port::ScopedFlushDenormal flush;
+      f();
+    });
+  }
+
+  Task CreateTask(std::function<void()> f) {
+    uint64 id = 0;
+    if (port::Tracing::IsActive()) {
+      id = port::Tracing::UniqueId();
+      port::Tracing::RecordEvent(port::Tracing::EventCategory::kScheduleClosure,
+                                 id);
+    }
+    return Task{std::move(f), id};
+  }
+
+  void ExecuteTask(const Task& t) {
+    if (t.trace_id != 0) {
+      port::Tracing::ScopedActivity region(
+          port::Tracing::EventCategory::kRunClosure, t.trace_id);
+      t.f();
+    } else {
+      t.f();
+    }
+  }
 };

-ThreadPool::ThreadPool(Env* env, const string& name, int num_threads)
-    : ThreadPool(env, ThreadOptions(), name, num_threads) {}
+struct ThreadPool::Impl : Eigen::ThreadPoolTempl<EigenEnvironment> {
+  Impl(Env* env, const ThreadOptions& thread_options, const string& name,
+       int num_threads)
+      : Eigen::ThreadPoolTempl<EigenEnvironment>(
+            num_threads, EigenEnvironment(env, thread_options, name)) {}
+};

-ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
+#else
+
+struct ThreadPool::Impl {
+  Impl(Env* env, const ThreadOptions& thread_options, const string& name,
+       int num_threads);
+  ~Impl();
+  void Schedule(std::function<void()> fn);
+
+ private:
+  struct Waiter {
+    condition_variable cv;
+    bool ready;
+  };
+
+  struct Task {
+    std::function<void()> fn;
+    uint64 id;
+  };
+
+  void WorkerLoop();
+
+  const string name_;
+  mutex mu_;
+  std::vector<Thread*> threads_;  // All threads
+  std::vector<Waiter*> waiters_;  // Stack of waiting threads.
+  std::deque<Task> pending_;      // Queue of pending work
+};
+
+ThreadPool::Impl::Impl(Env* env, const ThreadOptions& thread_options,
                       const string& name, int num_threads)
    : name_(name) {
-  CHECK_GE(num_threads, 1);
-  string name_prefix = "tf_" + name_;
  for (int i = 0; i < num_threads; i++) {
-    threads_.push_back(env->StartThread(thread_options, name_prefix,
-                                        [this]() { WorkerLoop(); }));
+    threads_.push_back(
+        env->StartThread(thread_options, name, [this]() { WorkerLoop(); }));
  }
 }

-ThreadPool::~ThreadPool() {
+ThreadPool::Impl::~Impl() {
  {
    // Wait for all work to get done.
    mutex_lock l(mu_);
@ -66,13 +147,7 @@ ThreadPool::~ThreadPool() {
  }
 }

-bool ThreadPool::HasPendingClosures() const {
-  mutex_lock l(mu_);
-  return pending_.size() != 0;
-}
-
-void ThreadPool::Schedule(std::function<void()> fn) {
-  CHECK(fn != nullptr);
+void ThreadPool::Impl::Schedule(std::function<void()> fn) {
  uint64 id = 0;
  if (port::Tracing::IsActive()) {
    id = port::Tracing::UniqueId();
@ -90,7 +165,7 @@ void ThreadPool::Schedule(std::function<void()> fn) {
  }
 }

-void ThreadPool::WorkerLoop() {
+void ThreadPool::Impl::WorkerLoop() {
  // Set the processor flag to flush denormals to zero
  port::ScopedFlushDenormal flush;

@ -107,22 +182,40 @@ void ThreadPool::WorkerLoop() {
      }
    }
    // Pick up pending work
-    Item item = pending_.front();
+    Task t = pending_.front();
    pending_.pop_front();
-    if (item.fn == nullptr) {
+    if (t.fn == nullptr) {
      break;
    }
    mu_.unlock();
-    if (item.id != 0) {
+    if (t.id != 0) {
      port::Tracing::ScopedActivity region(
-          port::Tracing::EventCategory::kRunClosure, item.id);
-      item.fn();
+          port::Tracing::EventCategory::kRunClosure, t.id);
+      t.fn();
    } else {
-      item.fn();
+      t.fn();
    }
    mu_.lock();
  }
 }
+#endif
+
+ThreadPool::ThreadPool(Env* env, const string& name, int num_threads)
+    : ThreadPool(env, ThreadOptions(), name, num_threads) {}
+
+ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
+                       const string& name, int num_threads) {
+  CHECK_GE(num_threads, 1);
+  impl_.reset(
+      new ThreadPool::Impl(env, thread_options, "tf_" + name, num_threads));
+}
+
+ThreadPool::~ThreadPool() {}
+
+void ThreadPool::Schedule(std::function<void()> fn) {
+  CHECK(fn != nullptr);
+  impl_->Schedule(std::move(fn));
+}

 }  // namespace thread
 }  // namespace tensorflow
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@ -16,13 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_CORE_THREADPOOL_H_
 #define TENSORFLOW_LIB_CORE_THREADPOOL_H_

-#include <deque>
 #include <functional>
-#include <thread>
-#include <vector>
+#include <memory>
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"

 namespace tensorflow {
@ -45,28 +42,15 @@ class ThreadPool {

  // Wait until all scheduled work has finished and then destroy the
  // set of threads.
-  virtual ~ThreadPool();
+  ~ThreadPool();

  // Schedule fn() for execution in the pool of threads.
-  virtual void Schedule(std::function<void()> fn);
+  void Schedule(std::function<void()> fn);

-  virtual bool HasPendingClosures() const;
+  struct Impl;

 private:
-  struct Waiter;
-  struct Item {
-    std::function<void()> fn;
-    uint64 id;
-  };
-
-  void WorkerLoop();
-
-  const string name_;
-  mutable mutex mu_;
-  std::vector<Thread*> threads_;  // All threads
-  std::vector<Waiter*> waiters_;  // Stack of waiting threads.
-  std::deque<Item> pending_;      // Queue of pending work
-
+  std::unique_ptr<Impl> impl_;
  TF_DISALLOW_COPY_AND_ASSIGN(ThreadPool);
 };

--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@ -18,6 +18,7 @@ limitations under the License.
 #include <atomic>

 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"

--- a/tensorflow/core/lib/io/table_test.cc
+++ b/tensorflow/core/lib/io/table_test.cc
@ -581,9 +581,9 @@ TEST(TableTest, ApproximateOffsetOfCompressed) {
  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 10, 100));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 4000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 4000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7000));
 }

 TEST(TableTest, SeekToFirstKeyDoesNotReadTooMuch) {
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@ -224,6 +224,87 @@ diagonal: The extracted diagonal.

 )doc");

+// --------------------------------------------------------------------------
+REGISTER_OP("BatchMatrixDiag")
+    .Input("diagonal: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Doc(R"doc(
+Returns a batched diagonal tensor with a given batched diagonal values.
+
+Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+everything else padded with zeros. The diagonal is computed as follows:
+
+Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+
+`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+
+For example:
+
+```prettyprint
+# 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+and diagonal.shape = (2, 4)
+
+tf.batch_matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+                                     [0, 2, 0, 0]
+                                     [0, 0, 3, 0]
+                                     [0, 0, 0, 4]],
+                                    [[5, 0, 0, 0]
+                                     [0, 6, 0, 0]
+                                     [0, 0, 7, 0]
+                                     [0, 0, 0, 8]]]
+
+which has shape (2, 4, 4)
+```
+
+diagonal: Rank `k`, where `k >= 1`.
+output: Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+)doc");
+
+// --------------------------------------------------------------------------
+REGISTER_OP("BatchMatrixDiagPart")
+    .Input("input: T")
+    .Output("diagonal: T")
+    .Attr("T: type")
+    .Doc(R"doc(
+Returns the batched diagonal part of a batched tensor.
+
+This operation returns a tensor with the `diagonal` part
+of the batched `input`. The `diagonal` part is computed as follows:
+
+Assume `input` has `k` dimensions `[I, J, K, ..., N, N]`, then the output is a
+tensor of rank `k - 1` with dimensions `[I, J, K, ..., N]` where:
+
+`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+
+The input must be at least a matrix.
+
+For example:
+
+```prettyprint
+# 'input' is [[[1, 0, 0, 0]
+               [0, 2, 0, 0]
+               [0, 0, 3, 0]
+               [0, 0, 0, 4]],
+              [[5, 0, 0, 0]
+               [0, 6, 0, 0]
+               [0, 0, 7, 0]
+               [0, 0, 0, 8]]]
+
+and input.shape = (2, 4, 4)
+
+tf.batch_matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+which has shape (2, 4)
+```
+
+input: Rank `k` tensor where `k >= 2` and the last two dimensions are equal.
+diagonal: The extracted diagonal(s) having shape
+  `diagonal.shape = input.shape[:-1]`.
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("Reverse")
    .Input("tensor: T")
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@ -3004,6 +3004,36 @@ op {
    }
  }
 }
+op {
+  name: "BatchMatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "BatchMatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
  name: "BatchMatrixInverse"
  input_arg {
@ -3050,6 +3080,38 @@ op {
    }
  }
 }
+op {
+  name: "BatchMatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
  name: "BatchMatrixSolveLs"
  input_arg {
@ -3118,6 +3180,45 @@ op {
    }
  }
 }
+op {
+  name: "BatchMatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
  name: "BatchNormWithGlobalNormalization"
  input_arg {
@ -5258,6 +5359,13 @@ op {
    }
  }
 }
+op {
+  name: "DeleteSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+}
 op {
  name: "DepthToSpace"
  input_arg {
@ -6509,6 +6617,36 @@ op {
    }
  }
 }
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
 op {
  name: "Greater"
  input_arg {
@ -8323,6 +8461,38 @@ op {
    }
  }
 }
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
  name: "MatrixSolveLs"
  input_arg {
@ -8391,6 +8561,45 @@ op {
    }
  }
 }
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
  name: "Max"
  input_arg {
@ -11056,6 +11265,35 @@ op {
    type: DT_FLOAT
  }
 }
+op {
+  name: "ReduceJoin"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reduction_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
 op {
  name: "RefEnter"
  input_arg {
@ -16924,6 +17162,44 @@ op {
  }
  is_stateful: true
 }
+op {
+  name: "TensorArray"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
  name: "TensorArrayClose"
  input_arg {
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@ -389,6 +389,7 @@ REGISTER_OP("TensorArray")
    .Input("size: int32")
    .Attr("dtype: type")
    .Attr("dynamic_size: bool = false")
+    .Attr("clear_after_read: bool = true")
    .Attr("tensor_array_name: string = ''")
    .Output("handle: Ref(string)")
    .SetIsStateful()
@ -401,6 +402,9 @@ size: The size of the array.
 dtype: The type of the elements on the tensor_array.
 dynamic_size: A boolean that determines whether writes to the TensorArray
  are allowed to grow the size.  By default, this is not allowed.
+clear_after_read: If true (default), Tensors in the TensorArray are cleared
+  after being read.  This disables multiple read semantics but allows early
+  release of memory.
 tensor_array_name: Overrides the name used for the temporary tensor_array
  resource. Default value is the name of the 'TensorArray' op (which
  is guaranteed unique).
@ -483,7 +487,7 @@ REGISTER_OP("TensorArrayRead")
    .Output("value: dtype")
    .Attr("dtype: type")
    .Doc(R"doc(
-Read an element from the TensorArray.
+Read an element from the TensorArray into output `value`.

 handle: The handle to a TensorArray.
 dtype: The type of the elem that is returned.
@ -497,7 +501,7 @@ REGISTER_OP("TensorArrayPack")
    .Output("value: dtype")
    .Attr("dtype: type")
    .Doc(R"doc(
-Pack the elements from the TensorArray.
+Pack the elements from the TensorArray into output `value`.

 All elements must have the same shape.

@ -530,12 +534,17 @@ REGISTER_OP("TensorArrayConcat")
    .Output("lengths: int64")
    .Attr("dtype: type")
    .Doc(R"doc(
-Concat the elements from the TensorArray.
+Concat the elements from the TensorArray into value `value`.
+
+Takes `T` elements of shapes
+
+  ```
+  (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+  ```

-Takes T elements of shapes (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...),
-  ..., (n(T-1) x d0 x d1 x ...)
 and concatenates them into a Tensor of shape:
-  (n0 + n1 + ... + n(T-1) x d0 x d1 x ...).
+
+  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```

 All elements must have the same shape (excepting the first dimension).

@ -546,7 +555,7 @@ value: All of the elements in the TensorArray, concatenated along the first
  axis.
 lengths: A vector of the row sizes of the original T elements in the
  value output.  In the example above, this would be the values:
-  (n1, n2, ..., n(T-1))
+  `(n1, n2, ..., n(T-1))`.
 )doc");

 REGISTER_OP("TensorArraySplit")
@ -560,15 +569,22 @@ REGISTER_OP("TensorArraySplit")
 Split the data from the input value into TensorArray elements.

 Assuming that `lengths` takes on values
-  (n0, n1, ..., n(T-1))
+
+  ```(n0, n1, ..., n(T-1))```
+
 and that `value` has shape
-  (n0 + n1 + ... + n(T-1) x d0 x d1 x ...),
+
+  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+
 this splits values into a TensorArray with T tensors.

 TensorArray index t will be the subtensor of values with starting position
-  (n0 + n1 + ... + n(t-1), 0, 0, ...)
+
+  ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+
 and having size
-  nt x d0 x d1 x ...
+
+  ```nt x d0 x d1 x ...```

 handle: The handle to a TensorArray.
 value: The concatenated tensor to write to the TensorArray.
@ -670,4 +686,35 @@ keys: Keys of type Tkey.
 values: Values of type Tval. Same shape as `keys`.
 )doc");

+REGISTER_OP("GetSessionHandle")
+    .Input("value: T")
+    .Output("handle: string")
+    .Attr("T: type")
+    .Doc(R"doc(
+Store the input tensor in the state of the current session.
+
+value: The tensor to be stored.
+handle: The handle for the tensor stored in the session state.
+)doc");
+
+REGISTER_OP("GetSessionTensor")
+    .Input("handle: string")
+    .Output("value: dtype")
+    .Attr("dtype: type")
+    .Doc(R"doc(
+Get the value of the tensor specified by its handle.
+
+handle: The handle for a tensor stored in the session state.
+value: The tensor for the given handle.
+dtype: The type of the output value.
+)doc");
+
+REGISTER_OP("DeleteSessionTensor")
+    .Input("handle: string")
+    .Doc(R"doc(
+Delete the tensor specified by its handle in the session.
+
+handle: The handle for a tensor stored in the session state.
+)doc");
+
 }  // namespace tensorflow
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@ -89,7 +89,7 @@ The generated
 [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
 has one summary value containing a histogram for `values`.

-This op reports an `OutOfRange` error if any value is not finite.
+This op reports an `InvalidArgument` error if any value is not finite.

 tag: Scalar.  Tag to use for the `Summary.Value`.
 values: Any shape. Values to use to build the histogram.
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@ -1390,6 +1390,44 @@ op {
  summary: "Calculates the determinants for a batch of square matrices."
  description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. The output is a 1-D tensor containing the determinants\nfor all input submatrices `[..., :, :]`."
 }
+op {
+  name: "BatchMatrixDiag"
+  input_arg {
+    name: "diagonal"
+    description: "Rank `k`, where `k >= 1`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Returns a batched diagonal tensor with a given batched diagonal values."
+  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a\ntensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:\n\n`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.\n\nFor example:\n\n```prettyprint\n# \'diagonal\' is [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nand diagonal.shape = (2, 4)\n\ntf.batch_matrix_diag(diagonal) ==> [[[1, 0, 0, 0]\n                                     [0, 2, 0, 0]\n                                     [0, 0, 3, 0]\n                                     [0, 0, 0, 4]],\n                                    [[5, 0, 0, 0]\n                                     [0, 6, 0, 0]\n                                     [0, 0, 7, 0]\n                                     [0, 0, 0, 8]]]\n\nwhich has shape (2, 4, 4)\n```"
+}
+op {
+  name: "BatchMatrixDiagPart"
+  input_arg {
+    name: "input"
+    description: "Rank `k` tensor where `k >= 2` and the last two dimensions are equal."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    description: "The extracted diagonal(s) having shape\n`diagonal.shape = input.shape[:-1]`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Returns the batched diagonal part of a batched tensor."
+  description: "This operation returns a tensor with the `diagonal` part\nof the batched `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has `k` dimensions `[I, J, K, ..., N, N]`, then the output is a\ntensor of rank `k - 1` with dimensions `[I, J, K, ..., N]` where:\n\n`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.\n\nThe input must be at least a matrix.\n\nFor example:\n\n```prettyprint\n# \'input\' is [[[1, 0, 0, 0]\n               [0, 2, 0, 0]\n               [0, 0, 3, 0]\n               [0, 0, 0, 4]],\n              [[5, 0, 0, 0]\n               [0, 6, 0, 0]\n               [0, 0, 7, 0]\n               [0, 0, 0, 8]]]\n\nand input.shape = (2, 4, 4)\n\ntf.batch_matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nwhich has shape (2, 4)\n```"
+}
 op {
  name: "BatchMatrixInverse"
  input_arg {
@ -1432,6 +1470,14 @@ op {
    description: "Shape is `[..., M, K]`."
    type_attr: "T"
  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Boolean indicating whether to solve with `matrix` or its (block-wise)\nadjoint."
+  }
  attr {
    name: "T"
    type: "type"
@ -1443,7 +1489,7 @@ op {
    }
  }
  summary: "Solves systems of linear equations. Checks for invertibility."
-  description: "Matrix is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. Rhs is a tensor of shape\n`[..., M, K]`. The output is a tensor shape `[..., M, K]` where each output\nmatrix satisfies matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]."
+  description: "Matrix is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. Rhs is a tensor of shape\n`[..., M, K]`. The output is a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output\nmatrix satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.\nIf `adjoint` is `True` then each output\nmatrix satisfies `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`."
 }
 op {
  name: "BatchMatrixSolveLs"
@ -1509,7 +1555,15 @@ op {
    default_value {
      b: true
    }
-    description: "Boolean indicating whether matrix is lower or upper triangular."
+    description: "Boolean indicating whether the innermost matrices in `matrix` are\nlower or upper triangular."
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Boolean indicating whether to solve with `matrix` or its (block-wise)\nadjoint."
  }
  attr {
    name: "T"
@ -1522,7 +1576,7 @@ op {
    }
  }
  summary: "Solves systems of linear equations with upper or lower triangular matrices by"
-  description: "backsubstitution.\n\n`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form\nsquare matrices. If `lower` is `True` then the strictly upper triangular part\nof each inner-most matrix is ignored. If `lower` is False then the strictly\nlower triangular part of each inner-most matrix is ignored. `rhs` is a tensor\nof shape [..., M, K]`.\n\nThe output is a tensor of shape `[..., M, K]`. If `lower` is `True` then the\noutput satisfies\n\\\\(\\sum_{k=0}^{i}\\\\) matrix[..., i, k] * output[..., k, j] = rhs[..., i, j].\nIf `lower` is false then the strictly then the output satisfies\n\\\\(sum_{k=i}^{K-1}\\\\) matrix[..., i, k] * output[..., k, j] = rhs[..., i, j]."
+  description: "backsubstitution.\n\n`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form\nsquare matrices. If `lower` is `True` then the strictly upper triangular part\nof each inner-most matrix is assumed to be zero and not accessed.\nIf `lower` is False then the strictly lower triangular part of each inner-most\nmatrix is assumed to be zero and not accessed.\n`rhs` is a tensor of shape [..., M, K]`.\n\nThe output is a tensor of shape `[..., M, K]`. If `adjoint` is `True` then the\ninnermost matrices in output` satisfy matrix equations\n`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.\nIf `adjoint` is `False` then the strictly then the  innermost matrices in\n`output` satisfy matrix equations\n`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`."
 }
 op {
  name: "BatchNormWithGlobalNormalization"
@ -2835,6 +2889,15 @@ op {
  }
  summary: "Reinterpret the bytes of a string as a vector of numbers."
 }
+op {
+  name: "DeleteSessionTensor"
+  input_arg {
+    name: "handle"
+    description: "The handle for a tensor stored in the session state."
+    type: DT_STRING
+  }
+  summary: "Delete the tensor specified by its handle in the session."
+}
 op {
  name: "DepthToSpace"
  input_arg {
@ -4100,6 +4163,43 @@ op {
  summary: "Gather values from `params` according to `indices`."
  description: "`indices` must be integer tensor, containing indices into `params`.\nIt must be shape `[d_0, ..., d_N, R]` where `R` is the rank of `params`.\nThe innermost dimension of `indices` (with length `R`) corresponds to the\nindices of `params`.\n\nProduces an output tensor with shape `[d_0, ..., d_{n-1}]` where:\n\n    output[i, j, k, ...] = params[indices[i, j, k, ..., :]]\n\ne.g. for `indices` a matrix:\n\n    output[i] = params[indices[i, :]]"
 }
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    description: "The tensor to be stored."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    description: "The handle for the tensor stored in the session state."
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Store the input tensor in the state of the current session."
+}
+op {
+  name: "GetSessionTensor"
+  input_arg {
+    name: "handle"
+    description: "The handle for a tensor stored in the session state."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    description: "The tensor for the given handle."
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    description: "The type of the output value."
+  }
+  summary: "Get the value of the tensor specified by its handle."
+}
 op {
  name: "Greater"
  input_arg {
@ -4257,7 +4357,7 @@ op {
    }
  }
  summary: "Outputs a `Summary` protocol buffer with a histogram."
-  description: "The generated\n[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)\nhas one summary value containing a histogram for `values`.\n\nThis op reports an `OutOfRange` error if any value is not finite."
+  description: "The generated\n[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)\nhas one summary value containing a histogram for `values`.\n\nThis op reports an `InvalidArgument` error if any value is not finite."
 }
 op {
  name: "IFFT"
@ -5403,9 +5503,17 @@ op {
  }
  output_arg {
    name: "output"
-    description: "Shape is `[M, K]` containing the tensor that solves\nmatrix * output = rhs."
+    description: "Shape is `[M, K]`. If `adjoint` is `False` then `output` that solves\n`matrix` * `output` = `rhs`. If `adjoint` is `True` then `output` that solves\n`adjoint(matrix)` * `output` = `rhs`."
    type_attr: "T"
  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Boolean indicating whether to solve with `matrix` or its adjoint."
+  }
  attr {
    name: "T"
    type: "type"
@ -5482,7 +5590,15 @@ op {
    default_value {
      b: true
    }
-    description: "Boolean indicating whether matrix is lower or upper triangular."
+    description: "Boolean indicating whether `matrix` is lower or upper triangular"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Boolean indicating whether to solve with `matrix` or its adjoint."
  }
  attr {
    name: "T"
@ -5495,7 +5611,7 @@ op {
    }
  }
  summary: "Solves a system of linear equations with an upper or lower triangular matrix by"
-  description: "backsubstitution.\n\n`matrix` is a matrix of shape `[M, M]`. If `lower` is `True` then the strictly\nupper triangular part of `matrix` is ignored. If `lower` is False then the\nstrictly lower triangular part of `matrix` is ignored. `rhs` is a matrix of\nshape [M, K]`.\n\nThe output is a matrix of shape `[M, K]`. If `lower` is `True` then the output\nsatisfies \\\\(\\sum_{k=0}^{i}\\\\) matrix[i, k] * output[k, j] = rhs[i, j].\nIf `lower` is false then output satisfies\n\\\\(\\sum_{k=i}^{K-1}\\\\) matrix[i, k] * output[k, j] = rhs[i, j]."
+  description: "backsubstitution.\n\n`matrix` is a matrix of shape `[M, M]`. If `lower` is `True` then the strictly\nupper triangular part of `matrix` is assumed to be zero and not accessed.\nIf `lower` is False then the strictly lower triangular part of `matrix` is\nassumed to be zero and not accessed.\n`rhs` is a matrix of shape [M, K]`.\n\nThe output is a matrix of shape `[M, K]`. If `adjoint` is `False` the output\nsatisfies the matrix equation `matrix` * `output` = `rhs`.\nIf `adjoint` is `False` then `output` satisfies the matrix equation\n`matrix` * `output` = `rhs`.\nIf `adjoint` is `True` then `output` satisfies the matrix equation\n`adjoint(matrix)` * `output` = `rhs`."
 }
 op {
  name: "Max"
@ -7568,6 +7684,42 @@ op {
  summary: "Returns the real part of a complex number."
  description: "Given a tensor `in` of complex numbers, this operation returns a tensor of type\n`float` that is the real part of each element in `in`. All elements in `in`\nmust be complex numbers of the form \\\\(a + bj\\\\), where *a* is the real part\nreturned by this operation and *b* is the imaginary part.\n\nFor example:\n\n```\n# tensor \'in\' is [-2.25 + 4.75j, 3.25 + 5.75j]\ntf.real(in) ==> [-2.25, 3.25]\n```"
 }
+op {
+  name: "ReduceJoin"
+  input_arg {
+    name: "inputs"
+    description: "The input to be joined.  All reduced indices must have non-zero size."
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reduction_indices"
+    description: "The dimensions to reduce over.  Dimensions are reduced in the\norder specified.  If `reduction_indices` has higher rank than `1`, it is\nflattened.  Omitting `reduction_indices` is equivalent to passing\n`[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "Has shape equal to that of the input with reduced dimensions removed or\nset to `1` depending on `keep_dims`."
+    type: DT_STRING
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, retain reduced dimensions with length `1`."
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "The separator to use when joining."
+  }
+  summary: "Joins a string Tensor across the given dimensions."
+  description: "Computes the string join across dimensions in the given string Tensor of shape\n`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input\nstrings with the given separator (default: empty string).  Negative indices are\ncounted backwards from the end, with `-1` being equivalent to `n - 1`.  Passing\nan empty `reduction_indices` joins all strings in linear index order and outputs\na scalar string.\n\n\nFor example:\n```\n# tensor `a` is [[\"a\", \"b\"], [\"c\", \"d\"]]\ntf.reduce_join(a, 0) ==> [\"ac\", \"bd\"]\ntf.reduce_join(a, 1) ==> [\"ab\", \"cd\"]\ntf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> [\"ac\", \"bd\"]\ntf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> [\"ab\", \"cd\"]\ntf.reduce_join(a, 0, keep_dims=True) ==> [[\"ac\", \"bd\"]]\ntf.reduce_join(a, 1, keep_dims=True) ==> [[\"ab\"], [\"cd\"]]\ntf.reduce_join(a, 0, separator=\".\") ==> [\"a.c\", \"b.d\"]\ntf.reduce_join(a, [0, 1]) ==> [\"acbd\"]\ntf.reduce_join(a, [1, 0]) ==> [\"abcd\"]\ntf.reduce_join(a, []) ==> [\"abcd\"]\n```"
+}
 op {
  name: "RefEnter"
  input_arg {
@ -11107,6 +11259,14 @@ op {
    }
    description: "A boolean that determines whether writes to the TensorArray\nare allowed to grow the size.  By default, this is not allowed."
  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "If true (default), Tensors in the TensorArray are cleared\nafter being read.  This disables multiple read semantics but allows early\nrelease of memory."
+  }
  attr {
    name: "tensor_array_name"
    type: "string"
@ -11150,7 +11310,7 @@ op {
  }
  output_arg {
    name: "lengths"
-    description: "A vector of the row sizes of the original T elements in the\nvalue output.  In the example above, this would be the values:\n(n1, n2, ..., n(T-1))"
+    description: "A vector of the row sizes of the original T elements in the\nvalue output.  In the example above, this would be the values:\n`(n1, n2, ..., n(T-1))`."
    type: DT_INT64
  }
  attr {
@ -11158,8 +11318,8 @@ op {
    type: "type"
    description: "The type of the elem that is returned."
  }
-  summary: "Concat the elements from the TensorArray."
-  description: "Takes T elements of shapes (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...),\n  ..., (n(T-1) x d0 x d1 x ...)\nand concatenates them into a Tensor of shape:\n  (n0 + n1 + ... + n(T-1) x d0 x d1 x ...).\n\nAll elements must have the same shape (excepting the first dimension)."
+  summary: "Concat the elements from the TensorArray into value `value`."
+  description: "Takes `T` elements of shapes\n\n  ```\n  (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)\n  ```\n\nand concatenates them into a Tensor of shape:\n\n  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```\n\nAll elements must have the same shape (excepting the first dimension)."
 }
 op {
  name: "TensorArrayGrad"
@ -11208,7 +11368,7 @@ op {
    type: "type"
    description: "The type of the elem that is returned."
  }
-  summary: "Pack the elements from the TensorArray."
+  summary: "Pack the elements from the TensorArray into output `value`."
  description: "All elements must have the same shape."
 }
 op {
@ -11238,7 +11398,7 @@ op {
    type: "type"
    description: "The type of the elem that is returned."
  }
-  summary: "Read an element from the TensorArray."
+  summary: "Read an element from the TensorArray into output `value`."
 }
 op {
  name: "TensorArraySize"
@ -11293,7 +11453,7 @@ op {
    type: "type"
  }
  summary: "Split the data from the input value into TensorArray elements."
-  description: "Assuming that `lengths` takes on values\n  (n0, n1, ..., n(T-1))\nand that `value` has shape\n  (n0 + n1 + ... + n(T-1) x d0 x d1 x ...),\nthis splits values into a TensorArray with T tensors.\n\nTensorArray index t will be the subtensor of values with starting position\n  (n0 + n1 + ... + n(t-1), 0, 0, ...)\nand having size\n  nt x d0 x d1 x ..."
+  description: "Assuming that `lengths` takes on values\n\n  ```(n0, n1, ..., n(T-1))```\n\nand that `value` has shape\n\n  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,\n\nthis splits values into a TensorArray with T tensors.\n\nTensorArray index t will be the subtensor of values with starting position\n\n  ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```\n\nand having size\n\n  ```nt x d0 x d1 x ...```"
 }
 op {
  name: "TensorArrayUnpack"
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@ -33,4 +33,48 @@ num_buckets: The number of buckets.
 output: A Tensor of the same shape as the input `string_tensor`.
 )doc");

+REGISTER_OP("ReduceJoin")
+    .Input("inputs: string")
+    .Input("reduction_indices: int32")
+    .Attr("keep_dims: bool = false")
+    .Attr("separator: string = ''")
+    .Output("output: string")
+    .Doc(R"doc(
+Joins a string Tensor across the given dimensions.
+
+Computes the string join across dimensions in the given string Tensor of shape
+`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+strings with the given separator (default: empty string).  Negative indices are
+counted backwards from the end, with `-1` being equivalent to `n - 1`.  Passing
+an empty `reduction_indices` joins all strings in linear index order and outputs
+a scalar string.
+
+
+For example:
+```
+# tensor `a` is [["a", "b"], ["c", "d"]]
+tf.reduce_join(a, 0) ==> ["ac", "bd"]
+tf.reduce_join(a, 1) ==> ["ab", "cd"]
+tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+tf.reduce_join(a, []) ==> ["abcd"]
+```
+
+inputs: The input to be joined.  All reduced indices must have non-zero size.
+reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+  order specified.  If `reduction_indices` has higher rank than `1`, it is
+  flattened.  Omitting `reduction_indices` is equivalent to passing
+  `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+keep_dims: If `True`, retain reduced dimensions with length `1`.
+separator: The separator to use when joining.
+
+output: Has shape equal to that of the input with reduced dimensions removed or
+  set to `1` depending on `keep_dims`.
+)doc");
+
 }  // namespace tensorflow
--- a/tensorflow/core/platform/default/thread_annotations.h
+++ b/tensorflow/core/platform/default/thread_annotations.h
@ -157,11 +157,6 @@ limitations under the License.
 // annotations will be ignored by the analysis.
 #define TS_UNCHECKED(x) ""

-// Disables warnings for a single read operation.  This can be used to do racy
-// reads of guarded data members, in cases where the race is benign.
-#define TS_UNCHECKED_READ(x) \
-  ::tensorflow::thread_safety_analysis::ts_unchecked_read(x)
-
 namespace tensorflow {
 namespace thread_safety_analysis {

--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@ -29,30 +29,32 @@ class FileSystemRegistryImpl : public FileSystemRegistry {

 private:
  mutable mutex mu_;
-  mutable std::unordered_map<string, FileSystem*> registry_ GUARDED_BY(mu_);
+  mutable std::unordered_map<string, std::unique_ptr<FileSystem>> registry_
+      GUARDED_BY(mu_);
 };

 void FileSystemRegistryImpl::Register(const string& scheme,
                                      FileSystemRegistry::Factory factory) {
  mutex_lock lock(mu_);
-  QCHECK(!gtl::FindOrNull(registry_, scheme)) << "File factory for " << scheme
-                                              << " already registered";
-  registry_[scheme] = factory();
+  QCHECK(
+      registry_.emplace(string(scheme), std::unique_ptr<FileSystem>(factory()))
+          .second)
+      << "File factory for " << scheme << " already registered";
 }

 FileSystem* FileSystemRegistryImpl::Lookup(const string& scheme) {
  mutex_lock lock(mu_);
-  auto fs_ptr = gtl::FindOrNull(registry_, scheme);
-  if (!fs_ptr) {
+  const auto found = registry_.find(scheme);
+  if (found == registry_.end()) {
    return nullptr;
  }
-  return *fs_ptr;
+  return found->second.get();
 }

 Status FileSystemRegistryImpl::GetRegisteredFileSystemSchemes(
    std::vector<string>* schemes) {
  mutex_lock lock(mu_);
-  for (auto const e : registry_) {
+  for (const auto& e : registry_) {
    schemes->push_back(e.first);
  }
  return Status::OK();
@ -60,8 +62,6 @@ Status FileSystemRegistryImpl::GetRegisteredFileSystemSchemes(

 Env::Env() : file_system_registry_(new FileSystemRegistryImpl) {}

-Env::~Env() { delete file_system_registry_; }
-
 Status Env::GetFileSystemForFile(const string& fname, FileSystem** result) {
  string scheme = GetSchemeFromURI(fname);
  FileSystem* file_system = file_system_registry_->Lookup(scheme);
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_ENV_H_

 #include <stdint.h>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
@ -45,7 +46,7 @@ struct ThreadOptions;
 class Env {
 public:
  Env();
-  virtual ~Env();
+  virtual ~Env() = default;

  /// \brief Returns a default environment suitable for the current operating
  /// system.
@ -59,6 +60,8 @@ class Env {
  /// \brief Returns the FileSystem object to handle operations on the file
  /// specified by 'fname'. The FileSystem object is used as the implementation
  /// for the file system related (non-virtual) functions that follow.
+  /// Returned FileSystem object is still owned by the Env object and will
+  // (might) be destroyed when the environment is destroyed.
  virtual Status GetFileSystemForFile(const string& fname, FileSystem** result);

  /// \brief Returns the file system schemes registered for this Env.
@ -77,6 +80,10 @@ class Env {
  /// status.
  ///
  /// The returned file may be concurrently accessed by multiple threads.
+  ///
+  /// The ownership of the returned RandomAccessFile is passed to the caller
+  /// and the object should be deleted when is not used. The file object
+  /// shouldn't live longer than the Env object.
  Status NewRandomAccessFile(const string& fname, RandomAccessFile** result);

  /// \brief Creates an object that writes to a new file with the specified
@ -88,6 +95,10 @@ class Env {
  /// returns non-OK.
  ///
  /// The returned file will only be accessed by one thread at a time.
+  ///
+  /// The ownership of the returned WritableFile is passed to the caller
+  /// and the object should be deleted when is not used. The file object
+  /// shouldn't live longer than the Env object.
  Status NewWritableFile(const string& fname, WritableFile** result);

  /// \brief Creates an object that either appends to an existing file, or
@ -98,6 +109,10 @@ class Env {
  /// non-OK.
  ///
  /// The returned file will only be accessed by one thread at a time.
+  ///
+  /// The ownership of the returned WritableFile is passed to the caller
+  /// and the object should be deleted when is not used. The file object
+  /// shouldn't live longer than the Env object.
  Status NewAppendableFile(const string& fname, WritableFile** result);

  /// \brief Creates a readonly region of memory with the file context.
@ -107,6 +122,10 @@ class Env {
  /// the caller. On failure stores nullptr in *result and returns non-OK.
  ///
  /// The returned memory region can be accessed from many threads in parallel.
+  ///
+  /// The ownership of the returned ReadOnlyMemoryRegion is passed to the caller
+  /// and the object should be deleted when is not used. The memory region
+  /// object shouldn't live longer than the Env object.
  Status NewReadOnlyMemoryRegionFromFile(const string& fname,
                                         ReadOnlyMemoryRegion** result);

@ -192,7 +211,7 @@ class Env {
  Env(const Env&);
  void operator=(const Env&);

-  FileSystemRegistry* file_system_registry_;
+  std::unique_ptr<FileSystemRegistry> file_system_registry_;
 };

 /// \brief An implementation of Env that forwards all calls to another Env.
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"

--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@ -1,29 +1,25 @@
 # Copyright 2015 Google Inc. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================

-"""A very simple MNIST classifier, modified to display data in TensorBoard.
+"""A simple MNIST classifier which displays summaries in TensorBoard.

-See extensive documentation for the original model at
-http://tensorflow.org/tutorials/mnist/beginners/index.md
-
-See documentation on the TensorBoard specific pieces at
-http://tensorflow.org/how_tos/summaries_and_tensorboard/index.md
-
-If you modify this file, please update the excerpt in
-how_tos/summaries_and_tensorboard/index.md.
+ This is an unimpressive MNIST model, but it is a good example of using
+tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of
+naming summary tags so that they are grouped meaningfully in TensorBoard.

+It demonstrates the functionality of every TensorBoard dashboard.
 """
 from __future__ import absolute_import
 from __future__ import division
@ -39,72 +35,132 @@ FLAGS = flags.FLAGS
 flags.DEFINE_boolean('fake_data', False, 'If true, uses fake data '
                     'for unit testing.')
 flags.DEFINE_integer('max_steps', 1000, 'Number of steps to run trainer.')
-flags.DEFINE_float('learning_rate', 0.5, 'Initial learning rate.')
+flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.')
+flags.DEFINE_float('dropout', 0.9, 'Keep probability for training dropout.')
 flags.DEFINE_string('data_dir', '/tmp/data', 'Directory for storing data')
 flags.DEFINE_string('summaries_dir', '/tmp/mnist_logs', 'Summaries directory')


-def main(_):
+def train():
  # Import data
  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True,
                                    fake_data=FLAGS.fake_data)

  sess = tf.InteractiveSession()

-  # Create the model
-  x = tf.placeholder(tf.float32, [None, 784], name='x-input')
-  W = tf.Variable(tf.zeros([784, 10]), name='weights')
-  b = tf.Variable(tf.zeros([10]), name='bias')
+  # Create a multilayer model.

-  # Use a name scope to organize nodes in the graph visualizer
-  with tf.name_scope('Wx_b'):
-    y = tf.nn.softmax(tf.matmul(x, W) + b)
+  # Input placehoolders
+  with tf.name_scope('input'):
+    x = tf.placeholder(tf.float32, [None, 784], name='x-input')
+    image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
+    tf.image_summary('input', image_shaped_input, 10)
+    y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
+    keep_prob = tf.placeholder(tf.float32)
+    tf.scalar_summary('dropout_keep_probability', keep_prob)

-  # Add summary ops to collect data
-  tf.histogram_summary('weights', W)
-  tf.histogram_summary('biases', b)
-  tf.histogram_summary('y', y)
+  # We can't initialize these variables to 0 - the network will get stuck.
+  def weight_variable(shape):
+    """Create a weight variable with appropriate initialization."""
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)

-  # Define loss and optimizer
-  y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
-  # More name scopes will clean up the graph representation
-  with tf.name_scope('xent'):
-    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
+  def bias_variable(shape):
+    """Create a bias variable with appropriate initialization."""
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+
+  def variable_summaries(var, name):
+    """Attach a lot of summaries to a Tensor."""
+    with tf.name_scope('summaries'):
+      mean = tf.reduce_mean(var)
+      tf.scalar_summary('mean/' + name, mean)
+      with tf.name_scope('stddev'):
+        stddev = tf.sqrt(tf.reduce_sum(tf.square(var - mean)))
+      tf.scalar_summary('sttdev/' + name, stddev)
+      tf.scalar_summary('max/' + name, tf.reduce_max(var))
+      tf.scalar_summary('min/' + name, tf.reduce_min(var))
+      tf.histogram_summary(name, var)
+
+  def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
+    """Reusable code for making a simple neural net layer.
+
+    It does a matrix multiply, bias add, and then uses relu to nonlinearize.
+    It also sets up name scoping so that the resultant graph is easy to read, and
+    adds a number of summary ops.
+    """
+    # Adding a name scope ensures logical grouping of the layers in the graph.
+    with tf.name_scope(layer_name):
+      # This Variable will hold the state of the weights for the layer
+      with tf.name_scope('weights'):
+        weights = weight_variable([input_dim, output_dim])
+        variable_summaries(weights, layer_name + '/weights')
+      with tf.name_scope('biases'):
+        biases = bias_variable([output_dim])
+        variable_summaries(biases, layer_name + '/biases')
+      with tf.name_scope('Wx_plus_b'):
+        preactivate = tf.matmul(input_tensor, weights) + biases
+        tf.histogram_summary(layer_name + '/pre_activations', preactivate)
+      activations = act(preactivate, 'activation')
+      tf.histogram_summary(layer_name + '/activations', activations)
+      return activations
+
+  hidden1 = nn_layer(x, 784, 500, 'layer1')
+  dropped = tf.nn.dropout(hidden1, keep_prob)
+  y = nn_layer(dropped, 500, 10, 'layer2', act=tf.nn.softmax)
+
+
+  with tf.name_scope('cross_entropy'):
+    diff = y_ * tf.log(y)
+    with tf.name_scope('total'):
+      cross_entropy = -tf.reduce_mean(diff)
    tf.scalar_summary('cross entropy', cross_entropy)
+
  with tf.name_scope('train'):
-    train_step = tf.train.GradientDescentOptimizer(
+    train_step = tf.train.AdamOptimizer(
        FLAGS.learning_rate).minimize(cross_entropy)

-  with tf.name_scope('test'):
-    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
-    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+  with tf.name_scope('accuracy'):
+    with tf.name_scope('correct_prediction'):
+      correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+    with tf.name_scope('accuracy'):
+      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.scalar_summary('accuracy', accuracy)

  # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
  merged = tf.merge_all_summaries()
-  writer = tf.train.SummaryWriter(FLAGS.summaries_dir, sess.graph)
+  train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train', sess.graph)
+  test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
  tf.initialize_all_variables().run()

-  # Train the model, and feed in test data and record summaries every 10 steps
+  # Train the model, and also write summaries.
+  # Every 10th step, measure test-set accuracy, and write test summaries
+  # All other steps, run train_step on training data, & add training summaries
+
+  def feed_dict(train):
+    """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
+    if train or FLAGS.fake_data:
+      xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
+      k = FLAGS.dropout
+    else:
+      xs, ys = mnist.test.images, mnist.test.labels
+      k = 1.0
+    return {x: xs, y_: ys, keep_prob: k}

  for i in range(FLAGS.max_steps):
-    if i % 10 == 0:  # Record summary data and the accuracy
-      if FLAGS.fake_data:
-        batch_xs, batch_ys = mnist.train.next_batch(
-            100, fake_data=FLAGS.fake_data)
-        feed = {x: batch_xs, y_: batch_ys}
-      else:
-        feed = {x: mnist.test.images, y_: mnist.test.labels}
-      result = sess.run([merged, accuracy], feed_dict=feed)
-      summary_str = result[0]
-      acc = result[1]
-      writer.add_summary(summary_str, i)
+    if i % 10 == 0:  # Record summaries and test-set accuracy
+      summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
+      test_writer.add_summary(summary, i)
      print('Accuracy at step %s: %s' % (i, acc))
-    else:
-      batch_xs, batch_ys = mnist.train.next_batch(
-          100, fake_data=FLAGS.fake_data)
-      feed = {x: batch_xs, y_: batch_ys}
-      sess.run(train_step, feed_dict=feed)
+    else: # Record train set summarieis, and train
+      summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
+      train_writer.add_summary(summary, i)
+
+def main(_):
+  if tf.gfile.Exists(FLAGS.summaries_dir):
+    tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
+  tf.gfile.MakeDirs(FLAGS.summaries_dir)
+  train()

 if __name__ == '__main__':
  tf.app.run()
--- a/tensorflow/g3doc/api_docs/python/client.md
+++ b/tensorflow/g3doc/api_docs/python/client.md
@ -117,6 +117,9 @@ method. A graph element can be one of the following types:
  the *i*th return value will be a
  [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue)
  containing the value of that sparse tensor.
+* If the *i*th element of `fetches` is produced by a `get_tensor_handle` op,
+  the *i*th return value will be a numpy ndarray containing the handle of
+  that tensor.

 The optional `feed_dict` argument allows the caller to override
 the value of tensors in the graph. Each key in `feed_dict` can be
@ -620,7 +623,7 @@ Creates an `AbortedError`.

 ### `class tf.errors.OutOfRangeError` {#OutOfRangeError}

-Raised when an operation executed past the valid range.
+Raised when an operation iterates past the valid input range.

 This exception is raised in "end-of-file" conditions, such as when a
 [`queue.dequeue()`](../../api_docs/python/io_ops.md#QueueBase.dequeue)
--- a/tensorflow/g3doc/api_docs/python/control_flow_ops.md
+++ b/tensorflow/g3doc/api_docs/python/control_flow_ops.md
@ -175,7 +175,7 @@ the same non-zero number and type of outputs.
  y = tf.constant(5)
  def f1(): return tf.mul(x, 17)
  def f2(): return tf.add(y, 23)
-  r = cond(math_ops.less(x, y), f1, f2)
+  r = cond(tf.less(x, y), f1, f2)
  # r is set to f1().
  # Operations in f2 (e.g., tf.add) are not executed.
 ```
@ -259,6 +259,55 @@ Example 2:
             callable.


+- - -
+
+### `tf.while_loop(cond, body, loop_vars, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#while_loop}
+
+Repeat `body` while the condition `cond` is true.
+
+`cond` is a callable taking a list of tensors and returning a boolean scalar
+tensor. `body` is a callable taking a list of tensors and returning a list of
+tensors of the same length and with the same types as the input. `loop_vars`
+is a list of tensors that is passed to both `cond` and `body`.
+
+In addition to regular Tensors or IndexedSlices, the body may accept and
+return TensorArray objects.  The flows of the TensorArray objects will
+be appropriately forwarded between loops and during gradient calculations.
+
+While `cond` evaluates to true, `body` is executed.
+
+##### Args:
+
+
+*  <b>`cond`</b>: The termination condition of the loop.
+*  <b>`body`</b>: A callable that represents the loop body.
+*  <b>`loop_vars`</b>: The list of variable input tensors.
+*  <b>`parallel_iterations`</b>: The number of iterations allowed to run in parallel.
+*  <b>`back_prop`</b>: Whether backprop is enabled for this while loop.
+*  <b>`swap_memory`</b>: Whether GPU-CPU memory swap is enabled for this loop.
+*  <b>`name`</b>: Optional name prefix for the returned tensors.
+
+##### Returns:
+
+  The output tensors for the loop variables after the loop.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `cond` or `body` is not callable.
+*  <b>`ValueError`</b>: if `loop_var` is empty.
+
+
+*  <b>`Example`</b>: 
+
+  ```python
+  i = tf.constant(0)
+  c = lambda i: tf.less(i, 10)
+  b = lambda i: tf.add(i, 1)
+  r = tf.while_loop(c, b, [i])
+  ```
+
+

 ## Logical Operators

--- a/tensorflow/g3doc/api_docs/python/histogram_ops.md
+++ b/tensorflow/g3doc/api_docs/python/histogram_ops.md
@ -32,6 +32,7 @@ equal width and determined by the arguments `value_range` and `nbins`.


 *  <b>`Examples`</b>: 
+
 ```python
 # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
 nbins = 5
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ b/tensorflow/g3doc/api_docs/python/index.md
@ -141,6 +141,8 @@
  * [`batch_ifft3d`](../../api_docs/python/math_ops.md#batch_ifft3d)
  * [`batch_matmul`](../../api_docs/python/math_ops.md#batch_matmul)
  * [`batch_matrix_determinant`](../../api_docs/python/math_ops.md#batch_matrix_determinant)
+  * [`batch_matrix_diag`](../../api_docs/python/math_ops.md#batch_matrix_diag)
+  * [`batch_matrix_diag_part`](../../api_docs/python/math_ops.md#batch_matrix_diag_part)
  * [`batch_matrix_inverse`](../../api_docs/python/math_ops.md#batch_matrix_inverse)
  * [`batch_matrix_solve`](../../api_docs/python/math_ops.md#batch_matrix_solve)
  * [`batch_matrix_solve_ls`](../../api_docs/python/math_ops.md#batch_matrix_solve_ls)
@ -224,6 +226,10 @@
  * [`unsorted_segment_sum`](../../api_docs/python/math_ops.md#unsorted_segment_sum)
  * [`where`](../../api_docs/python/math_ops.md#where)

+* **[Strings](../../api_docs/python/string_ops.md)**:
+  * [`reduce_join`](../../api_docs/python/string_ops.md#reduce_join)
+  * [`string_to_hash_bucket`](../../api_docs/python/string_ops.md#string_to_hash_bucket)
+
 * **[Histograms](../../api_docs/python/histogram_ops.md)**:
  * [`histogram_fixed_width`](../../api_docs/python/histogram_ops.md#histogram_fixed_width)

@ -255,6 +261,7 @@
  * [`tuple`](../../api_docs/python/control_flow_ops.md#tuple)
  * [`verify_tensor_all_finite`](../../api_docs/python/control_flow_ops.md#verify_tensor_all_finite)
  * [`where`](../../api_docs/python/control_flow_ops.md#where)
+  * [`while_loop`](../../api_docs/python/control_flow_ops.md#while_loop)

 * **[Higher Order Functions](../../api_docs/python/functional_ops.md)**:
  * [`foldl`](../../api_docs/python/functional_ops.md#foldl)
@ -262,6 +269,11 @@
  * [`map_fn`](../../api_docs/python/functional_ops.md#map_fn)
  * [`scan`](../../api_docs/python/functional_ops.md#scan)

+* **[Tensor Handle Operations](../../api_docs/python/session_ops.md)**:
+  * [`delete_session_tensor`](../../api_docs/python/session_ops.md#delete_session_tensor)
+  * [`get_session_handle`](../../api_docs/python/session_ops.md#get_session_handle)
+  * [`get_session_tensor`](../../api_docs/python/session_ops.md#get_session_tensor)
+
 * **[Images](../../api_docs/python/image.md)**:
  * [`adjust_brightness`](../../api_docs/python/image.md#adjust_brightness)
  * [`adjust_contrast`](../../api_docs/python/image.md#adjust_contrast)
--- a/tensorflow/g3doc/api_docs/python/math_ops.md
+++ b/tensorflow/g3doc/api_docs/python/math_ops.md
@ -741,6 +741,101 @@ Gamma function.
 TensorFlow provides several operations that you can use to add basic
 mathematical functions for matrices to your graph.

+- - -
+
+### `tf.batch_matrix_diag(diagonal, name=None)` {#batch_matrix_diag}
+
+Returns a batched diagonal tensor with a given batched diagonal values.
+
+Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+everything else padded with zeros. The diagonal is computed as follows:
+
+Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+
+`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+
+For example:
+
+```prettyprint
+# 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+and diagonal.shape = (2, 4)
+
+tf.batch_matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+                                     [0, 2, 0, 0]
+                                     [0, 0, 3, 0]
+                                     [0, 0, 0, 4]],
+                                    [[5, 0, 0, 0]
+                                     [0, 6, 0, 0]
+                                     [0, 0, 7, 0]
+                                     [0, 0, 0, 8]]]
+
+which has shape (2, 4, 4)
+```
+
+##### Args:
+
+
+*  <b>`diagonal`</b>: A `Tensor`. Rank `k`, where `k >= 1`.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor`. Has the same type as `diagonal`.
+  Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+
+
+- - -
+
+### `tf.batch_matrix_diag_part(input, name=None)` {#batch_matrix_diag_part}
+
+Returns the batched diagonal part of a batched tensor.
+
+This operation returns a tensor with the `diagonal` part
+of the batched `input`. The `diagonal` part is computed as follows:
+
+Assume `input` has `k` dimensions `[I, J, K, ..., N, N]`, then the output is a
+tensor of rank `k - 1` with dimensions `[I, J, K, ..., N]` where:
+
+`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+
+The input must be at least a matrix.
+
+For example:
+
+```prettyprint
+# 'input' is [[[1, 0, 0, 0]
+               [0, 2, 0, 0]
+               [0, 0, 3, 0]
+               [0, 0, 0, 4]],
+              [[5, 0, 0, 0]
+               [0, 6, 0, 0]
+               [0, 0, 7, 0]
+               [0, 0, 0, 8]]]
+
+and input.shape = (2, 4, 4)
+
+tf.batch_matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+which has shape (2, 4)
+```
+
+##### Args:
+
+
+*  <b>`input`</b>: A `Tensor`.
+    Rank `k` tensor where `k >= 2` and the last two dimensions are equal.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor`. Has the same type as `input`.
+  The extracted diagonal(s) having shape
+  `diagonal.shape = input.shape[:-1]`.
+
+
+
 - - -

 ### `tf.diag(diagonal, name=None)` {#diag}
@ -1192,7 +1287,7 @@ eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.

 - - -

-### `tf.matrix_solve(matrix, rhs, name=None)` {#matrix_solve}
+### `tf.matrix_solve(matrix, rhs, adjoint=None, name=None)` {#matrix_solve}

 Solves a system of linear equations. Checks for invertibility.

@ -1202,25 +1297,30 @@ Solves a system of linear equations. Checks for invertibility.
 *  <b>`matrix`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
    Shape is `[M, M]`.
 *  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`. Shape is `[M, K]`.
+*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
+    Boolean indicating whether to solve with `matrix` or its adjoint.
 *  <b>`name`</b>: A name for the operation (optional).

 ##### Returns:

  A `Tensor`. Has the same type as `matrix`.
-  Shape is `[M, K]` containing the tensor that solves
-  matrix * output = rhs.
+  Shape is `[M, K]`. If `adjoint` is `False` then `output` that solves
+  `matrix` * `output` = `rhs`. If `adjoint` is `True` then `output` that solves
+  `adjoint(matrix)` * `output` = `rhs`.


 - - -

-### `tf.batch_matrix_solve(matrix, rhs, name=None)` {#batch_matrix_solve}
+### `tf.batch_matrix_solve(matrix, rhs, adjoint=None, name=None)` {#batch_matrix_solve}

 Solves systems of linear equations. Checks for invertibility.

 Matrix is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices. Rhs is a tensor of shape
-`[..., M, K]`. The output is a tensor shape `[..., M, K]` where each output
-matrix satisfies matrix[..., :, :] * output[..., :, :] = rhs[..., :, :].
+`[..., M, K]`. The output is a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output
+matrix satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+If `adjoint` is `True` then each output
+matrix satisfies `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.

 ##### Args:

@ -1229,6 +1329,9 @@ matrix satisfies matrix[..., :, :] * output[..., :, :] = rhs[..., :, :].
    Shape is `[..., M, M]`.
 *  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`.
    Shape is `[..., M, K]`.
+*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
+    Boolean indicating whether to solve with `matrix` or its (block-wise)
+    adjoint.
 *  <b>`name`</b>: A name for the operation (optional).

 ##### Returns:
@ -1239,21 +1342,24 @@ matrix satisfies matrix[..., :, :] * output[..., :, :] = rhs[..., :, :].

 - - -

-### `tf.matrix_triangular_solve(matrix, rhs, lower=None, name=None)` {#matrix_triangular_solve}
+### `tf.matrix_triangular_solve(matrix, rhs, lower=None, adjoint=None, name=None)` {#matrix_triangular_solve}

 Solves a system of linear equations with an upper or lower triangular matrix by

 backsubstitution.

 `matrix` is a matrix of shape `[M, M]`. If `lower` is `True` then the strictly
-upper triangular part of `matrix` is ignored. If `lower` is False then the
-strictly lower triangular part of `matrix` is ignored. `rhs` is a matrix of
-shape [M, K]`.
+upper triangular part of `matrix` is assumed to be zero and not accessed.
+If `lower` is False then the strictly lower triangular part of `matrix` is
+assumed to be zero and not accessed.
+`rhs` is a matrix of shape [M, K]`.

-The output is a matrix of shape `[M, K]`. If `lower` is `True` then the output
-satisfies \\(\sum_{k=0}^{i}\\) matrix[i, k] * output[k, j] = rhs[i, j].
-If `lower` is false then output satisfies
-\\(\sum_{k=i}^{K-1}\\) matrix[i, k] * output[k, j] = rhs[i, j].
+The output is a matrix of shape `[M, K]`. If `adjoint` is `False` the output
+satisfies the matrix equation `matrix` * `output` = `rhs`.
+If `adjoint` is `False` then `output` satisfies the matrix equation
+`matrix` * `output` = `rhs`.
+If `adjoint` is `True` then `output` satisfies the matrix equation
+`adjoint(matrix)` * `output` = `rhs`.

 ##### Args:

@ -1262,7 +1368,9 @@ If `lower` is false then output satisfies
    Shape is `[M, M]`.
 *  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`. Shape is `[M, K]`.
 *  <b>`lower`</b>: An optional `bool`. Defaults to `True`.
-    Boolean indicating whether matrix is lower or upper triangular.
+    Boolean indicating whether `matrix` is lower or upper triangular
+*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
+    Boolean indicating whether to solve with `matrix` or its adjoint.
 *  <b>`name`</b>: A name for the operation (optional).

 ##### Returns:
@ -1272,7 +1380,7 @@ If `lower` is false then output satisfies

 - - -

-### `tf.batch_matrix_triangular_solve(matrix, rhs, lower=None, name=None)` {#batch_matrix_triangular_solve}
+### `tf.batch_matrix_triangular_solve(matrix, rhs, lower=None, adjoint=None, name=None)` {#batch_matrix_triangular_solve}

 Solves systems of linear equations with upper or lower triangular matrices by

@ -1280,15 +1388,17 @@ backsubstitution.

 `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
 square matrices. If `lower` is `True` then the strictly upper triangular part
-of each inner-most matrix is ignored. If `lower` is False then the strictly
-lower triangular part of each inner-most matrix is ignored. `rhs` is a tensor
-of shape [..., M, K]`.
+of each inner-most matrix is assumed to be zero and not accessed.
+If `lower` is False then the strictly lower triangular part of each inner-most
+matrix is assumed to be zero and not accessed.
+`rhs` is a tensor of shape [..., M, K]`.

-The output is a tensor of shape `[..., M, K]`. If `lower` is `True` then the
-output satisfies
-\\(\sum_{k=0}^{i}\\) matrix[..., i, k] * output[..., k, j] = rhs[..., i, j].
-If `lower` is false then the strictly then the output satisfies
-\\(sum_{k=i}^{K-1}\\) matrix[..., i, k] * output[..., k, j] = rhs[..., i, j].
+The output is a tensor of shape `[..., M, K]`. If `adjoint` is `True` then the
+innermost matrices in output` satisfy matrix equations
+`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+If `adjoint` is `False` then the strictly then the  innermost matrices in
+`output` satisfy matrix equations
+`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.

 ##### Args:

@ -1298,7 +1408,11 @@ If `lower` is false then the strictly then the output satisfies
 *  <b>`rhs`</b>: A `Tensor`. Must have the same type as `matrix`.
    Shape is `[..., M, K]`.
 *  <b>`lower`</b>: An optional `bool`. Defaults to `True`.
-    Boolean indicating whether matrix is lower or upper triangular.
+    Boolean indicating whether the innermost matrices in `matrix` are
+    lower or upper triangular.
+*  <b>`adjoint`</b>: An optional `bool`. Defaults to `False`.
+    Boolean indicating whether to solve with `matrix` or its (block-wise)
+    adjoint.
 *  <b>`name`</b>: A name for the operation (optional).

 ##### Returns:
--- a/tensorflow/g3doc/api_docs/python/session_ops.md
+++ b/tensorflow/g3doc/api_docs/python/session_ops.md
@ -0,0 +1,102 @@
+<!-- This file is machine generated: DO NOT EDIT! -->
+
+# Tensor Handle Operations
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
+
+[TOC]
+
+## Tensor Handle Operations.
+
+TensorFlow provides several operators that allows the user to keep tensors
+"in-place" across run calls.
+
+- - -
+
+### `tf.get_session_handle(data, name=None)` {#get_session_handle}
+
+Return the handle of `data`.
+
+This is EXPERIMENTAL and subject to change.
+
+Keep `data` "in-place" in the runtime and create a handle that can be
+used to retrieve `data` in a subsequent run().
+
+Combined with `get_session_tensor`, we can keep a tensor produced in
+one run call in place, and use it as the input in a future run call.
+Below is a simple example:
+
+```python
+c = tf.mul(a, b)
+h = tf.get_session_handle(c)
+h = sess.run(h)
+
+p, a = tf.get_session_tensor(tf.float32)
+b = tf.mul(a, 10)
+c = sess.run(b, feed_dict={p: h.handle})
+```
+
+##### Args:
+
+
+*  <b>`data`</b>: A tensor to be stored in the session.
+*  <b>`name`</b>: Optional name prefix for the return tensor.
+
+##### Returns:
+
+  A scalar string tensor representing a unique handle for `data`.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if `data` is not a Tensor.
+
+
+- - -
+
+### `tf.get_session_tensor(dtype, name=None)` {#get_session_tensor}
+
+Get the tensor of type `dtype` by feeding a tensor handle.
+
+This is EXPERIMENTAL and subject to change.
+
+Get the value of the tensor from a tensor handle. The tensor
+is produced in a previous run() and stored in the state of the
+session.
+
+##### Args:
+
+
+*  <b>`dtype`</b>: The type of the output tensor.
+*  <b>`name`</b>: Optional name prefix for the return tensor.
+
+##### Returns:
+
+  A pair of tensors. The first is a placeholder for feeding a
+  tensor handle and the second is the tensor in the session state
+  keyed by the tensor handle.
+
+
+- - -
+
+### `tf.delete_session_tensor(name=None)` {#delete_session_tensor}
+
+Delete the tensor by feeding a tensor handle.
+
+This is EXPERIMENTAL and subject to change.
+
+Delete the tensor of a given tensor handle. The tensor is produced
+in a previous run() and stored in the state of the session.
+
+##### Args:
+
+
+*  <b>`name`</b>: Optional name prefix for the return tensor.
+
+##### Returns:
+
+  A pair of graph elements. The first is a placeholder for feeding a
+  tensor handle and the second is a deletion operation.
+
+
--- a/tensorflow/g3doc/api_docs/python/state_ops.md
+++ b/tensorflow/g3doc/api_docs/python/state_ops.md
@ -781,7 +781,7 @@ checkpoints per device.

 - - -

-#### `tf.train.Saver.save(sess, save_path, global_step=None, latest_filename=None, meta_graph_suffix='meta')` {#Saver.save}
+#### `tf.train.Saver.save(sess, save_path, global_step=None, latest_filename=None, meta_graph_suffix='meta', write_meta_graph=True)` {#Saver.save}

 Saves variables.

@ -807,6 +807,8 @@ path can be passed directly to a call to `restore()`.
    managed by the saver to keep track of recent checkpoints.  Defaults to
    'checkpoint'.
 *  <b>`meta_graph_suffix`</b>: Suffix for `MetaGraphDef` file. Defaults to 'meta'.
+*  <b>`write_meta_graph`</b>: `Boolean` indicating whether or not to write the meta
+    graph file.

 ##### Returns:

--- a/tensorflow/g3doc/api_docs/python/string_ops.md
+++ b/tensorflow/g3doc/api_docs/python/string_ops.md
@ -0,0 +1,96 @@
+<!-- This file is machine generated: DO NOT EDIT! -->
+
+# Strings
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+[`tf.convert_to_tensor`](framework.md#convert_to_tensor).
+
+[TOC]
+
+## Hashing
+
+String hashing ops take a string input tensor and map each element to an
+integer.
+
+- - -
+
+### `tf.string_to_hash_bucket(string_tensor, num_buckets, name=None)` {#string_to_hash_bucket}
+
+Converts each string in the input Tensor to its hash mod by a number of buckets.
+
+The hash function is deterministic on the content of the string within the
+process.
+
+Note that the hash function may change from time to time.
+
+##### Args:
+
+
+*  <b>`string_tensor`</b>: A `Tensor` of type `string`.
+*  <b>`num_buckets`</b>: An `int` that is `>= 1`. The number of buckets.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor` of type `int64`.
+  A Tensor of the same shape as the input `string_tensor`.
+
+
+
+## Joining
+
+String joining ops concatenate elements of input string tensors to produce a new
+string tensor.
+
+- - -
+
+### `tf.reduce_join(inputs, reduction_indices, keep_dims=None, separator=None, name=None)` {#reduce_join}
+
+Joins a string Tensor across the given dimensions.
+
+Computes the string join across dimensions in the given string Tensor of shape
+`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+strings with the given separator (default: empty string).  Negative indices are
+counted backwards from the end, with `-1` being equivalent to `n - 1`.  Passing
+an empty `reduction_indices` joins all strings in linear index order and outputs
+a scalar string.
+
+
+For example:
+```
+# tensor `a` is [["a", "b"], ["c", "d"]]
+tf.reduce_join(a, 0) ==> ["ac", "bd"]
+tf.reduce_join(a, 1) ==> ["ab", "cd"]
+tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+tf.reduce_join(a, []) ==> ["abcd"]
+```
+
+##### Args:
+
+
+*  <b>`inputs`</b>: A `Tensor` of type `string`.
+    The input to be joined.  All reduced indices must have non-zero size.
+*  <b>`reduction_indices`</b>: A `Tensor` of type `int32`.
+    The dimensions to reduce over.  Dimensions are reduced in the
+    order specified.  If `reduction_indices` has higher rank than `1`, it is
+    flattened.  Omitting `reduction_indices` is equivalent to passing
+    `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+*  <b>`keep_dims`</b>: An optional `bool`. Defaults to `False`.
+    If `True`, retain reduced dimensions with length `1`.
+*  <b>`separator`</b>: An optional `string`. Defaults to `""`.
+    The separator to use when joining.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor` of type `string`.
+  Has shape equal to that of the input with reduced dimensions removed or
+  set to `1` depending on `keep_dims`.
+
+
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@ -1558,7 +1558,7 @@ communicate with any other server in the same cluster.
 Creates a new server with the given definition.

 The `job_name`, `task_index`, and `protocol` arguments are optional, and
-override any information also provided in `server_or_cluster_def`.
+override any information provided in `server_or_cluster_def`.

 ##### Args:

@ -1567,13 +1567,15 @@ override any information also provided in `server_or_cluster_def`.
    `tf.train.ClusterDef` protocol buffer, or a
    `tf.train.ClusterSpec` object, describing the server to be
    created and/or the cluster of which it is a member.
-*  <b>`job_name`</b>: (Optional.) If not specified in `server_or_cluster_def`,
-    specifies the name of the job of which this server is a member.
-*  <b>`task_index`</b>: (Optional.) If not specified in `server_or_cluster_def`,
-    specifies the task index of this server in its job.
-*  <b>`protocol`</b>: (Optional.) If not specified in `server_or_cluster_def`,
-    specifies the protocol to be used by this server. Acceptable
-    values include `"grpc"`.
+*  <b>`job_name`</b>: (Optional.) Specifies the name of the job of which the server
+    is a member. Defaults to the value in `server_or_cluster_def`, if
+    specified.
+*  <b>`task_index`</b>: (Optional.) Specifies the task index of the server in its
+    job. Defaults to the value in `server_or_cluster_def`, if specified.
+    Otherwise defaults to 0 if the server's job has only one task.
+*  <b>`protocol`</b>: (Optional.) Specifies the protocol to be used by the server.
+    Acceptable values include `"grpc"`. Defaults to the value in
+    `server_or_cluster_def`, if specified. Otherwise defaults to `"grpc"`.
 *  <b>`start`</b>: (Optional.) Boolean, indicating whether to start the server
    after creating it. Defaults to `True`.

@ -2677,7 +2679,7 @@ Returns a list of tasks in the given job.
 ##### Returns:

  A list of strings, corresponding to the network addresses of tasks in
-  the given job.
+  the given job, ordered by task index.

 ##### Raises:

@ -2852,7 +2854,7 @@ The generated
 [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
 has one summary value containing a histogram for `values`.

-This op reports an `OutOfRange` error if any value is not finite.
+This op reports an `InvalidArgument` error if any value is not finite.

 ##### Args:

--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
+++ b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
@ -8,7 +8,8 @@ your TensorFlow graph, plot quantitative metrics about the execution of your
 graph, and show additional data like images that pass through it. When
 TensorBoard is fully configured, it looks like this:

-![MNIST TensorBoard](../../images/mnist_tensorboard.png "MNIST TensorBoard")
+[![MNIST TensorBoard](../../images/mnist_tensorboard.png "MNIST TensorBoard")](http://tensorflow.org/tensorboard)
+[*Click try a TensorBoard with data from this tutorial!*](http://tensorflow.org/tensorboard)


 ## Serializing the data
@ -75,56 +76,70 @@ statistics, such as how the weights or accuracy varied during training.
 The code below is an excerpt; full source is [here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py).

 ```python
-# Create the model
-x = tf.placeholder(tf.float32, [None, 784], name="x-input")
-W = tf.Variable(tf.zeros([784,10]), name="weights")
-b = tf.Variable(tf.zeros([10], name="bias"))
+def variable_summaries(var, name):
+  with tf.name_scope("summaries"):
+    mean = tf.reduce_mean(var)
+    tf.scalar_summary('mean/' + name, mean)
+    with tf.name_scope('stddev'):
+      stddev = tf.sqrt(tf.reduce_sum(tf.square(var - mean)))
+    tf.scalar_summary('sttdev/' + name, stddev)
+    tf.scalar_summary('max/' + name, tf.reduce_max(var))
+    tf.scalar_summary('min/' + name, tf.reduce_min(var))
+    tf.histogram_summary(name, var)

-# use a name scope to organize nodes in the graph visualizer
-with tf.name_scope("Wx_b") as scope:
-  y = tf.nn.softmax(tf.matmul(x,W) + b)
+def nn_layer(input_tensor, input_dim, output_dim, layer_name):
+  """Reusable code for making a simple neural net layer.

-# Add summary ops to collect data
-tf.histogram_summary("weights", W)
-tf.histogram_summary("biases", b)
-tf.histogram_summary("y", y)
+  It does a matrix multiply, bias add, and then uses relu to nonlinearize.
+  It also sets up name scoping so that the resultant graph is easy to read, and
+  adds a number of summary ops.
+  """
+  # Adding a name scope ensures logical grouping of the layers in the graph.
+  with tf.name_scope(layer_name):
+    # This Variable will hold the state of the weights for the layer
+    with tf.name_scope("weights"):
+      weights = weight_variable([input_dim, output_dim])
+      variable_summaries(weights, layer_name + '/weights')
+    with tf.name_scope("biases"):
+      biases = bias_variable([output_dim])
+      variable_summaries(biases, layer_name + '/biases')
+    with tf.name_scope('Wx_plus_b'):
+      activations = tf.matmul(input_tensor, weights) + biases
+      tf.histogram_summary(layer_name + '/activations', activations)
+    relu = tf.nn.relu(activations, 'relu')
+    tf.histogram_summary(layer_name + '/activations_relu', relu)
+    return tf.nn.dropout(relu, keep_prob)

-# Define loss and optimizer
-y_ = tf.placeholder(tf.float32, [None,10], name="y-input")
-# More name scopes will clean up the graph representation
-with tf.name_scope("xent") as scope:
-  cross_entropy = -tf.reduce_sum(y_*tf.log(y))
-  tf.scalar_summary("cross entropy", cross_entropy)
-with tf.name_scope("train") as scope:
-  train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
+layer1 = nn_layer(x, 784, 50, 'layer1')
+layer2 = nn_layer(layer1, 50, 10, 'layer2')
+y = tf.nn.softmax(layer2, 'predictions')

-with tf.name_scope("test") as scope:
-  correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
-  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  tf.scalar_summary("accuracy", accuracy)

-# Merge all the summaries and write them out to /tmp/mnist_logs
+with tf.name_scope('cross_entropy'):
+  diff = y_ * tf.log(y)
+  with tf.name_scope('total'):
+    cross_entropy = -tf.reduce_sum(diff)
+  with tf.name_scope('normalized'):
+    normalized_cross_entropy = -tf.reduce_mean(diff)
+  tf.scalar_summary('cross entropy', normalized_cross_entropy)
+
+with tf.name_scope('train'):
+  train_step = tf.train.AdamOptimizer(
+      FLAGS.learning_rate).minimize(cross_entropy)
+
+with tf.name_scope('accuracy'):
+  with tf.name_scope('correct_prediction'):
+    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+  with tf.name_scope('accuracy'):
+    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+  tf.scalar_summary('accuracy', accuracy)
+
+# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
 merged = tf.merge_all_summaries()
-writer = tf.train.SummaryWriter("/tmp/mnist_logs", sess.graph)
+train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train', sess.graph)
+test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
 tf.initialize_all_variables().run()

-# Train the model, and feed in test data and record summaries every 10 steps
-
-for i in range(1000):
-  if i % 10 == 0:  # Record summary data, and the accuracy
-    feed = {x: mnist.test.images, y_: mnist.test.labels}
-    result = sess.run([merged, accuracy], feed_dict=feed)
-    summary_str = result[0]
-    acc = result[1]
-    writer.add_summary(summary_str, i)
-    print("Accuracy at step %s: %s" % (i, acc))
-  else:
-    batch_xs, batch_ys = mnist.train.next_batch(100)
-    feed = {x: batch_xs, y_: batch_ys}
-    sess.run(train_step, feed_dict=feed)
-
-print(accuracy.eval({x: mnist.test.images, y_: mnist.test.labels}))
-
 ```

 You're now all set to visualize this data using TensorBoard.
@ -135,7 +150,7 @@ You're now all set to visualize this data using TensorBoard.
 To run TensorBoard, use the command

 ```bash
-python tensorflow/tensorboard/tensorboard.py --logdir=path/to/log-directory
+tensorboard --logdir=path/to/log-directory
 ```

 where `logdir` points to the directory where the `SummaryWriter` serialized its
@ -144,18 +159,8 @@ serialized data from separate runs, then TensorBoard will visualize the data
 from all of those runs. Once TensorBoard is running, navigate your web browser
 to `localhost:6006` to view the TensorBoard.

-If you have pip installed TensorFlow, `tensorboard` is installed into
-the system path, so you can use the simpler command
-
-```bash
-tensorboard --logdir=/path/to/log-directory
-```
-
 When looking at TensorBoard, you will see the navigation tabs in the top right
 corner. Each tab represents a set of serialized data that can be visualized.
-For any tab you are looking at, if the logs being looked at by TensorBoard do
-not contain any data relevant to that tab, a message will be displayed
-indicating how to serialize data that is applicable to that tab.

 For in depth information on how to use the *graph* tab to visualize your graph,
 see [TensorBoard: Graph Visualization](../../how_tos/graph_viz/index.md).
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -575,6 +575,9 @@ tf_gen_op_wrapper_py(
        "TensorArraySplit",
        "TensorArrayUnpack",
        "TensorArrayWrite",
+        "GetSessionHandle",
+        "GetSessionTensor",
+        "DeleteSessionTensor",
    ],
    require_shape_functions = True,
 )
@ -810,6 +813,7 @@ py_library(
        "ops/rnn_cell.py",
        "ops/script_ops.py",
        "ops/seq2seq.py",
+        "ops/session_ops.py",
        "ops/sparse_grad.py",
        "ops/sparse_ops.py",
        "ops/standard_ops.py",
--- a/tensorflow/python/init.py
+++ b/tensorflow/python/init.py
@ -106,8 +106,10 @@ from tensorflow.python.ops import histogram_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import session_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import string_ops


 # Don't export modules except for the few we really want
@ -120,7 +122,8 @@ _whitelist = set([app, compat, contrib, errors, flags, gfile, image,
 __all__ = make_all(__name__,
                   [framework_lib, array_ops, client_lib, constant_op,
                    control_flow_ops, functional_ops, histogram_ops, io_ops,
-                    math_ops, nn, script_ops, sparse_ops, state_ops, train])
+                    math_ops, nn, script_ops, session_ops, sparse_ops,
+                    state_ops, string_ops, train])

 # Symbols whitelisted for export without documentation.
 # TODO(cwhipkey): review these and move to contrib, expose through
@ -167,7 +170,6 @@ __all__.extend([
    'sparse_matmul',
    'sparse_segment_mean_grad',
    'sparse_segment_sqrt_n_grad',
-    'string_to_hash_bucket',
    'unique_with_counts',
    'user_ops',
 ])
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python import pywrap_tensorflow as tf_session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import session_ops
 from tensorflow.python.platform import logging
 from tensorflow.python.util import compat

@ -99,6 +100,9 @@ class BaseSession(SessionInterface):
    self._extend_lock = threading.Lock()
    self._target = target

+    self._delete_lock = threading.Lock()
+    self._dead_handles = []
+
    self._session = None

    opts = tf_session.TF_NewSessionOptions(target=target, config=config)
@ -277,6 +281,9 @@ class BaseSession(SessionInterface):
      the *i*th return value will be a
      [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue)
      containing the value of that sparse tensor.
+    * If the *i*th element of `fetches` is produced by a `get_tensor_handle` op,
+      the *i*th return value will be a numpy ndarray containing the handle of
+      that tensor.

    The optional `feed_dict` argument allows the caller to override
    the value of tensors in the graph. Each key in `feed_dict` can be
@ -350,17 +357,22 @@ class BaseSession(SessionInterface):
    list of feeds and fetches that will be used in the subsequent
    `partial_run` calls.

+    The optional `feed_dict` argument allows the caller to override
+    the value of tensors in the graph. See run() for more information.
+
    Below is a simple example:

-      a = array_ops.placeholder(dtypes.float32, shape=[])
-      b = array_ops.placeholder(dtypes.float32, shape=[])
-      c = array_ops.placeholder(dtypes.float32, shape=[])
-      r1 = math_ops.add(a, b)
-      r2 = math_ops.mul(r1, c)
+    ```python
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.mul(r1, c)

-      h = sess.partial_run_setup([r1, r2], [a, b, c])
-      res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
-      res = sess.partial_run(h, r2, feed_dict={c: res})
+    h = sess.partial_run_setup([r1, r2], [a, b, c])
+    res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+    res = sess.partial_run(h, r2, feed_dict={c: res})
+    ```

    Args:
      handle: A handle for a sequence of partial runs.
@ -410,7 +422,7 @@ class BaseSession(SessionInterface):
                         'graph before calling run().')

    # Validate and process fetches.
-    unique_fetches, target_list, _ = self._process_fetches(fetches)
+    unique_fetches, target_list, _, _ = self._process_fetches(fetches)

    # Create request.
    feed_list = []
@ -455,6 +467,7 @@ class BaseSession(SessionInterface):
      fetches = [fetches]

    unique_fetch_targets = set()
+    unique_fetch_handles = {}
    target_list = []

    fetch_info = []
@ -465,10 +478,15 @@ class BaseSession(SessionInterface):
        try:
          fetch_t = self.graph.as_graph_element(subfetch, allow_tensor=True,
                                                allow_operation=True)
+          fetch_name = compat.as_bytes(fetch_t.name)
          if isinstance(fetch_t, ops.Operation):
-            target_list.append(compat.as_bytes(fetch_t.name))
+            target_list.append(fetch_name)
          else:
-            subfetch_names.append(compat.as_bytes(fetch_t.name))
+            subfetch_names.append(fetch_name)
+          # Remember the fetch if it is for a tensor handle.
+          if (isinstance(fetch_t, ops.Tensor) and
+              fetch_t.op.type == 'GetSessionHandle'):
+            unique_fetch_handles[fetch_name] = fetch_t.op.inputs[0].dtype
        except TypeError as e:
          raise TypeError('Fetch argument %r of %r has invalid type %r, '
                          'must be a string or Tensor. (%s)'
@ -483,7 +501,7 @@ class BaseSession(SessionInterface):
      fetch_info.append((subfetch_names, fetch_contraction_fn))

    unique_fetch_targets = list(unique_fetch_targets)
-    return unique_fetch_targets, target_list, fetch_info
+    return unique_fetch_targets, target_list, fetch_info, unique_fetch_handles

  def _run(self, handle, fetches, feed_dict, options, run_metadata):
    """Perform either run or partial_run, depending the exitence of `handle`."""
@ -502,10 +520,15 @@ class BaseSession(SessionInterface):
                         'graph before calling run().')

    # Validate and process fetches.
-    unique_fetches, target_list, fetch_info = self._process_fetches(fetches)
+    processed_fetches = self._process_fetches(fetches)
+    unique_fetches = processed_fetches[0]
+    target_list = processed_fetches[1]
+    fetch_info = processed_fetches[2]
+    unique_handles = processed_fetches[3]

    # Create request.
    feed_dict_string = {}
+    feed_map = {}

    # Validate and process feed_dict.
    if feed_dict:
@ -522,7 +545,6 @@ class BaseSession(SessionInterface):
            raise TypeError('The value of a feed cannot be a tf.Tensor object. '
                            'Acceptable feed values include Python scalars, '
                            'strings, lists, or numpy ndarrays.')
-
          np_val = np.array(subfeed_val, dtype=subfeed_t.dtype.as_numpy_dtype)
          if not subfeed_t.get_shape().is_compatible_with(np_val.shape):
            raise ValueError(
@ -531,17 +553,31 @@ class BaseSession(SessionInterface):
                % (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
          if not self.graph.is_feedable(subfeed_t):
            raise ValueError('Tensor %s may not be fed.' % subfeed_t)
-          feed_dict_string[compat.as_bytes(subfeed_t.name)] = np_val
+          subfeed_name = compat.as_bytes(subfeed_t.name)
+          feed_dict_string[subfeed_name] = np_val
+          feed_map[subfeed_name] = (subfeed_t, subfeed_val)

    # Run request and get response.
-    results = self._do_run(handle, target_list, unique_fetches,
-                           feed_dict_string, options, run_metadata)
+    movers = self._update_with_movers(feed_dict_string, feed_map)
+    try:
+      results = self._do_run(handle, target_list, unique_fetches,
+                             feed_dict_string, options, run_metadata)
+    finally:
+      # The movers are no longer used. Delete them.
+      for handle in movers:
+        self._register_dead_handle(handle)

    # User may have fetched the same tensor multiple times, but we
    # only fetch them from the runtime once.  Furthermore, they may
    # be wrapped as a tuple of tensors.  Here we map the results back
    # to what the client asked for.
-    fetched_results = dict(zip(unique_fetches, results))
+    # TODO(yuanbyu): Use the contraction_fn in _REGISTERED_EXPANSIONS.
+    fetched_results = {}
+    for fetch, result in zip(unique_fetches, results):
+      dtype = unique_handles.get(fetch)
+      if dtype:
+        result = session_ops.TensorHandle(result, dtype, self)
+      fetched_results[fetch] = result
    ret = []
    for fetch_names, fetch_contraction_fn in fetch_info:
      if fetch_names:
@ -642,6 +678,55 @@ class BaseSession(SessionInterface):

        self._current_version = self._graph.version

+  # The threshold to run garbage collection to delete dead tensors.
+  _DEAD_HANDLES_THRESHOLD = 10
+
+  def _register_dead_handle(self, handle):
+    # Register a dead handle in the session. Delete the dead tensors when
+    # the number of dead tensors exceeds certain threshold.
+    tensors_to_delete = None
+    with self._delete_lock:
+      self._dead_handles.append(handle)
+      if len(self._dead_handles) == BaseSession._DEAD_HANDLES_THRESHOLD:
+        tensors_to_delete = self._dead_handles
+        self._dead_handles = []
+    # Delete the dead tensors.
+    # TODO(yuanbyu): For now we use a sequence of runs to minimize the graph
+    # size and the overhead of graph construction/partitioning.
+    if tensors_to_delete:
+      for tensor_handle in tensors_to_delete:
+        feeds = {}
+        fetches = []
+        holder, deleter = session_ops._get_handle_deleter(self.graph,
+                                                          tensor_handle)
+        feeds[holder] = tensor_handle
+        fetches.append(deleter)
+        self.run(fetches, feed_dict=feeds)
+
+  def _update_with_movers(self, feed_dict, feed_map):
+    # If a tensor handle that is fed to a device incompatible placeholder,
+    # we move the tensor to the right device, generate a new tensor handle,
+    # and update `feed_dict` to use the new handle.
+    handle_movers = []
+    for feed_name, val in feed_map.items():
+      mover = session_ops._get_handle_mover(self.graph, *val)
+      if mover:
+        handle_movers.append((feed_name, val[1], mover))
+    # Transfer a tensor to the right device if needed.
+    if not handle_movers:
+      return []
+    else:
+      feeds = {}
+      fetches = []
+      for _, handle, mover in handle_movers:
+        feeds[mover[0]] = handle
+        fetches.append(mover[1])
+      handles = self.run(fetches, feed_dict=feeds)
+      for handle_mover, handle in zip(handle_movers, handles):
+        np_val = np.array(handle.handle, dtype=np.object)
+        feed_dict[handle_mover[0]] = np_val
+      return handles
+

 class Session(BaseSession):
  """A class for running TensorFlow operations.
--- a/tensorflow/python/framework/docs.py
+++ b/tensorflow/python/framework/docs.py
@ -99,11 +99,12 @@ class Index(Document):
        print("", file=f)


-def collect_members(module_to_name):
+def collect_members(module_to_name, exclude=()):
  """Collect all symbols from a list of modules.

  Args:
    module_to_name: Dictionary mapping modules to short names.
+    exclude: Set of fully qualified names to exclude.

  Returns:
    Dictionary mapping name to (fullname, member) pairs.
@ -116,6 +117,8 @@ def collect_members(module_to_name):
          not _always_drop_symbol_re.match(name) and
          (all_names is None or name in all_names)):
        fullname = '%s.%s' % (module_name, name)
+        if fullname in exclude:
+          continue
        if name in members:
          other_fullname, other_member = members[name]
          if member is not other_member:
--- a/tensorflow/python/framework/errors.py
+++ b/tensorflow/python/framework/errors.py
@ -328,7 +328,7 @@ class AbortedError(OpError):


 class OutOfRangeError(OpError):
-  """Raised when an operation executed past the valid range.
+  """Raised when an operation iterates past the valid input range.

  This exception is raised in "end-of-file" conditions, such as when a
  [`queue.dequeue()`](../../api_docs/python/io_ops.md#QueueBase.dequeue)
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ b/tensorflow/python/framework/gen_docs_combined.py
@ -81,9 +81,11 @@ def all_libraries(module_to_name, members, documented):
              exclude_symbols=["sparse_matmul", "arg_min", "arg_max",
                               "lin_space", "sparse_segment_mean_grad"],
              prefix=PREFIX_TEXT),
+      library("string_ops", "Strings", prefix=PREFIX_TEXT),
      library("histogram_ops", "Histograms"),
      library("control_flow_ops", "Control Flow", prefix=PREFIX_TEXT),
      library("functional_ops", "Higher Order Functions", prefix=PREFIX_TEXT),
+      library("session_ops", "Tensor Handle Operations", prefix=PREFIX_TEXT),
      library("image", "Images", tf.image, exclude_symbols=["ResizeMethod"],
              prefix=PREFIX_TEXT),
      library("sparse_ops", "Sparse Tensors",
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@ -1871,6 +1871,14 @@ class Graph(object):
    self._colocation_stack = []
    # Set of tensors that are dangerous to feed!
    self._unfeedable_tensors = set()
+    # A map of tensor handle placeholder to tensor dtype.
+    self._handle_feeders = {}
+    # A map from tensor handle to its read op.
+    self._handle_readers = {}
+    # A map from tensor handle to its move op.
+    self._handle_movers = {}
+    # A map from tensor handle to its delete op.
+    self._handle_deleters = {}

  def _check_not_finalized(self):
    """Check if the graph is finalized.
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@ -36,6 +36,10 @@ class Dimension(object):
  def __repr__(self):
    return "Dimension(%s)" % repr(self._value)

+  def __str__(self):
+    value = self._value
+    return "?" if value is None else str(value)
+
  def __eq__(self, other):
    """Returns true if `other` has the same known value as this Dimension."""
    other = as_dimension(other)
@ -429,17 +433,15 @@ class TensorShape(object):
        self._dims = [as_dimension(d) for d in dims_iter]

  def __repr__(self):
-    return "TensorShape(%s)" % self._dims
+    return "TensorShape(%r)" % self._dims

  def __str__(self):
    if self.ndims is None:
      return "<unknown>"
    elif self.ndims == 1:
-      length = self._dims[0].value
-      return "(%s,)" % (str(length) if length is not None else "?")
+      return "(%s,)" % self._dims[0]
    else:
-      return "(%s)" % ", ".join(str(d.value) if d.value is not None else "?"
-                                for d in self._dims)
+      return "(%s)" % ", ".join(str(d) for d in self._dims)

  @property
  def dims(self):
@ -541,11 +543,15 @@ class TensorShape(object):
    if self._dims is None:
      return other
    else:
-      self.assert_same_rank(other)
-      new_dims = []
-      for i, dim in enumerate(self._dims):
-        new_dims.append(dim.merge_with(other[i]))
-      return TensorShape(new_dims)
+      try:
+        self.assert_same_rank(other)
+        new_dims = []
+        for i, dim in enumerate(self._dims):
+          new_dims.append(dim.merge_with(other[i]))
+        return TensorShape(new_dims)
+      except ValueError:
+        raise ValueError("Shapes %s and %s are not compatible" %
+                         (self, other))

  def concatenate(self, other):
    """Returns the concatenation of the dimension in `self` and `other`.
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@ -143,6 +143,14 @@ class DimensionTest(test_util.TensorFlowTestCase):
    self.assertIs(None,
                  tensor_shape.Dimension(None) != tensor_shape.Dimension(None))

+  def testRepr(self):
+    self.assertEqual(repr(tensor_shape.Dimension(7)), "Dimension(7)")
+    self.assertEqual(repr(tensor_shape.Dimension(None)), "Dimension(None)")
+
+  def testStr(self):
+    self.assertEqual(str(tensor_shape.Dimension(7)), "7")
+    self.assertEqual(str(tensor_shape.Dimension(None)), "?")
+

 class ShapeTest(test_util.TensorFlowTestCase):

--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@ -103,6 +103,19 @@ class BenchmarkTest(tf.test.TestCase):
    self.assertTrue(_ran_somebenchmark_2[0])
    self.assertFalse(_ran_somebenchmark_but_shouldnt[0])

+    _ran_somebenchmark_1[0] = False
+    _ran_somebenchmark_2[0] = False
+    _ran_somebenchmark_but_shouldnt[0] = False
+
+    # Test running a specific method of SomeRandomBenchmark
+    if benchmark.TEST_REPORTER_TEST_ENV in os.environ:
+      del os.environ[benchmark.TEST_REPORTER_TEST_ENV]
+    benchmark._run_benchmarks("SomeRandom.*1$")
+
+    self.assertTrue(_ran_somebenchmark_1[0])
+    self.assertFalse(_ran_somebenchmark_2[0])
+    self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
+
  def testReportingBenchmark(self):
    tempdir = tf.test.get_temp_dir()
    try:
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@ -59,7 +59,7 @@ def isum(s):
  i = tf.constant(0, name="i")
  c = lambda i, s: tf.less(i, 10)
  b = lambda i, s: [tf.add(i, 1), tf.add(i, s)]
-  _, r_s = control_flow_ops.While(c, b, [i, s])
+  _, r_s = tf.while_loop(c, b, [i, s])
  return r_s


@ -467,7 +467,7 @@ class ControlFlowTest(tf.test.TestCase):
      n = tf.constant(0)
      c = lambda x: tf.less(x, 10000)
      b = lambda x: tf.add(x, 1)
-      r = control_flow_ops.While(c, b, [n], parallel_iterations=20)
+      r = tf.while_loop(c, b, [n], parallel_iterations=20)
      self.assertEqual(10000, r.eval())

  def testWhileWithRefs_1(self):
@ -482,7 +482,7 @@ class ControlFlowTest(tf.test.TestCase):
        self.assertEqual(x.dtype, tf.int32_ref)
        return (i+1, gen_array_ops._ref_identity(x))

-      r = control_flow_ops.While(c, b, [i, x], parallel_iterations=5)
+      r = tf.while_loop(c, b, [i, x], parallel_iterations=5)

      tf.initialize_all_variables().run()

@ -517,7 +517,7 @@ class ControlFlowTest(tf.test.TestCase):
      c = tf.convert_to_tensor(0)
      o = tf.convert_to_tensor(0)
      d = tf.convert_to_tensor(100)
-      r = control_flow_ops.While(
+      r = tf.while_loop(
          lambda i, m, c, o: tf.less(i, d), compute, [i, m, c, o])
      result = r[3].eval()
    self.assertTrue(check_op_order(i.graph))
@ -539,7 +539,7 @@ class ControlFlowTest(tf.test.TestCase):
      o = tf.convert_to_tensor(0)
      x = tf.convert_to_tensor([1, 2, 3, 4, 5, 6])
      s = tf.size(x)
-      r = control_flow_ops.While(
+      r = tf.while_loop(
          lambda i, m, c, o: tf.less(i, s), compute, [i, m, c, o])
      result = r[3].eval()
    self.assertTrue(check_op_order(i.graph))
@ -559,7 +559,7 @@ class ControlFlowTest(tf.test.TestCase):
      o = tf.convert_to_tensor([0])
      x = tf.convert_to_tensor([1, 2, 3, 4, 5, 6])
      s = tf.size(x)
-      r = control_flow_ops.While(
+      r = tf.while_loop(
          lambda i, c, o: tf.less(i, s), compute, [i, c, o])
      result = r[2].eval()
    self.assertTrue(check_op_order(i.graph))
@ -570,7 +570,7 @@ class ControlFlowTest(tf.test.TestCase):
      n = tf.constant(1.0)
      c = lambda x: tf.less(x, 10.0)
      b = lambda x: tf.add(x, 1.0)
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])
      self.assertAllClose(10.0, r.eval())

  def testWhile_Gpu_1(self):
@ -584,7 +584,7 @@ class ControlFlowTest(tf.test.TestCase):
      def b(x):
        with tf.device("/cpu:0"):
          return tf.add(x, 1.0)
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])
      self.assertAllClose(10.0, r.eval())

  def testWhile_Gpu_2(self):
@ -601,11 +601,11 @@ class ControlFlowTest(tf.test.TestCase):
          with tf.device("/cpu:0"):
            s1 = tf.add(i, s)
          return i1, s1
-        _, r_s = control_flow_ops.While(c, b, [n, s])
+        _, r_s = tf.while_loop(c, b, [n, s])
        return r_s
      c = lambda x: tf.less(x, 200)
      b = lambda x: tf.add(x, cpu_sum(n))
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])
      self.assertEqual(225, r.eval())

  def testNestedWhile_1(self):
@ -624,10 +624,8 @@ class ControlFlowTest(tf.test.TestCase):
          r_ = tf.constant(12)
        return [n_, r_]

-      res = control_flow_ops.While(condition,
-                                   body,
-                                   [n, r],
-                                   parallel_iterations=1)
+      res = tf.while_loop(condition, body, [n, r],
+                          parallel_iterations=1)
      self.assertAllEqual(12, res[1].eval())

  def testWhileWithControl_2(self):
@ -640,7 +638,7 @@ class ControlFlowTest(tf.test.TestCase):
          r_ = tf.constant(12)
        return [r_]

-      res = control_flow_ops.While(condition, body, [r], parallel_iterations=1)
+      res = tf.while_loop(condition, body, [r], parallel_iterations=1)
      self.assertAllEqual(12, res.eval())

  def testCondWhile_1(self):
@ -649,7 +647,7 @@ class ControlFlowTest(tf.test.TestCase):
      c = lambda x: tf.less(x, 10)
      b = lambda x: tf.add(x, 1)
      r = tf.cond(tf.less(0, 1),
-                  lambda: control_flow_ops.While(c, b, [n]),
+                  lambda: tf.while_loop(c, b, [n]),
                  lambda: n)
      self.assertAllEqual(10, r.eval())

@ -659,7 +657,7 @@ class ControlFlowTest(tf.test.TestCase):
      c = lambda x: tf.less(x, 10)
      b = lambda x: tf.add(x, 1)
      r = tf.cond(tf.less(1, 0), lambda: tf.add(n, 1),
-                  lambda: control_flow_ops.While(c, b, [n]))
+                  lambda: tf.while_loop(c, b, [n]))
      self.assertAllEqual(10, r.eval())

  def testWhileCond_1(self):
@ -673,7 +671,7 @@ class ControlFlowTest(tf.test.TestCase):
      b = lambda x: tf.cond(
          tf.constant(True), lambda: tf.add(x, one), lambda: tf.sub(x, one))
      # pylint: enable=undefined-variable
-      r = control_flow_ops.While(c, b, [i])
+      r = tf.while_loop(c, b, [i])
      self.assertAllEqual(10, r.eval())

  def testWhileCond_2(self):
@ -681,7 +679,7 @@ class ControlFlowTest(tf.test.TestCase):
      n = tf.convert_to_tensor(0, name="n")
      c = lambda x: tf.less(x, 10)
      b = lambda x: tf.cond(tf.constant(True), lambda: tf.add(x, 1), lambda: n)
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])
      self.assertAllEqual(10, r.eval())

  def testWhileCond_3(self):
@ -693,7 +691,7 @@ class ControlFlowTest(tf.test.TestCase):
      b = lambda x: tf.cond(tf.less(0, 1), lambda: tf.add(x, 1),
                            lambda: tf.sub(x, 1))
      # pylint: enable=undefined-variable
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])
      self.assertAllEqual(10, r.eval())

  # NOTE: It is ok to have parallel_iterations > 1
@ -712,10 +710,8 @@ class ControlFlowTest(tf.test.TestCase):
        nj = control_flow_ops.with_dependencies([op], nj)
        return [nj]

-      r = control_flow_ops.While(loop_iterator,
-                                 loop_body,
-                                 [n],
-                                 parallel_iterations=1)
+      r = tf.while_loop(loop_iterator, loop_body, [n],
+                        parallel_iterations=1)
      self.assertTrue(check_op_order(n.graph))
      tf.initialize_all_variables().run()
      self.assertEqual(3, r.eval())
@ -739,10 +735,8 @@ class ControlFlowTest(tf.test.TestCase):
        nj = control_flow_ops.with_dependencies([op], nj)
        return [nj]

-      r = control_flow_ops.While(loop_iterator,
-                                 loop_body,
-                                 [n],
-                                 parallel_iterations=1)
+      r = tf.while_loop(loop_iterator, loop_body, [n],
+                        parallel_iterations=1)
      self.assertTrue(check_op_order(n.graph))
      tf.initialize_all_variables().run()
      self.assertEqual(3, r.eval())
@ -764,10 +758,9 @@ class ControlFlowTest(tf.test.TestCase):
        nj = tf.add(j, 1)
        return [nj, ns]

-      r = control_flow_ops.While(loop_iterator,
-                                 loop_body,
-                                 [n, tf.identity(select)],
-                                 parallel_iterations=1)
+      r = tf.while_loop(loop_iterator, loop_body,
+                        [n, tf.identity(select)],
+                        parallel_iterations=1)
      tf.initialize_all_variables().run()
      result = r[1].eval()
    self.assertTrue(check_op_order(n.graph))
@ -792,8 +785,8 @@ class ControlFlowTest(tf.test.TestCase):
          ni = tf.add(i, 1, name="i_add")
        return ni

-      lpa = control_flow_ops.While(pred, loop_body, [c],
-                                   parallel_iterations=1)
+      lpa = tf.while_loop(pred, loop_body, [c],
+                          parallel_iterations=1)

      self.assertEqual(0, var_b.eval())
      lpa.eval()  # Run the loop
@ -819,7 +812,7 @@ class ControlFlowTest(tf.test.TestCase):
          inc_b = tf.identity(var_b)
        return inc_b

-      lpa = control_flow_ops.While(pred, loop_body, [var_b], 1, name="loop")
+      lpa = tf.while_loop(pred, loop_body, [var_b], 1, name="loop")

      self.assertEqual(0, var_b.eval())
      lpa.eval()  # Run the loop
@ -848,7 +841,7 @@ class ControlFlowTest(tf.test.TestCase):
          ni = tf.add(i, 1, name="i_add")
          return ni

-      lpa = control_flow_ops.While(pred, loop_body, [c], 1, name="loop")
+      lpa = tf.while_loop(pred, loop_body, [c], 1, name="loop")

      self.assertEqual(0, var_b.eval())
      lpa.eval()  # Run the loop
@ -868,7 +861,7 @@ class ControlFlowTest(tf.test.TestCase):
        ni = control_flow_ops.with_dependencies([q.enqueue((i,))], ni)
        return ni

-      r = control_flow_ops.While(c, b, [i], parallel_iterations=1)
+      r = tf.while_loop(c, b, [i], parallel_iterations=1)
      self.assertEqual([10], r.eval())
      for i in xrange(10):
        self.assertEqual([i], q.dequeue().eval())
@ -885,7 +878,7 @@ class ControlFlowTest(tf.test.TestCase):
        ni = control_flow_ops.with_dependencies(
            [gen_data_flow_ops._stack_push(s, i)], ni)
        return ni
-      r = control_flow_ops.While(c, b, [i], parallel_iterations=1)
+      r = tf.while_loop(c, b, [i], parallel_iterations=1)

      x = tf.constant(0)
      def c1(i, _):
@ -894,7 +887,7 @@ class ControlFlowTest(tf.test.TestCase):
        ni = tf.sub(i, 1)
        nx = x + gen_data_flow_ops._stack_pop(s, tf.int32)
        return [ni, nx]
-      _, rx = control_flow_ops.While(c1, b1, [r, x], parallel_iterations=1)
+      _, rx = tf.while_loop(c1, b1, [r, x], parallel_iterations=1)
      self.assertEqual(45, rx.eval())

  def testWhileGrad_Square(self):
@ -902,7 +895,7 @@ class ControlFlowTest(tf.test.TestCase):
      v = tf.constant(2.0, name="v")
      c = lambda v: tf.less(v, 100.0)
      b = tf.square
-      r = control_flow_ops.While(c, b, [v], parallel_iterations=1)
+      r = tf.while_loop(c, b, [v], parallel_iterations=1)
      r = control_flow_ops.cond(tf.less(1, 2), lambda: r, lambda: v)

      r = tf.gradients(r, v)[0]
@ -915,7 +908,7 @@ class ControlFlowTest(tf.test.TestCase):
      n = tf.constant(0, name="n")
      c = lambda i, v: tf.less(i, 5)
      b = lambda i, v: [i + 1, tf.mul(x, v)]
-      r = control_flow_ops.While(c, b, [n, v], parallel_iterations=1)
+      r = tf.while_loop(c, b, [n, v], parallel_iterations=1)

      r = tf.gradients(r[1], x)[0]
      self.assertEqual(r.get_shape(), tensor_shape.unknown_shape())
@ -926,7 +919,7 @@ class ControlFlowTest(tf.test.TestCase):
      v = tf.constant(2.0, name="v")
      c = lambda v: tf.less(v, 100.0)
      b = tf.square
-      r = control_flow_ops.While(c, b, [v], parallel_iterations=1)
+      r = tf.while_loop(c, b, [v], parallel_iterations=1)
      r = tf.mul(r, r)

      r = tf.gradients(r, v)[0]
@ -937,7 +930,7 @@ class ControlFlowTest(tf.test.TestCase):
      v = tf.constant(2.0, name="v")
      c = lambda v: tf.less(v, 100.0)
      b = tf.square
-      r = control_flow_ops.While(c, b, [v], parallel_iterations=1)
+      r = tf.while_loop(c, b, [v], parallel_iterations=1)
      r = tf.add(r, r)

      r = tf.gradients(r, v)[0]
@ -949,8 +942,7 @@ class ControlFlowTest(tf.test.TestCase):
      v = tf.constant(2.0, name="v")
      c = lambda v: tf.less(v, 100.0)
      b = lambda v: tf.mul(v, a)
-      r = control_flow_ops.While(c, b, [v],
-                                 parallel_iterations=p_iters)
+      r = tf.while_loop(c, b, [v], parallel_iterations=p_iters)

      grad_a, grad_v = tf.gradients(r, [a, v])
      grad_a_val, grad_v_val = sess.run([grad_a, grad_v])
@ -969,7 +961,7 @@ class ControlFlowTest(tf.test.TestCase):
      v = tf.constant(2.0, name="v")
      c = lambda v: tf.less(v, 100.0)
      b = lambda v: tf.mul(v, a)
-      r = control_flow_ops.While(c, b, [v], parallel_iterations=1)
+      r = tf.while_loop(c, b, [v], parallel_iterations=1)

      r = tf.gradients(r, a)
      tf.initialize_all_variables().run()
@ -985,7 +977,7 @@ class ControlFlowTest(tf.test.TestCase):
        y1 = tf.add(x, y)
        x1 = tf.mul(x, y1)
        return x1, y1
-      rx, ry = control_flow_ops.While(c, b, [x, y], parallel_iterations=1)
+      rx, ry = tf.while_loop(c, b, [x, y], parallel_iterations=1)

      r = tf.gradients([rx, ry], x)
      self.assertAllClose(304.0, r[0].eval())
@ -1006,7 +998,7 @@ class ControlFlowTest(tf.test.TestCase):
        x = tf.mul(x, 2.0)
        i = tf.add(i, 1)
        return i, x
-      ri, rx = control_flow_ops.While(c, b, [i, x], parallel_iterations=1)
+      ri, rx = tf.while_loop(c, b, [i, x], parallel_iterations=1)

      r = tf.gradients([ri, rx], x)
      self.assertAllClose(1024.0, r[0].eval())
@ -1018,7 +1010,7 @@ class ControlFlowTest(tf.test.TestCase):
      v = tf.constant(2.0, name="v")
      c = lambda v: tf.less(v, 100.0)
      b = tf.square
-      r = control_flow_ops.While(c, b, [v], back_prop=False)
+      r = tf.while_loop(c, b, [v], back_prop=False)
      r = tf.add(r, v)
      r = tf.gradients(r, v)
      self.assertAllClose(1.0, r[0].eval())
@ -1033,8 +1025,8 @@ class ControlFlowTest(tf.test.TestCase):
        x = tf.mul(x, 2.0)
        i = tf.add(i, 1)
        return i, x
-      _, rx = control_flow_ops.While(c, b, [i, x], parallel_iterations=1)
-      _, rx = control_flow_ops.While(c, b, [i, rx], parallel_iterations=1)
+      _, rx = tf.while_loop(c, b, [i, x], parallel_iterations=1)
+      _, rx = tf.while_loop(c, b, [i, rx], parallel_iterations=1)

      r = tf.gradients([rx], x)
      self.assertAllClose(1024.0, r[0].eval())
@ -1049,8 +1041,8 @@ class ControlFlowTest(tf.test.TestCase):
        x = tf.mul(x, 2.0)
        i = tf.add(i, 1)
        return i, x
-      _, r1 = control_flow_ops.While(c, b, [i, x], parallel_iterations=1)
-      _, r2 = control_flow_ops.While(c, b, [i, x], parallel_iterations=1)
+      _, r1 = tf.while_loop(c, b, [i, x], parallel_iterations=1)
+      _, r2 = tf.while_loop(c, b, [i, x], parallel_iterations=1)
      rx = tf.add(r1, r2)

      r = tf.gradients([rx], x)
@ -1062,10 +1054,10 @@ class ControlFlowTest(tf.test.TestCase):
      def inner_loop(s):
        c = lambda x: tf.less(x, 4.0)
        b = lambda x: tf.mul(x, 2.0)
-        return control_flow_ops.While(c, b, [s])
+        return tf.while_loop(c, b, [s])
      c = lambda x: tf.less(x, 2.0)
      b = lambda x: tf.mul(inner_loop(x), 2.0)
-      r = control_flow_ops.While(c, b, [v])
+      r = tf.while_loop(c, b, [v])

      r = tf.gradients(r, v)[0]
      self.assertAllClose(8.0, r.eval())
@ -1081,15 +1073,15 @@ class ControlFlowTest(tf.test.TestCase):
        z = tf.constant(0)
        c = lambda i, x: tf.less(i, 4)
        b = lambda i, x: [tf.add(i, 1), tf.mul(x, 2.0)]
-        return control_flow_ops.While(c, b, [z, s])
+        return tf.while_loop(c, b, [z, s])
      def inner_loop2(s):
        z = tf.constant(0)
        c = lambda i, x: tf.less(i, 4)
        b = lambda i, x: [tf.add(i, 1), tf.mul(x, 2.0)]
-        return control_flow_ops.While(c, b, [z, s])
+        return tf.while_loop(c, b, [z, s])
      c = lambda x: tf.less(x, 128.0)
      b = lambda x: inner_loop2(inner_loop1(x)[1])[1]
-      r = control_flow_ops.While(c, b, [v])
+      r = tf.while_loop(c, b, [v])

      r = tf.gradients(r, v)[0]
      self.assertAllClose(256.0, r.eval())
@ -1101,15 +1093,15 @@ class ControlFlowTest(tf.test.TestCase):
        z = tf.constant(0)
        c = lambda i, x: tf.less(i, 4)
        b = lambda i, x: [tf.add(i, 1), tf.mul(x, 2.0)]
-        return control_flow_ops.While(c, b, [z, s])
+        return tf.while_loop(c, b, [z, s])
      def inner_loop2(s):
        z = tf.constant(0)
        c = lambda i, x: tf.less(i, 4)
        b = lambda i, x: [tf.add(i, 1), tf.mul(x, 2.0)]
-        return control_flow_ops.While(c, b, [z, s])
+        return tf.while_loop(c, b, [z, s])
      c = lambda x: tf.less(x, 128.0)
      b = lambda x: tf.mul(inner_loop1(x)[1], inner_loop2(x)[1])
-      r = control_flow_ops.While(c, b, [v])
+      r = tf.while_loop(c, b, [v])

      r = tf.gradients(r, v)[0]
      self.assertAllClose(512.0, r.eval())
@ -1126,7 +1118,7 @@ class ControlFlowTest(tf.test.TestCase):
                                          lambda: tf.square(x),
                                          lambda: tf.sub(x, one))
      # pylint: enable=undefined-variable
-      r = control_flow_ops.While(c, b, [v])
+      r = tf.while_loop(c, b, [v])
      r = tf.gradients(r, v)[0]
      self.assertAllClose(1024.0, r.eval())

@ -1146,7 +1138,7 @@ class ControlFlowTest(tf.test.TestCase):
                                          lambda: tf.square(x),
                                          lambda: tf.sub(x, one))
      # pylint: enable=undefined-variable
-      r = control_flow_ops.While(c, b, [v])
+      r = tf.while_loop(c, b, [v])
      r = tf.gradients(r, v)[0]
      r = sess.run(r, feed_dict={v: 2.0})
      self.assertAllClose(1024.0, r)
@ -1165,7 +1157,7 @@ class ControlFlowTest(tf.test.TestCase):
        return (i+1, gen_array_ops._ref_identity(x))
      # pylint: enable=protected-access

-      r = control_flow_ops.While(c, body, [i, x], parallel_iterations=5)
+      r = tf.while_loop(c, body, [i, x], parallel_iterations=5)

      grad_ys = [tf.Variable(73).ref()]
      grad = tf.gradients([r[1]], [x], grad_ys=grad_ys)
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@ -17,10 +17,118 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy
+import numpy as np
 import tensorflow as tf


+class BatchMatrixDiagTest(tf.test.TestCase):
+  _use_gpu = False
+
+  def testVector(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v = np.array([1.0, 2.0, 3.0])
+      mat = np.diag(v)
+      v_diag = tf.batch_matrix_diag(v)
+      self.assertEqual((3, 3), v_diag.get_shape())
+      self.assertAllEqual(v_diag.eval(), mat)
+
+  def testBatchVector(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v_batch = np.array([[1.0, 2.0, 3.0],
+                          [4.0, 5.0, 6.0]])
+      mat_batch = np.array(
+          [[[1.0, 0.0, 0.0],
+            [0.0, 2.0, 0.0],
+            [0.0, 0.0, 3.0]],
+           [[4.0, 0.0, 0.0],
+            [0.0, 5.0, 0.0],
+            [0.0, 0.0, 6.0]]])
+      v_batch_diag = tf.batch_matrix_diag(v_batch)
+      self.assertEqual((2, 3, 3), v_batch_diag.get_shape())
+      self.assertAllEqual(v_batch_diag.eval(), mat_batch)
+
+  def testInvalidShape(self):
+    with self.assertRaisesRegexp(ValueError, "must have rank at least 1"):
+      tf.batch_matrix_diag(0)
+
+  def testInvalidShapeAtEval(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v = tf.placeholder(dtype=tf.float32)
+      with self.assertRaisesOpError("input must be at least 1-dim"):
+        tf.batch_matrix_diag(v).eval(feed_dict={v: 0.0})
+
+  def testGrad(self):
+    shapes = ((3,), (18, 4), (1, 9, 4, 8,))
+    with self.test_session(use_gpu=self._use_gpu):
+      for shape in shapes:
+        x = tf.constant(np.random.rand(*shape), np.float32)
+        y = tf.batch_matrix_diag(x)
+        error = tf.test.compute_gradient_error(x, x.get_shape().as_list(),
+                                               y, y.get_shape().as_list())
+        self.assertLess(error, 1e-4)
+
+
+class BatchMatrixDiagGpuTest(BatchMatrixDiagTest):
+  _use_gpu = True
+
+
+class BatchMatrixDiagPartTest(tf.test.TestCase):
+  _use_gpu = False
+
+  def testMatrix(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v = np.array([1.0, 2.0, 3.0])
+      mat = np.diag(v)
+      mat_diag = tf.batch_matrix_diag_part(mat)
+      self.assertEqual((3,), mat_diag.get_shape())
+      self.assertAllEqual(mat_diag.eval(), v)
+
+  def testBatchMatrix(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v_batch = np.array([[1.0, 2.0, 3.0],
+                          [4.0, 5.0, 6.0]])
+      mat_batch = np.array(
+          [[[1.0, 0.0, 0.0],
+            [0.0, 2.0, 0.0],
+            [0.0, 0.0, 3.0]],
+           [[4.0, 0.0, 0.0],
+            [0.0, 5.0, 0.0],
+            [0.0, 0.0, 6.0]]])
+      self.assertEqual(mat_batch.shape, (2, 3, 3))
+      mat_batch_diag = tf.batch_matrix_diag_part(mat_batch)
+      self.assertEqual((2, 3), mat_batch_diag.get_shape())
+      self.assertAllEqual(mat_batch_diag.eval(), v_batch)
+
+  def testInvalidShape(self):
+    with self.assertRaisesRegexp(ValueError, "must have rank at least 2"):
+      tf.batch_matrix_diag_part(0)
+    with self.assertRaisesRegexp(ValueError, r"Dimensions .* not compatible"):
+      tf.batch_matrix_diag_part([[0, 1], [1, 0], [0, 0]])
+
+  def testInvalidShapeAtEval(self):
+    with self.test_session(use_gpu=self._use_gpu):
+      v = tf.placeholder(dtype=tf.float32)
+      with self.assertRaisesOpError("input must be at least 2-dim"):
+        tf.batch_matrix_diag_part(v).eval(feed_dict={v: 0.0})
+      with self.assertRaisesOpError("last two dimensions must be equal"):
+        tf.batch_matrix_diag_part(v).eval(
+            feed_dict={v: [[0, 1], [1, 0], [0, 0]]})
+
+  def testGrad(self):
+    shapes = ((3, 3), (18, 3, 3), (1, 9, 4, 3, 5, 5))
+    with self.test_session(use_gpu=self._use_gpu):
+      for shape in shapes:
+        x = tf.constant(np.random.rand(*shape), dtype=np.float32)
+        y = tf.batch_matrix_diag_part(x)
+        error = tf.test.compute_gradient_error(x, x.get_shape().as_list(),
+                                               y, y.get_shape().as_list())
+        self.assertLess(error, 1e-4)
+
+
+class BatchMatrixDiagPartGpuTest(BatchMatrixDiagPartTest):
+  _use_gpu = True
+
+
 class DiagTest(tf.test.TestCase):

  def diagOp(self, diag, dtype, expected_ans, use_gpu=False):
@ -35,56 +143,56 @@ class DiagTest(tf.test.TestCase):
    self.assertShapeEqual(diag, tf_ans_inv)

  def testEmptyTensor(self):
-    x = numpy.array([])
-    expected_ans = numpy.empty([0, 0])
-    self.diagOp(x, numpy.int32, expected_ans)
+    x = np.array([])
+    expected_ans = np.empty([0, 0])
+    self.diagOp(x, np.int32, expected_ans)

  def testRankOneIntTensor(self):
-    x = numpy.array([1, 2, 3])
-    expected_ans = numpy.array(
+    x = np.array([1, 2, 3])
+    expected_ans = np.array(
        [[1, 0, 0],
         [0, 2, 0],
         [0, 0, 3]])
-    self.diagOp(x, numpy.int32, expected_ans)
-    self.diagOp(x, numpy.int64, expected_ans)
+    self.diagOp(x, np.int32, expected_ans)
+    self.diagOp(x, np.int64, expected_ans)

  def testRankOneFloatTensor(self):
-    x = numpy.array([1.1, 2.2, 3.3])
-    expected_ans = numpy.array(
+    x = np.array([1.1, 2.2, 3.3])
+    expected_ans = np.array(
        [[1.1, 0, 0],
         [0, 2.2, 0],
         [0, 0, 3.3]])
-    self.diagOp(x, numpy.float32, expected_ans)
-    self.diagOp(x, numpy.float64, expected_ans)
+    self.diagOp(x, np.float32, expected_ans)
+    self.diagOp(x, np.float64, expected_ans)

  def testRankTwoIntTensor(self):
-    x = numpy.array([[1, 2, 3], [4, 5, 6]])
-    expected_ans = numpy.array(
+    x = np.array([[1, 2, 3], [4, 5, 6]])
+    expected_ans = np.array(
        [[[[1, 0, 0], [0, 0, 0]],
          [[0, 2, 0], [0, 0, 0]],
          [[0, 0, 3], [0, 0, 0]]],
         [[[0, 0, 0], [4, 0, 0]],
          [[0, 0, 0], [0, 5, 0]],
          [[0, 0, 0], [0, 0, 6]]]])
-    self.diagOp(x, numpy.int32, expected_ans)
-    self.diagOp(x, numpy.int64, expected_ans)
+    self.diagOp(x, np.int32, expected_ans)
+    self.diagOp(x, np.int64, expected_ans)

  def testRankTwoFloatTensor(self):
-    x = numpy.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]])
-    expected_ans = numpy.array(
+    x = np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]])
+    expected_ans = np.array(
        [[[[1.1, 0, 0], [0, 0, 0]],
          [[0, 2.2, 0], [0, 0, 0]],
          [[0, 0, 3.3], [0, 0, 0]]],
         [[[0, 0, 0], [4.4, 0, 0]],
          [[0, 0, 0], [0, 5.5, 0]],
          [[0, 0, 0], [0, 0, 6.6]]]])
-    self.diagOp(x, numpy.float32, expected_ans)
-    self.diagOp(x, numpy.float64, expected_ans)
+    self.diagOp(x, np.float32, expected_ans)
+    self.diagOp(x, np.float64, expected_ans)

  def testRankThreeFloatTensor(self):
-    x = numpy.array([[[1.1, 2.2], [3.3, 4.4]],
-                     [[5.5, 6.6], [7.7, 8.8]]])
-    expected_ans = numpy.array(
+    x = np.array([[[1.1, 2.2], [3.3, 4.4]],
+                  [[5.5, 6.6], [7.7, 8.8]]])
+    expected_ans = np.array(
        [[[[[[1.1, 0], [0, 0]], [[0, 0], [0, 0]]],
           [[[0, 2.2], [0, 0]], [[0, 0], [0, 0]]]],
          [[[[0, 0], [3.3, 0]], [[0, 0], [0, 0]]],
@ -93,14 +201,14 @@ class DiagTest(tf.test.TestCase):
           [[[0, 0], [0, 0]], [[0, 6.6], [0, 0]]]],
          [[[[0, 0], [0, 0]], [[0, 0], [7.7, 0]]],
           [[[0, 0], [0, 0]], [[0, 0], [0, 8.8]]]]]])
-    self.diagOp(x, numpy.float32, expected_ans)
-    self.diagOp(x, numpy.float64, expected_ans)
+    self.diagOp(x, np.float32, expected_ans)
+    self.diagOp(x, np.float64, expected_ans)


 class DiagPartOpTest(tf.test.TestCase):

  def setUp(self):
-    numpy.random.seed(0)
+    np.random.seed(0)

  def diagPartOp(self, tensor, dtpe, expected_ans, use_gpu=False):
    with self.test_session(use_gpu=use_gpu):
@ -110,64 +218,64 @@ class DiagPartOpTest(tf.test.TestCase):
    self.assertShapeEqual(expected_ans, tf_ans_inv)

  def testRankTwoFloatTensor(self):
-    x = numpy.random.rand(3, 3)
-    i = numpy.arange(3)
+    x = np.random.rand(3, 3)
+    i = np.arange(3)
    expected_ans = x[i, i]
-    self.diagPartOp(x, numpy.float32, expected_ans)
-    self.diagPartOp(x, numpy.float64, expected_ans)
+    self.diagPartOp(x, np.float32, expected_ans)
+    self.diagPartOp(x, np.float64, expected_ans)

  def testRankFourFloatTensor(self):
-    x = numpy.random.rand(2, 3, 2, 3)
-    i = numpy.arange(2)[:, None]
-    j = numpy.arange(3)
+    x = np.random.rand(2, 3, 2, 3)
+    i = np.arange(2)[:, None]
+    j = np.arange(3)
    expected_ans = x[i, j, i, j]
-    self.diagPartOp(x, numpy.float32, expected_ans)
-    self.diagPartOp(x, numpy.float64, expected_ans)
+    self.diagPartOp(x, np.float32, expected_ans)
+    self.diagPartOp(x, np.float64, expected_ans)

  def testRankSixFloatTensor(self):
-    x = numpy.random.rand(2, 2, 2, 2, 2, 2)
-    i = numpy.arange(2)[:, None, None]
-    j = numpy.arange(2)[:, None]
-    k = numpy.arange(2)
+    x = np.random.rand(2, 2, 2, 2, 2, 2)
+    i = np.arange(2)[:, None, None]
+    j = np.arange(2)[:, None]
+    k = np.arange(2)
    expected_ans = x[i, j, k, i, j, k]
-    self.diagPartOp(x, numpy.float32, expected_ans)
-    self.diagPartOp(x, numpy.float64, expected_ans)
+    self.diagPartOp(x, np.float32, expected_ans)
+    self.diagPartOp(x, np.float64, expected_ans)

  def testOddRank(self):
-    w = numpy.random.rand(2)
-    x = numpy.random.rand(2, 2, 2)
-    y = numpy.random.rand(2, 2, 2, 2, 2)
-    z = numpy.random.rand(2, 2, 2, 2, 2, 2, 2)
-    self.assertRaises(ValueError, self.diagPartOp, w, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, x, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, y, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, z, numpy.float32, 0)
+    w = np.random.rand(2)
+    x = np.random.rand(2, 2, 2)
+    y = np.random.rand(2, 2, 2, 2, 2)
+    z = np.random.rand(2, 2, 2, 2, 2, 2, 2)
+    self.assertRaises(ValueError, self.diagPartOp, w, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, x, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, y, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, z, np.float32, 0)

  def testUnevenDimensions(self):
-    w = numpy.random.rand(2, 5)
-    x = numpy.random.rand(2, 1, 2, 3)
-    y = numpy.random.rand(2, 1, 2, 1, 2, 5)
-    z = numpy.random.rand(2, 2, 2, 2, 2, 2, 2, 2)
-    self.assertRaises(ValueError, self.diagPartOp, w, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, x, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, y, numpy.float32, 0)
-    self.assertRaises(ValueError, self.diagPartOp, z, numpy.float32, 0)
+    w = np.random.rand(2, 5)
+    x = np.random.rand(2, 1, 2, 3)
+    y = np.random.rand(2, 1, 2, 1, 2, 5)
+    z = np.random.rand(2, 2, 2, 2, 2, 2, 2, 2)
+    self.assertRaises(ValueError, self.diagPartOp, w, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, x, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, y, np.float32, 0)
+    self.assertRaises(ValueError, self.diagPartOp, z, np.float32, 0)


 class DiagGradOpTest(tf.test.TestCase):

  def testDiagGrad(self):
-    numpy.random.seed(0)
+    np.random.seed(0)
    shapes = ((3,), (3,3), (3,3,3))
    dtypes = (tf.float32, tf.float64)
    with self.test_session(use_gpu=False):
      errors = []
      for shape in shapes:
        for dtype in dtypes:
-          x1 = tf.constant(numpy.random.rand(*shape), dtype=dtype)
+          x1 = tf.constant(np.random.rand(*shape), dtype=dtype)
          y = tf.diag(x1)
-          error = tf.test.compute_gradient_error(x1, x1._shape_as_list(),
-                                                 y, y._shape_as_list())
+          error = tf.test.compute_gradient_error(x1, x1.get_shape().as_list(),
+                                                 y, y.get_shape().as_list())
          tf.logging.info("error = %f", error)
          self.assertLess(error, 1e-4)

@ -175,17 +283,17 @@ class DiagGradOpTest(tf.test.TestCase):
 class DiagGradPartOpTest(tf.test.TestCase):

  def testDiagPartGrad(self):
-    numpy.random.seed(0)
+    np.random.seed(0)
    shapes = ((3,3), (3,3,3,3), (3,3,3,3,3,3))
    dtypes = (tf.float32, tf.float64)
    with self.test_session(use_gpu=False):
      errors = []
      for shape in shapes:
        for dtype in dtypes:
-          x1 = tf.constant(numpy.random.rand(*shape), dtype=dtype)
+          x1 = tf.constant(np.random.rand(*shape), dtype=dtype)
          y = tf.diag_part(x1)
-          error = tf.test.compute_gradient_error(x1, x1._shape_as_list(),
-                                                 y, y._shape_as_list())
+          error = tf.test.compute_gradient_error(x1, x1.get_shape().as_list(),
+                                                 y, y.get_shape().as_list())
          tf.logging.info("error = %f", error)
          self.assertLess(error, 1e-4)

--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@ -153,8 +153,7 @@ class MatMulTest(tf.test.TestCase):
    b = tf.placeholder(tf.float32, [36, 2])
    c = tf.placeholder(tf.float32, [37])
    with self.assertRaisesRegexp(
-        ValueError,
-        r"Dimensions Dimension\(37\) and Dimension\(36\) are not compatible"):
+        ValueError, "Dimensions 37 and 36 are not compatible"):
      tf.matmul(a, b)
    with self.assertRaisesRegexp(ValueError, "must have rank 2"):
      tf.matmul(a, c)
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@ -0,0 +1,283 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for ReduceJoin op from string_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import itertools
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+
+def _input_array(num_dims):
+  """Creates an ndarray where each element is the binary of its linear index.
+
+  Args:
+    num_dims: The number of dimensions to create.
+
+  Returns:
+    An ndarray of shape [2] * num_dims.
+  """
+  formatter = "{:0%db}" % num_dims
+  strings = [formatter.format(i) for i in xrange(2 ** num_dims)]
+  return np.array(strings, dtype="S%d" % num_dims).reshape([2] * num_dims)
+
+
+def _joined_array(num_dims, reduce_dim):
+  """Creates an ndarray with the result from reduce_join on input_array.
+
+  Args:
+    num_dims: The number of dimensions of the original input array.
+    reduce_dim: The dimension to reduce.
+
+  Returns:
+    An ndarray of shape [2] * (num_dims - 1).
+  """
+  formatter = "{:0%db}" % (num_dims - 1)
+  result = np.zeros(shape=[2] * (num_dims - 1), dtype="S%d" % (2 * num_dims))
+  flat = result.ravel()
+  for i in xrange(2 ** (num_dims - 1)):
+    dims = formatter.format(i)
+    flat[i] = "".join([(dims[:reduce_dim] + "%d" + dims[reduce_dim:]) % j
+                       for j in xrange(2)])
+  return result
+
+
+class UnicodeTestCase(tf.test.TestCase):
+  """Test case with Python3-compatible string comparator."""
+
+  def assertAllEqualUnicode(self, truth, actual):
+    self.assertAllEqual(np.array(truth).astype("U"),
+                        np.array(actual).astype("U"))
+
+
+class ReduceJoinTestHelperTest(UnicodeTestCase):
+  """Tests for helper functions."""
+
+  def testInputArray(self):
+    num_dims = 3
+    truth = ["{:03b}".format(i) for i in xrange(2 ** num_dims)]
+    output_array = _input_array(num_dims).reshape([-1])
+    self.assertAllEqualUnicode(truth, output_array)
+
+  def testJoinedArray(self):
+    num_dims = 3
+    truth_dim_zero = [["000100", "001101"], ["010110", "011111"]]
+    truth_dim_one = [["000010", "001011"], ["100110", "101111"]]
+    truth_dim_two = [["000001", "010011"], ["100101", "110111"]]
+    output_array_dim_zero = _joined_array(num_dims, reduce_dim=0)
+    output_array_dim_one = _joined_array(num_dims, reduce_dim=1)
+    output_array_dim_two = _joined_array(num_dims, reduce_dim=2)
+    self.assertAllEqualUnicode(truth_dim_zero, output_array_dim_zero)
+    self.assertAllEqualUnicode(truth_dim_one, output_array_dim_one)
+    self.assertAllEqualUnicode(truth_dim_two, output_array_dim_two)
+
+
+class ReduceJoinTest(UnicodeTestCase):
+
+  def _testReduceJoin(self, input_array, truth, reduction_indices,
+                      keep_dims=False, separator=""):
+    """Compares the output of reduce_join to an expected result.
+
+    Args:
+      input_array: The string input to be joined.
+      truth: An array or np.array of the expected result.
+      reduction_indices: The indices to reduce over.
+      keep_dims: Whether or not to retain reduced dimensions.
+      separator: The separator to use for joining.
+    """
+    with self.test_session():
+      output = tf.reduce_join(inputs=input_array,
+                              reduction_indices=reduction_indices,
+                              keep_dims=keep_dims,
+                              separator=separator)
+      output_array = output.eval()
+
+    self.assertAllEqualUnicode(truth, output_array)
+
+  def _testMultipleReduceJoin(self, input_array, reduction_indices,
+                              separator=" "):
+    """Tests reduce_join for one input and multiple reduction_indices.
+
+    Does so by comparing the output to that from nested reduce_string_joins.
+    The correctness of single-dimension reduce_join is verified by other
+    tests below using _testReduceJoin.
+
+    Args:
+      input_array: The input to test.
+      reduction_indices: The indices to reduce.
+      separator: The separator to use when joining.
+    """
+    num_dims = len(input_array.shape)
+    truth_red_indices = reduction_indices or list(reversed(xrange(num_dims)))
+    with self.test_session():
+      output = tf.reduce_join(
+          inputs=input_array, reduction_indices=reduction_indices,
+          keep_dims=False, separator=separator)
+      output_keep_dims = tf.reduce_join(
+          inputs=input_array, reduction_indices=reduction_indices,
+          keep_dims=True, separator=separator)
+
+      truth = input_array
+      for index in truth_red_indices:
+        truth = tf.reduce_join(
+            inputs=truth, reduction_indices=index, keep_dims=True,
+            separator=separator)
+      truth_squeezed = tf.squeeze(truth, squeeze_dims=truth_red_indices)
+      output_array = output.eval()
+      output_keep_dims_array = output_keep_dims.eval()
+      truth_array = truth.eval()
+      truth_squeezed_array = truth_squeezed.eval()
+    self.assertAllEqualUnicode(truth_array, output_keep_dims_array)
+    self.assertAllEqualUnicode(truth_squeezed_array, output_array)
+
+  def testRankOne(self):
+    input_array = ["this", "is", "a", "test"]
+    truth = "thisisatest"
+    self._testReduceJoin(input_array, truth, reduction_indices=0)
+
+  def testRankTwo(self):
+    input_array = [["this", "is", "a", "test"],
+                   ["please", "do", "not", "panic"]]
+    truth_dim_zero = ["thisplease", "isdo", "anot", "testpanic"]
+    truth_dim_one = ["thisisatest", "pleasedonotpanic"]
+    self._testReduceJoin(input_array, truth_dim_zero, reduction_indices=0)
+    self._testReduceJoin(input_array, truth_dim_one, reduction_indices=1)
+
+  def testRankFive(self):
+    input_array = _input_array(num_dims=5)
+    truths = [_joined_array(num_dims=5, reduce_dim=i) for i in xrange(5)]
+    for i in xrange(5):
+      self._testReduceJoin(input_array, truths[i], reduction_indices=i)
+
+  def testNegative(self):
+    input_array = _input_array(num_dims=5)
+    truths = [_joined_array(num_dims=5, reduce_dim=i) for i in xrange(5)]
+    for i in xrange(5):
+      self._testReduceJoin(input_array, truths[i], reduction_indices=i - 5)
+
+  def testSingletonDimension(self):
+    input_arrays = [_input_array(num_dims=5)
+                    .reshape([2] * i + [1] + [2] * (5 - i))
+                    for i in xrange(6)]
+    truth = _input_array(num_dims=5)
+    for i in xrange(6):
+      self._testReduceJoin(input_arrays[i], truth, reduction_indices=i)
+
+  def testSeparator(self):
+    input_array = [["this", "is", "a", "test"],
+                   ["please", "do", "not", "panic"]]
+    truth_dim_zero = ["this  please", "is  do", "a  not", "test  panic"]
+    truth_dim_one = ["this  is  a  test", "please  do  not  panic"]
+    self._testReduceJoin(input_array, truth_dim_zero, reduction_indices=0,
+                         separator="  ")
+    self._testReduceJoin(input_array, truth_dim_one, reduction_indices=1,
+                         separator="  ")
+
+  def testUnknownShape(self):
+    input_array = [["a"], ["b"]]
+    truth = ["ab"]
+    with self.test_session():
+      placeholder = tf.placeholder(tf.string, name="placeholder")
+      reduced = tf.reduce_join(placeholder, reduction_indices=0)
+      output_array = reduced.eval(feed_dict={placeholder.name: input_array})
+      self.assertAllEqualUnicode(truth, output_array)
+
+  def testUnknownIndices(self):
+    input_array = [["this", "is", "a", "test"],
+                   ["please", "do", "not", "panic"]]
+    truth_dim_zero = ["thisplease", "isdo", "anot", "testpanic"]
+    truth_dim_one = ["thisisatest", "pleasedonotpanic"]
+    with self.test_session():
+      placeholder = tf.placeholder(tf.int32, name="placeholder")
+      reduced = tf.reduce_join(input_array, reduction_indices=placeholder)
+      output_array_dim_zero = reduced.eval(feed_dict={placeholder.name: [0]})
+      output_array_dim_one = reduced.eval(feed_dict={placeholder.name: [1]})
+      self.assertAllEqualUnicode(truth_dim_zero, output_array_dim_zero)
+      self.assertAllEqualUnicode(truth_dim_one, output_array_dim_one)
+
+  def testKeepDims(self):
+    input_array = [["this", "is", "a", "test"],
+                   ["please", "do", "not", "panic"]]
+    truth_dim_zero = [["thisplease", "isdo", "anot", "testpanic"]]
+    truth_dim_one = [["thisisatest"], ["pleasedonotpanic"]]
+    self._testReduceJoin(input_array, truth_dim_zero, reduction_indices=0,
+                         keep_dims=True)
+    self._testReduceJoin(input_array, truth_dim_one, reduction_indices=1,
+                         keep_dims=True)
+
+  def testMultiIndex(self):
+    num_dims = 3
+    input_array = _input_array(num_dims=num_dims)
+    # Also tests [].
+    for i in xrange(num_dims + 1):
+      for permutation in itertools.permutations(xrange(num_dims), i):
+        self._testMultipleReduceJoin(input_array,
+                                     reduction_indices=permutation)
+
+  def testInvalidReductionIndices(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, "scalar"):
+        tf.reduce_join(inputs="", reduction_indices=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   "Invalid reduction dimension -3"):
+        tf.reduce_join(inputs=[[""]], reduction_indices=-3)
+      with self.assertRaisesRegexp(ValueError, "Invalid reduction dimension 2"):
+        tf.reduce_join(inputs=[[""]], reduction_indices=2)
+      with self.assertRaisesRegexp(ValueError,
+                                   "Invalid reduction dimension -3"):
+        tf.reduce_join(inputs=[[""]], reduction_indices=[0, -3])
+      with self.assertRaisesRegexp(ValueError, "Invalid reduction dimension 2"):
+        tf.reduce_join(inputs=[[""]], reduction_indices=[0, 2])
+      with self.assertRaisesRegexp(ValueError, "Duplicate reduction index 0"):
+        tf.reduce_join(inputs=[[""]], reduction_indices=[0, 0])
+
+  def testZeroDims(self):
+    valid_truth_shape = [0]
+    with self.test_session():
+      inputs = np.zeros([0, 1], dtype=str)
+      with self.assertRaisesRegexp(ValueError, "dimension 0 with size 0"):
+        tf.reduce_join(inputs=inputs, reduction_indices=0)
+      valid = tf.reduce_join(inputs=inputs, reduction_indices=1)
+      valid_array_shape = valid.eval().shape
+      self.assertAllEqualUnicode(valid_truth_shape, valid_array_shape)
+
+  def testInvalidArgsUnknownShape(self):
+    with self.test_session():
+      placeholder = tf.placeholder(tf.string, name="placeholder")
+      index_too_high = tf.reduce_join(placeholder, reduction_indices=1)
+      duplicate_index = tf.reduce_join(placeholder, reduction_indices=[-1, 1])
+      with self.assertRaisesOpError("Invalid reduction dimension 1"):
+        index_too_high.eval(feed_dict={placeholder.name: [""]})
+      with self.assertRaisesOpError("Duplicate reduction dimension 1"):
+        duplicate_index.eval(feed_dict={placeholder.name: [[""]]})
+
+  def testInvalidArgsUnknownIndices(self):
+    with self.test_session():
+      placeholder = tf.placeholder(tf.int32, name="placeholder")
+      reduced = tf.reduce_join(["test", "test2"],
+                               reduction_indices=placeholder)
+
+      with self.assertRaisesOpError("reduction dimension -2"):
+        reduced.eval(feed_dict={placeholder.name: -2})
+      with self.assertRaisesOpError("reduction dimension 2"):
+        reduced.eval(feed_dict={placeholder.name: 2})
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@ -937,13 +937,14 @@ def graph_creation_static_vs_dynamic_rnn_benchmark(max_time):

  def _create_static_rnn():
    with tf.Session(config=config, graph=tf.Graph()) as sess:
-      inputs_list_t = [tf.constant(x) for x in inputs_list]
+      inputs_list_t = [
+          tf.Variable(x, trainable=False).value() for x in inputs_list]
      ops = _static_vs_dynamic_rnn_benchmark_static(
          inputs_list_t, sequence_length)

  def _create_dynamic_rnn():
    with tf.Session(config=config, graph=tf.Graph()) as sess:
-      inputs_t = tf.constant(inputs)
+      inputs_t = tf.Variable(inputs, trainable=False).value()
      ops = _static_vs_dynamic_rnn_benchmark_dynamic(
          inputs_t, sequence_length)

@ -961,7 +962,7 @@ def _timer(sess, ops):
    sess.run(ops)

  # Timing run
-  runs = 10
+  runs = 20
  start = time.time()
  for _ in range(runs):
    sess.run(ops)
@ -983,13 +984,9 @@ def static_vs_dynamic_rnn_benchmark(batch_size, max_time, num_units, use_gpu):

  # Using rnn()
  with tf.Session(config=config, graph=tf.Graph()) as sess:
-    if not use_gpu:
-      with tf.device("/cpu:0"):
-        inputs_list_t = [tf.constant(x) for x in inputs_list]
-        ops = _static_vs_dynamic_rnn_benchmark_static(
-            inputs_list_t, sequence_length)
-    else:
-      inputs_list_t = [tf.constant(x) for x in inputs_list]
+    with tf.device("/cpu:0" if not use_gpu else None):
+      inputs_list_t = [
+          tf.Variable(x, trainable=False).value() for x in inputs_list]
      ops = _static_vs_dynamic_rnn_benchmark_static(
          inputs_list_t, sequence_length)
    tf.initialize_all_variables().run()
@ -997,13 +994,8 @@ def static_vs_dynamic_rnn_benchmark(batch_size, max_time, num_units, use_gpu):

  # Using dynamic_rnn()
  with tf.Session(config=config, graph=tf.Graph()) as sess:
-    if not use_gpu:
-      with tf.device("/cpu:0"):
-        inputs_t = tf.Variable(inputs)
-        ops = _static_vs_dynamic_rnn_benchmark_dynamic(
-            inputs_t, sequence_length)
-    else:
-      inputs_t = tf.Variable(inputs)
+    with tf.device("/cpu:0" if not use_gpu else None):
+      inputs_t = tf.Variable(inputs, trainable=False).value()
      ops = _static_vs_dynamic_rnn_benchmark_dynamic(
          inputs_t, sequence_length)
    tf.initialize_all_variables().run()
@ -1016,6 +1008,59 @@ def static_vs_dynamic_rnn_benchmark(batch_size, max_time, num_units, use_gpu):
  return delta_static, delta_dynamic


+def _half_seq_len_vs_unroll_half_rnn_benchmark(inputs_list_t, sequence_length):
+  (_, input_size) = inputs_list_t[0].get_shape().as_list()
+  initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=127)
+  cell = tf.nn.rnn_cell.LSTMCell(
+      num_units=input_size, input_size=input_size, use_peepholes=True,
+      initializer=initializer)
+  outputs, final_state = tf.nn.rnn(
+      cell, inputs_list_t, sequence_length=sequence_length, dtype=tf.float32)
+
+  trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
+  gradients = tf.gradients(outputs + [final_state], trainable_variables)
+
+  return tf.group(final_state, *(gradients + outputs))
+
+
+def half_seq_len_vs_unroll_half_rnn_benchmark(
+    batch_size, max_time, num_units, use_gpu):
+  config = tf.ConfigProto()
+  config.allow_soft_placement = True
+
+  # Set up sequence lengths
+  np.random.seed([127])
+  sequence_length = max_time * np.ones((batch_size,))
+  inputs_list = [
+      np.random.randn(batch_size, num_units).astype(np.float32)
+      for _ in range(max_time)]
+
+  # Halve the sequence length, full static unroll
+  with tf.Session(config=config, graph=tf.Graph()) as sess:
+    with tf.device("/cpu:0" if not use_gpu else None):
+      inputs_list_t = [
+          tf.Variable(x, trainable=False).value() for x in inputs_list]
+      ops = _half_seq_len_vs_unroll_half_rnn_benchmark(
+          inputs_list_t, sequence_length / 2)
+    tf.initialize_all_variables().run()
+    delta_half_seq_len = _timer(sess, ops)
+
+  # Halve the unroll size, don't use sequence length
+  with tf.Session(config=config, graph=tf.Graph()) as sess:
+    with tf.device("/cpu:0" if not use_gpu else None):
+      inputs_list_t = [
+          tf.Variable(x, trainable=False).value() for x in inputs_list]
+      ops = _half_seq_len_vs_unroll_half_rnn_benchmark(
+          inputs_list_t[:(max_time // 2)], sequence_length / 2)
+    tf.initialize_all_variables().run()
+    delta_unroll_half = _timer(sess, ops)
+  print("%d \t %d \t\t %d \t %s \t %f \t\t %f \t\t %f" %
+        (batch_size, max_time, num_units, use_gpu, delta_half_seq_len,
+         delta_unroll_half, delta_half_seq_len/delta_unroll_half))
+
+  return delta_half_seq_len, delta_unroll_half
+
+
 def _dynamic_rnn_swap_memory_benchmark(inputs_t, sequence_length,
                                       swap_memory):
  (unused_0, unused_1, input_size) = inputs_t.get_shape().as_list()
@ -1047,7 +1092,7 @@ def dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units):

  # No memory swap
  with tf.Session(config=config, graph=tf.Graph()) as sess:
-    inputs_t = tf.Variable(inputs)
+    inputs_t = tf.Variable(inputs, trainable=False).value()
    ops = _dynamic_rnn_swap_memory_benchmark(
        inputs_t, sequence_length, swap_memory=False)
    tf.initialize_all_variables().run()
@ -1055,7 +1100,7 @@ def dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units):

  # Memory swap
  with tf.Session(config=config, graph=tf.Graph()) as sess:
-    inputs_t = tf.Variable(inputs)
+    inputs_t = tf.Variable(inputs, trainable=False).value()
    ops = _dynamic_rnn_swap_memory_benchmark(
        inputs_t, sequence_length, swap_memory=True)
    tf.initialize_all_variables().run()
@ -1082,14 +1127,15 @@ def rnn_long_sequence_benchmark(batch_size, seqlen, num_units,
  for _ in range(5):
    if dynamic:
      with tf.Session(config=config, graph=tf.Graph()) as sess:
-        inputs_t = tf.Variable(inputs)
+        inputs_t = tf.Variable(inputs, trainable=False).value()
        ops = _dynamic_rnn_swap_memory_benchmark(
            inputs_t, sequence_length, swap_memory=swap_memory)
        tf.initialize_all_variables().run()
        elapsed = _timer(sess, ops)
    else:
      with tf.Session(config=config, graph=tf.Graph()) as sess:
-        inputs_list_t = [tf.constant(x) for x in inputs_list]
+        inputs_list_t = [
+            tf.Variable(x, trainable=False).value() for x in inputs_list]
        ops = _static_vs_dynamic_rnn_benchmark_static(
            inputs_list_t, sequence_length)
        tf.initialize_all_variables().run()
@ -1126,11 +1172,11 @@ class BenchmarkRNN(tf.test.Benchmark):
            self.report_benchmark(
                name="static_unroll_time_T%02d_B%03d_N%03d_gpu_%s"
                % (max_time, batch_size, num_units, use_gpu),
-                iters=10, wall_time=s_dt)
+                iters=20, wall_time=s_dt)
            self.report_benchmark(
                name="dynamic_unroll_time_T%02d_B%03d_N%03d_gpu_%s"
                % (max_time, batch_size, num_units, use_gpu),
-                iters=10, wall_time=d_dt)
+                iters=20, wall_time=d_dt)

  def benchmarkDynamicLSTMNoMemorySwapVsMemorySwap(self):
    print("Calculation: Dynamic LSTM No Memory Swap vs. Memory Swap")
@ -1143,11 +1189,31 @@ class BenchmarkRNN(tf.test.Benchmark):
          self.report_benchmark(
              name="dynamic_lstm_no_memory_swap_T%02d_B%03d_N%03d"
              % (max_time, batch_size, num_units),
-              iters=10, wall_time=no_swap)
+              iters=20, wall_time=no_swap)
          self.report_benchmark(
              name="dynamic_lstm_with_memory_swap_T%02d_B%03d_N%03d"
              % (max_time, batch_size, num_units),
-              iters=10, wall_time=swap)
+              iters=20, wall_time=swap)
+
+  def benchmarkStaticUnrollHalfSequenceLengthVsHalfUnroll(self):
+    print("Calculation: Static Unroll with Halved Sequence Length "
+          "vs. Half Static Unroll")
+    print("batch \t full_t \t units \t gpu \t dt(half_seq_len) "
+          "\t dt(unroll_half) \t dt(half_seq_len)/dt(unroll_half)")
+    for batch_size in (128,):
+      for max_time in (50,):
+        for num_units in (256,):
+          for use_gpu in (False, True):
+            s_dt, d_dt = half_seq_len_vs_unroll_half_rnn_benchmark(
+                batch_size, max_time, num_units, use_gpu)
+            self.report_benchmark(
+                name="half_seq_len_time_T%02d_B%03d_N%03d_gpu_%s"
+                % (max_time, batch_size, num_units, use_gpu),
+                iters=20, wall_time=s_dt)
+            self.report_benchmark(
+                name="unroll_half_time_T%02d_B%03d_N%03d_gpu_%s"
+                % (max_time, batch_size, num_units, use_gpu),
+                iters=20, wall_time=d_dt)


 if __name__ == "__main__":
--- a/tensorflow/python/kernel_tests/session_ops_test.py
+++ b/tensorflow/python/kernel_tests/session_ops_test.py
@ -0,0 +1,157 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.session_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class SessionOpsTest(tf.test.TestCase):
+
+  def testHandleBasic(self):
+    with self.test_session() as sess:
+      # Return a handle.
+      a = tf.constant(10)
+      b = tf.constant(5)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      h = sess.run(h)
+
+      # Feed a tensor handle.
+      f, x = tf.get_session_tensor(tf.int32)
+      y = tf.mul(x, 10)
+      self.assertEqual(500, sess.run(y, feed_dict={f: h.handle}))
+
+  def testHandleEval(self):
+    with self.test_session() as sess:
+      # Return a handle.
+      a = tf.constant(10)
+      b = tf.constant(5)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      h = sess.run(h)
+
+      # Get the tensor from its handle.
+      self.assertEqual(50, h.eval())
+
+  def testHandleAndValue(self):
+    with self.test_session() as sess:
+      # Return a handle and a value.
+      a = tf.constant(10)
+      b = tf.constant(5)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      v = tf.mul(a, c)
+      h, v = sess.run([h, v])
+
+      self.assertEqual(50, h.eval())
+      self.assertEqual(500, v)
+
+  def testHandleCond(self):
+    with self.test_session() as sess:
+      # Return a handle and a value
+      a = tf.constant(10)
+      b = tf.constant(5)
+      p = tf.less(a, b)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      p, h = sess.run([p, h])
+
+      # Run by feeding a tensor handle.
+      f, x = tf.get_session_tensor(tf.int32)
+      if p:
+        y = tf.mul(x, 10)
+      else:
+        y = tf.mul(x, 100)
+      result = sess.run(y, feed_dict={f: h.handle})
+
+      self.assertEqual(5000, result)
+
+  def testHandleForLoop(self):
+    with self.test_session() as sess:
+      # Initialize a handle.
+      a = tf.constant(0)
+      h = tf.get_session_handle(a)
+      h = sess.run(h)
+
+      # Do some computation.
+      f, x = tf.get_session_tensor(tf.int32)
+      # Must define the loop body outside the loop.
+      h_x = tf.get_session_handle(tf.add(x, 1))
+      for _ in range(100):
+        # This exercises garbage collection.
+        h = sess.run(h_x, feed_dict={f: h.handle})
+
+      self.assertEqual(100, h.eval())
+
+  def testHandleWhileLoop(self):
+    with self.test_session() as sess:
+      # Initialize a handle.
+      a = tf.constant(0)
+      h = tf.get_session_handle(a)
+      h = sess.run(h)
+
+      # Do some computation.
+      f, x = tf.get_session_tensor(tf.int32)
+      b = tf.constant(100)
+      p = tf.less(x, b)
+      # Must define the loop body outside the loop.
+      h_x = tf.get_session_handle(tf.add(x, 1))
+      while True:
+        rp, h = sess.run([p, h_x], feed_dict={f: h.handle})
+        if not rp:
+          break
+
+      self.assertEqual(101, h.eval())
+
+  def testHandleMover(self):
+    with self.test_session() as sess:
+      # Return a handle.
+      a = tf.constant(10)
+      b = tf.constant(5)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      h = sess.run(h)
+
+      # Feed a tensor handle.
+      f, x = tf.get_session_tensor(tf.int32)
+      y = tf.mul(x, 10)
+      self.assertEqual(500, sess.run(y, feed_dict={f: h.handle}))
+
+      # Feed another tensor handle.
+      with tf.device("/gpu:0"):
+        a = tf.constant(10)
+        h = tf.get_session_handle(a)
+        h = sess.run(h)
+        self.assertEqual(100, sess.run(y, feed_dict={f: h.handle}))
+
+  def testHandleDeleter(self):
+    with self.test_session() as sess:
+      # Return a handle.
+      a = tf.constant(10)
+      b = tf.constant(5)
+      c = tf.mul(a, b)
+      h = tf.get_session_handle(c)
+      h = sess.run(h)
+
+      # Delete using a raw tensor handle.
+      h = h.get_raw_handle()
+      f, x = tf.delete_session_tensor()
+      sess.run(x, feed_dict={f: h})
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@ -24,7 +24,6 @@ import time
 import numpy as np
 import tensorflow as tf

-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import sparse_ops
 # pylint: enable=g-bad-import-order,unused-import

@ -131,7 +130,7 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
  t0 = tf.constant(0)
  v0 = tf.constant(0.0)
  def _timeit(iterations, _):
-    (_, final) = control_flow_ops.While(
+    (_, final) = tf.while_loop(
        lambda t, _: t < iterations, body, (t0, v0),
        parallel_iterations=1, back_prop=False)
    return [final]
@ -151,7 +150,7 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
  t0 = tf.constant(0)
  v0 = tf.constant(0.0)
  def _timeit(iterations, _):
-    (_, final) = control_flow_ops.While(
+    (_, final) = tf.while_loop(
        lambda t, _: t < iterations, body, (t0, v0),
        parallel_iterations=1, back_prop=False)
    return [final]
--- a/tensorflow/python/kernel_tests/stack_ops_test.py
+++ b/tensorflow/python/kernel_tests/stack_ops_test.py
@ -22,7 +22,6 @@ import numpy as np
 import tensorflow as tf

 from tensorflow.python.framework import errors
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops


@ -67,7 +66,7 @@ class StackOpTest(tf.test.TestCase):
          v = gen_data_flow_ops._stack_push(h, a, swap_memory=True)
        with tf.control_dependencies([v]):
          return tf.add(x, 1)
-      r = control_flow_ops.While(c, b, [n])
+      r = tf.while_loop(c, b, [n])

      v = tf.constant(np.zeros(2000), dtype=tf.float32)
      def c1(x, y):
@ -76,7 +75,7 @@ class StackOpTest(tf.test.TestCase):
        nx = tf.sub(x, 1)
        ny = y + gen_data_flow_ops._stack_pop(h, tf.float32)
        return [nx, ny]
-      rx, ry = control_flow_ops.While(c1, b1, [r, v])
+      rx, ry = tf.while_loop(c1, b1, [r, v])
      self.assertAllClose(np.ones(2000) * 10.0, ry.eval())

  def testStackWhileSwap(self):
--- a/Show More
+++ b/Show More