Merge pull request #13264 from caisq/branch_169770126

Branch 169770126
2017-09-23 22:01:48 -04:00 · 2017-09-23 22:01:48 -04:00 · d2d42ee8b3
commit d2d42ee8b3
parent e0501bc4d0 39f7e90b38
199 changed files with 4847 additions and 2014 deletions
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -356,7 +356,6 @@ filegroup(
        "//tensorflow/contrib/data:all_files",
        "//tensorflow/contrib/data/python/kernel_tests:all_files",
        "//tensorflow/contrib/data/python/ops:all_files",
-        "//tensorflow/contrib/data/python/util:all_files",
        "//tensorflow/contrib/decision_trees/proto:all_files",
        "//tensorflow/contrib/distributions:all_files",
        "//tensorflow/contrib/eager/python:all_files",
@ -475,6 +474,9 @@ filegroup(
        "//tensorflow/java/src/main/java/org/tensorflow/examples:all_files",
        "//tensorflow/java/src/main/native:all_files",
        "//tensorflow/python:all_files",
+        "//tensorflow/python/data:all_files",
+        "//tensorflow/python/data/ops:all_files",
+        "//tensorflow/python/data/util:all_files",
        "//tensorflow/python/debug:all_files",
        "//tensorflow/python/eager:all_files",
        "//tensorflow/python/estimator:all_files",
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@ -107,13 +107,13 @@ class Scope {
  static Scope NewRootScope();

  /// Return a new scope. Ops created with this scope will have
-  /// <name>/<child_scope_name> as the prefix. The actual name will be unique
+  /// `name/child_scope_name` as the prefix. The actual name will be unique
  /// in the current scope. All other properties are inherited from the current
-  /// scope. If child_scope_name is empty, the '/' is elided.
+  /// scope. If `child_scope_name` is empty, the `/` is elided.
  Scope NewSubScope(const string& child_scope_name) const;

  /// Return a new scope. All ops created within the returned scope will have
-  /// names of the form <name>/<op_name>[_<suffix].
+  /// names of the form `name/op_name[_suffix]`.
  Scope WithOpName(const string& op_name) const;

  /// Return a new scope. All ops created within the returned scope will have as
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@ -50,7 +50,6 @@ REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);
 Status LogSoftmaxGrad(const Scope& scope, const Operation& op,
                   const std::vector<Output>& grad_inputs,
                   std::vector<Output>* grad_outputs) {
-
  auto softmax = Exp(scope, op.output(0));
  auto sum = Sum(scope, grad_inputs[0], {1}, Sum::KeepDims(true));
  auto mul = Mul(scope, sum, softmax);
@ -130,8 +129,7 @@ Status Conv2DGrad(const Scope& scope, const Operation& op,
  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "strides", &strides));
-  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "use_cudnn_on_gpu", 
-			         &use_cudnn_on_gpu));
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "use_cudnn_on_gpu", &use_cudnn_on_gpu));
  Conv2DBackpropInput::Attrs input_attrs;
  input_attrs.DataFormat(data_format);
  input_attrs.UseCudnnOnGpu(use_cudnn_on_gpu);
@ -198,8 +196,6 @@ Status MaxPoolGradV2Helper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("MaxPoolV2", MaxPoolGradV2Helper);

-
-  
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@ -36,7 +36,7 @@ class NNGradTest : public ::testing::Test {
    float max_error;
    TF_ASSERT_OK((ComputeGradientError<float, float, float>(
        scope_, {x}, {x_shape}, {y}, {y_shape}, &max_error)));
-    EXPECT_LT(max_error, 2.2e-4);
+    EXPECT_LT(max_error, 1e-3);
  }

  void RunTest(const Output& x, const Tensor& x_init_value, const Output& y,
@ -44,7 +44,7 @@ class NNGradTest : public ::testing::Test {
    float max_error;
    TF_ASSERT_OK((ComputeGradientError<float, float, float>(
        scope_, x, x_init_value, y, y_shape, &max_error)));
-    EXPECT_LT(max_error, 2.2e-4);
+    EXPECT_LT(max_error, 1e-3);
  }

  void RunTest(const OutputList& xs, const std::vector<TensorShape>& x_shapes,
@ -53,7 +53,25 @@ class NNGradTest : public ::testing::Test {
    float max_error;
    TF_ASSERT_OK((ComputeGradientError<float, float, float>(
        scope_, xs, x_shapes, ys, y_shapes, &max_error)));
-    EXPECT_LT(max_error, 2.2e-4);
+    EXPECT_LT(max_error, 1e-3);
+  }
+
+  // Sets tensor with random values, ensuring that the max value is largest by
+  // a reasonable amount.
+  // This is an issue for MaxPool and MaxPoolV2, in which perturbations by the
+  // numeric gradient computation in the gradient checker can change the max
+  // value if values are too close together.
+  template <typename T>
+  void SetRandomValuesWithBumpedMax(Tensor* tensor) {
+    auto tensor_flat = tensor->flat<T>();
+    tensor_flat.setRandom();
+    int32 max_index = 0;
+    for (size_t i = 1; i < tensor->NumElements(); i++) {
+      if (tensor_flat(i) > tensor_flat(max_index)) {
+        max_index = i;
+      }
+    }
+    tensor_flat(max_index) += 1e-2;
  }

  Scope scope_;
@ -148,22 +166,30 @@ TEST_F(NNGradTest, Conv2DGrad) {
 }

 TEST_F(NNGradTest, MaxPoolGradHelper) {
-  TensorShape shape({1, 2, 2, 1});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  TensorShape x_shape({1, 2, 2, 1});
+  TensorShape y_shape({1, 1, 1, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Setup window and strides so that we only do one MaxPool.
  const std::vector<int> ksize{1, 2, 2, 1};
-  const std::vector<int> strides{1, 1, 1, 1};
-  auto y = MaxPool(scope_, x, ksize, strides, "SAME");
-  RunTest(x, shape, y, shape);
+  const std::vector<int> strides{1, 2, 2, 1};
+  auto y = MaxPool(scope_, x, ksize, strides, "VALID");
+  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  RunTest(x, x_init_value, y, y_shape);
 }

 TEST_F(NNGradTest, MaxPoolGradV2Helper) {
-  TensorShape shape({1, 2, 2, 1});
-  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  TensorShape x_shape({1, 2, 2, 1});
+  TensorShape y_shape({1, 1, 1, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Setup window and strides so that we only do one MaxPool.
  Tensor ksize = test::AsTensor<int>({1, 2, 2, 1}, {4});
-  Tensor strides = test::AsTensor<int>({1, 1, 1, 1}, {4});
-  auto y = MaxPoolV2(scope_, x, ksize, strides, "SAME");
-  RunTest(x, shape, y, shape);
+  Tensor strides = test::AsTensor<int>({1, 2, 2, 1}, {4});
+  auto y = MaxPoolV2(scope_, x, ksize, strides, "VALID");
+  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
+  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  RunTest(x, x_init_value, y, y_shape);
 }
-  
+
 }  // namespace
 }  // namespace tensorflow
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@ -45,7 +45,7 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        ":jit_compilation_passes",
-        "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
+        "//tensorflow/compiler/jit/kernels:xla_launch_op",
        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
        "//tensorflow/compiler/xla/service:cpu_plugin",
    ],
@ -57,7 +57,7 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = if_cuda([
        ":jit_compilation_passes",
-        "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
+        "//tensorflow/compiler/jit/kernels:xla_launch_op",
        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
        "//tensorflow/compiler/xla/service:gpu_plugin",
    ]),
@ -71,7 +71,7 @@ cc_library(
    deps = [
        ":jit_compilation_passes",
        ":xla_device",
-        "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
+        "//tensorflow/compiler/jit/kernels:xla_launch_op",
        "//tensorflow/compiler/tf2xla:xla_compiler",
        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
        "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
@ -88,7 +88,7 @@ cc_library(
    deps = [
        ":jit_compilation_passes",
        ":xla_device",
-        "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
+        "//tensorflow/compiler/jit/kernels:xla_launch_op",
        "//tensorflow/compiler/tf2xla:xla_compiler",
        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
        "//tensorflow/compiler/xla/service:gpu_plugin",  # buildcleaner: keep
@ -103,7 +103,7 @@ cc_library(
    srcs = ["xla_interpreter_device.cc"],
    deps = [
        ":xla_device",
-        "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
+        "//tensorflow/compiler/jit/kernels:xla_launch_op",
        "//tensorflow/compiler/tf2xla:xla_compiler",
    ],
    alwayslink = True,
@ -213,7 +213,7 @@ cc_library(
    deps = [
        ":common",
        ":compilation_passes",
-        "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
+        "//tensorflow/compiler/jit/kernels:xla_launch_op",
        "//tensorflow/compiler/tf2xla:const_analysis",
        "//tensorflow/compiler/tf2xla:xla_compiler",
        "//tensorflow/core:core_cpu_internal",
@ -297,7 +297,7 @@ tf_cc_test(
        "//tensorflow/cc:cc_ops_internal",
        "//tensorflow/cc:function_ops",
        "//tensorflow/cc:ops",
-        "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
+        "//tensorflow/compiler/jit/kernels:xla_launch_op",
        "//tensorflow/compiler/tf2xla:xla_compiler",
        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
        "//tensorflow/core:core_cpu",
--- a/tensorflow/compiler/jit/create_xla_launch_op.cc
+++ b/tensorflow/compiler/jit/create_xla_launch_op.cc
@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/

 #include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/kernels/xla_local_launch_op.h"
+#include "tensorflow/compiler/jit/kernels/xla_launch_op.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@ -7,9 +7,9 @@ package(
 )

 cc_library(
-    name = "xla_local_launch_op",
-    srcs = ["xla_local_launch_op.cc"],
-    hdrs = ["xla_local_launch_op.h"],
+    name = "xla_launch_op",
+    srcs = ["xla_launch_op.cc"],
+    hdrs = ["xla_launch_op.h"],
    deps = [
        "//tensorflow/compiler/jit:common",
        "//tensorflow/compiler/jit:xla_compilation_cache",
--- a/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/compiler/jit/kernels/xla_local_launch_op.h"
+#include "tensorflow/compiler/jit/kernels/xla_launch_op.h"

 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/xla_device.h"
@ -194,7 +194,7 @@ std::vector<OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
    Var* variable = nullptr;
    ResourceHandle handle = HandleFromInput(ctx, first_variable + i);
    if (LookupResource(ctx, handle, &variable).ok()) {
-      mutex_lock lock(*variable->mu());
+      tf_shared_lock lock(*variable->mu());
      snapshot[i].name = handle.name();
      snapshot[i].present = true;
      snapshot[i].value = *variable->tensor();
--- a/tensorflow/compiler/jit/kernels/xla_local_launch_op.h
+++ b/tensorflow/compiler/jit/kernels/xla_local_launch_op.h
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@ -16,7 +16,7 @@ limitations under the License.
 // Registers the XLA_CPU device, which is an XlaDevice instantiation that runs
 // operators using XLA via the XLA "Host" (CPU) backend.

-#include "tensorflow/compiler/jit/kernels/xla_local_launch_op.h"
+#include "tensorflow/compiler/jit/kernels/xla_launch_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@ -16,7 +16,7 @@ limitations under the License.
 // Registers the XLA_GPU device, which is an XlaDevice instantiation that runs
 // operators using XLA via the XLA "CUDA" (GPU) backend.

-#include "tensorflow/compiler/jit/kernels/xla_local_launch_op.h"
+#include "tensorflow/compiler/jit/kernels/xla_launch_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@ -15,7 +15,7 @@ limitations under the License.

 // Registers the XLA_INTERPRETER device which exposes the XLA Interpreter.

-#include "tensorflow/compiler/jit/kernels/xla_local_launch_op.h"
+#include "tensorflow/compiler/jit/kernels/xla_launch_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@ -165,6 +165,22 @@ class Array2D {
    return tensorflow::str_util::Join(pieces, "");
  }

+  bool operator==(const Array2D<T>& other) const {
+    if (n1() != other.n1() || n2() != other.n2()) {
+      return false;
+    }
+    for (int64 i0 = 0; i0 < n1(); ++i0) {
+      for (int64 i1 = 0; i1 < n2(); ++i1) {
+        if ((*this)(i0, i1) != other(i0, i1)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  bool operator!=(const Array2D<T>& other) const { return !(*this == other); }
+
 private:
  int64 n1_;
  int64 n2_;
--- a/tensorflow/compiler/xla/array2d_test.cc
+++ b/tensorflow/compiler/xla/array2d_test.cc
@ -139,5 +139,46 @@ TEST(Array2dTest, Stringification) {
  EXPECT_EQ(expected, arr->ToString());
 }

+TEST(Array2dTest, Equals) {
+  Array2D<int> arr0 = {{1, 2}, {3, 4}, {5, 6}};
+  Array2D<int> arr1 = {{1, 2}, {3, 4}, {5, 6}};
+  EXPECT_TRUE(arr0 == arr1);
+  EXPECT_FALSE(arr0 != arr1);
+  EXPECT_TRUE(arr1 == arr0);
+  EXPECT_FALSE(arr1 != arr0);
+
+  Array2D<int> arr2 = {{1, 2}, {3, 4}, {5, 6}, {7, 8}};
+  EXPECT_TRUE(arr0 != arr2);
+  EXPECT_FALSE(arr0 == arr2);
+  EXPECT_TRUE(arr2 != arr0);
+  EXPECT_FALSE(arr2 == arr0);
+
+  Array2D<int> arr3 = {{1, 2, 3}, {4, 5, 6}};
+  EXPECT_TRUE(arr0 != arr3);
+  EXPECT_FALSE(arr0 == arr3);
+  EXPECT_TRUE(arr3 != arr0);
+  EXPECT_FALSE(arr3 == arr0);
+
+  Array2D<int> arr4 = {{1, 2}, {3, 4}};
+  EXPECT_TRUE(arr0 != arr4);
+  EXPECT_FALSE(arr0 == arr4);
+  EXPECT_TRUE(arr4 != arr0);
+  EXPECT_FALSE(arr4 == arr0);
+
+  Array2D<int> arr5 = {{1, 2}, {13, 4}, {5, 6}};
+  EXPECT_TRUE(arr0 != arr5);
+  EXPECT_FALSE(arr0 == arr5);
+  EXPECT_TRUE(arr5 != arr0);
+  EXPECT_FALSE(arr5 == arr0);
+
+  Array2D<bool> bool_arr0 = {{false}, {true}};
+  Array2D<bool> bool_arr1 = {{false}, {true}};
+  EXPECT_TRUE(bool_arr0 == bool_arr1);
+  EXPECT_FALSE(bool_arr0 != bool_arr1);
+  Array2D<bool> bool_arr2 = {{false}, {false}};
+  EXPECT_FALSE(bool_arr0 == bool_arr2);
+  EXPECT_TRUE(bool_arr0 != bool_arr2);
+}
+
 }  // namespace
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@ -503,6 +503,7 @@ cc_library(
        "//tensorflow/compiler/xla:xla_data_proto",
        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
        "//tensorflow/core:stream_executor_no_cuda",
    ],
 )
@ -1966,6 +1967,20 @@ cc_library(
    alwayslink = 1,
 )

+tf_cc_test(
+    name = "hlo_graph_dumper_test",
+    srcs = ["hlo_graph_dumper_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_graph_dumper",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
    name = "transpose_folding",
    srcs = ["transpose_folding.cc"],
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@ -28,7 +28,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

@ -83,7 +85,10 @@ CompileOnlyService::CompileAheadOfTime(
          "computation_", versioned_handle.handle.handle(), "__",
          session_module->entry().name(), "__version_",
          versioned_handle.version);
-      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
+      const string& per_host_path = tensorflow::io::JoinPath(
+          directory_path, tensorflow::port::Hostname());
+
+      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(per_host_path, filename,
                                                     *session_module));
    }

--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@ -811,7 +811,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
                                   /*is_entry_computation=*/true,
                                   &module_sequence.at(computation)));

-    entry_function->setName(llvm_ir::AsStringRef(entry_point_name));
+    CHECK(entry_function->getName() == llvm_ir::AsStringRef(entry_point_name));

    ModuleHook pre_optimization_ir_dump_hook;
    ModuleHook post_optimization_ir_dump_hook;
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@ -340,11 +340,8 @@ class HloDotDumper {
  string Header();
  string Footer();

-  // Maps HloComputations we should dump to their parent instruction in the
-  // outer computation.
-  std::unordered_map<const HloComputation*, const HloInstruction*>
-  SubcomputationsToDump();
-
+  bool ShouldShowSubcomputation(const HloComputation* subcomp);
+  bool ShouldShowFusionSubcomputation(const HloInstruction* instr);
  string DumpSubcomputation(const HloComputation* subcomp,
                            const HloInstruction* parent_instr);
  string DumpComputation(const HloComputation* comp);
@ -401,11 +398,6 @@ class HloDotDumper {

 string HloDotDumper::Dump() {
  string body;
-  for (const auto& kv : SubcomputationsToDump()) {
-    const HloComputation* subcomp = kv.first;
-    const HloInstruction* parent = kv.second;
-    StrAppend(&body, DumpSubcomputation(subcomp, parent));
-  }
  StrAppend(&body, DumpComputation(computation_));
  StrAppend(&body, DumpRootTag());

@ -525,33 +517,36 @@ stylesheet="

 string HloDotDumper::Footer() { return StrCat(Join(edges_, "\n"), "\n}"); }

-std::unordered_map<const HloComputation*, const HloInstruction*>
-HloDotDumper::SubcomputationsToDump() {
-  // Dump the subcomputations of each instruction that's shown and doesn't have
-  // its operands omitted.  If an instruction has just one subcomputation and
-  // it's trivial, omit it: We'll display that subcomputation inlined into the
-  // instruction's node when we draw it.
-  std::unordered_map<const HloComputation*, const HloInstruction*> to_dump;
-  for (const auto& instr : computation_->instructions()) {
-    if (!filter_.Show(instr.get()) ||
-        filter_.SomeOrAllOperandsOmitted(instr.get())) {
-      continue;
-    }
-    if (instr->opcode() == HloOpcode::kFusion) {
-      to_dump[instr->fused_instructions_computation()] = instr.get();
-    }
+bool HloDotDumper::ShouldShowFusionSubcomputation(const HloInstruction* instr) {
+  CHECK_EQ(instr->opcode(), HloOpcode::kFusion);
+  return ShouldShowSubcomputation(instr->fused_instructions_computation());
+}

-    for (const HloComputation* comp : instr->called_computations()) {
-      if (!MatchTrivialComputation(comp)) {
-        to_dump[comp] = instr.get();
-      }
+bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
+  if (subcomp->IsFusionComputation()) {
+    const HloInstruction* fusion = subcomp->FusionInstruction();
+    if (!filter_.Show(fusion) || filter_.SomeOrAllOperandsOmitted(fusion)) {
+      return false;
    }
  }
-  return to_dump;
+
+  // Don't show trivial subcomputations on non-fusion nodes -- these are inlined
+  // into the graph.
+  if (!subcomp->IsFusionComputation() && MatchTrivialComputation(subcomp)) {
+    return false;
+  }
+
+  // Show the subcomputation if we're showing any of its members.
+  return std::any_of(computation_->instructions().begin(),
+                     computation_->instructions().end(),
+                     [&](const std::unique_ptr<HloInstruction>& instr) {
+                       return filter_.Show(instr.get());
+                     });
 }

 string HloDotDumper::DumpSubcomputation(const HloComputation* subcomp,
                                        const HloInstruction* parent_instr) {
+  VLOG(2) << "Dumping subcomputation " << subcomp->name();
  const char* computation_fmt = R"(subgraph %s {
 %s
 label = <%s>;
@ -593,20 +588,10 @@ tooltip = " ";

  string comp_body = DumpComputation(subcomp);

-  if (parent_instr->opcode() == HloOpcode::kFusion) {
-    // Dump any nested fusion nodes.
-    for (const auto& subcomp_instr : subcomp->instructions()) {
-      if (subcomp_instr->opcode() == HloOpcode::kFusion) {
-        StrAppend(
-            &comp_body,
-            DumpSubcomputation(subcomp_instr->fused_instructions_computation(),
-                               subcomp_instr.get()));
-      }
-    }
-  } else {
-    // Add an edge from the subcomputation to its parent node.  If subcomp
-    // belongs to a fusion node, it's drawn in place of the fusion instruction,
-    // so there's no need to link those.
+  // Add an edge from the subcomputation to its parent node.  If subcomp
+  // belongs to a fusion node, it's drawn in place of the fusion instruction,
+  // so there's no need to link those.
+  if (parent_instr->opcode() != HloOpcode::kFusion) {
    VLOG(2) << "Edge: from " << subcomp->root_instruction()->name() << " to "
            << parent_instr->name() << " as " << next_edge_id_;
    edge_ids_.insert(
@ -631,6 +616,14 @@ string HloDotDumper::DumpComputation(const HloComputation* comp) {
    if (!filter_.Show(instr.get())) {
      continue;
    }
+
+    // Dump subcomputations within instr.
+    for (const HloComputation* subcomp : instr->called_computations()) {
+      if (ShouldShowSubcomputation(subcomp)) {
+        StrAppend(&g, DumpSubcomputation(subcomp, instr.get()));
+      }
+    }
+
    StrAppend(&g, DumpInstruction(instr.get()));
  }
  return g;
@ -638,6 +631,14 @@ string HloDotDumper::DumpComputation(const HloComputation* comp) {

 string HloDotDumper::DumpRootTag() {
  HloInstruction* from = computation_->root_instruction();
+
+  // Fusion nodes are expanded inline, so if root is an expanded fusion node,
+  // walk up the graph until we find a node that isn't.
+  while (from->opcode() == HloOpcode::kFusion &&
+         ShouldShowFusionSubcomputation(from)) {
+    from = from->fused_expression_root();
+  }
+
  auto from_id = InstructionId(from);

  if (!filter_.Show(from)) {
@ -678,7 +679,7 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
  // Omit the fusion node if its subcomputation is drawn, since the
  // subcomputation will be drawn inline.
  if (instr->opcode() == HloOpcode::kFusion &&
-      filter_.ShowFusionSubcomputation(instr)) {
+      ShouldShowFusionSubcomputation(instr)) {
    return "";
  }

@ -937,7 +938,7 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
  // Show the shape and layout of the instruction, unless it's an inlined fusion
  // node -- there the shape and layout is present in the output node.
  if (instr->opcode() != HloOpcode::kFusion ||
-      !filter_.ShowFusionSubcomputation(instr)) {
+      !ShouldShowFusionSubcomputation(instr)) {
    string instr_shape = ShapeUtil::HumanString(instr->shape());

    // Show layout of non-tuple shapes with more than one dimension.
@ -982,7 +983,7 @@ void HloDotDumper::AddInstructionIncomingEdges(const HloInstruction* instr) {
    // fusion node and the node's subcomputation is shown, we draw our edge
    // starting at the fusion node's root instead of at the fusion node itself.
    if (from->opcode() == HloOpcode::kFusion &&
-        filter_.ShowFusionSubcomputation(from)) {
+        ShouldShowFusionSubcomputation(from)) {
      from = from->fused_expression_root();
    }
    if (!filter_.Show(from) || from->opcode() == HloOpcode::kConstant) {
@ -1147,6 +1148,11 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
      }
    }

+    // Traverse into instr's nested computations.
+    for (const HloComputation* computation : instr->called_computations()) {
+      worklist.push_back({computation->root_instruction(), depth + 1});
+    }
+
    // Traverse into instr's users, unless:
    //
    //  - there are a ton of them, in which case they're probably not
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace xla {
+namespace {
+
+using ::tensorflow::strings::StrCat;
+using ::testing::HasSubstr;
+
+string TestName() {
+  return ::testing::UnitTest::GetInstance()->current_test_info()->name();
+}
+
+class DotRenderer : public hlo_graph_dumper::GraphRendererInterface {
+ public:
+  string RenderGraph(const string& graph, GraphKind graph_kind,
+                     const DebugOptions& debug_options) override {
+    return graph;
+  }
+
+ private:
+  string last_graph_;
+};
+
+XLA_REGISTER_GRAPH_RENDERER(DotRenderer, std::numeric_limits<int>::max());
+
+TEST(HloGraphDumperTest, NestedFusion) {
+  HloComputation::Builder b("b");
+
+  // Build param0 + param1 + param2 + param3 + param4.
+  auto shape = ShapeUtil::MakeShape(F32, {10, 100});
+  std::vector<HloInstruction*> params;
+  for (int i = 0; i <= 4; ++i) {
+    params.push_back(b.AddInstruction(
+        HloInstruction::CreateParameter(i, shape, StrCat("param", i))));
+  }
+  std::vector<HloInstruction*> sums;
+  sums.push_back(b.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kAdd, params[0], params[1])));
+  for (int i = 0; i <= 2; ++i) {
+    sums.push_back(b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kAdd, sums[i], params[i + 2])));
+  }
+
+  HloModule m(TestName());
+  m.AddEntryComputation(b.Build());
+  HloComputation* root_computation = m.entry_computation();
+
+  // Fuse into fusion(param0 + param1 + param2 + param3 + param4).
+  auto* outer_fusion = root_computation->CreateFusionInstruction(
+      {sums[3], sums[2], sums[1], sums[0]}, HloInstruction::FusionKind::kLoop);
+
+  // Fusing invalidates the pointers in sums -- the instructions are cloned when
+  // they're moved to the new computation.  Get the updated pointers to sums.
+  std::vector<HloInstruction*> fused_sums;
+  for (auto* instr : outer_fusion->fused_instructions_computation()
+                         ->MakeInstructionPostOrder()) {
+    if (instr->opcode() == HloOpcode::kAdd) {
+      fused_sums.push_back(instr);
+    }
+  }
+
+  // Fuse into fusion(fusion(param0 + param1 + param2) + param3 + param4).
+  auto* inner_fusion =
+      outer_fusion->fused_instructions_computation()->CreateFusionInstruction(
+          {fused_sums[1], fused_sums[0]}, HloInstruction::FusionKind::kLoop);
+
+  // Generate the graph; all nodes should be present.
+  string graph = hlo_graph_dumper::DumpGraph(*root_computation, /*label=*/"",
+                                             DebugOptions());
+  for (const HloComputation* computation :
+       {root_computation,  //
+        inner_fusion->fused_instructions_computation(),
+        outer_fusion->fused_instructions_computation()}) {
+    for (const std::unique_ptr<HloInstruction>& instruction :
+         computation->instructions()) {
+      EXPECT_THAT(graph, HasSubstr(instruction->name()));
+    }
+  }
+
+  // Dump a neighborhood around one of the inner sum nodes.  We don't really
+  // care that the outer nodes are omitted -- whether they are or not is based
+  // fiddly heuristics -- but we do care that the node we asked for is printed.
+  const HloInstruction* inner_sum = nullptr;
+  for (const std::unique_ptr<HloInstruction>& instruction :
+       inner_fusion->fused_instructions_computation()->instructions()) {
+    if (instruction->opcode() == HloOpcode::kAdd) {
+      inner_sum = instruction.get();
+      break;
+    }
+  }
+  ASSERT_NE(inner_sum, nullptr);
+  EXPECT_THAT(
+      hlo_graph_dumper::DumpNeighborhoodAround(*inner_sum, /*radius=*/1),
+      HasSubstr(inner_sum->name()));
+}
+
+}  // anonymous namespace
+}  // namespace xla
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@ -930,9 +930,10 @@ tensorflow::Status Service::TransferToClient(const TransferToClientRequest* arg,
  }

  Literal literal;
-  auto status = LiteralFromAllocation(allocation, *literal_shape, &literal);
+  TF_RETURN_IF_ERROR(
+      LiteralFromAllocation(allocation, *literal_shape, &literal));
  *result->mutable_literal() = literal.ToProto();
-  return status;
+  return tensorflow::Status::OK();
 }

 tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@ -32,6 +32,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@ -320,6 +322,81 @@ TEST_F(ReduceWindowTest, R4UnitWindow) {
                             ErrorSpec(1e-3, 1e-3));
 }

+XLA_TEST_F(HloTestBase, R6AddMultipleStrides) {
+  auto b = HloComputation::Builder(TestName());
+
+  std::vector<int64> input_dims(6, 8);
+  auto shape = ShapeUtil::MakeShape(F32, input_dims);
+
+  std::unique_ptr<Literal> arg_literal = Literal::CreateFromShape(shape);
+  auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> float {
+    return 1.0f;
+  };
+  TF_EXPECT_OK(arg_literal->Populate<float>(generator));
+
+  auto input =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal)));
+
+  auto init_value = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
+
+  HloComputation::Builder add_computation("add");
+  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  auto param_lhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+  auto param_rhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+  add_computation.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
+
+  auto module = CreateNewModule();
+  auto add_func = module->AddEmbeddedComputation(add_computation.Build());
+
+  WindowDimension trivial_dim;
+  trivial_dim.set_size(1);
+  trivial_dim.set_stride(1);
+  trivial_dim.set_padding_low(0);
+  trivial_dim.set_padding_high(0);
+  trivial_dim.set_window_dilation(1);
+  trivial_dim.set_base_dilation(1);
+
+  WindowDimension active_dim;
+  active_dim.set_size(3);
+  active_dim.set_stride(1);
+  active_dim.set_padding_low(0);
+  active_dim.set_padding_high(0);
+  active_dim.set_window_dilation(1);
+  active_dim.set_base_dilation(1);
+
+  Window window;
+  *window.add_dimensions() = active_dim;
+  *window.add_dimensions() = trivial_dim;
+  *window.add_dimensions() = active_dim;
+  *window.add_dimensions() = active_dim;
+  *window.add_dimensions() = trivial_dim;
+  *window.add_dimensions() = trivial_dim;
+
+  // Non-monotonic output layout with minor dims trivial.
+  std::vector<int64> output_layout = {1, 5, 3, 2, 0, 4};
+  std::vector<int64> output_dims = {6, 8, 6, 6, 8, 8};
+  Shape result_shape =
+      ShapeUtil::MakeShapeWithLayout(F32, output_dims, output_layout);
+  b.AddInstruction(HloInstruction::CreateReduceWindow(
+      result_shape, input, init_value, window, add_func));
+
+  std::unique_ptr<Literal> expected = Literal::CreateFromShape(result_shape);
+  auto out_generator =
+      [&](tensorflow::gtl::ArraySlice<int64> indexes) -> float {
+    return 27.0f;
+  };
+  TF_EXPECT_OK(expected->Populate<float>(out_generator));
+
+  module->AddEntryComputation(b.Build());
+  auto actual = ExecuteAndTransfer(std::move(module), {});
+
+  LiteralTestUtil::ExpectNear(*actual, *expected, ErrorSpec(1e-3, 1e-3));
+}
+
 XLA_TEST_F(HloTestBase, R6Add) {
  auto b = HloComputation::Builder(TestName());

--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@ -222,9 +222,6 @@ py_test(
    size = "small",
    srcs = ["python/kernel_tests/split_handler_ops_test.py"],
    srcs_version = "PY2AND3",
-    tags = [
-        "nomac",  # b/63258195
-    ],
    deps = [
        ":split_handler_ops_py",
        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@ -90,9 +90,6 @@ py_test(
    size = "small",
    srcs = ["custom_export_strategy_test.py"],
    srcs_version = "PY2AND3",
-    tags = [
-        "nomac",  # b/63258195
-    ],
    deps = [
        ":custom_export_strategy",
        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_extensions_py",
--- a/tensorflow/contrib/boosted_trees/lib/BUILD
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@ -281,9 +281,6 @@ py_test(
    name = "categorical_split_handler_test",
    srcs = ["learner/batch/categorical_split_handler_test.py"],
    srcs_version = "PY2AND3",
-    tags = [
-        "nomac",  # b/63258195
-    ],
    deps = [
        ":categorical_split_handler",
        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
@ -309,9 +306,6 @@ py_test(
    name = "ordinal_split_handler_test",
    srcs = ["learner/batch/ordinal_split_handler_test.py"],
    srcs_version = "PY2AND3",
-    tags = [
-        "nomac",  # b/63258195
-    ],
    deps = [
        ":ordinal_split_handler",
        "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@ -204,6 +204,9 @@ add_python_module("tensorflow/examples/tutorials")
 add_python_module("tensorflow/examples/tutorials/mnist")
 add_python_module("tensorflow/python")
 add_python_module("tensorflow/python/client")
+add_python_module("tensorflow/python/data")
+add_python_module("tensorflow/python/data/ops")
+add_python_module("tensorflow/python/data/util")
 add_python_module("tensorflow/python/debug")
 add_python_module("tensorflow/python/debug/cli")
 add_python_module("tensorflow/python/debug/examples")
@ -237,6 +240,7 @@ add_python_module("tensorflow/python/keras/datasets/cifar100")
 add_python_module("tensorflow/python/keras/datasets/imdb")
 add_python_module("tensorflow/python/keras/datasets/mnist")
 add_python_module("tensorflow/python/keras/datasets/reuters")
+add_python_module("tensorflow/python/keras/estimator")
 add_python_module("tensorflow/python/keras/initializers")
 add_python_module("tensorflow/python/keras/layers")
 add_python_module("tensorflow/python/keras/losses")
@ -333,7 +337,6 @@ add_python_module("tensorflow/contrib/data")
 add_python_module("tensorflow/contrib/data/python")
 add_python_module("tensorflow/contrib/data/python/kernel_tests")
 add_python_module("tensorflow/contrib/data/python/ops")
-add_python_module("tensorflow/contrib/data/python/util")
 add_python_module("tensorflow/contrib/decision_trees")
 add_python_module("tensorflow/contrib/decision_trees/proto")
 add_python_module("tensorflow/contrib/deprecated")
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@ -12,6 +12,7 @@ py_library(
        "//tensorflow/contrib/data/python/ops:dataset_ops",
        "//tensorflow/contrib/data/python/ops:sloppy_ops",
        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
    ],
 )

--- a/tensorflow/contrib/data/init.py
+++ b/tensorflow/contrib/data/init.py
@ -21,10 +21,15 @@
@@TextLineDataset

@@batch_and_drop_remainder
-@@read_batch_features
-@@rejection_resample
+@@dense_to_sparse_batch
+@@enumerate_dataset
@@group_by_window
+@@ignore_errors
+@@read_batch_features
+@@unbatch
+@@rejection_resample
@@sloppy_interleave
+
 """

 from __future__ import absolute_import
@ -34,15 +39,19 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.contrib.data.python.ops.dataset_ops import batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.dataset_ops import Dataset
+from tensorflow.contrib.data.python.ops.dataset_ops import dense_to_sparse_batch
+from tensorflow.contrib.data.python.ops.dataset_ops import enumerate_dataset
 from tensorflow.contrib.data.python.ops.dataset_ops import FixedLengthRecordDataset
 from tensorflow.contrib.data.python.ops.dataset_ops import group_by_window
-from tensorflow.contrib.data.python.ops.dataset_ops import Iterator
+from tensorflow.contrib.data.python.ops.dataset_ops import ignore_errors
 from tensorflow.contrib.data.python.ops.dataset_ops import read_batch_features
 from tensorflow.contrib.data.python.ops.dataset_ops import rejection_resample
 from tensorflow.contrib.data.python.ops.dataset_ops import SqlDataset
 from tensorflow.contrib.data.python.ops.dataset_ops import TextLineDataset
 from tensorflow.contrib.data.python.ops.dataset_ops import TFRecordDataset
+from tensorflow.contrib.data.python.ops.dataset_ops import unbatch
 from tensorflow.contrib.data.python.ops.sloppy_ops import sloppy_interleave
+from tensorflow.python.data.ops.dataset_ops import Iterator
 # pylint: enable=unused-import

 from tensorflow.python.util.all_util import remove_undocumented
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@ -89,6 +89,7 @@ py_test(
        "//tensorflow/python:math_ops",
        "//tensorflow/python:string_ops",
        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
        "//third_party/py/numpy",
    ],
 )
@ -104,13 +105,17 @@ py_test(
    ],
    deps = [
        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/core:protos_all_py",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:client_testlib",
        "//tensorflow/python:dtypes",
        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:nest",
        "//third_party/py/numpy",
    ],
 )
@ -195,6 +200,7 @@ py_test(
        "//tensorflow/python:data_flow_ops",
        "//tensorflow/python:dtypes",
        "//tensorflow/python:errors",
+        "//tensorflow/python:functional_ops",
        "//tensorflow/python:io_ops",
        "//tensorflow/python:lookup_ops",
        "//tensorflow/python:math_ops",
@ -217,9 +223,13 @@ py_test(
        "//tensorflow/python:array_ops",
        "//tensorflow/python:client_testlib",
        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
        "//tensorflow/python:dtypes",
        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variables",
    ],
 )

@ -358,10 +368,10 @@ py_test(
    srcs_version = "PY2AND3",
    deps = [
        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/util:nest",
        "//tensorflow/python:client_testlib",
        "//tensorflow/python:errors",
        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/util:nest",
        "//third_party/py/numpy",
    ],
 )
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@ -229,8 +229,9 @@ class BatchDatasetTest(test.TestCase):
  def testDenseToSparseBatchDataset(self):
    components = np.random.randint(12, size=(100,)).astype(np.int32)
    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .map(lambda x: array_ops.fill([x], x)).dense_to_sparse_batch(
-                    4, [12]).make_initializable_iterator())
+                .map(lambda x: array_ops.fill([x], x)).apply(
+                    dataset_ops.dense_to_sparse_batch(4, [12]))
+                .make_initializable_iterator())
    init_op = iterator.initializer
    get_next = sparse_tensor.SparseTensor(*iterator.get_next())

@ -253,8 +254,9 @@ class BatchDatasetTest(test.TestCase):

  def testDenseToSparseBatchDatasetShapeErrors(self):
    input_tensor = array_ops.placeholder(dtypes.int32)
-    iterator = (dataset_ops.Dataset.from_tensors(input_tensor)
-                .dense_to_sparse_batch(4, [12]).make_initializable_iterator())
+    iterator = (dataset_ops.Dataset.from_tensors(input_tensor).apply(
+        dataset_ops.dense_to_sparse_batch(4, [12]))
+                .make_initializable_iterator())
    init_op = iterator.initializer
    get_next = sparse_tensor.SparseTensor(*iterator.get_next())

@ -277,7 +279,7 @@ class BatchDatasetTest(test.TestCase):
    expected_types = (dtypes.int32,) * 3
    data = data.batch(2)
    self.assertEqual(expected_types, data.output_types)
-    data = data.unbatch()
+    data = data.apply(dataset_ops.unbatch())
    self.assertEqual(expected_types, data.output_types)

    iterator = data.make_one_shot_iterator()
@ -296,7 +298,7 @@ class BatchDatasetTest(test.TestCase):
    expected_types = ((dtypes.int32,),) * 3
    data = data.batch(2)
    self.assertEqual(expected_types, data.output_types)
-    data = data.unbatch()
+    data = data.apply(dataset_ops.unbatch())
    self.assertEqual(expected_types, data.output_types)

    iterator = data.make_one_shot_iterator()
@ -317,7 +319,7 @@ class BatchDatasetTest(test.TestCase):
    expected_types = ((dtypes.int32, dtypes.string),) * 3
    data = data.batch(2)
    self.assertAllEqual(expected_types, data.output_types)
-    data = data.unbatch()
+    data = data.apply(dataset_ops.unbatch())
    self.assertAllEqual(expected_types, data.output_types)

    iterator = data.make_one_shot_iterator()
--- a/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py
@ -20,7 +20,7 @@ from __future__ import print_function
 import numpy as np

 from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.contrib.data.python.util import nest
+from tensorflow.python.data.util import nest
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import test
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
@ -22,9 +22,9 @@ import threading
 import numpy as np

 from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.contrib.data.python.util import nest
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@ -271,8 +271,8 @@ class MapDatasetTest(test.TestCase):
    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-               .map(lambda x: array_ops.check_numerics(x, "message"))
-               .ignore_errors())
+               .map(lambda x: array_ops.check_numerics(x, "message")).apply(
+                   dataset_ops.ignore_errors()))
    iterator = dataset.make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()
@ -287,10 +287,10 @@ class MapDatasetTest(test.TestCase):
  def testParallelMapIgnoreError(self):
    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)

-    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-               .map(lambda x: array_ops.check_numerics(x, "message"),
-                    num_threads=2, output_buffer_size=2)
-               .ignore_errors())
+    dataset = (dataset_ops.Dataset.from_tensor_slices(components).map(
+        lambda x: array_ops.check_numerics(x, "message"),
+        num_threads=2,
+        output_buffer_size=2).apply(dataset_ops.ignore_errors()))
    iterator = dataset.make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()
@ -311,9 +311,9 @@ class MapDatasetTest(test.TestCase):
    for filename in filenames:
      write_string_to_file(filename, filename)

-    dataset = (dataset_ops.Dataset.from_tensor_slices(filenames)
-               .map(io_ops.read_file, num_threads=2, output_buffer_size=2)
-               .ignore_errors())
+    dataset = (dataset_ops.Dataset.from_tensor_slices(filenames).map(
+        io_ops.read_file, num_threads=2, output_buffer_size=2).apply(
+            dataset_ops.ignore_errors()))
    iterator = dataset.make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()
--- a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
@ -169,8 +169,8 @@ class RangeDatasetTest(test.TestCase):
    components = (["a", "b"], [1, 2], [37.0, 38])
    start = constant_op.constant(20, dtype=dtypes.int64)

-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).enumerate(
-        start=start).make_initializable_iterator())
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).apply(
+        dataset_ops.enumerate_dataset(start)).make_initializable_iterator())
    init_op = iterator.initializer
    get_next = iterator.get_next()

--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@ -9,9 +9,7 @@ py_library(
    srcs = ["dataset_ops.py"],
    srcs_version = "PY2AND3",
    deps = [
-        "//tensorflow/contrib/data/python/util:nest",
        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
        "//tensorflow/python:control_flow_ops",
        "//tensorflow/python:dataset_ops_gen",
        "//tensorflow/python:dtypes",
@ -22,12 +20,13 @@ py_library(
        "//tensorflow/python:parsing_ops",
        "//tensorflow/python:platform",
        "//tensorflow/python:random_ops",
-        "//tensorflow/python:random_seed",
        "//tensorflow/python:resource_variable_ops",
        "//tensorflow/python:script_ops",
        "//tensorflow/python:sparse_tensor",
        "//tensorflow/python:tensor_shape",
        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
        "//third_party/py/numpy",
    ],
 )
@ -37,13 +36,12 @@ py_library(
    srcs = ["sloppy_ops.py"],
    srcs_version = "PY2AND3",
    deps = [
-        ":dataset_ops",
-        "//tensorflow/contrib/data/python/util:nest",
        "//tensorflow/python:dataset_ops_gen",
        "//tensorflow/python:dtypes",
        "//tensorflow/python:framework_ops",
        "//tensorflow/python:function",
-        "//tensorflow/python:platform",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
    ],
 )

--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
--- a/tensorflow/contrib/data/python/ops/sloppy_ops.py
+++ b/tensorflow/contrib/data/python/ops/sloppy_ops.py
@ -17,8 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.contrib.data.python.util import nest
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@ -41,11 +41,10 @@ py_library(
    srcs_version = "PY2AND3",
    visibility = ["//tensorflow:internal"],
    deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/util:nest",
        "//tensorflow/python:dataset_ops_gen",
        "//tensorflow/python:errors",
        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/data/util:nest",
        "//tensorflow/python/eager:context",
    ],
 )
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@ -20,7 +20,7 @@ from __future__ import print_function

 import threading

-from tensorflow.contrib.data.python.util import nest
+from tensorflow.python.data.util import nest
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import gen_dataset_ops
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@ -63,6 +63,7 @@ tf_kernel_library(
        "kernels/fused_conv_ops_gpu.h",
    ],
    prefix = "fused_conv2d_bias_activation_op",
+    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@ -41,7 +41,6 @@ py_library(
        "//tensorflow/contrib/training:training_py",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:check_ops",
-        "//tensorflow/python:dtypes",
        "//tensorflow/python:framework_ops",
        "//tensorflow/python:init_ops",
        "//tensorflow/python:training",
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@ -26,7 +26,6 @@ from tensorflow.contrib.gan.python import losses as tfgan_losses
 from tensorflow.contrib.gan.python import namedtuples
 from tensorflow.contrib.slim.python.slim import learning as slim_learning
 from tensorflow.contrib.training.python.training import training
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@ -549,7 +548,7 @@ def gan_train_ops(
    generator_global_step = variable_scope.get_variable(
        'dummy_global_step_generator',
        shape=[],
-        dtype=dtypes.int64,
+        dtype=global_step.dtype.base_dtype,
        initializer=init_ops.zeros_initializer(),
        trainable=False,
        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
@ -570,7 +569,7 @@ def gan_train_ops(
    discriminator_global_step = variable_scope.get_variable(
        'dummy_global_step_discriminator',
        shape=[],
-        dtype=dtypes.int64,
+        dtype=global_step.dtype.base_dtype,
        initializer=init_ops.zeros_initializer(),
        trainable=False,
        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@ -542,11 +542,17 @@ class GANTrainOpsTest(test.TestCase):
  def test_unused_update_ops_callable_acgan_provideupdates(self):
    self._test_unused_update_ops(create_callable_acgan_model, True)

-  def _test_sync_replicas_helper(self, create_gan_model_fn):
+  def _test_sync_replicas_helper(
+      self, create_gan_model_fn, create_global_step=False):
    model = create_gan_model_fn()
    loss = train.gan_loss(model)
    num_trainable_vars = len(variables_lib.get_trainable_variables())

+    if create_global_step:
+      gstep = variable_scope.get_variable(
+          'custom_gstep', dtype=dtypes.int32, initializer=0, trainable=False)
+      ops.add_to_collection(ops.GraphKeys.GLOBAL_STEP, gstep)
+
    g_opt = get_sync_optimizer()
    d_opt = get_sync_optimizer()
    train_ops = train.gan_train_ops(
@ -610,6 +616,9 @@ class GANTrainOpsTest(test.TestCase):
  def test_sync_replicas_callable_acgan(self):
    self._test_sync_replicas_helper(create_callable_acgan_model)

+  def test_global_step_can_be_int32(self):
+    self._test_sync_replicas_helper(create_gan_model, create_global_step=True)
+

 class GANTrainTest(test.TestCase):
  """Tests for `gan_train`."""
--- a/tensorflow/contrib/receptive_field/README.md
+++ b/tensorflow/contrib/receptive_field/README.md
@ -38,7 +38,7 @@ models are available to you. This can be done in three simple commands:

 ```sh
 git clone https://github.com/tensorflow/models
-cd models/slim
+cd models/research/slim
 sudo python setup.py install_lib
 ```

--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@ -121,7 +121,7 @@ class TestBeamStep(test.TestCase):
        log_probs=nn_ops.log_softmax(
            array_ops.ones([self.batch_size, self.beam_width])),
        lengths=constant_op.constant(
-            2, shape=[self.batch_size, self.beam_width], dtype=dtypes.int32),
+            2, shape=[self.batch_size, self.beam_width], dtype=dtypes.int64),
        finished=array_ops.zeros(
            [self.batch_size, self.beam_width], dtype=dtypes.bool))

@ -176,7 +176,7 @@ class TestBeamStep(test.TestCase):
        log_probs=nn_ops.log_softmax(
            array_ops.ones([self.batch_size, self.beam_width])),
        lengths=ops.convert_to_tensor(
-            [[2, 1, 2], [2, 2, 1]], dtype=dtypes.int32),
+            [[2, 1, 2], [2, 2, 1]], dtype=dtypes.int64),
        finished=ops.convert_to_tensor(
            [[False, True, False], [False, False, True]], dtype=dtypes.bool))

--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@ -22,6 +22,7 @@ import collections

 from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
 from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@ -256,7 +257,7 @@ class BeamSearchDecoder(decoder.Decoder):
            dtype=nest.flatten(self._initial_cell_state)[0].dtype),
        finished=finished,
        lengths=array_ops.zeros(
-            [self._batch_size, self._beam_width], dtype=dtypes.int32))
+            [self._batch_size, self._beam_width], dtype=dtypes.int64))

    return (finished, start_inputs, initial_state)

@ -267,7 +268,7 @@ class BeamSearchDecoder(decoder.Decoder):
      outputs: An instance of BeamSearchDecoderOutput.
      final_state: An instance of BeamSearchDecoderState. Passed through to the
        output.
-      sequence_lengths: An `int32` tensor shaped `[batch_size, beam_width]`.
+      sequence_lengths: An `int64` tensor shaped `[batch_size, beam_width]`.
        The sequence lengths determined for each beam during decode.

    Returns:
@ -491,9 +492,10 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
      indices=array_ops.tile(
          array_ops.reshape(end_token, [1, 1]), [batch_size, beam_width]),
      depth=vocab_size,
-      on_value=0,
-      off_value=1)
-  add_mask = (1 - math_ops.to_int32(previously_finished))
+      on_value=constant_op.constant(0, dtype=dtypes.int64),
+      off_value=constant_op.constant(1, dtype=dtypes.int64),
+      dtype=dtypes.int64)
+  add_mask = (1 - math_ops.to_int64(previously_finished))
  lengths_to_add = array_ops.expand_dims(add_mask, 2) * lengths_to_add
  new_prediction_lengths = (
      lengths_to_add + array_ops.expand_dims(prediction_lengths, 2))
@ -547,9 +549,9 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
  # 1. Finished beams remain unchanged
  # 2. Beams that are now finished (EOS predicted) remain unchanged
  # 3. Beams that are not yet finished have their length increased by 1
-  lengths_to_add = math_ops.to_int32(
+  lengths_to_add = math_ops.to_int64(
      math_ops.not_equal(next_word_ids, end_token))
-  lengths_to_add = (1 - math_ops.to_int32(next_finished)) * lengths_to_add
+  lengths_to_add = (1 - math_ops.to_int64(next_finished)) * lengths_to_add
  next_prediction_len = _tensor_gather_helper(
      gather_indices=next_beam_ids,
      gather_from=beam_state.lengths,
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
@ -33,7 +33,7 @@ def hertz_to_mel(frequencies_hertz):
  """Convert frequencies to mel scale using HTK formula.

  Copied from
-  https://github.com/tensorflow/models/blob/master/audioset/mel_features.py.
+  https://github.com/tensorflow/models/blob/master/research/audioset/mel_features.py.

  Args:
    frequencies_hertz: Scalar or np.array of frequencies in hertz.
@ -54,7 +54,7 @@ def spectrogram_to_mel_matrix(num_mel_bins=20,
  """Return a matrix that can post-multiply spectrogram rows to make mel.

  Copied from
-  https://github.com/tensorflow/models/blob/master/audioset/mel_features.py.
+  https://github.com/tensorflow/models/blob/master/research/audioset/mel_features.py.

  Returns a np.array matrix A that can be used to post-multiply a matrix S of
  spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@ -13,6 +13,7 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")

 package(
    default_visibility = [
+        "//cloud/vmm/testing/tests/tpu:__subpackages__",
        "//learning/brain:__subpackages__",
        "//tensorflow:__subpackages__",
    ],
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@ -50,11 +50,8 @@ cc_library(
    srcs = ["verbs_util.cc"],
    hdrs = ["verbs_util.h"],
    deps = [
-        "//tensorflow/core:core_cpu_internal",
        "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
    ],
 )

--- a/tensorflow/contrib/verbs/verbs_util.cc
+++ b/tensorflow/contrib/verbs/verbs_util.cc
@ -15,9 +15,13 @@ limitations under the License.

 #include "tensorflow/contrib/verbs/verbs_util.h"

-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#include "tensorflow/core/lib/core/notification.h"
+#include <vector>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
 namespace tensorflow {

 // static
--- a/tensorflow/contrib/verbs/verbs_util.h
+++ b/tensorflow/contrib/verbs/verbs_util.h
@ -18,14 +18,10 @@ limitations under the License.

 #include <string>

-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/framework/types.h"

 namespace tensorflow {

-class TensorProto;
-
 class VerbsUtil {
 public:
  static string AppendStepidToKey(const string& key, int64 step_id);
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@ -55,12 +55,14 @@ tf_cuda_library(
    name = "devices",
    srcs = ["devices.cc"],
    hdrs = ["devices.h"],
+    cuda_deps = [
+        "//tensorflow/core:gpu_init",
+        "//tensorflow/core:stream_executor",
+    ],
    visibility = ["//visibility:public"],
    deps = [
-        "//tensorflow/core:gpu_init",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
    ],
 )

--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -2300,7 +2300,12 @@ tf_kernel_library(
 tf_kernel_library(
    name = "self_adjoint_eig_v2_op",
    prefix = "self_adjoint_eig_v2_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + if_cuda([
+        ":cast_op",
+        ":cwise_op",
+        ":cuda_solvers",
+        ":transpose_functor",
+    ]),
 )

 tf_kernel_library(
@ -4011,7 +4016,6 @@ tf_kernel_library(
    name = "word2vec_kernels",
    prefix = "word2vec_kernels",
    deps = [
-        "//tensorflow/core",
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
@ -4803,8 +4807,8 @@ tf_kernel_library(
        ":image_resizer_state",
        ":ops_util",
        ":pooling_ops",
-        "//tensorflow/core",
        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:core_cpu",
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
        "//tensorflow/core:math_ops_op_lib",
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@ -194,15 +194,14 @@ Status CudaSolver::CopyLapackInfoToHostAsync(
 #define TF_CALL_LAPACK_TYPES_NO_COMPLEX(m) m(float, S) m(double, D)

 // Macros to construct cusolverDn method names.
-#define DN_SOLVER_FN(method, lapack_prefix) cusolverDn##lapack_prefix##method
-#define DN_SOLVER_NAME(method, lapack_prefix) \
-  "cusolverDn" #lapack_prefix #method
-#define DN_BUFSIZE_FN(method, lapack_prefix) \
-  cusolverDn##lapack_prefix##method##_bufferSize
+#define DN_SOLVER_FN(method, type_prefix) cusolverDn##type_prefix##method
+#define DN_SOLVER_NAME(method, type_prefix) "cusolverDn" #type_prefix #method
+#define DN_BUFSIZE_FN(method, type_prefix) \
+  cusolverDn##type_prefix##method##_bufferSize

 // Macros to construct cublas method names.
-#define BLAS_SOLVER_FN(method, lapack_prefix) cublas##lapack_prefix##method
-#define BLAS_SOLVER_NAME(method, lapack_prefix) "cublas" #lapack_prefix #method
+#define BLAS_SOLVER_FN(method, type_prefix) cublas##type_prefix##method
+#define BLAS_SOLVER_NAME(method, type_prefix) "cublas" #type_prefix #method

 //=============================================================================
 // Wrappers of cuSolverDN computational methods begin here.
@ -229,17 +228,16 @@ static inline Status GeamImpl(SolverFnT solver, cublasHandle_t cublas_handle,
  return Status::OK();
 }

-#define GEAM_INSTANCE(Scalar, lapack_prefix)                              \
-  template <>                                                             \
-  Status CudaSolver::Geam<Scalar>(                                        \
-      cublasOperation_t transa, cublasOperation_t transb, int m, int n,   \
-      const Scalar* alpha, /* host or device pointer */                   \
-      const Scalar* A, int lda,                                           \
-      const Scalar* beta, /* host or device pointer */                    \
-      const Scalar* B, int ldb, Scalar* C, int ldc) const {               \
-    return GeamImpl(BLAS_SOLVER_FN(geam, lapack_prefix), cublas_handle_,  \
-                    transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, \
-                    ldc);                                                 \
+#define GEAM_INSTANCE(Scalar, type_prefix)                                     \
+  template <>                                                                  \
+  Status CudaSolver::Geam<Scalar>(                                             \
+      cublasOperation_t transa, cublasOperation_t transb, int m, int n,        \
+      const Scalar* alpha, /* host or device pointer */                        \
+      const Scalar* A, int lda,                                                \
+      const Scalar* beta, /* host or device pointer */                         \
+      const Scalar* B, int ldb, Scalar* C, int ldc) const {                    \
+    return GeamImpl(BLAS_SOLVER_FN(geam, type_prefix), cublas_handle_, transa, \
+                    transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);        \
  }

 TF_CALL_LAPACK_TYPES(GEAM_INSTANCE);
@ -263,12 +261,12 @@ static inline Status PotrfImpl(BufSizeFnT bufsize, SolverFnT solver,
  return Status::OK();
 }

-#define POTRF_INSTANCE(Scalar, lapack_prefix)                                \
+#define POTRF_INSTANCE(Scalar, type_prefix)                                  \
  template <>                                                                \
  Status CudaSolver::Potrf<Scalar>(cublasFillMode_t uplo, int n, Scalar* A,  \
                                   int lda, int* dev_lapack_info) const {    \
-    return PotrfImpl(DN_BUFSIZE_FN(potrf, lapack_prefix),                    \
-                     DN_SOLVER_FN(potrf, lapack_prefix), context_,           \
+    return PotrfImpl(DN_BUFSIZE_FN(potrf, type_prefix),                      \
+                     DN_SOLVER_FN(potrf, type_prefix), context_,             \
                     cusolver_dn_handle_, uplo, n, A, lda, dev_lapack_info); \
  }

@ -293,13 +291,13 @@ static inline Status GetrfImpl(BufSizeFnT bufsize, SolverFnT solver,
  return Status::OK();
 }

-#define GETRF_INSTANCE(Scalar, lapack_prefix)                             \
+#define GETRF_INSTANCE(Scalar, type_prefix)                               \
  template <>                                                             \
  Status CudaSolver::Getrf<Scalar>(int m, int n, Scalar* A, int lda,      \
                                   int* dev_pivots, int* dev_lapack_info) \
      const {                                                             \
-    return GetrfImpl(DN_BUFSIZE_FN(getrf, lapack_prefix),                 \
-                     DN_SOLVER_FN(getrf, lapack_prefix), context_,        \
+    return GetrfImpl(DN_BUFSIZE_FN(getrf, type_prefix),                   \
+                     DN_SOLVER_FN(getrf, type_prefix), context_,          \
                     cusolver_dn_handle_, m, n, A, lda, dev_pivots,       \
                     dev_lapack_info);                                    \
  }
@ -319,53 +317,18 @@ static inline Status GetrsImpl(SolverFnT solver, OpKernelContext* context,
  return Status::OK();
 }

-#define GETRS_INSTANCE(Scalar, lapack_prefix)                                \
+#define GETRS_INSTANCE(Scalar, type_prefix)                                  \
  template <>                                                                \
  Status CudaSolver::Getrs<Scalar>(                                          \
      cublasOperation_t trans, int n, int nrhs, const Scalar* A, int lda,    \
      const int* pivots, Scalar* B, int ldb, int* dev_lapack_info) const {   \
-    return GetrsImpl(DN_SOLVER_FN(getrs, lapack_prefix), context_,           \
+    return GetrsImpl(DN_SOLVER_FN(getrs, type_prefix), context_,             \
                     cusolver_dn_handle_, trans, n, nrhs, A, lda, pivots, B, \
                     ldb, dev_lapack_info);                                  \
  }

 TF_CALL_LAPACK_TYPES(GETRS_INSTANCE);

-template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
-static inline Status GesvdImpl(BufSizeFnT bufsize, SolverFnT solver,
-                               OpKernelContext* context,
-                               cusolverDnHandle_t cusolver_dn_handle,
-                               signed char jobu, signed char jobvt, int m,
-                               int n, Scalar* A, int lda, Scalar* S, Scalar* U,
-                               int ldu, Scalar* VT, int ldvt,
-                               int* dev_lapack_info) {
-  /* Get amount of workspace memory required. */
-  int lwork;
-  TF_RETURN_IF_CUSOLVER_ERROR(bufsize(cusolver_dn_handle, m, n, &lwork));
-  /* Allocate device memory for workspace. */
-  ScratchSpace<Scalar> dev_workspace(context, lwork, /* on_host */ false);
-  /* Launch the solver kernel. */
-  TF_RETURN_IF_CUSOLVER_ERROR(solver(
-      cusolver_dn_handle, jobu, jobvt, m, n, CUDAComplex(A), lda, S,
-      CUDAComplex(U), ldu, CUDAComplex(VT), ldvt,
-      CUDAComplex(dev_workspace.mutable_data()), lwork, NULL, dev_lapack_info));
-  return Status::OK();
-}
-
-#define GESVD_INSTANCE(Scalar, lapack_prefix)                            \
-  template <>                                                            \
-  Status CudaSolver::Gesvd<Scalar>(                                      \
-      signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,  \
-      int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,    \
-      int ldvt, int* dev_lapack_info) const {                            \
-    return GesvdImpl(DN_BUFSIZE_FN(gesvd, lapack_prefix),                \
-                     DN_SOLVER_FN(gesvd, lapack_prefix), context_,       \
-                     cusolver_dn_handle_, jobu, jobvt, m, n, dev_A, lda, \
-                     dev_S, dev_U, ldu, dev_VT, ldvt, dev_lapack_info);  \
-  }
-
-TF_CALL_LAPACK_TYPES_NO_COMPLEX(GESVD_INSTANCE);
-
 template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
 static inline Status GeqrfImpl(BufSizeFnT bufsize, SolverFnT solver,
                               OpKernelContext* context,
@ -385,19 +348,19 @@ static inline Status GeqrfImpl(BufSizeFnT bufsize, SolverFnT solver,
  return Status::OK();
 }

-#define GEQRF_INSTANCE(Scalar, lapack_prefix)                                  \
+#define GEQRF_INSTANCE(Scalar, type_prefix)                                    \
  template <>                                                                  \
  Status CudaSolver::Geqrf<Scalar>(int m, int n, Scalar* A, int lda,           \
                                   Scalar* tau, int* dev_lapack_info) const {  \
-    return GeqrfImpl(DN_BUFSIZE_FN(geqrf, lapack_prefix),                      \
-                     DN_SOLVER_FN(geqrf, lapack_prefix), context_,             \
+    return GeqrfImpl(DN_BUFSIZE_FN(geqrf, type_prefix),                        \
+                     DN_SOLVER_FN(geqrf, type_prefix), context_,               \
                     cusolver_dn_handle_, m, n, A, lda, tau, dev_lapack_info); \
  }

 TF_CALL_LAPACK_TYPES(GEQRF_INSTANCE);

 template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
-static inline Status OrmqrImpl(BufSizeFnT bufsize, SolverFnT solver,
+static inline Status UnmqrImpl(BufSizeFnT bufsize, SolverFnT solver,
                               OpKernelContext* context,
                               cusolverDnHandle_t cusolver_dn_handle,
                               cublasSideMode_t side, cublasOperation_t trans,
@ -422,47 +385,25 @@ static inline Status OrmqrImpl(BufSizeFnT bufsize, SolverFnT solver,
 // Unfortunately the LAPACK function name differs for the real and complex case
 // (complex ones are prefixed with "UN" for "unitary"), so we instantiate each
 // one separately.
-template <>
-Status CudaSolver::Ormqr(cublasSideMode_t side, cublasOperation_t trans, int m,
-                         int n, int k, const float* dev_a, int lda,
-                         const float* dev_tau, float* dev_c, int ldc,
-                         int* dev_lapack_info) const {
-  return OrmqrImpl(DN_BUFSIZE_FN(ormqr, S), DN_SOLVER_FN(ormqr, S), context_,
-                   cusolver_dn_handle_, side, trans, m, n, k, dev_a, lda,
-                   dev_tau, dev_c, ldc, dev_lapack_info);
-}
-template <>
-Status CudaSolver::Ormqr(cublasSideMode_t side, cublasOperation_t trans, int m,
-                         int n, int k, const double* dev_a, int lda,
-                         const double* dev_tau, double* dev_c, int ldc,
-                         int* dev_lapack_info) const {
-  return OrmqrImpl(DN_BUFSIZE_FN(ormqr, D), DN_SOLVER_FN(ormqr, D), context_,
-                   cusolver_dn_handle_, side, trans, m, n, k, dev_a, lda,
-                   dev_tau, dev_c, ldc, dev_lapack_info);
-}
-template <>
-Status CudaSolver::Ormqr(cublasSideMode_t side, cublasOperation_t trans, int m,
-                         int n, int k, const std::complex<float>* dev_a,
-                         int lda, const std::complex<float>* dev_tau,
-                         std::complex<float>* dev_c, int ldc,
-                         int* dev_lapack_info) const {
-  return OrmqrImpl(DN_BUFSIZE_FN(unmqr, C), DN_SOLVER_FN(unmqr, C), context_,
-                   cusolver_dn_handle_, side, trans, m, n, k, dev_a, lda,
-                   dev_tau, dev_c, ldc, dev_lapack_info);
-}
-template <>
-Status CudaSolver::Ormqr(cublasSideMode_t side, cublasOperation_t trans, int m,
-                         int n, int k, const std::complex<double>* dev_a,
-                         int lda, const std::complex<double>* dev_tau,
-                         std::complex<double>* dev_c, int ldc,
-                         int* dev_lapack_info) const {
-  return OrmqrImpl(DN_BUFSIZE_FN(unmqr, Z), DN_SOLVER_FN(unmqr, Z), context_,
-                   cusolver_dn_handle_, side, trans, m, n, k, dev_a, lda,
-                   dev_tau, dev_c, ldc, dev_lapack_info);
-}
+#define UNMQR_INSTANCE(Scalar, function_prefix, type_prefix)                  \
+  template <>                                                                 \
+  Status CudaSolver::Unmqr(cublasSideMode_t side, cublasOperation_t trans,    \
+                           int m, int n, int k, const Scalar* dev_a, int lda, \
+                           const Scalar* dev_tau, Scalar* dev_c, int ldc,     \
+                           int* dev_lapack_info) const {                      \
+    return UnmqrImpl(DN_BUFSIZE_FN(function_prefix##mqr, type_prefix),        \
+                     DN_SOLVER_FN(function_prefix##mqr, type_prefix),         \
+                     context_, cusolver_dn_handle_, side, trans, m, n, k,     \
+                     dev_a, lda, dev_tau, dev_c, ldc, dev_lapack_info);       \
+  }
+
+UNMQR_INSTANCE(float, or, S);
+UNMQR_INSTANCE(double, or, D);
+UNMQR_INSTANCE(complex64, un, C);
+UNMQR_INSTANCE(complex128, un, Z);

 template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
-static inline Status OrgqrImpl(BufSizeFnT bufsize, SolverFnT solver,
+static inline Status UngqrImpl(BufSizeFnT bufsize, SolverFnT solver,
                               OpKernelContext* context,
                               cusolverDnHandle_t cusolver_dn_handle, int m,
                               int n, int k, Scalar* dev_a, int lda,
@ -482,40 +423,97 @@ static inline Status OrgqrImpl(BufSizeFnT bufsize, SolverFnT solver,
  return Status::OK();
 }

-// Unfortunately the LAPACK function name differs for the real and complex case
-// (complex ones are prefixed with "UN" for "unitary"), so we instantiate each
-// one separately.
-template <>
-Status CudaSolver::Orgqr(int m, int n, int k, float* dev_a, int lda,
-                         const float* dev_tau, int* dev_lapack_info) const {
-  return OrgqrImpl(DN_BUFSIZE_FN(orgqr, S), DN_SOLVER_FN(orgqr, S), context_,
-                   cusolver_dn_handle_, m, n, k, dev_a, lda, dev_tau,
-                   dev_lapack_info);
+#define UNGQR_INSTANCE(Scalar, function_prefix, type_prefix)             \
+  template <>                                                            \
+  Status CudaSolver::Ungqr(int m, int n, int k, Scalar* dev_a, int lda,  \
+                           const Scalar* dev_tau, int* dev_lapack_info)  \
+      const {                                                            \
+    return UngqrImpl(DN_BUFSIZE_FN(function_prefix##gqr, type_prefix),   \
+                     DN_SOLVER_FN(function_prefix##gqr, type_prefix),    \
+                     context_, cusolver_dn_handle_, m, n, k, dev_a, lda, \
+                     dev_tau, dev_lapack_info);                          \
+  }
+
+UNGQR_INSTANCE(float, or, S);
+UNGQR_INSTANCE(double, or, D);
+UNGQR_INSTANCE(complex64, un, C);
+UNGQR_INSTANCE(complex128, un, Z);
+
+template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
+static inline Status HeevdImpl(BufSizeFnT bufsize, SolverFnT solver,
+                               OpKernelContext* context,
+                               cusolverDnHandle_t cusolver_dn_handle,
+                               cusolverEigMode_t jobz, cublasFillMode_t uplo,
+                               int n, Scalar* dev_A, int lda,
+                               typename Eigen::NumTraits<Scalar>::Real* dev_W,
+                               int* dev_lapack_info) {
+  /* Get amount of workspace memory required. */
+  int lwork;
+  TF_RETURN_IF_CUSOLVER_ERROR(bufsize(cusolver_dn_handle, jobz, uplo, n,
+                                      CUDAComplex(dev_A), lda,
+                                      CUDAComplex(dev_W), &lwork));
+  /* Allocate device memory for workspace. */
+  ScratchSpace<Scalar> dev_workspace(context, lwork, /* on_host */ false);
+  /* Launch the solver kernel. */
+  TF_RETURN_IF_CUSOLVER_ERROR(
+      solver(cusolver_dn_handle, jobz, uplo, n, CUDAComplex(dev_A), lda,
+             CUDAComplex(dev_W), CUDAComplex(dev_workspace.mutable_data()),
+             lwork, dev_lapack_info));
+  return Status::OK();
 }
-template <>
-Status CudaSolver::Orgqr(int m, int n, int k, double* dev_a, int lda,
-                         const double* dev_tau, int* dev_lapack_info) const {
-  return OrgqrImpl(DN_BUFSIZE_FN(orgqr, D), DN_SOLVER_FN(orgqr, D), context_,
-                   cusolver_dn_handle_, m, n, k, dev_a, lda, dev_tau,
-                   dev_lapack_info);
-}
-template <>
-Status CudaSolver::Orgqr(int m, int n, int k, std::complex<float>* dev_a,
-                         int lda, const std::complex<float>* dev_tau,
-                         int* dev_lapack_info) const {
-  return OrgqrImpl(DN_BUFSIZE_FN(ungqr, C), DN_SOLVER_FN(ungqr, C), context_,
-                   cusolver_dn_handle_, m, n, k, dev_a, lda, dev_tau,
-                   dev_lapack_info);
-}
-template <>
-Status CudaSolver::Orgqr(int m, int n, int k, std::complex<double>* dev_a,
-                         int lda, const std::complex<double>* dev_tau,
-                         int* dev_lapack_info) const {
-  return OrgqrImpl(DN_BUFSIZE_FN(ungqr, Z), DN_SOLVER_FN(ungqr, Z), context_,
-                   cusolver_dn_handle_, m, n, k, dev_a, lda, dev_tau,
-                   dev_lapack_info);
+
+#define HEEVD_INSTANCE(Scalar, function_prefix, type_prefix)                   \
+  template <>                                                                  \
+  Status CudaSolver::Heevd(cusolverEigMode_t jobz, cublasFillMode_t uplo,      \
+                           int n, Scalar* dev_A, int lda,                      \
+                           typename Eigen::NumTraits<Scalar>::Real* dev_W,     \
+                           int* dev_lapack_info) const {                       \
+    return HeevdImpl(DN_BUFSIZE_FN(function_prefix##evd, type_prefix),         \
+                     DN_SOLVER_FN(function_prefix##evd, type_prefix),          \
+                     context_, cusolver_dn_handle_, jobz, uplo, n, dev_A, lda, \
+                     dev_W, dev_lapack_info);                                  \
+  }
+
+HEEVD_INSTANCE(float, sy, S);
+HEEVD_INSTANCE(double, sy, D);
+HEEVD_INSTANCE(complex64, he, C);
+HEEVD_INSTANCE(complex128, he, Z);
+
+template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
+static inline Status GesvdImpl(BufSizeFnT bufsize, SolverFnT solver,
+                               OpKernelContext* context,
+                               cusolverDnHandle_t cusolver_dn_handle,
+                               signed char jobu, signed char jobvt, int m,
+                               int n, Scalar* A, int lda, Scalar* S, Scalar* U,
+                               int ldu, Scalar* VT, int ldvt,
+                               int* dev_lapack_info) {
+  /* Get amount of workspace memory required. */
+  int lwork;
+  TF_RETURN_IF_CUSOLVER_ERROR(bufsize(cusolver_dn_handle, m, n, &lwork));
+  /* Allocate device memory for workspace. */
+  ScratchSpace<Scalar> dev_workspace(context, lwork, /* on_host */ false);
+  /* Launch the solver kernel. */
+  TF_RETURN_IF_CUSOLVER_ERROR(solver(
+      cusolver_dn_handle, jobu, jobvt, m, n, CUDAComplex(A), lda, S,
+      CUDAComplex(U), ldu, CUDAComplex(VT), ldvt,
+      CUDAComplex(dev_workspace.mutable_data()), lwork, NULL, dev_lapack_info));
+  return Status::OK();
 }

+#define GESVD_INSTANCE(Scalar, type_prefix)                              \
+  template <>                                                            \
+  Status CudaSolver::Gesvd<Scalar>(                                      \
+      signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,  \
+      int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,    \
+      int ldvt, int* dev_lapack_info) const {                            \
+    return GesvdImpl(DN_BUFSIZE_FN(gesvd, type_prefix),                  \
+                     DN_SOLVER_FN(gesvd, type_prefix), context_,         \
+                     cusolver_dn_handle_, jobu, jobvt, m, n, dev_A, lda, \
+                     dev_S, dev_U, ldu, dev_VT, ldvt, dev_lapack_info);  \
+  }
+
+TF_CALL_LAPACK_TYPES_NO_COMPLEX(GESVD_INSTANCE);
+
 //=============================================================================
 // Wrappers of cuBlas computational methods begin here.
 //
@ -542,12 +540,12 @@ static inline Status GetrfBatchedImpl(
  return Status::OK();
 }

-#define GETRF_BATCHED_INSTANCE(Scalar, lapack_prefix)                          \
+#define GETRF_BATCHED_INSTANCE(Scalar, type_prefix)                            \
  template <>                                                                  \
  Status CudaSolver::GetrfBatched(                                             \
      int n, const Scalar* host_a_dev_ptrs[], int lda, int* dev_pivots,        \
      DeviceLapackInfo* dev_lapack_info, int batch_size) const {               \
-    return GetrfBatchedImpl(BLAS_SOLVER_FN(getrfBatched, lapack_prefix),       \
+    return GetrfBatchedImpl(BLAS_SOLVER_FN(getrfBatched, type_prefix),         \
                            context_, cublas_handle_, n, host_a_dev_ptrs, lda, \
                            dev_pivots, dev_lapack_info, batch_size);          \
  }
@ -580,14 +578,14 @@ static inline Status GetrsBatchedImpl(
  return Status::OK();
 }

-#define GETRS_BATCHED_INSTANCE(Scalar, lapack_prefix)                          \
+#define GETRS_BATCHED_INSTANCE(Scalar, type_prefix)                            \
  template <>                                                                  \
  Status CudaSolver::GetrsBatched(                                             \
      cublasOperation_t trans, int n, int nrhs,                                \
      const Scalar* host_a_dev_ptrs[], int lda, const int* dev_pivots,         \
      const Scalar* host_b_dev_ptrs[], int ldb,                                \
      DeviceLapackInfo* dev_lapack_info, int batch_size) const {               \
-    return GetrsBatchedImpl(BLAS_SOLVER_FN(getrsBatched, lapack_prefix),       \
+    return GetrsBatchedImpl(BLAS_SOLVER_FN(getrsBatched, type_prefix),         \
                            context_, cublas_handle_, trans, n, nrhs,          \
                            host_a_dev_ptrs, lda, dev_pivots, host_b_dev_ptrs, \
                            ldb, dev_lapack_info, batch_size);                 \
@ -619,13 +617,13 @@ static inline Status GetriBatchedImpl(
  return Status::OK();
 }

-#define GETRI_BATCHED_INSTANCE(Scalar, lapack_prefix)                          \
+#define GETRI_BATCHED_INSTANCE(Scalar, type_prefix)                            \
  template <>                                                                  \
  Status CudaSolver::GetriBatched(                                             \
      int n, const Scalar* host_a_dev_ptrs[], int lda, const int* dev_pivots,  \
      const Scalar* host_a_inv_dev_ptrs[], int ldainv,                         \
      DeviceLapackInfo* dev_lapack_info, int batch_size) const {               \
-    return GetriBatchedImpl(BLAS_SOLVER_FN(getriBatched, lapack_prefix),       \
+    return GetriBatchedImpl(BLAS_SOLVER_FN(getriBatched, type_prefix),         \
                            context_, cublas_handle_, n, host_a_dev_ptrs, lda, \
                            dev_pivots, host_a_inv_dev_ptrs, ldainv,           \
                            dev_lapack_info, batch_size);                      \
@ -657,13 +655,13 @@ static inline Status MatInvBatchedImpl(
  return Status::OK();
 }

-#define MATINV_BATCHED_INSTANCE(Scalar, lapack_prefix)                     \
+#define MATINV_BATCHED_INSTANCE(Scalar, type_prefix)                       \
  template <>                                                              \
  Status CudaSolver::MatInvBatched(                                        \
      int n, const Scalar* host_a_dev_ptrs[], int lda,                     \
      const Scalar* host_a_inv_dev_ptrs[], int ldainv,                     \
      DeviceLapackInfo* dev_lapack_info, int batch_size) const {           \
-    return MatInvBatchedImpl(BLAS_SOLVER_FN(matinvBatched, lapack_prefix), \
+    return MatInvBatchedImpl(BLAS_SOLVER_FN(matinvBatched, type_prefix),   \
                             context_, cublas_handle_, n, host_a_dev_ptrs, \
                             lda, host_a_inv_dev_ptrs, ldainv,             \
                             dev_lapack_info, batch_size);                 \
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@ -242,32 +242,39 @@ class CudaSolver {
  Status Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_tau,
               int* dev_lapack_info) const TF_MUST_USE_RESULT;

-  // Overwrite matrix C by product of C and Householder matrix Q. The
-  // Householder matrix Q is represented by the output from Geqrf in dev_a and
-  // dev_tau.
+  // Overwrite matrix C by product of C and the unitary Householder matrix Q.
+  // The Householder matrix Q is represented by the output from Geqrf in dev_a
+  // and dev_tau.
  // Notice: If Scalar is real, only trans=CUBLAS_OP_N or trans=CUBLAS_OP_T is
  // supported. If Scalar is complex, trans=CUBLAS_OP_N or trans=CUBLAS_OP_C is
  // supported.
  // Returns Status::OK() if the kernel was launched successfully.
  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-ormqr
  template <typename Scalar>
-  Status Ormqr(cublasSideMode_t side, cublasOperation_t trans, int m, int n,
+  Status Unmqr(cublasSideMode_t side, cublasOperation_t trans, int m, int n,
               int k, const Scalar* dev_a, int lda, const Scalar* dev_tau,
               Scalar* dev_c, int ldc,
               int* dev_lapack_info) const TF_MUST_USE_RESULT;

-  // Overwrites QR factorization produced by Geqrf by Householder matrix Q.
-  // On input, the Householder matrix Q is represented by the output from Geqrf
-  // in dev_a and dev_tau. On output, dev_a is overwritten with the first n
-  // columns of Q.
-  // Requires m >= n >= 0.
+  // Overwrites QR factorization produced by Geqrf by the unitary Householder
+  // matrix Q. On input, the Householder matrix Q is represented by the output
+  // from Geqrf in dev_a and dev_tau. On output, dev_a is overwritten with the
+  // first n columns of Q. Requires m >= n >= 0.
  // Returns Status::OK() if the kernel was launched successfully.
  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-orgqr
  template <typename Scalar>
-  Status Orgqr(int m, int n, int k, Scalar* dev_a, int lda,
+  Status Ungqr(int m, int n, int k, Scalar* dev_a, int lda,
               const Scalar* dev_tau,
               int* dev_lapack_info) const TF_MUST_USE_RESULT;

+  // Hermitian (Symmetric) Eigen decomposition.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-syevd
+  template <typename Scalar>
+  Status Heevd(cusolverEigMode_t jobz, cublasFillMode_t uplo, int n,
+               Scalar* dev_A, int lda,
+               typename Eigen::NumTraits<Scalar>::Real* dev_W,
+               int* dev_lapack_info) const TF_MUST_USE_RESULT;
+
  // Singular value decomposition.
  // Returns Status::OK() if the kernel was launched successfully.
  // TODO(rmlarsen, volunteers): Add support for complex types.
@ -277,16 +284,6 @@ class CudaSolver {
               int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
               int ldvt, int* dev_lapack_info) const TF_MUST_USE_RESULT;

-  /*
-  TODO(rmlarsen, volunteers): Implement the kernels below.
-
-  // Symmetric/Hermitian Eigen decomposition.
-  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-syevd
-  template <typename Scalar>
-  Status Syevd(cusolverEigMode_t jobz, cublasFillMode_t uplo, int n, Scalar*
-  dev_A, int lda, Scalar* dev_W, int* dev_lapack_info) const TF_MUST_USE_RESULT;
-  */
-
 private:
  OpKernelContext* context_;  // not owned.
  cudaStream_t cuda_stream_;
--- a/tensorflow/core/kernels/one_hot_op.cc
+++ b/tensorflow/core/kernels/one_hot_op.cc
@ -159,6 +159,8 @@ namespace functor {
  DECLARE_GPU_SPEC_INDEX(T, int64);

 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_int32(DECLARE_GPU_SPEC);
+TF_CALL_int64(DECLARE_GPU_SPEC);

 #undef DECLARE_GPU_SPEC_INDEX
 #undef DECLARE_GPU_SPEC
@ -180,6 +182,8 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
  REGISTER_ONE_HOT_GPU_INDEX(type, int64);

 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ONE_HOT_GPU);
+TF_CALL_int32(REGISTER_ONE_HOT_GPU);
+TF_CALL_int64(REGISTER_ONE_HOT_GPU);

 #undef REGISTER_ONE_HOT_GPU_INDEX
 #undef REGISTER_ONE_HOT_GPU
--- a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
@ -37,6 +37,8 @@ typedef Eigen::GpuDevice GPUDevice;
  DEFINE_GPU_SPEC_INDEX(T, int64)

 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_int32(DEFINE_GPU_SPEC);
+TF_CALL_int64(DEFINE_GPU_SPEC);

 #undef DEFINE_GPU_SPEC_INDEX
 #undef DEFINE_GPU_SPEC
--- a/tensorflow/core/kernels/qr_op_impl.h
+++ b/tensorflow/core/kernels/qr_op_impl.h
@ -248,12 +248,12 @@ class QrOpGpu : public AsyncOpKernel {
      auto q_reshaped = q->flat_inner_dims<Scalar, 3>();
      eye(device, q_reshaped);
      for (int batch = 0; batch < batch_size; ++batch) {
-        // Notice: It appears that Ormqr does not write a zero into *info upon
+        // Notice: It appears that Unmqr does not write a zero into *info upon
        // success (probably a bug), so we simply re-use the info array already
        // zeroed by Geqrf above.
        OP_REQUIRES_OK_ASYNC(
            context,
-            solver.Ormqr(CUBLAS_SIDE_LEFT, CublasAdjointOp<Scalar>(), m, m,
+            solver.Unmqr(CUBLAS_SIDE_LEFT, CublasAdjointOp<Scalar>(), m, m,
                         min_size, &input_transposed_reshaped(batch, 0, 0), m,
                         &tau_matrix(batch, 0), &q_reshaped(batch, 0, 0), m,
                         dev_info.back().mutable_data() + batch),
@ -266,12 +266,12 @@ class QrOpGpu : public AsyncOpKernel {
      }
    } else {
      // Generate m x n matrix Q. In this case we can use the more efficient
-      // algorithm in Orgqr to generate Q in place.
+      // algorithm in Ungqr to generate Q in place.
      dev_info.emplace_back(context, batch_size, "orgqr");
      for (int batch = 0; batch < batch_size; ++batch) {
        OP_REQUIRES_OK_ASYNC(
            context,
-            solver.Orgqr(
+            solver.Ungqr(
                m, n, min_size, &input_transposed_reshaped(batch, 0, 0), m,
                &tau_matrix(batch, 0), dev_info.back().mutable_data() + batch),
            done);
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op_gpu.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_v2_op_gpu.cc
@ -0,0 +1,194 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#if GOOGLE_CUDA
+
+#include <numeric>
+#include <type_traits>
+
+#define EIGEN_USE_GPU
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/cast_op.h"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <class Scalar>
+class SelfAdjointEigV2OpGpu : public AsyncOpKernel {
+ public:
+  explicit SelfAdjointEigV2OpGpu(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("compute_v", &compute_v_));
+  }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+    const Tensor& input = context->input(0);
+    const int ndims = input.dims();
+    OP_REQUIRES_ASYNC(
+        context, ndims >= 2,
+        errors::InvalidArgument("Input must have rank >= 2, got ", ndims),
+        done);
+    const int64 n = input.dim_size(ndims - 1);
+    OP_REQUIRES_ASYNC(
+        context, input.dim_size(ndims - 2) == n,
+        errors::InvalidArgument("Input matrices must be squares, got",
+                                input.dim_size(ndims - 2), " != ", n),
+        done);
+    const int64 batch_size =
+        input.template flat_inner_dims<Scalar, 3>().dimension(0);
+
+    // Allocate outputs.
+    Tensor* eigenvalues;
+    TensorShape eigenvalues_shape = input.shape();
+    eigenvalues_shape.RemoveLastDims(1);
+    OP_REQUIRES_OK_ASYNC(
+        context, context->allocate_output(0, eigenvalues_shape, &eigenvalues),
+        done);
+    Tensor* eigenvectors;
+    TensorShape eigenvectors_shape =
+        compute_v_ ? input.shape() : TensorShape({});
+    OP_REQUIRES_OK_ASYNC(
+        context, context->allocate_output(1, eigenvectors_shape, &eigenvectors),
+        done);
+
+    if (input.NumElements() == 0) {
+      done();
+      return;
+    }
+
+    // Allocate workspace.
+    Tensor eigenvalues_real;
+    using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+    if (std::is_same<Scalar, RealScalar>::value) {
+      eigenvalues_real = *eigenvalues;
+    } else {
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          context->allocate_temp(DataTypeToEnum<RealScalar>::value,
+                                 eigenvalues_shape, &eigenvalues_real),
+          done);
+    }
+
+    Tensor input_copy;
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->forward_input_or_allocate_temp(
+            {0}, DataTypeToEnum<Scalar>::value, input.shape(), &input_copy),
+        done);
+    // For real symmetric matrices, row-major and column-major are the same. For
+    // complex Hermitian, row-major and column-major differ by a conjugation,
+    // which is still cheaper than a transpose.
+    const GPUDevice& device = context->eigen_device<GPUDevice>();
+    if (!input.SharesBufferWith(input_copy)) {
+      if (Eigen::NumTraits<Scalar>::IsComplex) {
+        functor::UnaryFunctor<GPUDevice, functor::conj<Scalar>> conj;
+        conj(device, input_copy.flat<Scalar>() /*out*/,
+             input.flat<Scalar>() /*in*/);
+      } else {
+        device.memcpy(input_copy.flat<Scalar>().data(),
+                      input.flat<Scalar>().data(),
+                      input.NumElements() * sizeof(Scalar));
+      }
+    } else if (Eigen::NumTraits<Scalar>::IsComplex) {
+      functor::UnaryFunctor<GPUDevice, functor::conj<Scalar>> conj;
+      conj(device, const_cast<Tensor*>(&input)->flat<Scalar>() /*out*/,
+           input.flat<Scalar>() /*in*/);
+    }
+
+    // Compute eigen decomposition in-place in input_copy.
+    CudaSolver solver(context);
+    std::vector<DeviceLapackInfo> dev_info;
+    dev_info.emplace_back(context, batch_size, "heevd");
+    auto input_copy_reshaped = input_copy.flat_inner_dims<Scalar, 3>();
+    auto eigenvalues_real_reshaped =
+        eigenvalues_real.flat_inner_dims<RealScalar, 2>();
+    for (int batch = 0; batch < batch_size; ++batch) {
+      OP_REQUIRES_OK_ASYNC(context,
+                           solver.Heevd(compute_v_ ? CUSOLVER_EIG_MODE_VECTOR
+                                                   : CUSOLVER_EIG_MODE_NOVECTOR,
+                                        CUBLAS_FILL_MODE_UPPER, n,
+                                        &input_copy_reshaped(batch, 0, 0), n,
+                                        &eigenvalues_real_reshaped(batch, 0),
+                                        dev_info.back().mutable_data() + batch),
+                           done);
+    }
+
+    if (!std::is_same<Scalar, RealScalar>::value) {
+      functor::CastFunctor<GPUDevice, Scalar, RealScalar> cast;
+      cast(device, eigenvalues->flat<Scalar>(),
+           const_cast<const Tensor*>(&eigenvalues_real)->flat<RealScalar>());
+    }
+
+    if (compute_v_) {
+      // Transpose eigenvectors now stored in input_copy in column-major form to
+      // output in row-major form.
+      std::vector<int> perm(ndims);
+      std::iota(perm.begin(), perm.end(), 0);
+      std::swap(perm[ndims - 2], perm[ndims - 1]);
+      OP_REQUIRES_OK_ASYNC(
+          context, DoTranspose(device, input_copy, perm, eigenvectors), done);
+    }
+
+    // Asynchronously check return status from cuSolver kernels.
+    TensorReference input_copy_ref(input_copy);
+    TensorReference eigenvalues_real_ref(eigenvalues_real);
+    auto info_checker = [context, dev_info, input_copy_ref,
+                         eigenvalues_real_ref,
+                         done](const Status& status,
+                               const std::vector<HostLapackInfo>& host_infos) {
+      input_copy_ref.Unref();
+      eigenvalues_real_ref.Unref();
+      OP_REQUIRES_OK_ASYNC(context, status, done);
+      done();
+    };
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.CopyLapackInfoToHostAsync(dev_info, std::move(info_checker)),
+        done);
+  }
+
+ private:
+  bool compute_v_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SelfAdjointEigV2OpGpu);
+};
+
+#define REGISTER(Scalar)                                                       \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("SelfAdjointEigV2").Device(DEVICE_GPU).TypeConstraint<Scalar>("T"), \
+      (SelfAdjointEigV2OpGpu<Scalar>))
+
+REGISTER(float);
+REGISTER(double);
+REGISTER(complex64);
+REGISTER(complex128);
+
+#undef REGISTER
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
--- a/tensorflow/core/lib/gtl/flatmap.h
+++ b/tensorflow/core/lib/gtl/flatmap.h
@ -146,8 +146,8 @@ class FlatMap {
    friend class FlatMap;
    Bucket* b_;
    Bucket* end_;
+    char space_ alignas(value_type)[sizeof(value_type)];
    uint32 i_;
-    char space_[sizeof(value_type)];

    pointer val() { return reinterpret_cast<pointer>(space_); }
    void FillValue() { new (space_) value_type(b_->key(i_), b_->val(i_)); }
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@ -56,6 +56,9 @@ void ForEach(int first, int last, const std::function<void(int)>& f) {
 FileSystem::~FileSystem() {}

 string FileSystem::TranslateName(const string& name) const {
+  // If the name is empty, CleanPath returns "." which is incorrect and
+  // we should return the empty path instead.
+  if (name.empty()) return name;
  return io::CleanPath(name);
 }

--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@ -62,7 +62,6 @@ cc_library(
    ],
    deps = [
        ":ctc_loss_util_lib",
-        "//tensorflow/core:gpu_runtime",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//third_party/eigen3",
--- a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
+++ b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
@ -198,7 +198,7 @@ You're now all set to visualize this data using TensorBoard.
 ## Launching TensorBoard

 To run TensorBoard, use the following command (alternatively `python -m
-tensorflow.tensorboard`)
+tensorboard.main`)

 ```bash
 tensorboard --logdir=path/to/log-directory
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -50,7 +50,6 @@ py_library(
        "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
    ],
    deps = [
-        ":tf_optimizer",
        ":array_ops",
        ":bitwise_ops",
        ":check_ops",
@ -63,15 +62,20 @@ py_library(
        ":framework_for_generated_wrappers",
        ":functional_ops",
        ":gradient_checker",
+        ":graph_util",
        ":histogram_ops",
        ":image_ops",
        ":initializers_ns",
        ":io_ops",
+        ":layers",
        ":lib",
        ":linalg_ns",
        ":math_ops",
+        ":metrics",
        ":nn",
+        ":ops",
        ":platform",
+        ":pywrap_tensorflow",
        ":script_ops",
        ":session_ops",
        ":sets",
@ -81,24 +85,24 @@ py_library(
        ":state_ops",
        ":string_ops",
        ":summary",
-        ":metrics",
-        ":layers",
        ":tensor_array_ops",
        ":training",
-        ":ops",
        ":saver_test_utils",
        ":subscribe",
        ":test_ops",  # TODO: Break testing code out into separate rule.
+        ":tf_optimizer",
        ":util",
        ":weights_broadcast_ops",
        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data",
        "//tensorflow/python/estimator:estimator_py",
        "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/keras",
        "//tensorflow/python/ops/losses",
        "//tensorflow/python/ops/distributions",
        "//tensorflow/python/profiler",
        "//tensorflow/python/saved_model",
-        "//tensorflow/python/keras",
    ] + if_not_windows([
        "//tensorflow/contrib:contrib_py",
    ]),
--- a/tensorflow/python/data/BUILD
+++ b/tensorflow/python/data/BUILD
@ -0,0 +1,27 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "data",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
--- a/tensorflow/python/data/init.py
+++ b/tensorflow/python/data/init.py
@ -0,0 +1,37 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`tf.data.Dataset` API for input pipelines.
+
+@@Dataset
+@@Iterator
+@@TFRecordDataset
+@@FixedLengthRecordDataset
+@@TextLineDataset
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.python.data.ops.dataset_ops import FixedLengthRecordDataset
+from tensorflow.python.data.ops.dataset_ops import Iterator
+from tensorflow.python.data.ops.dataset_ops import TextLineDataset
+from tensorflow.python.data.ops.dataset_ops import TFRecordDataset
+# pylint: enable=unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@ -0,0 +1,38 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "dataset_ops",
+    srcs = ["dataset_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
--- a/tensorflow/contrib/data/python/util/BUILD
+++ b/tensorflow/contrib/data/python/util/BUILD
--- a/tensorflow/contrib/data/python/util/nest.py
+++ b/tensorflow/contrib/data/python/util/nest.py
@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================

+# TODO(shivaniagrawal): Merge with core nest
 """## Functions for working with arbitrarily nested sequences of elements.

 NOTE(mrry): This fork of the `tensorflow.python.util.nest` module
--- a/tensorflow/contrib/data/python/util/nest_test.py
+++ b/tensorflow/contrib/data/python/util/nest_test.py
@ -22,7 +22,7 @@ import collections

 import numpy as np

-from tensorflow.contrib.data.python.util import nest
+from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@ -203,6 +203,22 @@ class EvalSpec(
        throttle_secs=throttle_secs)


+class _StopAtSecsHook(session_run_hook.SessionRunHook):
+  """Stops given secs after begin is called."""
+
+  def __init__(self, stop_after_secs):
+    self._stop_after_secs = stop_after_secs
+    self._start_time = None
+
+  def begin(self):
+    self._start_time = time.time()
+
+  def after_run(self, run_context, run_values):
+    del run_values
+    if time.time() - self._start_time >= self._stop_after_secs:
+      run_context.request_stop()
+
+
 class UnimplementedError(Exception):
  pass

@ -254,7 +270,38 @@ class _TrainingExecutor(object):

  def run_local(self):
    """Runs training and evaluation locally (non-distributed)."""
-    raise UnimplementedError('Method run_local has not been implemented.')
+
+    def _should_stop_local_train(global_step):
+      if self._train_spec.max_steps is None:
+        return False
+      if global_step >= self._train_spec.max_steps:
+        return True
+      return False
+
+    if self._eval_spec.throttle_secs <= 0:
+      raise ValueError('eval_spec.throttle_secs should be positive, given: {}.'
+                       'It is used do determine how long each training '
+                       'iteration should go when train and evaluate '
+                       'locally.'.format(
+                           self._eval_spec.throttle_secs))
+
+    stop_hook = _StopAtSecsHook(self._eval_spec.throttle_secs)
+    train_hooks = list(self._train_spec.hooks) + [stop_hook]
+    logging.info('Start train and evaluate loop. The evaluate will happen '
+                 'after {} secs (eval_spec.throttle_secs) or training is '
+                 'finished.'.format(self._eval_spec.throttle_secs))
+    while True:
+      self._estimator.train(
+          input_fn=self._train_spec.input_fn,
+          max_steps=self._train_spec.max_steps,
+          hooks=train_hooks)
+      metrics = self._estimator.evaluate(
+          input_fn=self._eval_spec.input_fn,
+          steps=self._eval_spec.steps,
+          hooks=self._eval_spec.hooks,
+          name=self._eval_spec.name)
+      if _should_stop_local_train(metrics[ops.GraphKeys.GLOBAL_STEP]):
+        break

  def _start_std_server(self, config):
    """Creates, starts, and returns a server_lib.Server."""
--- a/tensorflow/python/estimator/training_test.py
+++ b/tensorflow/python/estimator/training_test.py
@ -27,8 +27,10 @@ from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator import training
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.training import session_run_hook
@ -614,5 +616,102 @@ class TrainingExecutorRunPsTest(test.TestCase):
                                 mock_eval_spec).run_ps()


+class StopAtSecsHookTest(test.TestCase):
+  """Tests StopAtSecsHook."""
+
+  @test.mock.patch.object(time, 'time')
+  def test_stops_after_time(self, mock_time):
+    mock_time.return_value = 1484695987.209386
+    hook = training._StopAtSecsHook(1000)
+    with ops.Graph().as_default():
+      no_op = control_flow_ops.no_op()
+      # some time passed before training starts
+      mock_time.return_value += 250
+      with monitored_session.MonitoredSession(hooks=[hook]) as sess:
+        self.assertFalse(sess.should_stop())
+        sess.run(no_op)
+        self.assertFalse(sess.should_stop())
+        mock_time.return_value += 500
+        sess.run(no_op)
+        self.assertFalse(sess.should_stop())
+        mock_time.return_value += 400
+        sess.run(no_op)
+        self.assertFalse(sess.should_stop())
+        mock_time.return_value += 200
+        sess.run(no_op)
+        self.assertTrue(sess.should_stop())
+
+
+class TrainingExecutorRunLocalTest(test.TestCase):
+  """Tests run_local of _TrainingExecutor."""
+
+  def test_send_stop_at_secs_to_train(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, hooks=[_FakeHook()], throttle_secs=100)
+    mock_est.evaluate.return_value = {_GLOBAL_STEP_KEY: train_spec.max_steps}
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+    executor.run_local()
+
+    stop_hook = mock_est.train.call_args[1]['hooks'][-1]
+    self.assertIsInstance(stop_hook, training._StopAtSecsHook)
+    self.assertEqual(eval_spec.throttle_secs, stop_hook._stop_after_secs)
+
+  def test_runs_in_a_loop_until_max_steps(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=300, hooks=[_FakeHook()])
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, hooks=[_FakeHook()], throttle_secs=100)
+    # should be called 3 times.
+    mock_est.evaluate.side_effect = [{
+        _GLOBAL_STEP_KEY: train_spec.max_steps - 100
+    }, {
+        _GLOBAL_STEP_KEY: train_spec.max_steps - 50
+    }, {
+        _GLOBAL_STEP_KEY: train_spec.max_steps
+    }]
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+    executor.run_local()
+
+    self.assertEqual(3, mock_est.train.call_count)
+    self.assertEqual(3, mock_est.evaluate.call_count)
+
+  def test_train_and_evaluate_args(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=300, hooks=[_FakeHook()])
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, steps=2, hooks=[_FakeHook()], name='local_eval')
+    mock_est.evaluate.return_value = {_GLOBAL_STEP_KEY: train_spec.max_steps}
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+    executor.run_local()
+
+    mock_est.evaluate.assert_called_with(
+        name=eval_spec.name,
+        input_fn=eval_spec.input_fn,
+        steps=eval_spec.steps,
+        hooks=eval_spec.hooks)
+
+    train_args = mock_est.train.call_args[1]
+    self.assertEqual(list(train_spec.hooks), list(train_args['hooks'][:-1]))
+    self.assertEqual(train_spec.input_fn, train_args['input_fn'])
+    self.assertEqual(train_spec.max_steps, train_args['max_steps'])
+
+  def test_errors_out_if_throttle_secs_is_zero(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    train_spec = training.TrainSpec(input_fn=lambda: 1)
+    eval_spec = training.EvalSpec(input_fn=lambda: 1, throttle_secs=0)
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+    with self.assertRaisesRegexp(ValueError, 'throttle_secs'):
+      executor.run_local()
+
+
 if __name__ == '__main__':
  test.main()
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@ -35,6 +35,7 @@ py_library(
        "_impl/keras/engine/__init__.py",
        "_impl/keras/engine/topology.py",
        "_impl/keras/engine/training.py",
+        "_impl/keras/estimator.py",
        "_impl/keras/initializers.py",
        "_impl/keras/layers/__init__.py",
        "_impl/keras/layers/advanced_activations.py",
@ -88,6 +89,7 @@ py_library(
        "datasets/imdb/__init__.py",
        "datasets/mnist/__init__.py",
        "datasets/reuters/__init__.py",
+        "estimator/__init__.py",
        "initializers/__init__.py",
        "layers/__init__.py",
        "losses/__init__.py",
@ -125,9 +127,11 @@ py_library(
        "//tensorflow/python:layers_base",
        "//tensorflow/python:logging_ops",
        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
        "//tensorflow/python:nn",
        "//tensorflow/python:platform",
        "//tensorflow/python:random_ops",
+        "//tensorflow/python:session",
        "//tensorflow/python:sparse_ops",
        "//tensorflow/python:sparse_tensor",
        "//tensorflow/python:state_ops",
@ -139,6 +143,8 @@ py_library(
        "//tensorflow/python:util",
        "//tensorflow/python:variable_scope",
        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
        "@six_archive//:six",
    ],
 )
@ -656,6 +662,22 @@ py_test(
    ],
 )

+py_test(
+    name = "estimator_test",
+    size = "medium",
+    srcs = ["_impl/keras/estimator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/estimator:numpy_io",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
    name = "backend_test",
    size = "small",
--- a/tensorflow/python/keras/init.py
+++ b/tensorflow/python/keras/init.py
@ -29,6 +29,7 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import datasets
+from tensorflow.python.keras import estimator
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import losses
--- a/tensorflow/python/keras/_impl/keras/init.py
+++ b/tensorflow/python/keras/_impl/keras/init.py
@ -25,6 +25,7 @@ from tensorflow.python.keras._impl.keras import callbacks
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import datasets
 from tensorflow.python.keras._impl.keras import engine
+from tensorflow.python.keras._impl.keras import estimator
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras import losses
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@ -57,7 +57,7 @@ the 100 % MobileNet on various input sizes:

 The weights for all 16 models are obtained and translated
 from Tensorflow checkpoints found at
-https://github.com/tensorflow/models/blob/master/slim/nets/mobilenet_v1.md
+https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md

 # Reference
 - [MobileNets: Efficient Convolutional Neural Networks for
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@ -373,22 +373,7 @@ def get_session():
    session = _SESSION
  if not _MANUAL_VAR_INIT:
    with session.graph.as_default():
-      variables = variables_module.global_variables()
-      candidate_vars = []
-      for v in variables:
-        if not getattr(v, '_keras_initialized', False):
-          candidate_vars.append(v)
-      # This step is expensive, so we only run it on variables not already
-      # marked as initialized.
-      is_initialized = session.run(
-          [variables_module.is_variable_initialized(v) for v in candidate_vars])
-      uninitialized_vars = []
-      for flag, v in zip(is_initialized, candidate_vars):
-        if not flag:
-          uninitialized_vars.append(v)
-        v._keras_initialized = True
-      if uninitialized_vars:
-        session.run(variables_module.variables_initializer(uninitialized_vars))
+      _initialize_variables(session)
  return session


@ -556,6 +541,26 @@ def variable(value, dtype=None, name=None, constraint=None):
  return v


+def _initialize_variables(session):
+  """Utility to initialize uninitialized variables on the fly."""
+  variables = variables_module.global_variables()
+  candidate_vars = []
+  for v in variables:
+    if not getattr(v, '_keras_initialized', False):
+      candidate_vars.append(v)
+  # This step is expensive, so we only run it on variables not already
+  # marked as initialized.
+  is_initialized = session.run(
+      [variables_module.is_variable_initialized(v) for v in candidate_vars])
+  uninitialized_vars = []
+  for flag, v in zip(is_initialized, candidate_vars):
+    if not flag:
+      uninitialized_vars.append(v)
+    v._keras_initialized = True
+  if uninitialized_vars:
+    session.run(variables_module.variables_initializer(uninitialized_vars))
+
+
 def constant(value, dtype=None, shape=None, name=None):
  """Creates a constant tensor.

--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@ -0,0 +1,281 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Home of estimator related functions.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import models
+from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops import metrics as metrics_module
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import training_util
+
+
+def _create_ordered_io(keras_model, estimator_io_dict, is_input=True):
+  """Create a list of tensors from IO dictionary based on Keras IO order.
+
+  Args:
+    keras_model: an instance of compiled keras model.
+    estimator_io_dict: features or labels dictionary from model_fn.
+    is_input: True if dictionary is for inputs.
+
+  Returns:
+    a list of tensors based on Keras IO order.
+
+  Raises:
+    ValueError: if dictionary keys cannot be found in Keras model input_names
+      or output_names.
+  """
+  if is_input:
+    keras_io_names = keras_model.input_names
+  else:
+    keras_io_names = keras_model.output_names
+
+  for key in estimator_io_dict:
+    if key not in keras_io_names:
+      raise ValueError(
+          'Cannot find %s with name "%s" in Keras Model. It needs to match '
+          'one of the following: %s' % ('input' if is_input else 'output', key,
+                                        ', '.join(keras_io_names)))
+  tensors = []
+  for io_name in keras_io_names:
+    tensors.append(estimator_io_dict[io_name])
+  return tensors
+
+
+def _clone_and_build_model(mode,
+                           keras_model,
+                           custom_objects,
+                           features=None,
+                           labels=None):
+  """Clone and build the given keras_model.
+
+  Args:
+    mode: training mode.
+    keras_model: an instance of compiled keras model.
+    custom_objects: Dictionary for custom objects.
+    features:
+    labels:
+
+  Returns:
+    The newly built model.
+  """
+  # Set to True during training, False for inference.
+  K.set_learning_phase(mode == model_fn_lib.ModeKeys.TRAIN)
+
+  # Clone keras model.
+  input_tensors = None if features is None else _create_ordered_io(
+      keras_model, features)
+  if custom_objects:
+    with CustomObjectScope(custom_objects):
+      model = models.clone_model(keras_model, input_tensors=input_tensors)
+  else:
+    model = models.clone_model(keras_model, input_tensors=input_tensors)
+
+  # Compile/Build model
+  if mode is model_fn_lib.ModeKeys.PREDICT and not model.built:
+    model.build()
+  else:
+    optimizer_config = keras_model.optimizer.get_config()
+    optimizer = keras_model.optimizer.__class__.from_config(optimizer_config)
+    optimizer.iterations = training_util.get_or_create_global_step()
+
+    # Get list of outputs.
+    if labels is None:
+      target_tensors = None
+    elif isinstance(labels, dict):
+      target_tensors = _create_ordered_io(keras_model, labels, is_input=False)
+    else:
+      target_tensors = [
+          sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(labels)
+      ]
+
+    model.compile(
+        optimizer,
+        keras_model.loss,
+        metrics=keras_model.metrics,
+        loss_weights=keras_model.loss_weights,
+        sample_weight_mode=keras_model.sample_weight_mode,
+        weighted_metrics=keras_model.weighted_metrics,
+        target_tensors=target_tensors)
+
+  if isinstance(model, models.Sequential):
+    model = model.model
+  return model
+
+
+def _create_keras_model_fn(keras_model, custom_objects=None):
+  """Creates model_fn for keras Estimator.
+
+  Args:
+    keras_model: an instance of compiled keras model.
+    custom_objects: Dictionary for custom objects.
+
+  Returns:
+    The model_fn for a keras Estimator.
+  """
+
+  def model_fn(features, labels, mode):
+    """model_fn for keras Estimator."""
+    model = _clone_and_build_model(mode, keras_model, custom_objects, features,
+                                   labels)
+    # Get inputs to EstimatorSpec
+    predictions = dict(zip(model.output_names, model.outputs))
+
+    loss = None
+    train_op = None
+    eval_metric_ops = None
+
+    # Set loss and metric only during train and evaluate.
+    if mode is not model_fn_lib.ModeKeys.PREDICT:
+      model._make_train_function()  # pylint: disable=protected-access
+      loss = model.total_loss
+
+      if model.metrics:
+        eval_metric_ops = {}
+        # When each metric maps to an output
+        if isinstance(model.metrics, dict):
+          for i, output_name in enumerate(model.metrics.keys()):
+            metric_name = model.metrics[output_name]
+            if callable(metric_name):
+              metric_name = metric_name.__name__
+            # When some outputs use the same metric
+            if list(model.metrics.values()).count(metric_name) > 1:
+              metric_name += '_' + output_name
+            eval_metric_ops[metric_name] = metrics_module.mean(
+                model.metrics_tensors[i - len(model.metrics)])
+        else:
+          for i, metric_name in enumerate(model.metrics):
+            if callable(metric_name):
+              metric_name = metric_name.__name__
+            eval_metric_ops[metric_name] = metrics_module.mean(
+                model.metrics_tensors[i])
+
+    # Set train_op only during train.
+    if mode is model_fn_lib.ModeKeys.TRAIN:
+      train_op = model.train_function.updates_op
+
+    return model_fn_lib.EstimatorSpec(
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        eval_metric_ops=eval_metric_ops)
+
+  return model_fn
+
+
+def _save_first_checkpoint(keras_model, estimator, custom_objects,
+                           keras_weights):
+  """Save first checkpoint for the keras Estimator.
+
+  Args:
+    keras_model: an instance of compiled keras model.
+    estimator: keras estimator.
+    custom_objects: Dictionary for custom objects.
+    keras_weights: A flat list of Numpy arrays for weights of given keras_model.
+
+  Returns:
+    The model_fn for a keras Estimator.
+  """
+  with ops.Graph().as_default() as g, g.device(estimator._device_fn):
+    random_seed.set_random_seed(estimator.config.tf_random_seed)
+    training_util.create_global_step()
+    model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model,
+                                   custom_objects)
+
+    if isinstance(model, models.Sequential):
+      model = model.model
+    # Load weights and save to checkpoint if there is no checkpoint
+    latest_path = saver_lib.latest_checkpoint(estimator.model_dir)
+    if not latest_path:
+      with session.Session() as sess:
+        model.set_weights(keras_weights)
+        # Make update ops and initialize all variables.
+        if not model.train_function:
+          # pylint: disable=protected-access
+          model._make_train_function()
+          K._initialize_variables(sess)
+          # pylint: enable=protected-access
+        saver = saver_lib.Saver()
+        saver.save(sess, estimator.model_dir + '/')
+
+
+def model_to_estimator(keras_model=None,
+                       keras_model_path=None,
+                       custom_objects=None,
+                       model_dir=None,
+                       config=None):
+  """Constructs an `Estimator` instance from given keras model.
+
+  Args:
+    keras_model: Keras model in memory.
+    keras_model_path: Directory to a keras model on disk.
+    custom_objects: Dictionary for custom objects.
+    model_dir: Directory to save Estimator model parameters, graph and etc.
+    config: Configuration object.
+
+  Returns:
+    An Estimator from given keras model.
+
+  Raises:
+    ValueError: if neither keras_model nor keras_model_path was given.
+    ValueError: if both keras_model and keras_model_path was given.
+    ValueError: if the keras_model_path is a GCS URI.
+    ValueError: if keras_model has not been compiled.
+  """
+  if (not keras_model) and (not keras_model_path):
+    raise ValueError(
+        'Either keras_model or keras_model_path needs to be provided.')
+  if keras_model and keras_model_path:
+    raise ValueError(
+        'Please specity either keras_model or keras_model_path but not both.')
+
+  if not keras_model:
+    if keras_model_path.startswith(
+        'gs://') or 'storage.googleapis.com' in keras_model_path:
+      raise ValueError(
+          '%s is not a local path. Please copy the model locally first.' %
+          keras_model_path)
+    logging.info('Loading models from %s', keras_model_path)
+    keras_model = models.load_model(keras_model_path)
+  else:
+    logging.info('Using the Keras model from memory.')
+    keras_model = keras_model
+
+  if not hasattr(keras_model, 'optimizer'):
+    raise ValueError(
+        'Given keras model has not been compiled yet. Please compile first '
+        'before creating the estimator.')
+
+  keras_weights = keras_model.get_weights()
+  keras_model_fn = _create_keras_model_fn(keras_model, custom_objects)
+  est = estimator_lib.Estimator(
+      keras_model_fn, model_dir=model_dir, config=config)
+  # TODO(yifeif): move checkpoint initialization to scaffold.init_fn
+  _save_first_checkpoint(keras_model, est, custom_objects, keras_weights)
+  return est
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@ -0,0 +1,392 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from math import log10
+import os
+import tempfile
+
+import numpy as np
+
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+try:
+  import h5py  # pylint:disable=g-import-not-at-top
+except ImportError:
+  h5py = None
+
+
+def simple_sequential_model():
+  model = keras.models.Sequential()
+  model.add(
+      keras.layers.Conv2D(
+          32, kernel_size=(3, 3), activation='relu', input_shape=(14, 14, 3)))
+  model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
+  model.add(keras.layers.Dropout(0.25))
+  model.add(keras.layers.Flatten())
+  model.add(keras.layers.Dense(16, activation='relu'))
+  model.add(keras.layers.Dropout(0.25))
+  model.add(keras.layers.Dense(3, activation='softmax'))
+  return model
+
+
+def simple_functional_model():
+  a = keras.layers.Input(shape=(14, 14, 3))
+  b = keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu')(a)
+  b = keras.layers.MaxPooling2D(pool_size=(2, 2))(b)
+  b = keras.layers.Dropout(0.25)(b)
+  b = keras.layers.Flatten()(b)
+  b = keras.layers.Dense(16, activation='relu')(b)
+  b = keras.layers.Dropout(0.25)(b)
+  b = keras.layers.Dense(3, activation='softmax')(b)
+  model = keras.models.Model(inputs=[a], outputs=[b])
+  return model
+
+
+def get_resource_for_simple_model(is_sequential, is_evaluate):
+  model = simple_sequential_model(
+  ) if is_sequential else simple_functional_model()
+  if is_sequential:
+    model.build()
+  input_name = model.input_names[0]
+
+  np.random.seed(1337)
+  (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+      train_samples=200,
+      test_samples=100,
+      input_shape=(14, 14, 3),
+      num_classes=3)
+  y_train = keras.utils.to_categorical(y_train)
+  y_test = keras.utils.to_categorical(y_test)
+
+  train_input_fn = numpy_io.numpy_input_fn(
+      x={input_name: np.array(x_train, dtype=np.float32)},
+      y=np.array(y_train, dtype=np.float32),
+      shuffle=False,
+      num_epochs=None,
+      batch_size=16)
+
+  evaluate_input_fn = numpy_io.numpy_input_fn(
+      x={input_name: np.array(x_test, dtype=np.float32)},
+      y=np.array(y_test, dtype=np.float32),
+      num_epochs=1,
+      shuffle=False)
+
+  predict_input_fn = numpy_io.numpy_input_fn(
+      x={input_name: np.array(x_test, dtype=np.float32)},
+      num_epochs=1,
+      shuffle=False)
+
+  inference_input_fn = evaluate_input_fn if is_evaluate else predict_input_fn
+
+  return model, (x_train, y_train), (x_test,
+                                     y_test), train_input_fn, inference_input_fn
+
+
+def multi_inputs_multi_outputs_model():
+  # test multi-input layer
+  a = keras.layers.Input(shape=(32,), name='input_a')
+  b = keras.layers.Input(shape=(32,), name='input_b')
+  dense = keras.layers.Dense(16, name='dense_1')
+  a_2 = dense(a)
+  b_2 = dense(b)
+  merged = keras.layers.concatenate([a_2, b_2], name='merge')
+  c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
+  d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged)
+  model = keras.models.Model(inputs=[a, b], outputs=[c, d])
+  model.compile(
+      loss='categorical_crossentropy',
+      optimizer='rmsprop',
+      metrics={'dense_2': 'accuracy',
+               'dense_3': 'accuracy'})
+  return model
+
+
+class TestKerasEstimator(test.TestCase):
+
+  def setUp(self):
+    self._base_dir = os.path.join(self.get_temp_dir(), 'keras_estimator_test')
+    gfile.MakeDirs(self._base_dir)
+
+  def tearDown(self):
+    gfile.DeleteRecursively(self._base_dir)
+
+  def test_train(self):
+    for is_sequential in [True, False]:
+      keras_model, (_, _), (
+          _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
+              is_sequential=is_sequential, is_evaluate=True)
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['accuracy', 'mse', keras.metrics.categorical_accuracy])
+
+      with self.test_session():
+        est_keras = keras.estimator.model_to_estimator(
+            keras_model=keras_model,
+            model_dir=tempfile.mkdtemp(dir=self._base_dir))
+        est_keras.train(input_fn=train_input_fn, steps=200 * 10 / 16)
+        eval_results = est_keras.evaluate(input_fn=eval_input_fn)
+        self.assertGreater(eval_results['accuracy'], 0.9)
+        self.assertGreater(eval_results['categorical_accuracy'], 0.9)
+        self.assertLess(eval_results['mse'], 0.1)
+
+  def test_evaluate(self):
+    keras_model, (x_train, y_train), (
+        x_test, y_test), _, eval_input_fn = get_resource_for_simple_model(
+            is_sequential=False, is_evaluate=True)
+
+    with self.test_session():
+      metrics = [
+          'binary_accuracy', 'binary_crossentropy', 'categorical_accuracy',
+          'categorical_crossentropy', 'cosine_proximity', 'hinge',
+          'kullback_leibler_divergence', 'mean_absolute_error',
+          'mean_absolute_percentage_error', 'mean_squared_error',
+          'mean_squared_logarithmic_error', 'poisson', 'squared_hinge',
+          'top_k_categorical_accuracy'
+      ]
+      keras_model.compile(
+          loss='categorical_crossentropy', optimizer='adam', metrics=metrics)
+      keras_model.fit(x_train, y_train, epochs=1)
+      keras_eval = keras_model.evaluate(x_test, y_test, batch_size=32)
+
+    with self.test_session():
+      keras_est = keras.estimator.model_to_estimator(
+          keras_model=keras_model,
+          model_dir=tempfile.mkdtemp(dir=self._base_dir))
+      est_eval = keras_est.evaluate(input_fn=eval_input_fn)
+
+    metrics = ['loss'] + metrics
+
+    # Check loss and all metrics match between keras and estimator.
+    def shift(val):
+      return val / 10**int(log10(abs(val)))
+
+    for i, metric_name in enumerate(metrics):
+      self.assertAlmostEqual(
+          shift(est_eval[metric_name]),
+          shift(keras_eval[i]),
+          places=4,
+          msg='%s mismatch, keras model: %s, estimator: %s' %
+          (metric_name, est_eval[metric_name], keras_eval[i]))
+
+  def test_predict(self):
+    # Check that predict on a pretrained model yield the same result.
+    keras_model, (x_train, y_train), (
+        x_test, _), _, pred_input_fn = get_resource_for_simple_model(
+            is_sequential=True, is_evaluate=False)
+
+    with self.test_session():
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer='adam',
+          metrics=['accuracy'])
+      keras_model.fit(x_train, y_train, epochs=1)
+      keras_pred = [np.argmax(y) for y in keras_model.predict(x_test)]
+
+    with self.test_session():
+      keras_est = keras.estimator.model_to_estimator(
+          keras_model=keras_model,
+          model_dir=tempfile.mkdtemp(dir=self._base_dir))
+      est_pred = [
+          np.argmax(y[keras_model.output_names[0]])
+          for y in keras_est.predict(input_fn=pred_input_fn)
+      ]
+    self.assertAllEqual(est_pred, keras_pred)
+
+  def test_multi_inputs_multi_outputs(self):
+    np.random.seed(1337)
+    (a_train, c_train), (a_test, c_test) = testing_utils.get_test_data(
+        train_samples=200, test_samples=100, input_shape=(32,), num_classes=3)
+    (b_train, d_train), (b_test, d_test) = testing_utils.get_test_data(
+        train_samples=200, test_samples=100, input_shape=(32,), num_classes=2)
+    c_train = keras.utils.to_categorical(c_train)
+    c_test = keras.utils.to_categorical(c_test)
+    d_train = keras.utils.to_categorical(d_train)
+    d_test = keras.utils.to_categorical(d_test)
+
+    def train_input_fn():
+      input_dict = {
+          'input_a':
+              ops.convert_to_tensor(
+                  np.array(a_train, dtype=np.float32), dtype=dtypes.float32),
+          'input_b':
+              ops.convert_to_tensor(
+                  np.array(b_train, dtype=np.float32), dtype=dtypes.float32)
+      }
+      output_dict = {
+          'dense_2':
+              ops.convert_to_tensor(
+                  np.array(c_train, dtype=np.float32), dtype=dtypes.float32),
+          'dense_3':
+              ops.convert_to_tensor(
+                  np.array(d_train, dtype=np.float32), dtype=dtypes.float32)
+      }
+      return input_dict, output_dict
+
+    def evaluate_input_fn():
+      input_dict = {
+          'input_a':
+              ops.convert_to_tensor(
+                  np.array(a_test, dtype=np.float32), dtype=dtypes.float32),
+          'input_b':
+              ops.convert_to_tensor(
+                  np.array(b_test, dtype=np.float32), dtype=dtypes.float32)
+      }
+      output_dict = {
+          'dense_2':
+              ops.convert_to_tensor(
+                  np.array(c_test, dtype=np.float32), dtype=dtypes.float32),
+          'dense_3':
+              ops.convert_to_tensor(
+                  np.array(d_test, dtype=np.float32), dtype=dtypes.float32)
+      }
+      return input_dict, output_dict
+
+    with self.test_session():
+      model = multi_inputs_multi_outputs_model()
+      est_keras = keras.estimator.model_to_estimator(
+          keras_model=model, model_dir=tempfile.mkdtemp(dir=self._base_dir))
+      est_keras.train(input_fn=train_input_fn, steps=200 * 10 / 16)
+      eval_results = est_keras.evaluate(input_fn=evaluate_input_fn, steps=1)
+      self.assertGreater(eval_results['accuracy_dense_2'], 0.5)
+      self.assertGreater(eval_results['accuracy_dense_3'], 0.5)
+
+  def test_init_from_file(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    keras_model, (x_train, y_train), (
+        x_test, _), _, pred_input_fn = get_resource_for_simple_model(
+            is_sequential=False, is_evaluate=False)
+
+    with self.test_session():
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['accuracy'])
+      keras_model.fit(x_train, y_train, epochs=1)
+      keras_pred = [np.argmax(y) for y in keras_model.predict(x_test)]
+      fname = os.path.join(self._base_dir, 'keras_model.h5')
+      keras.models.save_model(keras_model, fname)
+
+    with self.test_session():
+      keras_est = keras.estimator.model_to_estimator(
+          keras_model_path=fname,
+          model_dir=tempfile.mkdtemp(dir=self._base_dir))
+      est_pred = [
+          np.argmax(y[keras_model.output_names[0]])
+          for y in keras_est.predict(input_fn=pred_input_fn)
+      ]
+    self.assertAllEqual(est_pred, keras_pred)
+
+  def test_keras_model_init_error(self):
+    with self.assertRaisesRegexp(ValueError, 'Either'):
+      keras.estimator.model_to_estimator()
+
+    with self.test_session():
+      keras_model = simple_sequential_model()
+      with self.assertRaisesRegexp(ValueError, 'not both'):
+        keras.estimator.model_to_estimator(
+            keras_model=keras_model,
+            keras_model_path=tempfile.mkdtemp(dir=self._base_dir))
+
+    with self.test_session():
+      keras_model = simple_sequential_model()
+      with self.assertRaisesRegexp(ValueError, 'compiled'):
+        keras.estimator.model_to_estimator(keras_model=keras_model)
+
+    with self.test_session():
+      keras_model = simple_sequential_model()
+      with self.assertRaisesRegexp(ValueError, 'not a local path'):
+        keras.estimator.model_to_estimator(
+            keras_model_path='gs://bucket/object')
+
+  def test_invalid_ionames_error(self):
+    np.random.seed(1337)
+    (x_train, y_train), (_, _) = testing_utils.get_test_data(
+        train_samples=200, test_samples=100, input_shape=(10,), num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    def invald_input_name_input_fn():
+      input_dict = {
+          'invalid_input_name':
+              ops.convert_to_tensor(
+                  np.array(x_train, dtype=np.float32), dtype=dtypes.float32),
+      }
+      output = ops.convert_to_tensor(
+          np.array(y_train, dtype=np.float32), dtype=dtypes.float32)
+      return input_dict, output
+
+    def invald_output_name_input_fn():
+      input_dict = {
+          'input_1':
+              ops.convert_to_tensor(
+                  np.array(x_train, dtype=np.float32), dtype=dtypes.float32),
+      }
+      output_dict = {
+          'invalid_output_name':
+              ops.convert_to_tensor(
+                  np.array(y_train, dtype=np.float32), dtype=dtypes.float32),
+      }
+      return input_dict, output_dict
+
+    model = simple_functional_model()
+    model.compile(
+        loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
+    est_keras = keras.estimator.model_to_estimator(
+        keras_model=model, model_dir=tempfile.mkdtemp(dir=self._base_dir))
+
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        est_keras.train(input_fn=invald_input_name_input_fn, steps=100)
+
+      with self.assertRaises(ValueError):
+        est_keras.train(input_fn=invald_output_name_input_fn, steps=100)
+
+  def test_custom_objects(self):
+    keras_model, (_, _), (
+        _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
+            is_sequential=True, is_evaluate=True)
+
+    class CustomOp(keras.optimizers.RMSprop):
+      pass
+
+    def custom_loss(y_true, y_pred):
+      return keras.losses.categorical_crossentropy(y_true, y_pred)
+
+    keras_model.compile(
+        loss=custom_loss, optimizer=CustomOp(), metrics=['accuracy'])
+
+    with self.test_session():
+      est_keras = keras.estimator.model_to_estimator(
+          keras_model=keras_model,
+          model_dir=tempfile.mkdtemp(dir=self._base_dir))
+      est_keras.train(input_fn=train_input_fn, steps=200 * 10 / 16)
+      eval_results = est_keras.evaluate(input_fn=eval_input_fn)
+      self.assertGreater(eval_results['accuracy'], 0.9)
+
+
+if __name__ == '__main__':
+  test.main()
--- a/tensorflow/python/keras/_impl/keras/models.py
+++ b/tensorflow/python/keras/_impl/keras/models.py
@ -417,6 +417,9 @@ class Sequential(Model):
      name = prefix + str(K.get_uid(prefix))
    self.name = name

+    # Used by Layer base class.
+    self._dtype = None
+
    # The following properties are not actually used by Keras;
    # they exist for compatibility with TF's variable scoping mechanism.
    self._updates = []
--- a/tensorflow/python/keras/estimator/init.py
+++ b/tensorflow/python/keras/estimator/init.py
@ -0,0 +1,25 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras estimator API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl.keras.estimator import model_to_estimator
+
+del absolute_import
+del division
+del print_function
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@ -29,6 +29,13 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test


+def _AddTest(test_class, op_name, testcase_name, fn):
+  test_name = "_".join(["test", op_name, testcase_name])
+  if hasattr(test_class, test_name):
+    raise RuntimeError("Test %s defined more than once" % test_name)
+  setattr(test_class, test_name, fn)
+
+
 class SelfAdjointEigTest(test.TestCase):

  def testWrongDimensions(self):
@ -50,28 +57,30 @@ def SortEigenDecomposition(e, v):
    return np.take(e, perm, -1), np.take(v, perm, -1)


-def NormalizeEigenvectorsPhase(v):
-  """Normalizes the phase of the Eigenvectors stored in the columns of `v`.
+def EquilibrateEigenVectorPhases(x, y):
+  """Equilibrate the phase of the Eigenvectors in the columns of `x` and `y`.

-  (complex) Eigenvectors are only unique up to an arbitrary phase.
-  We normalize the vectors such that the first component has phase 0.
+  Eigenvectors are only unique up to an arbitrary phase. This function rotates x
+  such that it matches y. Precondition: The coluns of x and y differ by a
+  multiplicative complex phase factor only.

  Args:
-    v: `np.ndarray` with Eigenvectors as returned from `np.linalg.eigh`.
+    x: `np.ndarray` with Eigenvectors
+    y: `np.ndarray` with Eigenvectors

  Returns:
-    `np.ndarray` normalized Eigenvectors.
+    `np.ndarray` containing an equilibrated version of x.
  """
-  reference = v / np.linalg.norm(v[..., 0:1, :], axis=-1, keepdims=True)
-  return v * reference.conj()
+  phases = np.sum(np.conj(x) * y, -2, keepdims=True)
+  phases /= np.abs(phases)
+  return phases * x


 def _GetSelfAdjointEigTest(dtype_, shape_, compute_v_):

  def CompareEigenVectors(self, x, y, tol):
-    x = NormalizeEigenvectorsPhase(x)
-    y = NormalizeEigenvectorsPhase(y)
-    self.assertAllClose(x, y, atol=tol, rtol=tol)
+    x = EquilibrateEigenVectorPhases(x, y)
+    self.assertAllClose(x, y, atol=tol)

  def CompareEigenDecompositions(self, x_e, x_v, y_e, y_v, tol):
    num_batches = int(np.prod(x_e.shape[:-1]))
@ -103,7 +112,7 @@ def _GetSelfAdjointEigTest(dtype_, shape_, compute_v_):
    else:
      atol = 1e-12
    np_e, np_v = np.linalg.eigh(a)
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      if compute_v_:
        tf_e, tf_v = linalg_ops.self_adjoint_eig(constant_op.constant(a))

@ -152,7 +161,7 @@ def _GetSelfAdjointEigGradTest(dtype_, shape_, compute_v_):
      tol = 1e-2
    else:
      tol = 1e-7
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      tf_a = constant_op.constant(a)
      if compute_v_:
        tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a)
@ -185,17 +194,16 @@ def _GetSelfAdjointEigGradTest(dtype_, shape_, compute_v_):
  return Test


-if __name__ == '__main__':
-  for compute_v in [True, False]:
-    for dtype in (
-        dtypes_lib.float32, dtypes_lib.float64,
-        dtypes_lib.complex64, dtypes_lib.complex128):
+if __name__ == "__main__":
+  for compute_v in True, False:
+    for dtype in (dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
+                  dtypes_lib.complex128):
      for size in 1, 2, 5, 10:
        for batch_dims in [(), (3,)] + [(3, 2)] * (max(size, size) < 10):
          shape = batch_dims + (size, size)
-          name = '%s_%s_%s' % (dtype, '_'.join(map(str, shape)), compute_v)
-          setattr(SelfAdjointEigTest, 'testSelfAdjointEig_' + name,
-                  _GetSelfAdjointEigTest(dtype, shape, compute_v))
-          setattr(SelfAdjointEigGradTest, 'testSelfAdjointEigGrad_' + name,
-                  _GetSelfAdjointEigGradTest(dtype, shape, compute_v))
+          name = "%s_%s_%s" % (dtype, "_".join(map(str, shape)), compute_v)
+          _AddTest(SelfAdjointEigTest, "SelfAdjointEig", name,
+                   _GetSelfAdjointEigTest(dtype, shape, compute_v))
+          _AddTest(SelfAdjointEigGradTest, "SelfAdjointEigGrad", name,
+                   _GetSelfAdjointEigGradTest(dtype, shape, compute_v))
  test.main()
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@ -57,7 +57,8 @@ class Layer(object):
  Properties:
    trainable: Whether the layer should be trained (boolean).
    name: The name of the layer (string).
-    dtype: Default dtype of the layer (dtypes.float32).
+    dtype: Default dtype of the layer (default of None means use the
+      type of the first input).
    trainable_variables: List of trainable variables.
    non_trainable_variables: List of non-trainable variables.
    variables: List of all variables of this layer, trainable and non-trainable.
@ -68,7 +69,7 @@ class Layer(object):
  """

  def __init__(self, trainable=True, name=None,
-               dtype=dtypes.float32, **kwargs):
+               dtype=None, **kwargs):
    # We use a kwargs dict here because these kwargs only exist
    # for compatibility reasons.
    # The list of kwargs is subject to changes in the future.
@ -97,7 +98,7 @@ class Layer(object):
    self._graph = ops.get_default_graph()
    self._per_input_losses = {}
    self._per_input_updates = {}
-    self.dtype = dtypes.as_dtype(dtype).name
+    self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
    self.input_spec = None
    self._compute_previous_mask = ('mask' in estimator_util.fn_args(self.call)
                                   or hasattr(self, 'compute_mask'))
@ -131,6 +132,10 @@ class Layer(object):
      batch_size = kwargs.get('batch_size')
      self.batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])

+  @property
+  def dtype(self):
+    return self._dtype
+
  @property
  def scope_name(self):
    if not self._scope:
@ -389,7 +394,7 @@ class Layer(object):
    Arguments:
      name: variable name.
      shape: variable shape.
-      dtype: The type of the variable. Defaults to `self.dtype`.
+      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
      initializer: initializer instance (callable).
      regularizer: regularizer instance (callable).
      trainable: whether the variable should be part of the layer's
@ -414,7 +419,7 @@ class Layer(object):
        raise RuntimeError('Variable regularization not supported in Eager '
                           'mode.')
    if dtype is None:
-      dtype = self.dtype
+      dtype = self.dtype or dtypes.float32

    self._set_scope(None)
    vs_reuse = ((self.built or self._reuse)
@ -526,6 +531,11 @@ class Layer(object):
          # Check input assumptions set before layer building, e.g. input rank.
          self._assert_input_compatibility(inputs)
          input_list = nest.flatten(inputs)
+          if input_list and self._dtype is None:
+            try:
+              self._dtype = input_list[0].dtype.name
+            except AttributeError:
+              pass
          input_shapes = [x.get_shape() for x in input_list]
          if len(input_shapes) == 1:
            self.build(input_shapes[0])
@ -1406,8 +1416,8 @@ class Network(Layer):
    self.trainable = True
    # A Network does not create weights of its own, thus it is already built.
    self.built = True
-    # A Network does not create weights of its own, thus dtype is not settable.
-    self.dtype = None
+    # A Network does not create weights of its own, thus has no dtype.
+    self._dtype = None
    # The following are implemented as property functions:
    # self.trainable_weights
    # self.non_trainable_weights
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@ -35,6 +35,11 @@ class FileIoTest(test.TestCase):
  def tearDown(self):
    file_io.delete_recursively(self._base_dir)

+  def testEmptyFilename(self):
+    f = file_io.FileIO("", mode="r")
+    with self.assertRaises(errors.NotFoundError):
+      _ = f.read()
+
  def testFileDoesntExist(self):
    file_path = os.path.join(self._base_dir, "temp_file")
    self.assertFalse(file_io.file_exists(file_path))
--- a/tensorflow/tools/api/golden/tensorflow.keras.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.estimator.pbtxt
@ -0,0 +1,7 @@
+path: "tensorflow.keras.estimator"
+tf_module {
+  member_method {
+    name: "model_to_estimator"
+    argspec: "args=[\'keras_model\', \'keras_model_path\', \'custom_objects\', \'model_dir\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
@ -4,6 +4,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
@ -4,6 +4,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
@ -5,6 +5,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
@ -4,6 +4,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@ -6,6 +6,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@ -6,6 +6,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@ -6,6 +6,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
@ -5,6 +5,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@ -6,6 +6,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@ -6,6 +6,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@ -6,6 +6,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
@ -5,6 +5,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
@ -13,6 +13,10 @@ tf_class {
    name: "constraints"
    mtype: "<type \'property\'>"
  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
@ -5,6 +5,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@ -6,6 +6,10 @@ tf_class {
  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
  member {
    name: "graph"
    mtype: "<type \'property\'>"
--- a/Show More
+++ b/Show More