Merge commit for internal changes

2017-05-08 19:31:28 -07:00 · 2017-05-08 19:31:28 -07:00 · 78fe1944d9
commit 78fe1944d9
parent 7fa0cf39f8 697f34ca82
129 changed files with 8280 additions and 1676 deletions
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -342,6 +342,7 @@ filegroup(
        "//tensorflow/tensorboard/components/tf_globals:all_files",
        "//tensorflow/tensorboard/components/tf_globals_d3v4:all_files",
        "//tensorflow/tensorboard/components/tf_graph_common:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_loader:all_files",
        "//tensorflow/tensorboard/components/tf_histogram_dashboard:all_files",
        "//tensorflow/tensorboard/components/tf_histogram_dashboard/demo:all_files",
        "//tensorflow/tensorboard/components/tf_image_dashboard:all_files",
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@ -73,7 +73,7 @@ cc_library(
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto",
        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:compile_only_client",
        "//tensorflow/compiler/xla/service:compiler",
        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
        "//tensorflow/core:core_cpu",
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@ -274,7 +274,8 @@ Status CreateXlaArgs(const Graph& graph,

 // Converts the TensorFlow graph into an XLA computation, by executing the
 // graph symbolically, with each op building up the XLA HLO.
-Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
+Status ConvertGraphToXla(xla::CompileOnlyClient* client,
+                         std::unique_ptr<Graph> graph,
                         xla::Computation* computation, bool* has_context_arg) {
  // Create a device and context to convert the graph into an XLA computation.
  XlaOpRegistry::RegisterCompilationKernels();
@ -333,7 +334,8 @@ Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
 }

 // Compiles the XLA computation into executable code.
-Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
+Status CompileXla(xla::CompileOnlyClient* client,
+                  const xla::Computation& computation,
                  const xla::cpu::CpuAotCompilationOptions& aot_opts,
                  CompileResult* compile_result) {
  // Retrieves arg and result layouts from the computation.
@ -350,7 +352,7 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
  for (int i = 0; i < pshape->parameters_size(); ++i) {
    arg_layouts.push_back(pshape->mutable_parameters(i));
  }
-  xla::LocalClient::AheadOfTimeComputationInstance instance;
+  xla::CompileOnlyClient::AotComputationInstance instance;
  instance.computation = &computation;
  instance.argument_layouts = std::move(arg_layouts);
  instance.result_layout = &pshape->result();
@ -365,7 +367,7 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
          std::move(aot_or.ValueOrDie().back()));
  compile_result->entry_point = aot_opts.entry_point_name();
  compile_result->pointer_size =
-      xla::LocalClient::PointerSizeForTriple(aot_opts.triple());
+      xla::CompileOnlyClient::PointerSizeForTriple(aot_opts.triple());
  return Status::OK();
 }

@ -394,8 +396,9 @@ Status CompileGraph(std::unique_ptr<Graph> graph, const MainFlags& flags,
  namespace gpu = perftools::gputools;
  gpu::Platform* cpu_platform =
      gpu::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
-  xla::LocalClient* client =
-      xla::ClientLibrary::GetOrCreateLocalClient(cpu_platform).ValueOrDie();
+  xla::CompileOnlyClient* client =
+      xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
+          .ValueOrDie();
  xla::Computation computation;
  TF_RETURN_IF_ERROR(ConvertGraphToXla(client, std::move(graph), &computation,
                                       &compile_result->has_context_arg));
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@ -99,6 +99,26 @@ cc_library(
    ],
 )

+cc_library(
+    name = "compile_only_client",
+    srcs = ["compile_only_client.cc"],
+    hdrs = ["compile_only_client.h"],
+    deps = [
+        ":client",
+        ":computation",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:compile_only_service",
+        "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "@llvm//:support",
+    ],
+)
+
 # This target is used to instantiate the XLA service in-process and create
 # a client for it.
 cc_library(
@ -106,12 +126,14 @@ cc_library(
    srcs = ["client_library.cc"],
    hdrs = ["client_library.h"],
    deps = [
+        ":compile_only_client",
        ":local_client",
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla:statusor",
        "//tensorflow/compiler/xla:types",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla/service:backend",
+        "//tensorflow/compiler/xla/service:compile_only_service",
        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:local_service",
        "//tensorflow/compiler/xla/service:platform_util",
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@ -69,8 +69,8 @@ ClientLibrary::~ClientLibrary() = default;
    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
  }

-  auto it = client_library.instances_.find(platform->id());
-  if (it != client_library.instances_.end()) {
+  auto it = client_library.local_instances_.find(platform->id());
+  if (it != client_library.local_instances_.end()) {
    return it->second->client.get();
  }

@ -78,13 +78,13 @@ ClientLibrary::~ClientLibrary() = default;
  service_options.set_platform(platform);
  service_options.set_number_of_replicas(replica_count);

-  std::unique_ptr<LocalInstance> instance = MakeUnique<LocalInstance>();
+  auto instance = MakeUnique<LocalInstance>();
  TF_ASSIGN_OR_RETURN(instance->service,
                      LocalService::NewService(service_options));
  instance->client = MakeUnique<LocalClient>(instance->service.get());
  LocalClient* cl = instance->client.get();

-  client_library.instances_.insert(
+  client_library.local_instances_.insert(
      std::make_pair(platform->id(), std::move(instance)));
  return cl;
 }
@ -99,9 +99,35 @@ ClientLibrary::~ClientLibrary() = default;
    perftools::gputools::Platform* platform) {
  ClientLibrary& client_library = Singleton();
  tensorflow::mutex_lock lock(client_library.service_mutex_);
-  auto it = client_library.instances_.find(platform->id());
-  CHECK(it != client_library.instances_.end());
+  auto it = client_library.local_instances_.find(platform->id());
+  CHECK(it != client_library.local_instances_.end());
  return it->second->service.get();
 }

+/* static */ StatusOr<CompileOnlyClient*>
+ClientLibrary::GetOrCreateCompileOnlyClient(
+    perftools::gputools::Platform* platform) {
+  ClientLibrary& client_library = Singleton();
+  tensorflow::mutex_lock lock(client_library.service_mutex_);
+
+  if (platform == nullptr) {
+    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
+  }
+
+  auto it = client_library.compile_only_instances_.find(platform->id());
+  if (it != client_library.compile_only_instances_.end()) {
+    return it->second->client.get();
+  }
+
+  auto instance = MakeUnique<CompileOnlyInstance>();
+  TF_ASSIGN_OR_RETURN(instance->service,
+                      CompileOnlyService::NewService(platform));
+  instance->client = MakeUnique<CompileOnlyClient>(instance->service.get());
+  CompileOnlyClient* cl = instance->client.get();
+
+  client_library.compile_only_instances_.insert(
+      std::make_pair(platform->id(), std::move(instance)));
+  return cl;
+}
+
 }  // namespace xla
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@ -26,7 +26,9 @@ limitations under the License.
 #include <string>
 #include <vector>

+#include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/statusor.h"
@ -76,6 +78,13 @@ class ClientLibrary {
  // access user computations from client.
  static LocalService* GetXlaService(perftools::gputools::Platform* platform);

+  // Singleton constructor-or-accessor for compile-only clients. Arguments:
+  //
+  //   platform : The platform the underlying XLA service should target. If
+  //     null then default platform is used.
+  static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
+      perftools::gputools::Platform* platform = nullptr);
+
 private:
  // Returns the singleton instance of ClientLibrary.
  static ClientLibrary& Singleton();
@ -90,10 +99,21 @@ class ClientLibrary {
    std::unique_ptr<LocalClient> client;
  };

+  struct CompileOnlyInstance {
+    // Service that is wrapped by the singleton client object.
+    std::unique_ptr<CompileOnlyService> service;
+    // Singleton client object.
+    std::unique_ptr<CompileOnlyClient> client;
+  };
+
  tensorflow::mutex service_mutex_;  // Guards the singleton creation state.
  std::unordered_map<perftools::gputools::Platform::Id,
                     std::unique_ptr<LocalInstance>>
-      instances_ GUARDED_BY(service_mutex_);
+      local_instances_ GUARDED_BY(service_mutex_);
+
+  std::unordered_map<perftools::gputools::Platform::Id,
+                     std::unique_ptr<CompileOnlyInstance>>
+      compile_only_instances_ GUARDED_BY(service_mutex_);

  TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
 };
--- a/tensorflow/compiler/xla/client/compile_only_client.cc
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
+
+#include "external/llvm/include/llvm/ADT/Triple.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyClient::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<CompileOnlyService::AotComputationInstance> service_instances;
+  service_instances.reserve(computations.size());
+  for (const AotComputationInstance& instance : computations) {
+    service_instances.push_back({});
+    CompileOnlyService::AotComputationInstance& service_instance =
+        service_instances.back();
+    TF_RET_CHECK(instance.computation != nullptr);
+    service_instance.computation = instance.computation->handle();
+    service_instance.argument_layouts = instance.argument_layouts;
+    service_instance.result_layout = instance.result_layout;
+  }
+  return compiler_service_->CompileAheadOfTime(service_instances, options);
+}
+
+int64 CompileOnlyClient::PointerSizeForTriple(
+    tensorflow::StringPiece target_triple) {
+  llvm::Triple triple(
+      llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
+  if (triple.isArch64Bit()) {
+    return 8;
+  } else if (triple.isArch32Bit()) {
+    return 4;
+  } else {
+    CHECK(triple.isArch16Bit());
+    return 2;
+  }
+}
+
+}  // namespace xla
--- a/tensorflow/compiler/xla/client/compile_only_client.h
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
+
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// An XLA Client specialization for doing ahead-of-time compilation.  This does
+// not require (or attempt to instantiate) an execution-capable backend for the
+// relevant platform.
+class CompileOnlyClient : public Client {
+ public:
+  explicit CompileOnlyClient(CompileOnlyService* service)
+      : Client(service), compiler_service_(service) {}
+
+  CompileOnlyClient(const CompileOnlyClient&) = delete;
+  void operator=(const CompileOnlyClient&) = delete;
+
+  // A description of a computation to compile using CompileAheadOfTime.
+  struct AotComputationInstance {
+    const Computation* computation;
+    // Inform the compiler of the expected layout for arguments.
+    std::vector<const Shape*> argument_layouts;
+    // Specifies the expected result layout.
+    const Shape* result_layout;
+  };
+
+  // Compiles a list of computations for ahead-of-time execution.  This is
+  // intended for use in static compilation. The |options| parameter describes
+  // the target for which the compiler should emit code.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+      const AotCompilationOptions& options);
+
+  // Returns the size of a pointer in bytes for a given triple.
+  static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
+
+ private:
+  CompileOnlyService* compiler_service_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
--- a/tensorflow/compiler/xla/client/global_data.h
+++ b/tensorflow/compiler/xla/client/global_data.h
@ -23,13 +23,15 @@ limitations under the License.

 namespace xla {

-// Wraps a GlobalDataHandle with a lifetime.
+// A GlobalData object represents a globally-accessible allocation of
+// data in the associated XLA service.
 class GlobalData {
 public:
  // Gives ownership of the global data handle to this object.
  GlobalData(ServiceInterface* parent, GlobalDataHandle handle);

-  // Unregisters the wrapped handle.
+  // Unregisters the wrapped handle, which causes the service to
+  // deallocate the associated data.
  ~GlobalData();

  const GlobalDataHandle& handle() const { return handle_; }
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@ -176,10 +176,10 @@ StatusOr<std::unique_ptr<ShapedBuffer>> LocalExecutable::Run(
  TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_));

  ExecutableRunOptions actual_options = options;
-  Backend::StreamPtr stream;
  if (options.stream() == nullptr) {
    TF_ASSIGN_OR_RETURN(
-        stream, BorrowStreamForDevice(options.device_ordinal(), backend_));
+        Backend::StreamPtr stream,
+        BorrowStreamForDevice(options.device_ordinal(), backend_));
    actual_options.set_stream(stream.get());
  }
  if (options.allocator() == nullptr) {
@ -261,38 +261,6 @@ tensorflow::Status LocalClient::ResolveArguments(
                                          argument_ptrs);
 }

-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-LocalClient::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-        computations,
-    const AotCompilationOptions& options) {
-  std::vector<LocalService::AheadOfTimeComputationInstance> service_instances;
-  service_instances.reserve(computations.size());
-  for (const AheadOfTimeComputationInstance& instance : computations) {
-    service_instances.push_back({});
-    LocalService::AheadOfTimeComputationInstance& service_instance =
-        service_instances.back();
-    TF_RET_CHECK(instance.computation != nullptr);
-    service_instance.computation = instance.computation->handle();
-    service_instance.argument_layouts = instance.argument_layouts;
-    service_instance.result_layout = instance.result_layout;
-  }
-  return local_service_->CompileAheadOfTime(service_instances, options);
-}
-
-int64 LocalClient::PointerSizeForTriple(tensorflow::StringPiece target_triple) {
-  llvm::Triple triple(
-      llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
-  if (triple.isArch64Bit()) {
-    return 8;
-  } else if (triple.isArch32Bit()) {
-    return 4;
-  } else {
-    CHECK(triple.isArch16Bit());
-    return 2;
-  }
-}
-
 se::Platform* LocalClient::platform() const {
  return local_service_->backend().platform();
 }
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@ -148,7 +148,7 @@ class LocalExecutable {
  const ExecutableBuildOptions& build_options_;
 };

-// An XLA service client object for use when the client and service run in
+// An XLA Client specialization for use when the client and service run in
 // the same process.
 class LocalClient : public Client {
 public:
@ -182,30 +182,6 @@ class LocalClient : public Client {
      const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
      const ExecutableBuildOptions& options);

-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AheadOfTimeComputationInstance {
-    const Computation* computation;
-    // Inform the compiler of the expected layout for arguments.
-    std::vector<const Shape*> argument_layouts;
-    // Specifies the expected result layout.
-    const Shape* result_layout;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation. The |options| parameter describes
-  // the target for which the compiler should emit code.
-  //
-  // TODO(b/31222190): This doesn't really belong in LocalClient. Move it to its
-  // own library.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-          computations,
-      const AotCompilationOptions& options);
-
-  // Returns the size of a pointer in bytes for a given triple.
-  static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
-
  // Returns the platform that the underlying service targets.
  perftools::gputools::Platform* platform() const;

--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"

 #include <algorithm>
+#include <cstring>
 #include <functional>
 #include <limits>
 #include <numeric>
@ -308,37 +309,16 @@ template <typename T, typename WT>

 /* static */ std::unique_ptr<Literal> LiteralUtil::Relayout(
    const Literal& original, const Layout& layout) {
-  // Note: if this were a performance bottleneck, we avoid cloning and just make
-  // an uninitialized array instead, since all values are clobbered below.
  std::unique_ptr<Literal> result = CloneToUnique(original);
  *result->mutable_shape()->mutable_layout() = layout;
-  const PrimitiveType primitive_type = original.shape().element_type();
-  switch (primitive_type) {
-    case F32:
-      LiteralUtil::EachCell<float>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, float value) {
-            LiteralUtil::Set<float>(result.get(), indices, value);
-          });
-      return result;
-    case S32:
-      LiteralUtil::EachCell<int32>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, int32 value) {
-            LiteralUtil::Set<int32>(result.get(), indices, value);
-          });
-      return result;
-    case U32:
-      LiteralUtil::EachCell<uint32>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, uint32 value) {
-            LiteralUtil::Set<uint32>(result.get(), indices, value);
-          });
-      return result;
-    default:
-      LOG(FATAL) << "not yet implemented: "
-                 << PrimitiveType_Name(primitive_type);
-  }
+
+  const Shape& shape = original.shape();
+  std::vector<int64> base(ShapeUtil::Rank(shape), 0);
+  std::vector<int64> copy_size(shape.dimensions().begin(),
+                               shape.dimensions().end());
+
+  TF_CHECK_OK(Copy(original, base, result.get(), base, copy_size));
+  return result;
 }

 /* static */ StatusOr<std::unique_ptr<Literal>> LiteralUtil::Reshape(
@ -346,25 +326,19 @@ template <typename T, typename WT>
  if (ShapeUtil::IsTuple(input.shape())) {
    return InvalidArgument("Reshape does not support tuples.");
  }
-
+  std::unique_ptr<Literal> output;
  if (!LayoutUtil::IsMonotonicWithDim0Major(input.shape().layout())) {
-    return Unimplemented(
-        "Input shape must have a monotonic layout where dimension 0 is major, "
-        "was: %s",
-        LayoutUtil::HumanString(input.shape().layout()).c_str());
+    std::vector<int64> minor_to_major(ShapeUtil::Rank(input.shape()));
+    std::iota(minor_to_major.rbegin(), minor_to_major.rend(),
+              static_cast<int64>(0));
+    output = Relayout(input, LayoutUtil::MakeLayout(minor_to_major));
+  } else {
+    output = CloneToUnique(input);
  }
-  std::vector<int64> layout(dimensions.size());
-  std::iota(layout.rbegin(), layout.rend(), 0);
-
  // Because the layout is monotonic, we can simply reuse the same sequence of
  // values without changing their order.
-  std::unique_ptr<Literal> output = CloneToUnique(input);
-  output->clear_shape();
-  output->mutable_shape()->set_element_type(input.shape().element_type());
-  for (int64 dimension : dimensions) {
-    output->mutable_shape()->add_dimensions(dimension);
-  }
-  *output->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout(layout);
+  *output->mutable_shape() =
+      ShapeUtil::MakeShape(input.shape().element_type(), dimensions);

  int64 elements_before = ShapeUtil::ElementsIn(input.shape());
  int64 elements_after = ShapeUtil::ElementsIn(output->shape());
@ -378,73 +352,42 @@ template <typename T, typename WT>
  return std::move(output);
 }

-namespace {
-
-template <class T>
-void TransposeLiteralInternal(const Literal& original,
-                              tensorflow::gtl::ArraySlice<int64> permutation,
-                              Literal* result) {
-  std::vector<int64> new_indices(ShapeUtil::Rank(original.shape()));
-  LiteralUtil::EachCell<T>(
-      original, [&](tensorflow::gtl::ArraySlice<int64> indices, T value) {
-        for (int64 i = 0; i < indices.size(); ++i) {
-          new_indices[i] = indices[permutation[i]];
-        }
-        LiteralUtil::Set<T>(result, new_indices, value);
-      });
-}
-}  // namespace
-
 /* static */ std::unique_ptr<Literal> LiteralUtil::Transpose(
    const Literal& original, tensorflow::gtl::ArraySlice<int64> permutation) {
  CHECK(!ShapeUtil::IsTuple(original.shape()))
-      << "tuple is not supported for transpose";
-  std::vector<int64> dimension_numbers(ShapeUtil::Rank(original.shape()));
-  std::iota(dimension_numbers.begin(), dimension_numbers.end(), 0);
-  CHECK(std::is_permutation(permutation.begin(), permutation.end(),
-                            dimension_numbers.begin()))
-      << "given permutation is not a permutation of dimension numbers";
-  std::vector<int64> new_dimension_sizes;
-  for (const int64 dim : permutation) {
-    new_dimension_sizes.push_back(original.shape().dimensions(dim));
-  }
-  const auto result_shape = ShapeUtil::MakeShape(
-      original.shape().element_type(), new_dimension_sizes);
-  std::unique_ptr<Literal> result = CloneToUnique(original);
-  *result->mutable_shape() = result_shape;
-  const PrimitiveType primitive_type = original.shape().element_type();
-  switch (primitive_type) {
-    case F32:
-      TransposeLiteralInternal<float>(original, permutation, result.get());
-      return result;
-    case F64:
-      TransposeLiteralInternal<double>(original, permutation, result.get());
-      return result;
-    case PRED:
-      TransposeLiteralInternal<bool>(original, permutation, result.get());
-      return result;
-    case S8:
-      TransposeLiteralInternal<int8>(original, permutation, result.get());
-      return result;
-    case U8:
-      TransposeLiteralInternal<uint8>(original, permutation, result.get());
-      return result;
-    case S32:
-      TransposeLiteralInternal<int32>(original, permutation, result.get());
-      return result;
-    case U32:
-      TransposeLiteralInternal<uint32>(original, permutation, result.get());
-      return result;
-    case S64:
-      TransposeLiteralInternal<int64>(original, permutation, result.get());
-      return result;
-    case U64:
-      TransposeLiteralInternal<uint64>(original, permutation, result.get());
-      return result;
-    default:
-      LOG(FATAL) << "not yet implemented: "
-                 << PrimitiveType_Name(primitive_type);
+      << "Tuple is not supported for transpose";
+  CHECK(IsPermutation(permutation, ShapeUtil::Rank(original.shape())))
+      << "Given permutation is not a permutation of dimension numbers";
+  // To transpose the array, we just permute the dimensions and layout, and
+  // do a straight memory copy of the raw data set.
+  // This is considerably faster than iterating over every array element using
+  // the EachCell<>() and Set<>() APIs.
+  std::vector<int64> inverse_permutation = InversePermutation(permutation);
+  Shape shape =
+      ShapeUtil::PermuteDimensions(inverse_permutation, original.shape());
+  // Replace the layout with one affine to the original shape, such that a
+  // transpose operation can be performed by leaving the flat values
+  // representation intact.
+  // For example, consider the shape F32[11,8]{1,0} under a {1,0} permutation.
+  // The shape with affine layout resulting from that operation will be
+  // F32[8,11]{0,1}, since it leave the original most minor (the 8 sized), the
+  // most minor.
+  // Essentially, given MinMaj(Di) the position of the Di dimension within the
+  // minor to major vector, and given T(Di) the index that the original Di
+  // dimension has within the transposed array, a layout is affine if
+  // MinMaj(Di) == TMinMaj(T(Di)), with TMinMaj() being the minor to major
+  // vector of the affine layout.
+  Layout* layout = shape.mutable_layout();
+  layout->clear_minor_to_major();
+  for (auto index : original.shape().layout().minor_to_major()) {
+    layout->add_minor_to_major(inverse_permutation[index]);
  }
+  std::unique_ptr<Literal> new_literal = CreateFromShape(shape);
+  DCHECK_GE(ShapeUtil::ByteSizeOf(new_literal->shape()),
+            ShapeUtil::ByteSizeOf(original.shape()));
+  std::memcpy(MutableInternalData(new_literal.get()), InternalData(original),
+              ShapeUtil::ByteSizeOf(original.shape()));
+  return new_literal;
 }

 /* static */ std::unique_ptr<Literal> LiteralUtil::Slice(
@ -793,47 +736,14 @@ void TransposeLiteralInternal(const Literal& original,
    const Literal& literal,
    const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
                             const string& value)>& per_cell) {
-  if (ShapeUtil::Rank(literal.shape()) == 1) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      per_cell({i0}, GetAsString(literal, {i0}));
-    }
+  if (ShapeUtil::HasZeroElements(literal.shape())) {
    return;
  }
-
-  if (ShapeUtil::Rank(literal.shape()) == 2) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        per_cell({i0, i1}, GetAsString(literal, {i0, i1}));
-      }
-    }
-    return;
-  }
-
-  if (ShapeUtil::Rank(literal.shape()) == 3) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
-          per_cell({i0, i1, i2}, GetAsString(literal, {i0, i1, i2}));
-        }
-      }
-    }
-    return;
-  }
-
-  if (ShapeUtil::Rank(literal.shape()) == 4) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
-          for (int64 i3 = 0; i3 < literal.shape().dimensions(3); ++i3) {
-            per_cell({i0, i1, i2, i3}, GetAsString(literal, {i0, i1, i2, i3}));
-          }
-        }
-      }
-    }
-    return;
-  }
-
-  LOG(FATAL) << "unhandled rank: " << ShapeUtil::Rank(literal.shape());
+  std::vector<int64> indices = IndexUtil::LinearIndexToMultidimensionalIndex(
+      literal.shape(), /*linear_index=*/0);
+  do {
+    per_cell(indices, GetAsString(literal, indices));
+  } while (IndexUtil::BumpIndices(literal.shape(), &indices));
 }

 namespace {
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@ -239,6 +239,11 @@ class LiteralUtil {
  // Clones literal into an owned unique_ptr version.
  static std::unique_ptr<Literal> CloneToUnique(const Literal& literal);

+  // Returns the linear index of the given index within the literal's
+  // element_type repeated field.
+  static int64 LinearIndex(const Literal& literal,
+                           tensorflow::gtl::ArraySlice<int64> multi_index);
+
  // Gets or sets an element in the literal at the given index. The index is
  // CHECKed against the dimension sizes.
  template <typename NativeT>
@ -427,11 +432,6 @@ class LiteralUtil {
                  "Cannot map native type to primitive type.");
  }

-  // Returns the linear index of the given index within the literal's
-  // element_type repeated field.
-  static int64 LinearIndex(const Literal& literal,
-                           tensorflow::gtl::ArraySlice<int64> multi_index);
-
  // Internal template helper for the Copy() API, matching its arguments one by
  // one.
  //
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@ -469,6 +469,26 @@ TEST_F(LiteralUtilTest, ReshapeR4) {
  EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
 }

+TEST_F(LiteralUtilTest, ReshapeR4Dim0Minor) {
+  // clang-format off
+  // F32[1x3x2x4]
+  auto original = LiteralUtil::CreateR4WithLayout<float>({{
+     {{10, 11, 12, 13}, {14, 15, 16, 17}},
+     {{18, 19, 20, 21}, {22, 23, 24, 25}},
+     {{26, 27, 28, 29}, {30, 31, 32, 33}},
+  }}, layout_r4_dim0minor_);
+  // F32[1x3x4x2]
+  auto expected = LiteralUtil::CreateR3WithLayout<float>({
+    {{10, 11}, {12, 13}, {14, 15}, {16, 17}},
+    {{18, 19}, {20, 21}, {22, 23}, {24, 25}},
+    {{26, 27}, {28, 29}, {30, 31}, {32, 33}},
+  }, layout_r3_dim0major_);
+  // clang-format on
+  auto reshape = LiteralUtil::Reshape(*original, {3, 4, 2}).ConsumeValueOrDie();
+
+  EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
+}
+
 TEST_F(LiteralUtilTest, TransposeR0) {
  auto original = LiteralUtil::CreateR0<float>(1.7f);
  auto reshape = LiteralUtil::Transpose(*original, /*permutation=*/{});
@ -659,15 +679,15 @@ TEST_F(LiteralUtilTest, Copy) {
        primitive_util::NativeToPrimitiveType<uint32>(), dimensions, layout);
    auto blank = LiteralUtil::CreateFromShape(shape);
    auto source = LiteralUtil::CreateFromShape(shape);
-    const int64 sbase[] = {0, 0, 0, 0};
-    const int64 incr[] = {1, 1, 1, 1};
+    const int64 zero_base[] = {0, 0, 0, 0};
+    const int64 step[] = {1, 1, 1, 1};
    uint32 seqnr = 0;
    auto init_proc = [&](const std::vector<int64>& indexes) {
      LiteralUtil::Set(source.get(), indexes, ++seqnr);
      return true;
    };

-    ShapeUtil::ForEachIndex(source->shape(), sbase, dimensions, incr,
+    ShapeUtil::ForEachIndex(source->shape(), zero_base, dimensions, step,
                            init_proc);

    const int64 src_base[] = {3, 1, 5, 7};
@ -691,7 +711,7 @@ TEST_F(LiteralUtilTest, Copy) {
                 bval == LiteralUtil::Get<uint32>(*source, source_indexes));
      return matched;
    };
-    ShapeUtil::ForEachIndex(source->shape(), sbase, copy_size, incr,
+    ShapeUtil::ForEachIndex(source->shape(), zero_base, copy_size, step,
                            check_proc);
    EXPECT_TRUE(matched);
  }
@ -710,5 +730,43 @@ TEST_F(LiteralUtilTest, CopyScalars) {
  EXPECT_EQ(LiteralUtil::Get<uint32>(*vect, {4}), 17);
 }

+TEST_F(LiteralUtilTest, Populate) {
+  struct PopulateData {
+    std::vector<int64> dimensions;
+    std::vector<int64> layout;
+  } populate_data[] = {
+      {{}, {}},
+      {{16}, {0}},
+      {{4, 16}, {1, 0}},
+      {{21, 12}, {0, 1}},
+      {{6, 11, 17}, {2, 0, 1}},
+      {{6, 11, 5, 17}, {3, 2, 0, 1}},
+  };
+  for (const auto& data : populate_data) {
+    Shape shape = ShapeUtil::MakeShapeWithLayout(
+        primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
+        data.layout);
+    auto literal = LiteralUtil::CreateFromShape(shape);
+    auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> uint32 {
+      // Offsets from linear index just to avoid R0 literals to be initialized
+      // with zero.
+      return LiteralUtil::LinearIndex(*literal, indexes) + 17;
+    };
+    TF_EXPECT_OK(LiteralUtil::Populate<uint32>(literal.get(), generator));
+
+    std::vector<int64> zero_base(data.dimensions.size(), 0);
+    std::vector<int64> step(data.dimensions.size(), 1);
+    bool matched = true;
+    auto check_function = [&](const std::vector<int64>& indexes) {
+      auto value = LiteralUtil::Get<uint32>(*literal, indexes);
+      matched = matched && (value == generator(indexes));
+      return matched;
+    };
+    ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step,
+                            check_function);
+    EXPECT_TRUE(matched);
+  }
+}
+
 }  // namespace
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@ -406,6 +406,27 @@ cc_library(
    ],
 )

+cc_library(
+    name = "compile_only_service",
+    srcs = ["compile_only_service.cc"],
+    hdrs = ["compile_only_service.h"],
+    deps = [
+        ":backend",
+        ":compiler",
+        ":computation_layout",
+        ":computation_tracker",
+        ":platform_util",
+        ":service",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
 cc_library(
    name = "cpu_plugin",
    deps = [
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@ -0,0 +1,131 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+
+/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
+CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
+  ServiceOptions default_options;
+  default_options.set_platform(platform);
+  return NewService(default_options);
+}
+
+/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
+CompileOnlyService::NewService(const ServiceOptions& options) {
+  perftools::gputools::Platform* platform = options.platform();
+  if (platform == nullptr) {
+    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
+  }
+
+  TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
+                      CreateComputeConstantBackend());
+  std::unique_ptr<CompileOnlyService> service(
+      new CompileOnlyService(compiler, std::move(compute_constant_backend)));
+  return std::move(service);
+}
+
+CompileOnlyService::CompileOnlyService(
+    Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend)
+    : Service(/*backend=*/nullptr, std::move(compute_constant_backend)),
+      compiler_(compiler) {
+  runs_in_client_process_ = true;
+}
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyService::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<std::unique_ptr<HloModule>> hlo_modules;
+  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
+  for (const AotComputationInstance& instance : computations) {
+    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
+                        computation_tracker_.Resolve(instance.computation));
+    VersionedComputationHandle versioned_handle =
+        user_computation->GetVersionedHandle();
+
+    // Dump computation proto state if flag is set.
+    legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
+    const string& directory_path = flags->xla_dump_computations_to;
+    if (!directory_path.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<SessionModule> session_module,
+          computation_tracker_.SnapshotComputation(versioned_handle.handle));
+      string filename = tensorflow::strings::StrCat(
+          "computation_", versioned_handle.handle.handle(), "__",
+          session_module->entry().name(), "__version_",
+          versioned_handle.version);
+      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
+                                                     *session_module));
+    }
+
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                        computation_tracker_.BuildHloModule(
+                            versioned_handle,
+                            /*include_unreachable_instructions=*/true));
+    hlo_modules.push_back(std::move(hlo_module));
+
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<const ProgramShape> program_shape,
+        user_computation->ComputeProgramShape(versioned_handle.version));
+
+    module_configs.push_back(MakeUnique<HloModuleConfig>(*program_shape));
+    HloModuleConfig* module_config = module_configs.back().get();
+    auto* computation_layout =
+        module_config->mutable_entry_computation_layout();
+    if (flags->xla_hlo_profile) {
+      module_config->enable_hlo_profiling(true);
+    }
+    for (int i = 0; i < instance.argument_layouts.size(); ++i) {
+      const Shape& argument_layout = *instance.argument_layouts[i];
+      if (ShapeUtil::IsTuple(argument_layout)) {
+        return Unimplemented("tuple arguments not supported yet");
+      }
+      TF_RETURN_IF_ERROR(
+          computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
+              argument_layout));
+    }
+    TF_RETURN_IF_ERROR(
+        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
+            *instance.result_layout));
+  }
+
+  return compiler_->CompileAheadOfTime(std::move(hlo_modules),
+                                       std::move(module_configs),
+                                       MakeHloDumper(), options);
+}
+
+}  // namespace xla
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@ -0,0 +1,125 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
+
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/service.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// An XLA Service specialization for ahead-of-time compilation.  This only
+// instantiates a Compiler object for the relevant platform; it does not
+// instantiate or require an execution backend.
+class CompileOnlyService : public Service {
+ public:
+  // Factory for creating a CompileOnlyService. The parameter platform is the
+  // platform that the service should target. If platform is null then the
+  // default platform is used.
+  static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
+      perftools::gputools::Platform* platform);
+  static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
+      const ServiceOptions& options);
+
+  // A description of a computation to compile using CompileAheadOfTime.
+  struct AotComputationInstance {
+    ComputationHandle computation;
+    std::vector<const Shape*> argument_layouts;
+    const Shape* result_layout = nullptr;
+  };
+
+  // Compiles a list of computations for ahead-of-time execution.  This is
+  // intended for use in static compilation.  See
+  // |CompileOnlyClient::CompileAheadOfTime| for additional details.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+      const AotCompilationOptions& Options);
+
+  // Override Service methods that require or imply the existence of an
+  // execute backend.  Note that this does not include TransferToClient and
+  // TransferToClientInProcess, as computing contants produces global data
+  // that we may wish to transfer.
+  tensorflow::Status Execute(const ExecuteRequest* arg,
+                             ExecuteResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                                     ExecuteParallelResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status GetDeviceHandles(
+      const GetDeviceHandlesRequest* arg,
+      GetDeviceHandlesResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support devices.");
+  }
+  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                                  ExecuteAsyncResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status WaitForExecution(
+      const WaitForExecutionRequest* arg,
+      WaitForExecutionResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status TransferToServer(
+      const TransferToServerRequest* arg,
+      TransferToServerResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status TransferToInfeed(
+      const TransferToInfeedRequest* arg,
+      TransferToInfeedResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status TransferToServerInProcess(
+      const TransferToServerInProcessRequest* arg,
+      TransferToServerInProcessResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
+                                 ResetDeviceResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support devices.");
+  }
+
+ private:
+  explicit CompileOnlyService(
+      Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend);
+  CompileOnlyService(const CompileOnlyService&) = delete;
+  void operator=(const CompileOnlyService&) = delete;
+
+  // The compiler for the target platform.  This is included in place of
+  // the Service::execute_backend_'s compiler, since execute_backend_ is a
+  // nullptr in CompileOnlyService.
+  Compiler* compiler_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -188,41 +188,52 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
  return pipeline.Run(hlo_module).status();
 }

-// Invokes the ptxas tool on the given PTX string, and dumps its output.
-void DumpPtxasInfo(const string& ptx) {
+// Invokes the ptxas tool on the given PTX string, and stores the resulting
+// SASS in *cubin. If -v 2 or greater, runs ptxas with -v and dumps the
+// resulting stderr (which contains register allocation info, etc.)
+// to VLOG(2). If ptxas binary is not found *sass is set to "".
+Status CompilePTX(const string& ptx, int cc_major, int cc_minor,
+                  string* cubin) {
+  *cubin = "";
+
  const string ptxas_path =
      tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas");
+
  // Do not log PTX stats if ptxas is not found at the given path.
-  if (!tensorflow::Env::Default()->FileExists(ptxas_path).ok()) {
-    LOG(WARNING)
-        << "Failed to dump PTX stats because ptxas is not found at path \""
-        << ptxas_path << "\".";
-    return;
-  }
+  LOG(INFO) << "Invoking ptxas at path \"" << ptxas_path << "\".";
+  TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(ptxas_path));

  // Write `ptx` into a temporary file.
  char tempdir_template[] = "/tmp/ptxXXXXXX";
  char* tempdir_name = mkdtemp(tempdir_template);
  CHECK_NOTNULL(tempdir_name);
  string ptx_path = tensorflow::io::JoinPath(tempdir_name, "ptx");
+
  TF_CHECK_OK(
      tensorflow::WriteStringToFile(tensorflow::Env::Default(), ptx_path, ptx));
  LOG(INFO) << "ptx file written to: " << ptx_path;

  // Invoke ptxas and collect its output.
-  tensorflow::SubProcess ptxas_info_dumper;
-  ptxas_info_dumper.SetProgram(ptxas_path, {ptxas_path, ptx_path, "-o",
-                                            "/dev/null", "-v", "-arch=sm_35"});
-  ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
-                                     tensorflow::ACTION_PIPE);
-  CHECK(ptxas_info_dumper.Start());
-  string stderr_output;
-  int exit_status = ptxas_info_dumper.Communicate(
-      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
-  XLA_LOG_LINES(tensorflow::INFO, stderr_output);
-  if (exit_status != 0) {
-    LOG(FATAL) << "Invalid PTX. See the error message above for reasons.";
+  tensorflow::SubProcess ptxas_info;
+  string arch = tensorflow::strings::StrCat("sm_", cc_major, cc_minor);
+  string cubin_path = tensorflow::io::JoinPath(tempdir_name, "cubin");
+
+  if (VLOG_IS_ON(2)) {
+    ptxas_info.SetProgram(ptxas_path, {ptxas_path, "-v", "-o", cubin_path,
+                                       "-arch", arch, ptx_path});
+  } else {
+    ptxas_info.SetProgram(
+        ptxas_path, {ptxas_path, "-o", cubin_path, "-arch", arch, ptx_path});
  }
+  ptxas_info.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
+  CHECK(ptxas_info.Start());
+  string stderr_output;
+  int ptxas_exit_status = ptxas_info.Communicate(
+      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
+
+  TF_RET_CHECK(ptxas_exit_status == 0);
+  return tensorflow::ReadFileToString(tensorflow::Env::Default(), cubin_path,
+                                      cubin);
 }

 }  // namespace
@ -298,10 +309,14 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(

  // Reserve space for the PTX to be generated for this module.
  string* ptx;
+  string* cubin;
  {
    tensorflow::mutex_lock lock(mutex_);
    generated_ptxes_.emplace_back(MakeUnique<string>());
    ptx = generated_ptxes_.back().get();
+
+    generated_cubins_.emplace_back(MakeUnique<string>());
+    cubin = generated_cubins_.back().get();
  }
  int cc_major, cc_minor;
  if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
@ -318,9 +333,6 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
  XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
  VLOG(2) << "PTX:";
  XLA_VLOG_LINES(2, *ptx);
-  if (VLOG_IS_ON(2)) {
-    DumpPtxasInfo(*ptx);
-  }

  auto thunk_schedule = MakeUnique<ThunkSchedule>(
      ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
@ -328,9 +340,13 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
  VLOG(2) << "Printing the thunk schedule...";
  XLA_VLOG_LINES(2, thunk_schedule->ToString());

+  TF_RET_CHECK(CompilePTX(*ptx, cc_major, cc_minor, cubin).ok());
+
  auto* gpu_executable =
-      new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(hlo_module),
+      new GpuExecutable(*cubin, *ptx, {cc_major, cc_minor},
+                        std::move(thunk_schedule), std::move(hlo_module),
                        std::move(module_config), std::move(buffer_assignment));
+
  if (flags->xla_gpu_embed_ir) {
    DCHECK_NE("", ir_module_string_before_opt);
    gpu_executable->set_ir_module_string(ir_module_string_before_opt);
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@ -71,6 +71,7 @@ class GpuCompiler : public Compiler {
  // StreamExecutor (b/24776264).
  tensorflow::mutex mutex_;
  std::vector<std::unique_ptr<string>> generated_ptxes_ GUARDED_BY(mutex_);
+  std::vector<std::unique_ptr<string>> generated_cubins_ GUARDED_BY(mutex_);

  // The size in bytes of a pointer. Used for computing ShapeSizeBytes.
  int64 pointer_size_;
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@ -107,13 +107,17 @@ class HloExecutionProfiler {

 // Implementation note: HLO profiling is always enabled for GPU executables,
 // since we can use timers around thunks.
-GpuExecutable::GpuExecutable(tensorflow::StringPiece ptx,
+GpuExecutable::GpuExecutable(tensorflow::StringPiece cubin,
+                             tensorflow::StringPiece ptx,
+                             std::pair<int, int> compute_capability,
                             std::unique_ptr<ThunkSchedule> thunk_schedule,
                             std::unique_ptr<HloModule> hlo_module,
                             std::unique_ptr<HloModuleConfig> module_config,
                             std::unique_ptr<BufferAssignment> assignment)
    : Executable(std::move(hlo_module), std::move(module_config)),
+      cubin_(cubin),
      ptx_(ptx),
+      compute_capability_(compute_capability),
      thunk_schedule_(std::move(thunk_schedule)),
      assignment_(std::move(assignment)) {}

@ -186,6 +190,13 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
  // false.
  TF_RET_CHECK(!module_config().has_hybrid_result());

+  // Ensure the compute capability of the cubin and the stream match.
+  std::pair<int, int> stream_compute_compatibility;
+  stream->parent()->GetDeviceDescription().cuda_compute_capability(
+      &stream_compute_compatibility.first,
+      &stream_compute_compatibility.second);
+  TF_RET_CHECK(stream_compute_compatibility == compute_capability_);
+
  BufferAllocations::Builder buffer_allocations_builder;
  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
       ++i) {
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@ -40,15 +40,17 @@ limitations under the License.

 namespace xla {
 namespace gpu {
-
 // GPU-targeting implementation of the XLA Executable interface.
 //
 // Launches the given CUDA kernel via the StreamExecutor.
-//
-// This is an immutable data type after initialization, and thus thread safe.
+
+// GPUExecutable should eventually be updated to associate a compute
+// capability with the PTX and store multiple cubins, each with their own
+// associated CC's, rather than including CC as a property of GpuExecutable.
 class GpuExecutable : public Executable {
 public:
-  GpuExecutable(tensorflow::StringPiece ptx,
+  GpuExecutable(tensorflow::StringPiece cubin, tensorflow::StringPiece ptx,
+                std::pair<int, int> compute_capability,
                std::unique_ptr<ThunkSchedule> thunk_schedule,
                std::unique_ptr<HloModule> hlo_module,
                std::unique_ptr<HloModuleConfig> module_config,
@ -62,7 +64,8 @@ class GpuExecutable : public Executable {
    ir_module_string_ = ir_module_string;
  }

-  // Returns the compiled PTX for the computation.
+  // Returns the compiled CUDA binary for the computation.
+  tensorflow::StringPiece cubin() const { return cubin_; }
  tensorflow::StringPiece ptx() const { return ptx_; }

  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
@ -104,8 +107,10 @@ class GpuExecutable : public Executable {
  // This string should be modified only before ExecuteOnStream.
  string ir_module_string_;

-  // The reference to the compiled PTX for the computation.
-  const tensorflow::StringPiece ptx_;
+  // The reference to the compiled PTX & CUDA binary for the computation.
+  tensorflow::StringPiece cubin_;
+  tensorflow::StringPiece ptx_;
+  std::pair<int, int> compute_capability_;

  // The thunks to be invoked by this GpuExecutable. They are generated by the
  // IrEmitter.
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@ -41,13 +41,10 @@ tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
    // Already initialized by another thread.
    return tensorflow::Status::OK();
  }
-
  loader_spec_.reset(new se::MultiKernelLoaderSpec(io_buffers_.size() + 1));
-  tensorflow::StringPiece ptx = executable.ptx();
-  // Convert tensorflow::StringPiece to se::port::StringPiece because
-  // StreamExecutor uses the latter.
-  loader_spec_->AddCudaPtxInMemory(
-      se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_);
+
+  tensorflow::StringPiece cubin = executable.cubin();
+  loader_spec_->AddCudaCubinInMemory(cubin.data(), kernel_name_);
  return tensorflow::Status::OK();
 }

--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@ -195,7 +195,7 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {

  HloInstruction* root = computation->root_instruction();
  EXPECT_THAT(root, op::Constant());
-  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), shape));

  using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
  bool matched = true;
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@ -128,70 +128,6 @@ StatusOr<GlobalDataHandle> LocalService::AllocateBufferOnDevice(
                                  allocation_size));
 }

-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-LocalService::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-        computations,
-    const AotCompilationOptions& options) {
-  std::vector<std::unique_ptr<HloModule>> hlo_modules;
-  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
-  for (const AheadOfTimeComputationInstance& instance : computations) {
-    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                        computation_tracker_.Resolve(instance.computation));
-    VersionedComputationHandle versioned_handle =
-        user_computation->GetVersionedHandle();
-
-    // Dump computation proto state if flag is set.
-    legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-    const string& directory_path = flags->xla_dump_computations_to;
-    if (!directory_path.empty()) {
-      TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<SessionModule> session_module,
-          computation_tracker_.SnapshotComputation(versioned_handle.handle));
-      string filename = tensorflow::strings::StrCat(
-          "computation_", versioned_handle.handle.handle(), "__",
-          session_module->entry().name(), "__version_",
-          versioned_handle.version);
-      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
-                                                     *session_module));
-    }
-
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                        computation_tracker_.BuildHloModule(
-                            versioned_handle,
-                            /*include_unreachable_instructions=*/true));
-    hlo_modules.push_back(std::move(hlo_module));
-
-    TF_ASSIGN_OR_RETURN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        user_computation->ComputeProgramShape(versioned_handle.version));
-
-    module_configs.push_back(MakeUnique<HloModuleConfig>(*program_shape));
-    HloModuleConfig* module_config = module_configs.back().get();
-    auto* computation_layout =
-        module_config->mutable_entry_computation_layout();
-    if (flags->xla_hlo_profile) {
-      module_config->enable_hlo_profiling(true);
-    }
-    for (int i = 0; i < instance.argument_layouts.size(); ++i) {
-      const Shape& argument_layout = *instance.argument_layouts[i];
-      if (ShapeUtil::IsTuple(argument_layout)) {
-        return Unimplemented("tuple arguments not supported yet");
-      }
-      TF_RETURN_IF_ERROR(
-          computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
-              argument_layout));
-    }
-    TF_RETURN_IF_ERROR(
-        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
-            *instance.result_layout));
-  }
-
-  return execute_backend_->compiler()->CompileAheadOfTime(
-      std::move(hlo_modules), std::move(module_configs), MakeHloDumper(),
-      options);
-}
-
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
    const ComputationHandle& computation,
    const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@ -59,22 +59,6 @@ class LocalService : public Service {
      const Shape& shape, int device_ordinal,
      bool allocate_space_for_deep_copy);

-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AheadOfTimeComputationInstance {
-    ComputationHandle computation;
-    std::vector<const Shape*> argument_layouts;
-    const Shape* result_layout = nullptr;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation.  See
-  // |LocalClient::CompileAheadOfTime| for additional details.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-          computations,
-      const AotCompilationOptions& Options);
-
  // Builds an Executable with the given argument layouts and options. If
  // result_layout is non-null, then the executable is compiled to produce a
  // result of the given layout.
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@ -180,20 +180,24 @@ Service::Service(std::unique_ptr<Backend> execute_backend,
                 std::unique_ptr<Backend> compute_constant_backend)
    : execute_backend_(std::move(execute_backend)),
      compute_constant_backend_(std::move(compute_constant_backend)) {
-  LOG(INFO) << Printf(
-      "XLA service %p executing computations on platform %s. Devices:", this,
-      execute_backend_->platform()->Name().c_str());
-  for (int i = 0; i < execute_backend_->device_count(); ++i) {
-    if (execute_backend_->device_ordinal_supported(i)) {
-      se::StreamExecutor* executor =
-          execute_backend_->stream_executor(i).ValueOrDie();
-      const auto& description = executor->GetDeviceDescription();
-      LOG(INFO) << Printf("  StreamExecutor device (%d): %s, %s", i,
-                          description.name().c_str(),
-                          description.platform_version().c_str());
-    } else {
-      LOG(INFO) << Printf("  StreamExecutor device (%d) not supported", i);
+  if (execute_backend_) {
+    LOG(INFO) << Printf(
+        "XLA service %p executing computations on platform %s. Devices:", this,
+        execute_backend_->platform()->Name().c_str());
+    for (int i = 0; i < execute_backend_->device_count(); ++i) {
+      if (execute_backend_->device_ordinal_supported(i)) {
+        se::StreamExecutor* executor =
+            execute_backend_->stream_executor(i).ValueOrDie();
+        const auto& description = executor->GetDeviceDescription();
+        LOG(INFO) << Printf("  StreamExecutor device (%d): %s, %s", i,
+                            description.name().c_str(),
+                            description.platform_version().c_str());
+      } else {
+        LOG(INFO) << Printf("  StreamExecutor device (%d) not supported", i);
+      }
    }
+  } else {
+    VLOG(1) << "XLA compile-only service constructed";
  }
 }

@ -286,7 +290,7 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
    const ProgramShape& program_shape,
    tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-    const ExecutionOptions& execution_options) {
+    const ExecutionOptions& execution_options, Backend* backend) {
  auto module_config = MakeUnique<HloModuleConfig>(program_shape);
  auto* computation_layout = module_config->mutable_entry_computation_layout();

@ -326,7 +330,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
    module_config->enable_hlo_profiling(true);
  }

-  module_config->set_replica_count(execute_backend_->Replicas().size());
+  module_config->set_replica_count(backend->Replicas().size());
  module_config->set_fast_math_disabled(execution_options.disable_fast_math());
  module_config->set_seed(execution_options.seed());

@ -474,7 +478,7 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
      std::unique_ptr<Executable> executable_unique_ptr,
      BuildExecutable(versioned_handle, std::move(module_config),
                      /*executable_for_compute_constant=*/false, arguments,
-                      execute_backend_.get(), executor));
+                      backend, executor));

  if (profile != nullptr) {
    uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@ -575,15 +579,14 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
  perftools::gputools::DeviceMemoryBase result;
  if (backend->Replicas().size() == 1) {
    TF_ASSIGN_OR_RETURN(
-        result,
-        ExecuteOnStreamWrapper<StatusOr<se::DeviceMemoryBase>>(
-            executable, &run_options[0], profile, execute_backend_.get(),
-            [&arguments](Executable* executable,
-                         const ServiceExecutableRunOptions* run_options,
-                         HloExecutionProfile* hlo_execution_profile) {
-              return executable->ExecuteOnStream(run_options, arguments,
-                                                 hlo_execution_profile);
-            }));
+        result, ExecuteOnStreamWrapper<StatusOr<se::DeviceMemoryBase>>(
+                    executable, &run_options[0], profile, backend,
+                    [&arguments](Executable* executable,
+                                 const ServiceExecutableRunOptions* run_options,
+                                 HloExecutionProfile* hlo_execution_profile) {
+                      return executable->ExecuteOnStream(run_options, arguments,
+                                                         hlo_execution_profile);
+                    }));
  } else {
    std::vector<
        tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
@ -666,7 +669,8 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
    // the program and the argument allocations.
    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
                        CreateModuleConfig(*program_shape, arg_allocations,
-                                           request.execution_options()));
+                                           request.execution_options(),
+                                           execute_backend_.get()));
    VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
            << module_config->entry_computation_layout().ToString();

@ -751,9 +755,10 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
      ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
                                  execute_backend_->default_device_ordinal()));

-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arg_allocations,
+                         arg->execution_options(), execute_backend_.get()));

  VLOG(3) << "Execute created HloModuleConfig computation layout: "
          << module_config->entry_computation_layout().ToString();
@ -818,9 +823,10 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
      ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
                                  execute_backend_->default_device_ordinal()));

-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arg_allocations,
+                         arg->execution_options(), execute_backend_.get()));

  VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
          << module_config->entry_computation_layout().ToString();
@ -1141,7 +1147,8 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
  }

  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(program_shape, {}, execution_options));
+                      CreateModuleConfig(program_shape, {}, execution_options,
+                                         compute_constant_backend_.get()));

  TF_ASSIGN_OR_RETURN(
      std::shared_ptr<Executable> executable,
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@ -265,11 +265,11 @@ class Service : public ServiceInterface {
      tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
      const Backend* backend, int device_ordinal);

-  // Create a Hlo module config foe the given program shape and arguments.
+  // Create a Hlo module config for the given program shape and arguments.
  StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
      const ProgramShape& program_shape,
      tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-      const ExecutionOptions& execution_options);
+      const ExecutionOptions& execution_options, Backend* backend);

  // Builds an Executable for the given parameters. If
  // executable_for_compute_constant is true, then the executable is intended to
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@ -728,9 +728,17 @@ Status ForEachMutableSubshapeHelper(
    new_shape.add_dimensions(dim);
  }
  if (shape.has_layout()) {
-    new_shape.mutable_layout()->clear_minor_to_major();
+    Layout* new_layout = new_shape.mutable_layout();
+    new_layout->clear_minor_to_major();
    for (auto index : Permute(permutation, shape.layout().minor_to_major())) {
-      new_shape.mutable_layout()->add_minor_to_major(index);
+      new_layout->add_minor_to_major(index);
+    }
+    if (shape.layout().padded_dimensions_size() > 0) {
+      new_layout->clear_padded_dimensions();
+      for (auto dim :
+           Permute(permutation, shape.layout().padded_dimensions())) {
+        new_layout->add_padded_dimensions(dim);
+      }
    }
  }
  return new_shape;
@ -1057,7 +1065,9 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
  DCHECK_EQ(count.size(), base.size());
  const Layout& layout = shape.layout();
  int64 rank = layout.minor_to_major_size();
-  int64 n = 0;
+  // Allows handling R0 arrays, such that the visitor function will be called
+  // once with the proper empty indexes.
+  int64 n = -1;
  std::vector<int64> indexes(base.begin(), base.end());
  while (n < rank && visitor_function(indexes)) {
    // Increments dimensions in minor to major order.
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@ -42,7 +42,7 @@ xla::Computation Doubler(xla::Client* client) {
 int main(int argc, char** argv) {
  tensorflow::port::InitMain(argv[0], &argc, &argv);

-  auto client = xla::ClientLibrary::LocalClientOrDie();
+  auto client = xla::ClientLibrary::GetOrCreateCompileOnlyClient().ValueOrDie();

  xla::ComputationBuilder builder(client, "aot_test_helper");
  auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
@ -74,7 +74,7 @@ int main(int argc, char** argv) {
  llvm::Triple triple(xla::llvm_ir::AsStringRef(triple_string));

  xla::Computation computation = builder.Build().ConsumeValueOrDie();
-  xla::LocalClient::AheadOfTimeComputationInstance instance{
+  xla::CompileOnlyClient::AotComputationInstance instance{
      &computation, /*argument_layouts=*/{&opaque_shape}, &r0f32};

  xla::cpu::CpuAotCompilationOptions options(
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@ -153,16 +153,26 @@ string Reindent(tensorflow::StringPiece original,
      });
 }

+bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank) {
+  if (rank != permutation.size()) {
+    return false;
+  }
+  std::vector<int64> output(permutation.size(), -1);
+  for (auto index : permutation) {
+    CHECK_GE(index, 0);
+    CHECK_LT(index, rank);
+    output[index] = 0;
+  }
+  return std::find(output.begin(), output.end(), -1) == output.end();
+}
+
 std::vector<int64> InversePermutation(
    tensorflow::gtl::ArraySlice<int64> input_permutation) {
+  DCHECK(IsPermutation(input_permutation, input_permutation.size()));
  std::vector<int64> output_permutation(input_permutation.size(), -1);
  for (size_t i = 0; i < input_permutation.size(); ++i) {
    output_permutation[input_permutation[i]] = i;
  }
-  DCHECK_EQ(
-      0, std::count(output_permutation.begin(), output_permutation.end(), -1));
-  DCHECK(std::is_permutation(input_permutation.begin(), input_permutation.end(),
-                             output_permutation.begin()));
  return output_permutation;
 }

--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@ -177,6 +177,9 @@ Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 string Reindent(tensorflow::StringPiece original,
                tensorflow::StringPiece indentation);

+// Checks whether permutation is a permutation of the [0, rank) integer range.
+bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank);
+
 // Applies `permutation` on `input` and returns the permuted array.
 // For each i, output[permutation[i]] = input[i].
 //
@ -187,12 +190,11 @@ template <template <typename...> class C, typename T>
 std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
                       C<T> input_) {
  tensorflow::gtl::ArraySlice<T> input(input_);
-  CHECK_EQ(permutation.size(), input.size());
+  CHECK(IsPermutation(permutation, input.size()));
  std::vector<T> output(input.size());
  for (size_t i = 0; i < permutation.size(); ++i) {
    output[permutation[i]] = input[i];
  }
-  DCHECK(std::is_permutation(input.begin(), input.end(), output.begin()));
  return output;
 }

--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@ -121,6 +121,7 @@ set(tf_proto_text_srcs
    "tensorflow/core/protobuf/cluster.proto"
    "tensorflow/core/protobuf/config.proto"
    "tensorflow/core/protobuf/debug.proto"
+    "tensorflow/core/protobuf/device_properties.proto"
    "tensorflow/core/protobuf/rewriter_config.proto"
    "tensorflow/core/protobuf/tensor_bundle.proto"
    "tensorflow/core/protobuf/saver.proto"
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@ -82,7 +82,7 @@ tf_custom_op_py_library(

 cuda_py_test(
    name = "cudnn_rnn_ops_test",
-    size = "small",
+    size = "medium",
    srcs = ["python/kernel_tests/cudnn_rnn_ops_test.py"],
    additional_deps = [
        ":cudnn_rnn_py",
--- a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"

 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
@ -67,7 +68,7 @@ limitations under the License.
 * TensorFlow is responsible for making sure the memory is alive long enough
 * and recycles afterwards.
 *
-*/
+ */
 namespace tensorflow {

 using CPUDevice = Eigen::ThreadPoolDevice;
@ -106,6 +107,7 @@ using perftools::gputools::DeviceMemory;
 using perftools::gputools::DeviceMemoryBase;
 using perftools::gputools::ScratchAllocator;
 using perftools::gputools::port::StatusOr;
+using strings::Printf;

 Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
  if (str == "rnn_relu") {
@ -203,9 +205,10 @@ DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
 }

 inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
-  return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
-                                            static_cast<int>(s.code())),
-                                        s.error_message());
+  return s.ok() ? Status::OK()
+                : Status(static_cast<tensorflow::error::Code>(
+                             static_cast<int>(s.code())),
+                         s.error_message());
 }

 template <typename T>
@ -244,8 +247,7 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
    // allocator.
    allocated_tensors_.push_back(temporary_memory);
    total_byte_size_ += byte_size;
-    return perftools::gputools::port::StatusOr<
-        perftools::gputools::DeviceMemory<uint8>>(
+    return StatusOr<DeviceMemory<uint8>>(
        AsDeviceMemory<uint8>(&temporary_memory));
  }
  int64 TotalByteSize() { return total_byte_size_; }
@ -296,6 +298,43 @@ class CudnnRNNReserveSpaceAllocator : public ScratchAllocator {
  int output_index_;
 };

+// A helper to allocate persistent memory for Cudnn RNN models, which is
+// expected to live between kernel invocations.
+// This class is not thread-safe.
+class CudnnRNNPersistentSpaceAllocator : public ScratchAllocator {
+ public:
+  CudnnRNNPersistentSpaceAllocator(OpKernelContext* context)
+      : context_(context) {}
+
+  virtual ~CudnnRNNPersistentSpaceAllocator() {}
+
+  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
+    return std::numeric_limits<int64>::max();
+  }
+
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(
+      perftools::gputools::Stream* stream, int64 byte_size) override {
+    if (total_byte_size_ != 0) {
+      return Status(error::FAILED_PRECONDITION,
+                    "Persistent space allocator can only be called once");
+    }
+
+    Status allocation_status = context_->allocate_persistent(
+        DT_UINT8, TensorShape({byte_size}), &handle_, nullptr);
+    if (!allocation_status.ok()) {
+      return ToExecutorStatus(allocation_status);
+    }
+    total_byte_size_ += byte_size;
+    return AsDeviceMemory<uint8>(handle_.AccessTensor(context_));
+  }
+  int64 TotalByteSize() { return total_byte_size_; }
+
+ private:
+  int64 total_byte_size_ = 0;
+  PersistentTensor handle_;
+  OpKernelContext* context_;  // not owned
+};
+
 struct CudnnModelTypes {
  RnnMode rnn_mode;
  TFRNNInputMode rnn_input_mode;
@ -317,6 +356,16 @@ struct CudnnModelShapes {
  TensorShape input_shape;
  TensorShape output_shape;
  TensorShape hidden_state_shape;
+  // At present only fields related to cached RnnDescriptor are concerned.
+  bool IsCompatibleWith(const CudnnModelShapes& rhs) const {
+    return num_layers == rhs.num_layers && input_size == rhs.input_size &&
+           num_units == rhs.num_units && dir_count == rhs.dir_count;
+  }
+  string RnnDescDebugString() {
+    return strings::Printf(
+        "[num_layers, input_size, num_units, dir_count]: [%d, %d, %d, %d]",
+        num_layers, input_size, num_units, dir_count);
+  }
 };

 // Extract and checks the forward input tensors, parameters, and shapes from the
@ -399,11 +448,23 @@ void RestoreParams(const OpInputList params_input,

 }  // namespace

+// Note: all following kernels depend on a RnnDescriptor instance, which
+// according to Cudnn official doc should be kept around and reused across all
+// Cudnn kernels in the same model.
+// In Tensorflow, we don't pass the reference across different OpKernels,
+// rather, recreate it separately in each OpKernel, which does no cause issue:
+// CudnnDropoutDescriptor keeps a reference to a memory for
+// random number generator state. During recreation, this state is lost.
+// However, only forward-pass Cudnn APIs make use of the state.
+
 // A common base class for RNN kernels. It extracts common attributes and
 // shape validations.
 class CudnnRNNKernelCommon : public OpKernel {
 protected:
  CudnnRNNKernelCommon(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dropout", &dropout_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
    string str;
    OP_REQUIRES_OK(context, context->GetAttr("rnn_mode", &str));
    OP_REQUIRES_OK(context, ParseRNNMode(str, &model_types_.rnn_mode));
@ -413,6 +474,10 @@ class CudnnRNNKernelCommon : public OpKernel {
    OP_REQUIRES_OK(context, context->GetAttr("direction", &str));
    OP_REQUIRES_OK(
        context, ParseRNNDirectionMode(str, &model_types_.rnn_direction_mode));
+    // Reset CudnnRnnDescriptor and related random number generate states in
+    // every Compute() call.
+    OP_REQUIRES_OK(context, ReadBoolFromEnvVar("TF_CUDNN_RESET_RND_GEN_STATE",
+                                               false, &reset_rnd_gen_state_));
  }

  bool HasInputC() const { return model_types_.HasInputC(); }
@ -422,6 +487,9 @@ class CudnnRNNKernelCommon : public OpKernel {
    return model_types_.rnn_direction_mode;
  }
  CudnnModelTypes model_types() const { return model_types_; }
+  float dropout() const { return dropout_; }
+  uint64 seed() { return (static_cast<uint64>(seed_) << 32) | seed2_; }
+  bool ResetRndGenState() { return reset_rnd_gen_state_; }

  template <typename T>
  Status ExtractCudnnRNNParamsInfo(OpKernelContext* context,
@ -448,11 +516,14 @@ class CudnnRNNKernelCommon : public OpKernel {
    RnnInputMode input_mode;
    TF_RETURN_IF_ERROR(
        ToRNNInputMode(rnn_input_mode(), num_units, input_size, &input_mode));
+
    auto* stream = context->op_device_context()->stream();
+    // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
+    // random number generator, therefore set state_allocator to nullptr.
    auto rnn_desc_s = stream->parent()->createRnnDescriptor(
        num_layers, num_units, input_size, input_mode, rnn_direction_mode(),
-        rnn_mode(), ToDataType<T>::value, 0.f /*dropout*/, 0 /*seed*/,
-        nullptr /*state_allocator*/);
+        rnn_mode(), ToDataType<T>::value, dropout(), seed(),
+        nullptr /* state_allocator */);
    if (!rnn_desc_s.ok()) {
      return FromExecutorStatus(rnn_desc_s);
    }
@ -461,6 +532,11 @@ class CudnnRNNKernelCommon : public OpKernel {
  }

 private:
+  int seed_;
+  int seed2_;
+  float dropout_;
+  bool reset_rnd_gen_state_;
+
  CudnnModelTypes model_types_;
 };

@ -560,9 +636,8 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
        context->set_output(i, input.Slice(start, end));
      } else {
        Tensor* output = nullptr;
-        OP_REQUIRES_OK(
-            context,
-            context->allocate_output(i, TensorShape({width, height}), &output));
+        OP_REQUIRES_OK(context, context->allocate_output(
+                                    i, TensorShape({width, height}), &output));
        DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
            input_ptr, rnn_desc->ParamsWeightRegions()[i].offset,
            size_in_bytes);
@ -571,14 +646,17 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
      }
    }

-    CHECK(num_params_ == rnn_desc->ParamsBiasRegions().size())
-        << "Number of params mismatch. Expected " << num_params_ << ", got "
-        << rnn_desc->ParamsBiasRegions().size();
+    OP_REQUIRES(context, num_params_ == rnn_desc->ParamsBiasRegions().size(),
+                errors::InvalidArgument("Number of params mismatch. Expected ",
+                                        num_params_, ", got ",
+                                        rnn_desc->ParamsBiasRegions().size()));
    for (int i = 0; i < rnn_desc->ParamsBiasRegions().size(); i++) {
      int64 size_in_bytes = rnn_desc->ParamsBiasRegions()[i].size;
      int64 size = size_in_bytes / sizeof(T);
-      CHECK(size == num_units) << "Params size mismatch. Expected " << num_units
-                               << ", got " << size;
+      OP_REQUIRES(context, size == num_units,
+                  errors::InvalidArgument("Params size mismatch. Expected ",
+                                          num_units, ", got ", size));
+
      // If data is aligned, use slice view to avoid expensive memcpy.
      bool start_aligned =
          rnn_desc->ParamsBiasRegions()[i].offset % EIGEN_MAX_ALIGN_BYTES == 0;
@ -698,16 +776,32 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
    OP_REQUIRES_OK(context,
                   ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
                                  model_shapes.input_size, &input_mode));
-    // TODO(zhengxq): add dropout support.
    // TODO(zhengxq): cache the descriptor so we don't have to create them all
    // the time.
    auto data_type = ToDataType<T>::value;
-    auto rnn_desc_s = executor->createRnnDescriptor(
-        model_shapes.num_layers, model_shapes.num_units,
-        model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
-        data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
-    OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-    auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
+    {
+      mutex_lock l(mu_);
+      if (model_shapes_ == nullptr) {
+        model_shapes_.reset(new CudnnModelShapes(model_shapes));
+      } else {
+        OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
+                    errors::InvalidArgument(
+                        "Incompatible rnn model shapes inferred: expecting ",
+                        model_shapes_->RnnDescDebugString(), ", getting ",
+                        model_shapes.RnnDescDebugString(), "."));
+      }
+      if (rnn_desc_ == nullptr || ResetRndGenState()) {
+        dropout_state_allocator_.reset(
+            new CudnnRNNPersistentSpaceAllocator(context));
+        auto rnn_desc_s = executor->createRnnDescriptor(
+            model_shapes_->num_layers, model_shapes_->num_units,
+            model_shapes_->input_size, input_mode, rnn_direction_mode(),
+            rnn_mode(), data_type, dropout(), seed(),
+            dropout_state_allocator_.get());
+        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
+        rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
+      }
+    }

    auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
        input_shape.dim_size(0), input_shape.dim_size(1),
@ -753,21 +847,30 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
    // Creates a memory callback for the workspace. The memory lives to the end
    // of this kernel calls.
    CudnnRNNWorkspaceAllocator workspace_allocator(context);
-    bool launch_status =
-        stream
-            ->ThenRnnForward(
-                *rnn_desc, *input_desc, input_data, *hidden_state_desc,
-                input_h_data, *hidden_state_desc, input_c_data, params_data,
-                *output_desc, &output_data, *hidden_state_desc, &output_h_data,
-                *hidden_state_desc, &output_c_data, is_training_,
-                &reserve_space_allocator, &workspace_allocator)
-            .ok();
+    bool launch_status = false;
+    {
+      mutex_lock l(mu_);
+      launch_status =
+          stream
+              ->ThenRnnForward(
+                  *rnn_desc_, *input_desc, input_data, *hidden_state_desc,
+                  input_h_data, *hidden_state_desc, input_c_data, params_data,
+                  *output_desc, &output_data, *hidden_state_desc,
+                  &output_h_data, *hidden_state_desc, &output_c_data,
+                  is_training_, &reserve_space_allocator, &workspace_allocator)
+              .ok();
+    }
    OP_REQUIRES(context, launch_status,
                errors::Internal("Failed to call ThenRnnForward"));
  }

 private:
+  mutex mu_;
  bool is_training_;
+  std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
+  std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
+  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
+      GUARDED_BY(mu_);
 };

 REGISTER_KERNEL_BUILDER(
@ -808,9 +911,9 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
    const Tensor* output_h = nullptr;
    OP_REQUIRES_OK(context, context->input("output_h", &output_h));
    OP_REQUIRES(context, output_h->shape() == hidden_state_shape,
-                errors::InvalidArgument("Invalid output_h shape: ",
-                                        output_h->shape().DebugString(), " ",
-                                        hidden_state_shape.DebugString()));
+                errors::InvalidArgument(
+                    "Invalid output_h shape: ", output_h->shape().DebugString(),
+                    " ", hidden_state_shape.DebugString()));
    const Tensor* output_c = nullptr;
    if (HasInputC()) {
      // Only LSTM uses input_c and output_c. So for all other models, we only
@ -881,15 +984,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
    OP_REQUIRES_OK(context,
                   ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
                                  model_shapes.input_size, &input_mode));
-    // TODO(zhengxq): add dropout support.
    // TODO(zhengxq): cache the descriptor so we don't have to create them all
    // the time.
-    auto rnn_desc_s = executor->createRnnDescriptor(
-        model_shapes.num_layers, model_shapes.num_units,
-        model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
-        data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
-    OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-    auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
+    {
+      mutex_lock l(mu_);
+      if (model_shapes_ == nullptr) {
+        model_shapes_.reset(new CudnnModelShapes(model_shapes));
+      } else {
+        OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
+                    errors::InvalidArgument(
+                        "Incompatible rnn model shapes inferred: expecting ",
+                        model_shapes_->RnnDescDebugString(), ", getting ",
+                        model_shapes.RnnDescDebugString(), "."));
+      }
+
+      if (rnn_desc_ == nullptr || ResetRndGenState()) {
+        dropout_state_allocator_.reset(
+            new CudnnRNNPersistentSpaceAllocator(context));
+        auto rnn_desc_s = executor->createRnnDescriptor(
+            model_shapes.num_layers, model_shapes.num_units,
+            model_shapes.input_size, input_mode, rnn_direction_mode(),
+            rnn_mode(), data_type, dropout(), seed(),
+            dropout_state_allocator_.get());
+        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
+        rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
+      }
+    }

    auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
        input_shape.dim_size(0), input_shape.dim_size(1),
@ -939,21 +1059,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
    // Creates a memory callback for the workspace. The memory lives to the end
    // of this kernel calls.
    CudnnRNNWorkspaceAllocator workspace_allocator(context);
-    bool launch_status =
-        stream
-            ->ThenRnnBackward(
-                *rnn_desc, *input_desc, input_data, *hidden_state_desc,
-                input_h_data, *hidden_state_desc, input_c_data, params_data,
-                *output_desc, output_data, *hidden_state_desc, output_h_data,
-                *hidden_state_desc, output_c_data, output_backprop_data,
-                output_h_backprop_data, output_c_backprop_data,
-                &input_backprop_data, &input_h_backprop_data,
-                &input_c_backprop_data, &params_backprop_data,
-                &reserve_space_uint8, &workspace_allocator)
-            .ok();
+    bool launch_status = false;
+    {
+      mutex_lock l(mu_);
+      launch_status =
+          stream
+              ->ThenRnnBackward(
+                  *rnn_desc_, *input_desc, input_data, *hidden_state_desc,
+                  input_h_data, *hidden_state_desc, input_c_data, params_data,
+                  *output_desc, output_data, *hidden_state_desc, output_h_data,
+                  *hidden_state_desc, output_c_data, output_backprop_data,
+                  output_h_backprop_data, output_c_backprop_data,
+                  &input_backprop_data, &input_h_backprop_data,
+                  &input_c_backprop_data, &params_backprop_data,
+                  &reserve_space_uint8, &workspace_allocator)
+              .ok();
+    }
    OP_REQUIRES(context, launch_status,
                errors::Internal("Failed to call ThenRnnBackward"));
  }
+
+ private:
+  mutex mu_;
+  std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
+  std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
+  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
+      GUARDED_BY(mu_);
 };

 REGISTER_KERNEL_BUILDER(
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
@ -35,6 +35,9 @@ input_mode: Indicate whether there is a linear projection between the input and
    input_size == num_units; otherwise, it implies 'linear_input'.
 direction: Indicates whether a bidirectional model will be used.
    dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
 )doc";

 constexpr auto kCudnnRNNParamsBuffer = R"doc(
@ -77,6 +80,9 @@ REGISTER_OP("CudnnRNNParamsSize")
    .Attr(kRNNModeAttrs)
    .Attr(kRNNInputModeAttrs)
    .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
    .Output("params_size: S")
    .SetShapeFn([](InferenceContext* c) {
      c->set_output(0, c->Vector(1));
@ -119,6 +125,7 @@ REGISTER_OP("CudnnRNN")
    .Input("input_h: T")
    .Input("input_c: T")
    .Input("params: T")
+    .SetIsStateful()
    .Output("output: T")
    .Output("output_h: T")
    .Output("output_c: T")
@ -127,7 +134,7 @@ REGISTER_OP("CudnnRNN")
    .Attr(kRNNModeAttrs)
    .Attr(kRNNInputModeAttrs)
    .Attr(kRNNDirectionAttrs)
-    .Attr("dropout: float")
+    .Attr("dropout: float = 0.0")
    .Attr("seed: int = 0")
    .Attr("seed2: int = 0")
    .Attr("is_training: bool = true")
@ -158,7 +165,8 @@ REGISTER_OP("CudnnRNN")
 Computes the RNN from the input and initial states, with respect to the params
 buffer.
 )doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
+                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
+                         R"doc(
 is_training: Indicates whether this operation is used for inferenece or
    training.
 reserve_space: an opaque tensor that can be used in backprop calculation. It
@ -185,6 +193,9 @@ REGISTER_OP("CudnnRNNBackprop")
    .Attr(kRNNModeAttrs)
    .Attr(kRNNInputModeAttrs)
    .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
    .SetShapeFn([](InferenceContext* c) {
      auto input_shape = c->input(0);
      auto input_h_shape = c->input(1);
@ -199,7 +210,8 @@ REGISTER_OP("CudnnRNNBackprop")
    .Doc(strings::StrCat(R"doc(
 Compute the backprop of both data and weights in a RNN.
 )doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
+                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
+                         R"doc(
 output_backprop: A 3-D tensor with the same shape as output in the forward pass.
 output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
    pass.
@ -228,6 +240,9 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
    .Attr(kRNNModeAttrs)
    .Attr(kRNNInputModeAttrs)
    .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
    .SetShapeFn([](InferenceContext* c) {
      ShapeHandle unused;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &unused));
@ -268,6 +283,9 @@ REGISTER_OP("CudnnRNNCanonicalToParams")
    .Attr(kRNNModeAttrs)
    .Attr(kRNNInputModeAttrs)
    .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
    .SetShapeFn([](InferenceContext* c) {
      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
      return Status::OK();
@ -281,7 +299,6 @@ upcoming training or inferences.
 num_params: number of parameter sets for all layers.
    Each layer may contain multiple parameter sets, with each set consisting of
    a weight matrix and a bias vector.
-)doc",
-                         kCudnnRNNCommonAttrs));
+)doc", kCudnnRNNCommonAttrs));

 }  // namespace tensorflow
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@ -38,15 +38,24 @@ from tensorflow.python.training import saver as saver_lib

 class CudnnRNNTest(TensorFlowTestCase):

-  def _CreateModel(self, rnn_mode, num_layers, num_units, input_size):
+  def _CreateModel(self,
+                   rnn_mode,
+                   num_layers,
+                   num_units,
+                   input_size,
+                   dropout=0.):
    if rnn_mode == "lstm":
-      model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnLSTM(
+          num_layers, num_units, input_size, dropout=dropout)
    elif rnn_mode == "gru":
-      model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnGRU(
+          num_layers, num_units, input_size, dropout=dropout)
    elif rnn_mode == "rnn_tanh":
-      model = cudnn_rnn_ops.CudnnRNNTanh(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnRNNTanh(
+          num_layers, num_units, input_size, dropout=dropout)
    elif rnn_mode == "rnn_relu":
-      model = cudnn_rnn_ops.CudnnRNNRelu(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnRNNRelu(
+          num_layers, num_units, input_size, dropout=dropout)
    else:
      raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
    return model
@ -174,9 +183,11 @@ class CudnnRNNTest(TensorFlowTestCase):
        self._testOneLSTMParamsSize(num_layers, num_units, input_size)

  def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
-                              batch_size, seq_length, dir_count, expected,
-                              tolerance):
-    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
+                              batch_size, seq_length, dir_count, dropout,
+                              expected, tolerance):
+    random_seed.set_random_seed(5678)
+    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
+                              dropout)
    has_input_c = (rnn_mode == "lstm")
    params_size_t = model.params_size()
    input_data = array_ops.ones([seq_length, batch_size, input_size])
@ -206,18 +217,24 @@ class CudnnRNNTest(TensorFlowTestCase):
    with self.test_session(use_gpu=True) as sess:
      sess.run(variables.global_variables_initializer())
      total_sum_v = sess.run([total_sum])
+
      self.assertAllClose(
          total_sum_v[0], expected, atol=tolerance, rtol=tolerance)

  @unittest.skipUnless(test.is_built_with_cuda(),
                       "Test only applicable when running on GPUs")
  def testSimpleInference(self):
+    # Cudnn scales result for dropout during training, therefore dropout has no
+    # impact for inference results.
+    # (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most
+    # demonstrative of the dropout-invariant nature of CudnnRnn.)
    test_configs = [
-        [
-            "lstm",
-            231833.22,
-            1e-2,
-            {
+        {
+            "rnn_mode": "lstm",
+            "dropout": [0., 0.5, 1.],
+            "expected": 231833.22,
+            "tolerance": 1e-2,
+            "shape": {
                "num_layers": 4,
                "num_units": 200,
                "input_size": 200,
@ -225,12 +242,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                "seq_length": 10,
                "dir_count": 1,
            },
-        ],
-        [
-            "gru",
-            56000,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "gru",
+            "dropout": [0., 0.5, 1.],
+            "expected": 56000,
+            "tolerance": 1e-2,
+            "shape": {
                "num_layers": 4,
                "num_units": 200,
                "input_size": 200,
@ -238,12 +256,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                "seq_length": 10,
                "dir_count": 1,
            },
-        ],
-        [
-            "rnn_tanh",
-            56000,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "rnn_tanh",
+            "dropout": [0., 0.5, 1.],
+            "expected": 56000,
+            "tolerance": 1e-2,
+            "shape": {
                "num_layers": 4,
                "num_units": 200,
                "input_size": 200,
@ -251,12 +270,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                "seq_length": 10,
                "dir_count": 1,
            },
-        ],
-        [
-            "rnn_relu",
-            130688,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "rnn_relu",
+            "dropout": [0., 0.5, 1.],
+            "expected": 130688,
+            "tolerance": 1e-2,
+            "shape": {
                "num_layers": 2,
                "num_units": 8,
                "input_size": 4,
@ -264,24 +284,32 @@ class CudnnRNNTest(TensorFlowTestCase):
                "seq_length": 2,
                "dir_count": 1,
            },
-        ],
+        },
    ]
    with ops.Graph().as_default():
      for config in test_configs:
-        rnn_mode = config[0]
-        expected = config[1]
-        tolerance = config[2]
-        shapes = config[3]
-        self._testOneSimpleInference(rnn_mode, shapes["num_layers"],
-                                     shapes["num_units"], shapes["input_size"],
-                                     shapes["batch_size"], shapes["seq_length"],
-                                     shapes["dir_count"], expected, tolerance)
+        rnn_mode = config["rnn_mode"]
+        dropout_list = config.get("dropout", [0.])
+        expected = config["expected"]
+        tolerance = config["tolerance"]
+        shape = config["shape"]
+        for dropout in dropout_list:
+          self._testOneSimpleInference(
+              rnn_mode, shape["num_layers"], shape["num_units"],
+              shape["input_size"], shape["batch_size"], shape["seq_length"],
+              shape["dir_count"], dropout, expected, tolerance)

  def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
-                             batch_size, seq_length, dir_count, tolerance):
+                             batch_size, seq_length, dir_count, dropout,
+                             tolerance):
+    # Gradient checking runs two forward ops with almost the same input. Need to
+    # make sure the drop patterns across the two runs are the same.
+    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
+    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
    has_input_c = (rnn_mode == "lstm")
    random_seed.set_random_seed(1234)
-    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
+    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
+                              dropout)
    params_size_t = model.params_size()
    input_data = variables.Variable(
        random_ops.random_uniform([seq_length, batch_size, input_size]))
@ -294,6 +322,7 @@ class CudnnRNNTest(TensorFlowTestCase):
      input_c = variables.Variable(
          random_ops.random_uniform(
              [num_layers * dir_count, batch_size, num_units]))
+
      output, output_h, output_c = model(
          input_data=input_data,
          input_h=input_h,
@ -322,18 +351,22 @@ class CudnnRNNTest(TensorFlowTestCase):
      sess.run(variables.global_variables_initializer())
      all_inputs = [entry[0] for entry in inputs_and_shapes]
      all_shapes = [entry[1] for entry in inputs_and_shapes]
+
      err = gradient_checker.compute_gradient_error(all_inputs, all_shapes,
                                                    total_sum, [1])
+
      self.assertLess(err, tolerance)
+      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state

  @unittest.skipUnless(test.is_built_with_cuda(),
                       "Test only applicable when running on GPUs")
  def testSimpleTraining(self):
    test_configs = [
-        [
-            "lstm",
-            1e-2,
-            {
+        {
+            "rnn_mode": "lstm",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 1e-2,
+            "shape": {
                "num_layers": 2,
                "num_units": 3,
                "input_size": 4,
@ -341,11 +374,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                "seq_length": 4,
                "dir_count": 1,
            },
-        ],
-        [
-            "gru",
-            4e-3,
-            {
+        },
+        {
+            "rnn_mode": "gru",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 4e-3,
+            "shape": {
                "num_layers": 2,
                "num_units": 3,
                "input_size": 4,
@ -353,11 +387,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                "seq_length": 4,
                "dir_count": 1,
            },
-        ],
-        [
-            "rnn_tanh",
-            5e-3,
-            {
+        },
+        {
+            "rnn_mode": "rnn_tanh",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 5e-3,
+            "shape": {
                "num_layers": 2,
                "num_units": 3,
                "input_size": 4,
@ -365,11 +400,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                "seq_length": 4,
                "dir_count": 1,
            },
-        ],
-        [
-            "rnn_relu",
-            3e-1,
-            {
+        },
+        {
+            "rnn_mode": "rnn_relu",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 4e-1,
+            "shape": {
                "num_layers": 2,
                "num_units": 3,
                "input_size": 4,
@ -377,17 +413,19 @@ class CudnnRNNTest(TensorFlowTestCase):
                "seq_length": 4,
                "dir_count": 1,
            },
-        ],
+        },
    ]
    with ops.Graph().as_default():
      for config in test_configs:
-        rnn_mode = config[0]
-        tolerance = config[1]
-        shape = config[2]
-        self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
-                                    shape["num_units"], shape["input_size"],
-                                    shape["batch_size"], shape["seq_length"],
-                                    shape["dir_count"], tolerance)
+        rnn_mode = config["rnn_mode"]
+        dropout_list = config.get("dropout", [0.])
+        tolerance = config["tolerance"]
+        shape = config["shape"]
+        for dropout in dropout_list:
+          self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
+                                      shape["num_units"], shape["input_size"],
+                                      shape["batch_size"], shape["seq_length"],
+                                      shape["dir_count"], dropout, tolerance)


 if __name__ == "__main__":
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@ -23,13 +23,13 @@ from tensorflow.contrib.util import loader
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver

-
 _cudnn_rnn_ops_so = loader.load_op_library(
    resource_loader.get_path_to_datafile("_cudnn_rnn_ops.so"))

@ -110,12 +110,12 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
    if not isinstance(params, tuple):
      params = (params,)
    assign_ops = [
-        state_ops.assign(
-            variable, param, validate_shape=False)
+        state_ops.assign(variable, param, validate_shape=False)
        for variable, param in zip(self._variables, params)
    ]
    return control_flow_ops.group(*assign_ops)

+
 _cudnn_rnn_common_doc_string = """
  Cudnn RNN has an opaque parameter buffer that can be used for inference and
  training. But it is possible that the layout of the parameter buffers
@ -163,8 +163,7 @@ class _CudnnRNN(object):
               input_mode="auto_select",
               direction="unidirectional",
               dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
    """Creates a CudnnRNN model from model spec.

    Args:
@ -183,8 +182,8 @@ class _CudnnRNN(object):
      direction: the direction model that the model operates. Could be either
          'unidirectional' or 'bidirectional'
      dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+          for behavior.
    """
    self._num_layers = num_layers
    self._num_units = num_units
@ -193,8 +192,10 @@ class _CudnnRNN(object):
    self._input_mode = input_mode
    self._direction = direction
    self._dropout = dropout
-    self._seed = seed
-    self._seed2 = seed2
+    # get graph and op seed.
+    self._seed, self._seed2 = random_seed.get_seed(seed)
+    if self._seed is None and self._seed2 is None:
+      self._seed, self._seed2 = 0, 0

  def params_size(self):
    """Calculates the size of the opaque parameter buffer needed for this model.
@ -208,6 +209,9 @@ class _CudnnRNN(object):
        input_size=self._input_size,
        T=dtypes.float32,
        S=dtypes.int32,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
        rnn_mode=self._rnn_mode,
        input_mode=self._input_mode,
        direction=self._direction)[0]
@ -258,6 +262,9 @@ class _CudnnRNN(object):
        num_units=self._num_units,
        input_size=self._input_size,
        params=params,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
        num_params=self._num_layers * self._NUM_PARAMS_PER_LAYER,
        rnn_mode=self._rnn_mode,
        input_mode=self._input_mode,
@ -280,6 +287,9 @@ class _CudnnRNN(object):
        input_size=self._input_size,
        weights=weights,
        biases=biases,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
        rnn_mode=self._rnn_mode,
        input_mode=self._input_mode,
        direction=self._direction)
@ -299,8 +309,7 @@ class CudnnLSTM(_CudnnRNN):
               input_mode="auto_select",
               direction="unidirectional",
               dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
    """Creates a Cudnn LSTM model from model spec.

    Args:
@ -317,8 +326,7 @@ class CudnnLSTM(_CudnnRNN):
      direction: the direction model that the model operates. Could be either
          'unidirectional' or 'bidirectional'
      dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the seed used for initializing dropout.
    """
    super(CudnnLSTM, self).__init__(
        "lstm",
@ -328,8 +336,7 @@ class CudnnLSTM(_CudnnRNN):
        input_mode=input_mode,
        direction=direction,
        dropout=dropout,
-        seed=seed,
-        seed2=seed2)
+        seed=seed)

  def __call__(self, input_data, input_h, input_c, params, is_training=True):
    """Runs the forward step for the Cudnn LSTM model.
@ -346,11 +353,8 @@ class CudnnLSTM(_CudnnRNN):
      output_h: the final state for h.
      output_c: the final state for c.
    """
-    output, output_h, output_c = super(CudnnLSTM, self).__call__(input_data,
-                                                                 input_h,
-                                                                 input_c,
-                                                                 params,
-                                                                 is_training)
+    output, output_h, output_c = super(CudnnLSTM, self).__call__(
+        input_data, input_h, input_c, params, is_training=is_training)
    return (output, output_h, output_c)


@ -365,8 +369,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
               input_mode="auto_select",
               direction="unidirectional",
               dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
    """Creates a Cudnn RNN model from model without hidden-state C.

    Args:
@ -383,8 +386,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
      direction: the direction model that the model operates. Could be either
          'unidirectional' or 'bidirectional'
      dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the seed used for initializing dropout.
    """
    super(_CudnnRNNNoInputC, self).__init__(
        self._rnn_mode,
@ -394,8 +396,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
        input_mode=input_mode,
        direction=direction,
        dropout=dropout,
-        seed=seed,
-        seed2=seed2)
+        seed=seed)

  def __call__(self, input_data, input_h, params, is_training=True):
    """Runs the forward step for the Cudnn LSTM model.
@ -459,6 +460,9 @@ def _cudnn_rnn_backward(op, *grad):
      output_h_backprop=grad[1],
      output_c_backprop=grad[2],
      reserve_space=op.outputs[3],
+      dropout=op.get_attr("dropout"),
+      seed=op.get_attr("seed"),
+      seed2=op.get_attr("seed2"),
      rnn_mode=op.get_attr("rnn_mode"),
      input_mode=op.get_attr("input_mode"),
      direction=op.get_attr("direction"))
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@ -111,13 +111,11 @@ py_library(
    srcs_version = "PY2AND3",
    deps = [
        ":sdca_ops_py",
-        ":sparse_feature_column_py",
        "//tensorflow/contrib/framework:framework_py",
        "//tensorflow/contrib/layers:layers_py",
        "//tensorflow/contrib/learn",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
        "//tensorflow/python:sparse_tensor",
        "//tensorflow/python:tensor_util",
        "//tensorflow/python:training",
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
@ -24,13 +24,10 @@ from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
-from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
-from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import session_run_hook

@ -76,131 +73,6 @@ def _add_bias_column(feature_columns, columns_to_tensors, bias_variable,
  columns_to_variables[bias_column] = [bias_variable]


-def _get_sdca_train_step(optimizer, columns_to_variables, weight_column_name,
-                         loss_type, features, targets, global_step):
-  """Returns the training operation of an SdcaModel optimizer."""
-
-  def _dense_tensor_to_sparse_feature_column(dense_tensor):
-    """Returns SparseFeatureColumn for the input dense_tensor."""
-    ignore_value = 0.0
-    sparse_indices = array_ops.where(
-        math_ops.not_equal(dense_tensor,
-                           math_ops.cast(ignore_value, dense_tensor.dtype)))
-    sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices)
-    # TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports
-    # very sparse features with weights and not weights.
-    return SparseFeatureColumn(
-        array_ops.reshape(
-            array_ops.split(value=sparse_indices, num_or_size_splits=2,
-                            axis=1)[0], [-1]),
-        array_ops.reshape(
-            array_ops.split(value=sparse_indices, num_or_size_splits=2,
-                            axis=1)[1], [-1]),
-        array_ops.reshape(math_ops.to_float(sparse_values), [-1]))
-
-  def _training_examples_and_variables():
-    """Returns dictionaries for training examples and variables."""
-    batch_size = targets.get_shape()[0]
-
-    # Iterate over all feature columns and create appropriate lists for dense
-    # and sparse features as well as dense and sparse weights (variables) for
-    # SDCA.
-    # TODO(sibyl-vie3Poto): Reshape variables stored as values in column_to_variables
-    # dict as 1-dimensional tensors.
-    dense_features, sparse_features, sparse_feature_with_values = [], [], []
-    dense_feature_weights = []
-    sparse_feature_weights, sparse_feature_with_values_weights = [], []
-    for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
-      transformed_tensor = features[column]
-      if isinstance(column, layers.feature_column._RealValuedColumn):  # pylint: disable=protected-access
-        # A real-valued column corresponds to a dense feature in SDCA. A
-        # transformed tensor corresponding to a RealValuedColumn has rank 2
-        # (its shape is typically [batch_size, column.dimension]) and so it
-        # can be passed to SDCA as is.
-        dense_features.append(transformed_tensor)
-        # For real valued columns, the variables list contains exactly one
-        # element.
-        dense_feature_weights.append(columns_to_variables[column][0])
-      elif isinstance(column, layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
-        # A bucketized column corresponds to a sparse feature in SDCA. The
-        # bucketized feature is "sparsified" for SDCA by converting it to a
-        # SparseFeatureColumn respresenting the one-hot encoding of the
-        # bucketized feature.
-        #
-        # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
-        # bucketized feature column to a dense feature in SDCA. This will likely
-        # depend on the number of buckets.
-        dense_bucket_tensor = column._to_dnn_input_layer(transformed_tensor)  # pylint: disable=protected-access
-        sparse_feature_column = _dense_tensor_to_sparse_feature_column(
-            dense_bucket_tensor)
-        sparse_feature_with_values.append(sparse_feature_column)
-        # For bucketized columns, the variables list contains exactly one
-        # element.
-        sparse_feature_with_values_weights.append(
-            columns_to_variables[column][0])
-      elif isinstance(
-          column,
-          (
-              layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
-              layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
-        sparse_features.append(
-            SparseFeatureColumn(
-                array_ops.reshape(
-                    array_ops.split(
-                        value=transformed_tensor.indices,
-                        num_or_size_splits=2,
-                        axis=1)[0], [-1]),
-                array_ops.reshape(transformed_tensor.values, [-1]), None))
-        sparse_feature_weights.append(columns_to_variables[column][0])
-      elif isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
-        id_tensor = column.id_tensor(transformed_tensor)
-        weight_tensor = column.weight_tensor(transformed_tensor)
-        sparse_feature_with_values.append(
-            SparseFeatureColumn(
-                array_ops.reshape(
-                    array_ops.split(
-                        value=id_tensor.indices, num_or_size_splits=2, axis=1)[
-                            0], [-1]),
-                array_ops.reshape(id_tensor.values, [-1]),
-                array_ops.reshape(weight_tensor.values, [-1])))
-        sparse_feature_with_values_weights.append(
-            columns_to_variables[column][0])
-      else:
-        raise ValueError("SDCAOptimizer does not support column type {}".format(
-            type(column).__name__))
-
-    example_weights = array_ops.reshape(
-        features[weight_column_name],
-        shape=[-1]) if weight_column_name else array_ops.ones([batch_size])
-    example_ids = features[optimizer.example_id_column]
-    sparse_feature_with_values.extend(sparse_features)
-    sparse_feature_with_values_weights.extend(sparse_feature_weights)
-    examples = dict(
-        sparse_features=sparse_feature_with_values,
-        dense_features=dense_features,
-        example_labels=math_ops.to_float(
-            array_ops.reshape(targets, shape=[-1])),
-        example_weights=example_weights,
-        example_ids=example_ids)
-    sdca_variables = dict(
-        sparse_features_weights=sparse_feature_with_values_weights,
-        dense_features_weights=dense_feature_weights)
-    return examples, sdca_variables
-
-  training_examples, training_variables = _training_examples_and_variables()
-  sdca_model = sdca_ops.SdcaModel(
-      examples=training_examples,
-      variables=training_variables,
-      options=dict(
-          symmetric_l1_regularization=optimizer.symmetric_l1_regularization,
-          symmetric_l2_regularization=optimizer.symmetric_l2_regularization,
-          num_loss_partitions=optimizer.num_loss_partitions,
-          num_table_shards=optimizer.num_table_shards,
-          loss_type=loss_type))
-  train_op = sdca_model.minimize(global_step=global_step)
-  return sdca_model, train_op
-
-
 def sdca_model_fn(features, labels, mode, params, config=None):
  """A model_fn for linear models that use the SDCA optimizer.

@ -283,9 +155,9 @@ def sdca_model_fn(features, labels, mode, params, config=None):

  def _train_op_fn(unused_loss):
    global_step = contrib_variables.get_global_step()
-    sdca_model, train_op = _get_sdca_train_step(optimizer, columns_to_variables,
-                                                weight_column_name, loss_type,
-                                                features, labels, global_step)
+    sdca_model, train_op = optimizer.get_train_step(
+        columns_to_variables, weight_column_name, loss_type, features, labels,
+        global_step)
    if update_weights_hook is not None:
      update_weights_hook.set_parameters(sdca_model, train_op)
    return train_op
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@ -99,16 +99,16 @@ class SDCAOptimizer(object):
  def symmetric_l2_regularization(self):
    return self._symmetric_l2_regularization

-  def get_train_step(self, columns_to_variables,
-                     weight_column_name, loss_type, features, targets,
-                     global_step):
+  def get_train_step(self, columns_to_variables, weight_column_name, loss_type,
+                     features, targets, global_step):
    """Returns the training operation of an SdcaModel optimizer."""

-    def _tensor_to_sparse_feature_column(dense_tensor):
+    def _dense_tensor_to_sparse_feature_column(dense_tensor):
      """Returns SparseFeatureColumn for the input dense_tensor."""
      ignore_value = 0.0
-      sparse_indices = array_ops.where(math_ops.not_equal(
-          dense_tensor, math_ops.cast(ignore_value, dense_tensor.dtype)))
+      sparse_indices = array_ops.where(
+          math_ops.not_equal(dense_tensor,
+                             math_ops.cast(ignore_value, dense_tensor.dtype)))
      sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices)
      # TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports
      # very sparse features with weights and not weights.
@ -133,10 +133,9 @@ class SDCAOptimizer(object):
      dense_features, sparse_features, sparse_feature_with_values = [], [], []
      dense_feature_weights = []
      sparse_feature_weights, sparse_feature_with_values_weights = [], []
-      # pylint: disable=protected-access
      for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
        transformed_tensor = features[column]
-        if isinstance(column, layers.feature_column._RealValuedColumn):
+        if isinstance(column, layers.feature_column._RealValuedColumn):  # pylint: disable=protected-access
          # A real-valued column corresponds to a dense feature in SDCA. A
          # transformed tensor corresponding to a RealValuedColumn has rank 2
          # (its shape is typically [batch_size, column.dimension]) and so it
@ -145,22 +144,28 @@ class SDCAOptimizer(object):
          # For real valued columns, the variables list contains exactly one
          # element.
          dense_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._BucketizedColumn):
+        elif isinstance(column, layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
          # A bucketized column corresponds to a sparse feature in SDCA. The
          # bucketized feature is "sparsified" for SDCA by converting it to a
          # SparseFeatureColumn respresenting the one-hot encoding of the
          # bucketized feature.
-          dense_bucket_tensor = layers.input_from_feature_columns(
-              {column: transformed_tensor}, [column])
-          sparse_feature_column = _tensor_to_sparse_feature_column(
+          #
+          # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
+          # bucketized feature column to a dense feature in SDCA. This will
+          # likely depend on the number of buckets.
+          dense_bucket_tensor = column._to_dnn_input_layer(transformed_tensor)  # pylint: disable=protected-access
+          sparse_feature_column = _dense_tensor_to_sparse_feature_column(
              dense_bucket_tensor)
          sparse_feature_with_values.append(sparse_feature_column)
          # For bucketized columns, the variables list contains exactly one
          # element.
          sparse_feature_with_values_weights.append(
              columns_to_variables[column][0])
-        elif isinstance(column, (layers.feature_column._CrossedColumn,
-                                 layers.feature_column._SparseColumn)):
+        elif isinstance(
+            column,
+            (
+                layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
+                layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
          sparse_features.append(
              SparseFeatureColumn(
                  array_ops.reshape(
@ -168,10 +173,9 @@ class SDCAOptimizer(object):
                          value=transformed_tensor.indices,
                          num_or_size_splits=2,
                          axis=1)[0], [-1]),
-                  array_ops.reshape(transformed_tensor.values, [-1]),
-                  None))
+                  array_ops.reshape(transformed_tensor.values, [-1]), None))
          sparse_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._WeightedSparseColumn):
+        elif isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
          id_tensor = column.id_tensor(transformed_tensor)
          weight_tensor = column.weight_tensor(transformed_tensor)
          sparse_feature_with_values.append(
@ -183,11 +187,10 @@ class SDCAOptimizer(object):
                  array_ops.reshape(id_tensor.values, [-1]),
                  array_ops.reshape(weight_tensor.values, [-1])))
          sparse_feature_with_values_weights.append(
-            columns_to_variables[column][0])
+              columns_to_variables[column][0])
        else:
          raise ValueError('SDCAOptimizer does not support column type %s.' %
                           type(column).__name__)
-      # pylint: enable=protected-access

      example_weights = array_ops.reshape(
          features[weight_column_name],
@ -195,12 +198,13 @@ class SDCAOptimizer(object):
      example_ids = features[self._example_id_column]
      sparse_feature_with_values.extend(sparse_features)
      sparse_feature_with_values_weights.extend(sparse_feature_weights)
-      examples = dict(sparse_features=sparse_feature_with_values,
-                      dense_features=dense_features,
-                      example_labels=math_ops.to_float(array_ops.reshape(
-                          targets, shape=[-1])),
-                      example_weights=example_weights,
-                      example_ids=example_ids)
+      examples = dict(
+          sparse_features=sparse_feature_with_values,
+          dense_features=dense_features,
+          example_labels=math_ops.to_float(
+              array_ops.reshape(targets, shape=[-1])),
+          example_weights=example_weights,
+          example_ids=example_ids)
      sdca_variables = dict(
          sparse_features_weights=sparse_feature_with_values_weights,
          dense_features_weights=dense_feature_weights)
--- a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
@ -11,6 +11,7 @@ tensorflow/core/protobuf/cluster.pb.cc
 tensorflow/core/protobuf/config.pb.cc
 tensorflow/core/protobuf/rewriter_config.pb.cc
 tensorflow/core/protobuf/debug.pb.cc
+tensorflow/core/protobuf/device_properties.pb.cc
 tensorflow/core/lib/core/error_codes.pb.cc
 tensorflow/core/framework/versions.pb.cc
 tensorflow/core/framework/variable.pb.cc
@ -36,3 +37,4 @@ tensorflow/core/framework/attr_value.pb.cc
 tensorflow/core/framework/allocation_description.pb.cc
 tensorflow/core/example/feature.pb.cc
 tensorflow/core/example/example.pb.cc
+tensorflow/core/grappler/costs/op_performance_data.pb.cc
--- a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
@ -10,6 +10,7 @@ tensorflow/core/protobuf/meta_graph.pb.h
 tensorflow/core/protobuf/cluster.pb.h
 tensorflow/core/protobuf/config.pb.h
 tensorflow/core/protobuf/debug.pb.h
+tensorflow/core/protobuf/device_properties.pb.h
 tensorflow/core/protobuf/rewriter_config.pb.h
 tensorflow/core/protobuf/tensor_bundle.pb.h
 tensorflow/core/lib/core/error_codes.pb.h
@ -37,3 +38,4 @@ tensorflow/core/framework/attr_value.pb.h
 tensorflow/core/framework/allocation_description.pb.h
 tensorflow/core/example/feature.pb.h
 tensorflow/core/example/example.pb.h
+tensorflow/core/grappler/costs/op_performance_data.pb.h
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@ -10,6 +10,7 @@ tensorflow/core/protobuf/meta_graph.proto
 tensorflow/core/protobuf/cluster.proto
 tensorflow/core/protobuf/config.proto
 tensorflow/core/protobuf/debug.proto
+tensorflow/core/protobuf/device_properties.proto
 tensorflow/core/protobuf/rewriter_config.proto
 tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/lib/core/error_codes.proto
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@ -8,11 +8,13 @@ exports_files(["LICENSE"])
 package(default_visibility = ["//tensorflow:__subpackages__"])

 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")

 py_library(
    name = "opt_py",
    srcs = [
        "__init__.py",
+        "python/training/drop_stale_gradient_optimizer.py",
        "python/training/external_optimizer.py",
        "python/training/lazy_adam_optimizer.py",
        "python/training/moving_average_optimizer.py",
@ -104,6 +106,22 @@ py_test(
    ],
 )

+tf_py_test(
+    name = "drop_stale_gradient_optimizer_test",
+    srcs = ["python/training/drop_stale_gradient_optimizer_test.py"],
+    additional_deps = [
+        ":opt_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
 filegroup(
    name = "all_files",
    srcs = glob(
--- a/tensorflow/contrib/opt/init.py
+++ b/tensorflow/contrib/opt/init.py
@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function

 # pylint: disable=wildcard-import
+from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
@ -27,7 +28,8 @@ from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *

 from tensorflow.python.util.all_util import remove_undocumented

-_allowed_symbols = ['ExternalOptimizerInterface',
+_allowed_symbols = ['DropStaleGradientOptimizer',
+                    'ExternalOptimizerInterface',
                    'LazyAdamOptimizer',
                    'MovingAverageOptimizer',
                    'ScipyOptimizerInterface',
--- a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
@ -0,0 +1,112 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper optimizer for checking and dropping stale gradients."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_util
+
+
+class DropStaleGradientOptimizer(optimizer.Optimizer):
+  """Wrapper optimizer that checks and drops stale gradient.
+
+  This optimizer records the global step for each worker before computing
+  gradients and compares it with the global step at the time of applying the
+  gradients. If the difference is larger than a threshold, it will drop all
+  the computed gradients.
+  """
+
+  def __init__(self,
+               opt,
+               staleness,
+               use_locking=False,
+               name="DropStaleGradient"):
+    """Constructs a new DropStaleGradientOptimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to compute and apply the
+           gradients. Must be one of the Optimizer classes.
+      staleness: The maximum staleness allowed for the optimizer.
+      use_locking: If `True` use locks for clip update operations.
+      name: Optional name prefix for the operations created when applying
+            gradients. Defaults to "DropStaleGradient".
+    """
+    super(DropStaleGradientOptimizer, self).__init__(use_locking, name)
+    self._opt = opt
+    self._staleness = staleness
+
+  def compute_gradients(self, loss, *args, **kwargs):
+    # Record current global step for worker.
+    with ops.colocate_with(loss):
+      self._local_step = training_util.get_global_step() + 0
+
+    with ops.control_dependencies([self._local_step]):
+      loss = gen_array_ops.identity(loss)
+      return self._opt.compute_gradients(loss, *args, **kwargs)
+
+  def get_slot(self, *args, **kwargs):
+    return self._opt.get_slot(*args, **kwargs)
+
+  def get_slot_names(self, *args, **kwargs):
+    return self._opt.get_slot_names(*args, **kwargs)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    gradients = []
+    # Number of stale gradients.
+    stale_counter = variable_scope.get_variable(
+        "stale_counter", [],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False)
+
+    def _AcceptGradientOp():
+      with ops.control_dependencies(
+          [self._opt.apply_gradients(
+              grads_and_vars, global_step=global_step, name=name)]):
+        return gen_array_ops.identity(0.0)
+
+    def _DropGradientOp():
+      return gen_array_ops.identity(1.0)
+
+    for grad_and_var in grads_and_vars:
+      grad = grad_and_var[0]
+      if isinstance(grad, ops.Tensor):
+        gradients.append(grad)
+      else:
+        gradients.append(grad.op)
+
+    with ops.control_dependencies(gradients), ops.colocate_with(global_step):
+      staleness = gen_array_ops.reshape(
+          global_step - self._local_step, shape=())
+      conditional_update = stale_counter.assign_add(control_flow_ops.cond(
+          gen_math_ops.less_equal(staleness, self._staleness),
+          _AcceptGradientOp, _DropGradientOp))
+
+    summary.scalar(
+        "Gradient staleness percentage",
+        stale_counter / (math_ops.cast(global_step + 1, dtypes.float32)))
+    return conditional_update
--- a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py
@ -0,0 +1,297 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DropStaleGradientOptimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import portpicker
+
+from tensorflow.contrib.opt.python.training import drop_stale_gradient_optimizer
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import server_lib
+from tensorflow.python.training import training_util
+
+
+# Creates the workers and return their sessions, graphs, train_ops.
+def _get_workers(num_workers, staleness):
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  cluster_dict = {
+      'worker': ['localhost:%s' % port for port in worker_ports],
+      'ps': ['localhost:%s' % portpicker.pick_unused_port()]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+  workers = [
+      server_lib.Server(
+          cs, job_name='worker', task_index=ix, start=True)
+      for ix in range(num_workers)
+  ]
+  server_lib.Server(cs, job_name='ps', task_index=0, start=True)
+
+  sessions = []
+  graphs = []
+  train_ops = []
+
+  # To simulate stale cases, maintaining two queues for computing and
+  # applying gradients respectively. In the phase of computing gradients,
+  # all workers except chief worker compute gradients together and chief worker
+  # computes after all other worers' computing finished. In the phase of
+  # applying gradients, chief worker will first apply gradients, then all other
+  # workers will apply gradients one by one. Therefore, the chief worker will
+  # always have 0 staleness, each of all other workers will have a unique
+  # staleness value from [1, num_workers).
+  for worker_id in range(num_workers):
+    graph = ops.Graph()
+    with graph.as_default():
+      global_step = training_util.create_global_step()
+      var_0 = variables.Variable(0.0, name='v0')
+      var_1 = variables.Variable(1.0, name='v1')
+      compute_gradients_queue = data_flow_ops.FIFOQueue(
+          -1, global_step.dtype.base_dtype, shapes=(),
+          name='compute_gradients_queue', shared_name='compute_gradients_queue')
+      apply_gradients_queue = data_flow_ops.FIFOQueue(
+          -1, global_step.dtype.base_dtype, shapes=(),
+          name='apply_gradients_queue', shared_name='apply_gradients_queue')
+
+      # Gradients for loss on var_0 and var_1 will be 1.0.
+      loss = 0 - var_0 - var_1
+      sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
+      stale_check_opt = (
+          drop_stale_gradient_optimizer.DropStaleGradientOptimizer(
+              sgd_opt, staleness))
+
+      # Compute gradients.
+      if worker_id == 0:
+        with ops.control_dependencies(
+            [compute_gradients_queue.dequeue_many(num_workers - 1)]):
+          grad_and_vars = stale_check_opt.compute_gradients(loss)
+      else:
+        grad_and_vars = stale_check_opt.compute_gradients(loss)
+        with ops.control_dependencies([t[0] for t in grad_and_vars]):
+          worker_enqueue_op = compute_gradients_queue.enqueue(global_step)
+
+      # Apply gradients.
+      if worker_id == 0:
+        with ops.control_dependencies(
+            [stale_check_opt.apply_gradients(grad_and_vars, global_step)]):
+          train_op = apply_gradients_queue.enqueue(global_step)
+      else:
+        with ops.control_dependencies([worker_enqueue_op]):
+          with ops.control_dependencies([apply_gradients_queue.dequeue()]):
+            with ops.control_dependencies(
+                [stale_check_opt.apply_gradients(
+                    grad_and_vars, global_step)]):
+              train_op = apply_gradients_queue.enqueue(global_step)
+
+      sess = session.Session(workers[worker_id].target)
+
+    sessions.append(sess)
+    graphs.append(graph)
+    train_ops.append(train_op)
+
+  return sessions, graphs, train_ops
+
+
+class DropStaleGradientOptimizerTest(test.TestCase):
+
+  def _run(self, train_op, sess):
+    sess.run(train_op)
+
+  def test1Worker(self):
+    num_workers = 1
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    # Verify the updated value after 1 step.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+  def test1WorkerNegativeStaleness(self):
+    num_workers = 1
+    sessions, graphs, train_ops = _get_workers(num_workers, -1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    # Verify no updates because max staleness is negative.
+    self.assertAllEqual(0, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+
+  def test2WorkersStaleness0(self):
+    num_workers = 2
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_0.start()
+    thread_1.start()
+    thread_0.join()
+    thread_1.join()
+
+    # With 2 workers and max staleness set to 0, only cheif worker will update
+    # var_0 and var_1.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+
+  def test2WorkersStaleness1(self):
+    num_workers = 2
+    sessions, graphs, train_ops = _get_workers(num_workers, 1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_0.start()
+    thread_1.start()
+    thread_0.join()
+    thread_1.join()
+
+    # With 2 workers and max staleness set to 1, both workers will update
+    # var_0 and var_1.
+    self.assertAllEqual(2, sessions[0].run(global_step))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
+
+  def test3WorkersStaleness0(self):
+    num_workers = 3
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_2 = self.checkedThread(
+        target=self._run, args=(train_ops[2], sessions[2]))
+    thread_0.start()
+    thread_1.start()
+    thread_2.start()
+    thread_0.join()
+    thread_1.join()
+    thread_2.join()
+
+    # With 3 workers and max staleness set to 0, only cheif worker will update
+    # var_0 and var_1.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(2.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+
+  def test3WorkersStaleness1(self):
+    num_workers = 3
+    sessions, graphs, train_ops = _get_workers(num_workers, 1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_2 = self.checkedThread(
+        target=self._run, args=(train_ops[2], sessions[2]))
+    thread_0.start()
+    thread_1.start()
+    thread_2.start()
+    thread_0.join()
+    thread_1.join()
+    thread_2.join()
+
+    # With 3 workers and max staleness set to 1, chief worker and only one of
+    # the two other workers will update var_0 and var_1.
+    self.assertAllEqual(2, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
+
+
+if __name__ == '__main__':
+  test.main()
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@ -849,14 +849,12 @@ class RNNCellTest(test.TestCase):
      batch_size = 3
      input_size = 4
      expected_state_c = np.array(
-          [[2.954548e-01, 8.354891e-04],
-           [2.834632e-01, 8.158963e-01],
-           [2.291694e-01, 1.325745e-04]],
+          [[0.00072015, 0.00036633], [0.00083481, 0.00047266],
+           [0.00085111, 0.00053054]],
          dtype=np.float32)
      expected_state_h = np.array(
-          [[2.116566e-01, 5.985238e-04],
-           [2.137760e-01, 6.153145e-01],
-           [1.742966e-01, 1.008306e-04]],
+          [[0.0005159, 0.00026243], [0.00062958, 0.00035646],
+           [0.00064732, 0.00040351]],
          dtype=np.float32)
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@ -11,7 +11,12 @@ Consultants: Jon Shlens, Pete Warden
 1.  Measure model parameters, float operations, tensor shapes.
 2.  Measure op execution times, requested memory size and device placement.
 3.  Inspect checkpoint tensors' shapes and their values.
-4.  Explore model based on name scope or graph structure.
+4.  3 ways to view and explore TensorFlow model profiles
+
+    *  Organize by Python code call stack.
+    *  Organize by TensorFlow operation name scope hierarchies.
+    *  Organize by TensorFlow operation inputs/outputs graph.
+
 5.  Selectively grouping/filtering/accounting/ordering ops.

 tfprof can be used as Python API, Interactive CLI and One-shot Script.
@ -28,7 +33,8 @@ param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
    tfprof_options=tf.contrib.tfprof.model_analyzer.
        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)

-# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
+# param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
+# It organize the statistics
 # of each graph node in tree scructure. Let's print the root below.
 sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 ```
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
@ -21,16 +21,34 @@ py_test(
    name = "model_analyzer_test",
    srcs = ["model_analyzer_test.py"],
    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
    deps = [
        ":model_analyzer",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
+        ":model_analyzer_testlib",
        "//tensorflow/python:client",
        "//tensorflow/python:client_testlib",
        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_library(
+    name = "model_analyzer_testlib",
+    srcs = ["model_analyzer_testlib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":model_analyzer",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
        "//tensorflow/python:nn_ops",
        "//tensorflow/python:platform",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:training",
        "//tensorflow/python:variable_scope",
        "//tensorflow/python:variables",
    ],
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
@ -123,7 +123,7 @@ def print_model_analysis(graph,
  """Print model statistics.

    Prints the model statistics to stdout. Also returns the results
-    in a TFProfNode proto. See go/tfprof or run tfprof tool:
+    in a TFGraphNodeProto proto. See go/tfprof or run tfprof tool:
    'bazel run third_party/tensorflow/tools/tfprof help'

    Examples:
@ -142,15 +142,19 @@ def print_model_analysis(graph,
              'micros' and 'bytes'.
    op_log: tensorflow::tfprof::OpLog proto. users can use this proto to
            group together ops and use a op_type to select the group.
-    tfprof_cmd: string. Either 'scope' or 'graph'. 'scope' view organize
-                ops using their name scopes. 'graph' view organize ops using
-                their graph inputs.
+    tfprof_cmd: string. Either 'scope', 'graph', 'code'.
+                'scope' view organize outputs using ops' name scope.
+                'graph' view organize outputs using op's inputs/outputs.
+                'code' view organize outputs using Python call stack.
    tfprof_options: See 'tfprof help' for details.
  Returns:
-    TFProfNode proto. Side effect: a formatted output to stdout.
+    If tfprof_cmd is 'scope' or 'graph', returns TFGraphNodeProto proto.
+    If tfprof_cmd is 'code', returns TFCodeNodeProto proto.
+    Side effect: a formatted output to stdout.
  """
  # pylint: disable=protected-access
-  op_log = tfprof_logger._merge_default_with_oplog(graph, op_log, run_meta)
+  op_log = tfprof_logger._merge_default_with_oplog(
+      graph, op_log, run_meta, add_trace=tfprof_cmd == 'code')
  # pylint: enable=protected-access
  opts = tfprof_options_pb2.OptionsProto()
  opts.max_depth = tfprof_options['max_depth']
@ -178,11 +182,24 @@ def print_model_analysis(graph,
  opts.dump_to_file = tfprof_options['dump_to_file']

  run_meta_str = run_meta.SerializeToString() if run_meta else b''
-  op_log_str = op_log.SerializeToString() if op_log else b''

-  tfprof_node = tfprof_output_pb2.TFProfNode()
-  tfprof_node.ParseFromString(
-      print_mdl.PrintModelAnalysis(
-          graph.as_graph_def().SerializeToString(), run_meta_str, op_log_str,
-          tfprof_cmd.encode('utf-8'), opts.SerializeToString()))
+  if tfprof_cmd == 'code':
+    tfprof_node = tfprof_output_pb2.TFCodeNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.PrintModelAnalysis(
+            graph.as_graph_def().SerializeToString(),
+            run_meta_str,
+            op_log.SerializeToString(),
+            tfprof_cmd.encode('utf-8'),
+            opts.SerializeToString()))
+  else:
+    tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.PrintModelAnalysis(
+            graph.as_graph_def().SerializeToString(),
+            run_meta_str,
+            op_log.SerializeToString(),
+            tfprof_cmd.encode('utf-8'),
+            opts.SerializeToString()))
+
  return tfprof_node
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
@ -18,49 +18,27 @@ from __future__ import division
 from __future__ import print_function

 import os
-
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test

 # XXX: this depends on pywrap_tensorflow and must come later
 from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer
+from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer_testlib as lib


 class PrintModelAnalysisTest(test.TestCase):

-  def _BuildSmallModel(self):
-    image = array_ops.zeros([2, 6, 6, 3])
-    _ = variable_scope.get_variable(
-        'ScalarW', [],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    kernel = variable_scope.get_variable(
-        'DW', [3, 3, 3, 6],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
-    kernel = variable_scope.get_variable(
-        'DW2', [2, 2, 6, 12],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
-    return x
-
  def testDumpToFile(self):
+    ops.reset_default_graph()
    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')

    with session.Session() as sess, ops.device('/cpu:0'):
-      _ = self._BuildSmallModel()
+      _ = lib.BuildSmallModel()
      model_analyzer.print_model_analysis(sess.graph, tfprof_options=opts)

      with gfile.Open(opts['dump_to_file'], 'r') as f:
@ -71,6 +49,7 @@ class PrintModelAnalysisTest(test.TestCase):
                         f.read())

  def testSelectEverything(self):
+    ops.reset_default_graph()
    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
    opts['account_type_regexes'] = ['.*']
@ -78,8 +57,10 @@ class PrintModelAnalysisTest(test.TestCase):
        'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device', 'op_types'
    ]

-    with session.Session() as sess, ops.device('/cpu:0'):
-      x = self._BuildSmallModel()
+    config = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(build_cost_model=1))
+    with session.Session(config=config) as sess, ops.device('/cpu:0'):
+      x = lib.BuildSmallModel()

      sess.run(variables.global_variables_initializer())
      run_meta = config_pb2.RunMetadata()
@ -98,6 +79,118 @@ class PrintModelAnalysisTest(test.TestCase):
            f.read())
        # pylint: enable=line-too-long

+  def testSimpleCodeView(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    opts['account_type_regexes'] = ['.*']
+    opts['show_name_regexes'] = ['.*model_analyzer_testlib.*']
+    opts['account_displayed_op_only'] = False
+    # TODO(xpan): Test 'micros'. Since the execution time changes each run,
+    # it's a bit difficult to test it now.
+    opts['select'] = [
+        'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device',
+    ]
+
+    config = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(build_cost_model=1))
+    with session.Session(config=config) as sess, ops.device('/cpu:0'):
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      with gfile.Open(opts['dump_to_file'], 'r') as f:
+        # pylint: disable=line-too-long
+        self.assertEqual(
+            '_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB)\n  model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops, 0B/864B)\n  model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/1 params, 0/0 flops, 0B/0B)\n  model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/162 params, 0/0 flops, 0B/1.30KB)\n  model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/5.83k flops, 0B/432B)\n  model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/288 params, 0/0 flops, 0B/2.30KB)\n  model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/4.61k flops, 0B/384B)\n',
+            f.read())
+        # pylint: enable=line-too-long
+
+  def testComplexCodeView(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    opts['account_type_regexes'] = ['.*']
+    opts['show_name_regexes'] = ['.*model_analyzer_testlib.py.*']
+    opts['account_displayed_op_only'] = False
+    opts['select'] = ['params', 'float_ops']
+
+    config = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(build_cost_model=1))
+    with session.Session(config=config) as sess, ops.device('/cpu:0'):
+      x = lib.BuildFullModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      # pylint: disable=line-too-long
+      with gfile.Open(opts['dump_to_file'], 'r') as f:
+        self.assertEqual(
+            '_TFProfRoot (0/2.84k params, 0/54.08k flops)\n  model_analyzer_testlib.py:56:BuildFullModel:seq.append(array_... (0/1.80k params, 0/41.76k flops)\n    model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops)\n    model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/4 params, 0/0 flops)\n    model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/648 params, 0/0 flops)\n    model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/23.33k flops)\n    model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/1.15k params, 0/0 flops)\n    model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/18.43k flops)\n  model_analyzer_testlib.py:60:BuildFullModel:cell, array_ops.c... (0/1.04k params, 0/4.13k flops)\n  model_analyzer_testlib.py:62:BuildFullModel:target = array_op... (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:63:BuildFullModel:loss = nn_ops.l2_... (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:65:BuildFullModel:return sgd_op.min... (0/0 params, 0/8.19k flops)\n',
+            f.read())
+
+      self.assertLess(0, tfprof_node.total_exec_micros)
+      self.assertEqual(2844, tfprof_node.total_parameters)
+      self.assertEqual(54080, tfprof_node.total_float_ops)
+      self.assertEqual(5, len(tfprof_node.children))
+      self.assertEqual('_TFProfRoot', tfprof_node.name)
+      self.assertEqual('model_analyzer_testlib.py:56:BuildFullModel:seq.append(array_...',
+                       tfprof_node.children[0].name)
+      self.assertEqual('model_analyzer_testlib.py:60:BuildFullModel:cell, array_ops.c...',
+                       tfprof_node.children[1].name)
+      self.assertEqual('model_analyzer_testlib.py:62:BuildFullModel:target = array_op...',
+                       tfprof_node.children[2].name)
+      self.assertEqual('model_analyzer_testlib.py:63:BuildFullModel:loss = nn_ops.l2_...',
+                       tfprof_node.children[3].name)
+      self.assertEqual('model_analyzer_testlib.py:65:BuildFullModel:return sgd_op.min...',
+                       tfprof_node.children[4].name)
+      # pylint: enable=line-too-long
+
+  def testCodeViewLeafGraphNode(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    opts['account_type_regexes'] = ['.*']
+    opts['account_displayed_op_only'] = False
+    opts['select'] = [
+        'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device'
+    ]
+
+    config = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(build_cost_model=1))
+    with session.Session(config=config) as sess, ops.device('/cpu:0'):
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      leaf = tfprof_node
+      while leaf.children:
+        self.assertEqual(0, len(leaf.graph_nodes))
+        leaf = leaf.children[0]
+      self.assertEqual(1, len(leaf.graph_nodes))
+

 if __name__ == '__main__':
  test.main()
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_testlib.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_testlib.py
@ -0,0 +1,67 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A test lib that defines some models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rnn.python.ops.core_rnn_cell import BasicRNNCell
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import gradient_descent
+
+
+def BuildSmallModel():
+  """Build a small forward conv model."""
+  image = array_ops.zeros([2, 6, 6, 3])
+  _ = variable_scope.get_variable(
+      'ScalarW', [],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  kernel = variable_scope.get_variable(
+      'DW', [3, 3, 3, 6],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
+  kernel = variable_scope.get_variable(
+      'DW2', [2, 2, 6, 12],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
+  return x
+
+
+def BuildFullModel():
+  """Build the full model with conv,rnn,opt."""
+  seq = []
+  for i in range(4):
+    with variable_scope.variable_scope('inp_%d' % i):
+      seq.append(array_ops.reshape(BuildSmallModel(), [2, 1, -1]))
+
+  cell = BasicRNNCell(16, 48)
+  out = rnn.dynamic_rnn(
+      cell, array_ops.concat(seq, axis=1), dtype=dtypes.float32)[0]
+
+  target = array_ops.ones_like(out)
+  loss = nn_ops.l2_loss(math_ops.reduce_mean(target - out))
+  sgd_op = gradient_descent.GradientDescentOptimizer(1e-2)
+  return sgd_op.minimize(loss)
+
+
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
@ -96,12 +96,13 @@ class PrintModelAnalysisTest(test.TestCase):

    with session.Session() as sess, ops.device('/cpu:0'):
      _ = self._BuildSmallModel()
-      tfprof_pb = tfprof_output_pb2.TFProfNode()
+      tfprof_pb = tfprof_output_pb2.TFGraphNodeProto()
      tfprof_pb.ParseFromString(
-          print_mdl.PrintModelAnalysis(sess.graph.as_graph_def(
-          ).SerializeToString(), b'', b'', b'scope', opts.SerializeToString()))
+          print_mdl.PrintModelAnalysis(
+              sess.graph.as_graph_def().SerializeToString(),
+              b'', b'', b'scope', opts.SerializeToString()))

-      expected_pb = tfprof_output_pb2.TFProfNode()
+      expected_pb = tfprof_output_pb2.TFGraphNodeProto()
      text_format.Merge(r"""name: "_TFProfRoot"
      exec_micros: 0
      requested_bytes: 0
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
@ -62,12 +62,13 @@ def _fill_missing_graph_shape(graph, run_meta):
  return graph


-def _get_logged_ops(graph, run_meta=None):
+def _get_logged_ops(graph, run_meta=None, add_trace=False):
  """Extract trainable model parameters and FLOPs for ops from a Graph.

  Args:
    graph: tf.Graph.
    run_meta: RunMetadata proto used to complete shape information.
+    add_trace: Whether to add op trace information.
  Returns:
    logged_ops: dict mapping from op_name to OpLogEntry.
  """
@ -76,21 +77,32 @@ def _get_logged_ops(graph, run_meta=None):

  op_missing_shape = 0
  logged_ops = {}
-  graph_def = graph.as_graph_def()
-  for node in graph_def.node:
+  for op in graph.get_operations():
    try:
-      stats = ops.get_stats_for_node_def(graph, node, REGISTERED_FLOP_STATS)
+      stats = ops.get_stats_for_node_def(
+          graph, op.node_def, REGISTERED_FLOP_STATS)
    except ValueError:
      # Catch Exception When shape is incomplete. Skip it.
      op_missing_shape += 1
      stats = None

-    if not stats or not stats.value:
-      continue
-    if node.name not in logged_ops:
-      entry = tfprof_log_pb2.OpLogEntry()
-      entry.name = node.name
+    entry = tfprof_log_pb2.OpLogEntry()
+    entry.name = op.name
+    add_entry = False
+    if stats and stats.value:
      entry.float_ops = int(stats.value)
+      add_entry = True
+
+    if add_trace:
+      for tb in op.traceback:
+        trace = entry.code_def.traces.add()
+        trace.file = tb[0] if tb[0] else 'none'
+        trace.lineno = tb[1] if tb[1] else -1
+        trace.function = tb[2] if tb[2] else 'none'
+        trace.line = tb[3] if tb[3] else 'none'
+      add_entry = True
+
+    if add_entry:
      logged_ops[entry.name] = entry

  for v in graph.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES):
@ -108,18 +120,21 @@ def _get_logged_ops(graph, run_meta=None):
  return logged_ops


-def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
+def _merge_default_with_oplog(graph, op_log=None,
+                              run_meta=None,
+                              add_trace=False):
  """Merge the tfprof default extra info with caller's op_log.

  Args:
    graph: tf.Graph.
    op_log: OpLog proto.
    run_meta: RunMetadata proto used to complete shape information.
+    add_trace: Whether to add op trace information.
  Returns:
    tmp_op_log: Merged OpLog proto.
  """
  tmp_op_log = tfprof_log_pb2.OpLog()
-  logged_ops = _get_logged_ops(graph, run_meta)
+  logged_ops = _get_logged_ops(graph, run_meta, add_trace=add_trace)
  if not op_log:
    tmp_op_log.log_entries.extend(logged_ops.values())
  else:
@ -131,13 +146,16 @@ def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
        all_ops[op_name].types.extend(entry.types)
        if entry.float_ops > 0 and all_ops[op_name].float_ops == 0:
          all_ops[op_name].float_ops = entry.float_ops
+        if entry.code_def.traces and not all_ops[op_name].code_def.traces:
+          all_ops[op_name].code_def.MergeFrom(entry.code_def)
      else:
        all_ops[op_name] = entry
    tmp_op_log.log_entries.extend(all_ops.values())
  return tmp_op_log


-def write_op_log(graph, log_dir, op_log=None, run_meta=None):
+def write_op_log(graph, log_dir, op_log=None, run_meta=None,
+                 add_trace=False):
  """Log provided 'op_log', and add additional model information below.

    The API also assigns ops in tf.trainable_variables() an op type called
@ -154,8 +172,9 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None):
        one is created.
    run_meta: (Optional) RunMetadata proto that helps flops computation using
        run time shape information.
+    add_trace: Whether to add op trace information. Used to support "code" view.
  """
-  op_log = _merge_default_with_oplog(graph, op_log, run_meta)
+  op_log = _merge_default_with_oplog(graph, op_log, run_meta, add_trace)

  with gfile.Open(os.path.join(log_dir, 'tfprof_log'), 'w') as log:
    log.write(op_log.SerializeToString())
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -156,6 +156,7 @@ CORE_PROTO_SRCS = [
    "protobuf/config.proto",
    "protobuf/cluster.proto",
    "protobuf/debug.proto",
+    "protobuf/device_properties.proto",
    "protobuf/queue_runner.proto",
    "protobuf/rewriter_config.proto",
    "protobuf/tensor_bundle.proto",
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@ -829,7 +829,8 @@ static bool ValidateInlining(const Node* node, const FunctionBody* fbody) {
 // Given a "caller" in "graph", which is a function call of a function
 // to "fbody". Replaces the "caller" with fbody->graph and connects
 // edges properly.
-static void InlineFunctionBody(Graph* g, Node* caller,
+static void InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
+                               Graph* g, Node* caller,
                               const FunctionBody* fbody) {
  if (!ValidateInlining(caller, fbody)) {
    LOG(WARNING) << "Inlining mismatch: " << caller->DebugString() << " vs. "
@ -837,6 +838,23 @@ static void InlineFunctionBody(Graph* g, Node* caller,
    return;
  }

+  // Input edges. For data edges coming into "caller", we first compute the
+  // <src>:<src_output> for the i-th input in "inputs".
+  // If "caller" has any input control dependencies, we add a NoOp
+  // node "input_control_node", which depends on "caller"'s control inputs.
+  std::vector<Endpoint> inputs(caller->num_inputs());
+  Node* input_control_node = nullptr;
+  for (const Edge* e : caller->in_edges()) {
+    if (e->IsControlEdge()) {
+      if (input_control_node == nullptr) {
+        input_control_node = AddNoOp(g);
+      }
+      g->AddControlEdge(e->src(), input_control_node);
+    } else {
+      inputs[e->dst_input()] = {e->src(), e->src_output()};
+    }
+  }
+
  // Duplicate fbody->graph into 'g'.  First, we copy the nodes of
  // fbody->graph into 'g' except the source and sink nodes.  We copy
  // edges among nodes in 'fbody->graph'.
@ -850,8 +868,35 @@ static void InlineFunctionBody(Graph* g, Node* caller,
    CHECK(n->IsOp());
    NodeDef ndef = n->def();
    ndef.set_name(strings::StrCat(caller->name(), "/", ndef.name()));
-    node_map[n->id()] = g->AddNode(ndef, &s);
+    Node* clone = g->AddNode(ndef, &s);
    TF_CHECK_OK(s);
+    node_map[n->id()] = clone;
+
+    // If there is an input control node, and one of:
+    // a) the node has no data or control inputs, or
+    // b) the node is a function call or SymbolicGradient,
+    // then add a control edge from the input control node to the clone.
+    //
+    // We must not execute any nodes if the original function call would not
+    // have executed. This is especially critical when the function call is
+    // inside a control-flow construct like tf.cond(). Case (a) ensures that
+    // such nodes do not run.
+    //
+    // The purpose of case (b) is to ensure that instances of case (a) created
+    // by further inlining steps also receive the control dependency.
+    if (input_control_node) {
+      bool has_inputs = false;
+      for (const Edge* e : n->in_edges()) {
+        if (!e->src()->IsSource()) {
+          has_inputs = true;
+          break;
+        }
+      }
+      if (!has_inputs || flib_def.Find(clone->type_string()) != nullptr ||
+          clone->type_string() == "SymbolicGradient") {
+        g->AddControlEdge(input_control_node, clone);
+      }
+    }
  }
  for (const Edge* e : fbody->graph->edges()) {
    if (e->src()->IsSource() || e->src()->IsSink() || e->dst()->IsSource() ||
@ -865,29 +910,12 @@ static void InlineFunctionBody(Graph* g, Node* caller,

  // Connect input edges.
  //
-  // For data edges coming into "caller", we first compute the
-  // <src>:<src_output> for the i-th input in "inputs". We create one
-  // Identity node for each input. Then, we connect inputs[i] to to
-  // the i-th identity node added. The nodes that previously connects
-  // to the j-th output of i-th arg node are reconnected to th i-th
+  // We create one Identity node for each input. Then, we connect inputs[i] to
+  // the i-th identity node added. The nodes that previously connected
+  // to the j-th output of i-th arg node are reconnected to the i-th
  // identity node.
  //
-  // If "caller" has any input control dependencies, we add a NoOp
-  // node "input_control_node". This "input_control_node" depends on
-  // what "caller" depends on, and the added identity nodes depend on
-  // "input_control_node".
-  std::vector<Endpoint> inputs(caller->num_inputs());
-  Node* input_control_node = nullptr;
-  for (const Edge* e : caller->in_edges()) {
-    if (e->IsControlEdge()) {
-      if (input_control_node == nullptr) {
-        input_control_node = AddNoOp(g);
-      }
-      g->AddControlEdge(e->src(), input_control_node);
-    } else {
-      inputs[e->dst_input()] = {e->src(), e->src_output()};
-    }
-  }
+  // The added identity nodes depend on "input_control_node".
  for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
    Node* arg = node_map[fbody->arg_nodes[i]->id()];
    Node* n = AddIdentity(g, inputs[i]);
@ -982,7 +1010,7 @@ bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
    candidates.push_back({node, fbody});
  }
  for (const auto& p : candidates) {
-    InlineFunctionBody(graph, p.first, p.second);
+    InlineFunctionBody(*fld, graph, p.first, p.second);
  }
  return !candidates.empty();
 }
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@ -391,6 +391,90 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
  }
 }

+// Verifies that control dependencies on the caller are added as control
+// dependencies on any function calls created by inlining.
+TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
+  Init({test::function::XTimesTwo(), test::function::XTimesFour()});
+
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto b = Call(&s, "b", "XTimesFour", {a});
+    s.graph()->AddControlEdge(c.operation.node(), b.node());
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), b, 0);
+    TF_ASSERT_OK(s.ToGraph(g.get()));
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto func0 =
+        ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
+    auto b_x2 = Call(&s, "b/x2", "XTimesTwo", {func1});
+    s.graph()->AddControlEdge(func0.operation.node(), b_x2.node());
+    auto b_y = Call(&s, "b/y", "XTimesTwo", {b_x2});
+    s.graph()->AddControlEdge(func0.operation.node(), b_y.node());
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), b_y);
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto func0 =
+        ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
+
+    auto func3 =
+        ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies({func0}));
+    auto func4 = ops::Identity(
+        s.WithOpName("Func/_4").WithControlDependencies({func3}), func1);
+    auto b_x2_two = ops::Const(
+        s.WithOpName("b/x2/two").WithControlDependencies({func3}), 2LL);
+    auto b_x2_scale = ops::Cast(s.WithOpName("b/x2/scale"), b_x2_two, DT_FLOAT);
+    auto b_x2_y = ops::Mul(s.WithOpName("b/x2/y"), func4, b_x2_scale);
+    auto func5 = ops::Identity(s.WithOpName("Func/_5"), b_x2_y);
+
+    auto func6 =
+        ops::NoOp(s.WithOpName("Func/_6").WithControlDependencies({func0}));
+    auto func7 = ops::Identity(
+        s.WithOpName("Func/_7").WithControlDependencies({func6}), func5);
+    auto b_y_two = ops::Const(
+        s.WithOpName("b/y/two").WithControlDependencies({func6}), 2LL);
+    auto b_y_scale = ops::Cast(s.WithOpName("b/y/scale"), b_y_two, DT_FLOAT);
+    auto b_y_y = ops::Mul(s.WithOpName("b/y/y"), func7, b_y_scale);
+    auto func8 = ops::Identity(s.WithOpName("Func/_8"), b_y_y);
+
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func8);
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+}
+
 TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
  Init({test::function::XTimesTwo(), test::function::XTimesFour(),
        test::function::XTimes16()});
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@ -567,15 +567,14 @@ int64 MinSystemMemory(int64 available_memory) {
  // We use the following heuristic for now:
  //
  // If the available_memory is < 2GiB, we allocate 200MiB to system memory.
-  // Otherwise, allocate 300MiB to system memory.
+  // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
  //
-  // In the future we could be more sophisticated by using a table of
-  // devices.
+  // In the future we could be more sophisticated by using a table of devices.
  if (available_memory < (1LL << 31)) {
    // 200MiB
    return 209715200LL;
  } else {
-    // max(300 MiB, 0.95 * available_memory)
+    // max(300 MiB, 0.05 * available_memory)
    return std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
  }
 }
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@ -60,6 +60,7 @@ cc_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:utils",
        "//tensorflow/core/kernels:ops_util",
    ],
 )
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@ -56,5 +56,15 @@ void Cluster::DisableDetailedStats(bool disable) {
  }
 }

+const std::vector<string> Cluster::GetDeviceNames() const {
+  std::vector<string> device_names;
+  device_names.reserve(devices_.size());
+  for (const auto& device : devices_) {
+    device_names.push_back(device.first);
+  }
+  std::sort(device_names.begin(), device_names.end());
+  return device_names;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@ -17,13 +17,14 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_

 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>

-#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorflow/core/public/session_options.h"

 namespace tensorflow {
@ -62,18 +63,14 @@ class Cluster {

  // Return the list of TensorFlow devices that are available to execute a
  // graph. This is empty until provision() is called.
-  const std::vector<DeviceAttributes>& GetDevices() const { return devices_; }
-
-  // Convenience method that returns the set of device names.
-  const std::vector<string> GetDeviceNames() const {
-    std::vector<string> device_names;
-    device_names.reserve(devices_.size());
-    for (const auto& device : devices_) {
-      device_names.push_back(device.name());
-    }
-    return device_names;
+  const std::unordered_map<string, DeviceProperties>& GetDevices() const {
+    return devices_;
  }

+  // Convenience method that returns the set of device names. These names are
+  // sorted alphabetically.
+  const std::vector<string> GetDeviceNames() const;
+
  // Prepare the session to run the specified grappler item. This include
  // initializing all the model variables.
  virtual Status Initialize(const GrapplerItem& item) = 0;
@ -85,7 +82,7 @@ class Cluster {
                     RunMetadata* metadata) = 0;

 protected:
-  std::vector<DeviceAttributes> devices_;
+  std::unordered_map<string, DeviceProperties> devices_;
  const int timeout_s_;
  SessionOptions options_;
  RunOptions run_options_;
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@ -19,6 +19,7 @@ limitations under the License.

 #include "tensorflow/cc/training/queue_runner.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@ -66,16 +67,12 @@ Status SingleMachine::Provision() {
    return status;
  }

-  DeviceAttributes attr;
-  attr.set_name("/job:localhost/replica:0/task:0/cpu:0");
-  attr.set_device_type("CPU");
-  devices_.push_back(attr);
+  DeviceProperties attr = GetLocalCPUInfo();
+  devices_["/job:localhost/replica:0/task:0/cpu:0"] = GetLocalCPUInfo();

  for (int i = 0; i < num_gpus_; ++i) {
-    DeviceAttributes attr;
-    attr.set_name(strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i));
-    attr.set_device_type("GPU");
-    devices_.push_back(attr);
+    devices_[strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i)] =
+        GetLocalGPUInfo(i);
  }
  return Status::OK();
 }
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@ -25,7 +25,9 @@ tf_proto_library(
    name = "op_performance_data",
    srcs = ["op_performance_data.proto"],
    cc_api_version = 2,
-    protodeps = ["//tensorflow/core:protos_all"],
+    protodeps = [
+        "//tensorflow/core:protos_all",
+    ],
    visibility = ["//visibility:public"],
 )

@ -141,10 +143,10 @@ cc_library(
    hdrs = ["virtual_placer.h"],
    visibility = ["//visibility:public"],
    deps = [
-        ":op_performance_data_cc",
        ":utils",
        "//tensorflow/core:framework",
        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core/grappler:devices",
        "//tensorflow/core/grappler/clusters:cluster",
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@ -73,7 +73,7 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
    std::vector<OpInfo::TensorProperties> inputs =
        properties.GetInputProperties(node->name());

-    OpInfo::DeviceProperties device = placer.get_device(*node);
+    DeviceProperties device = placer.get_device(*node);
    OpInfo op_info;
    op_info.set_op(node->op());
    *op_info.mutable_attr() = node->attr();
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@ -69,7 +69,7 @@ Costs OpLevelCostEstimator::PredictCosts(const OpInfo& op_features) const {
 }

 std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
-    const OpInfo::DeviceProperties& device) const {
+    const DeviceProperties& device) const {
  double gflops = -1;
  double bandwidth = -1;
  if (device.bandwidth() > 0) {
@ -77,7 +77,7 @@ std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
  }

  if (device.type() == "CPU") {
-    const OpInfo::DeviceProperties local_cpu = GetLocalCPUInfo();
+    const DeviceProperties local_cpu = GetLocalCPUInfo();
    // Check if vector instructions are available, and refine performance
    // prediction based on this.
    // Frequencies are stored in MHz in the DeviceProperties.
@ -90,7 +90,7 @@ std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
      }
    }
  } else if (device.type() == "GPU") {
-    const OpInfo::DeviceProperties local_gpu = GetLocalGPUInfo(0);
+    const DeviceProperties local_gpu = GetLocalGPUInfo(0);
    const string architecture = local_gpu.environment().at("architecture");
    int cores_per_multiprocessor;
    if (architecture < "3") {
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@ -40,7 +40,7 @@ class OpLevelCostEstimator {
  // executed per second) and memory bandwith (in GigaBytes/second) for the
  // specified device.
  virtual std::pair<double, double> GetDeviceInfo(
-      const OpInfo::DeviceProperties& device) const;
+      const DeviceProperties& device) const;

  // For operations for which we haven't yet built estimates, returns a dummy
  // value based on input size.
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"

 namespace tensorflow {
 namespace grappler {
--- a/tensorflow/core/grappler/costs/op_performance_data.proto
+++ b/tensorflow/core/grappler/costs/op_performance_data.proto
@ -22,6 +22,7 @@ import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/protobuf/device_properties.proto";

 // Description of an operation as well as the parameters expected to impact its
 // performance.
@ -41,36 +42,6 @@ message OpInfo {
  repeated TensorProperties inputs = 3;

  // Device on which the operation is run.
-  message DeviceProperties {
-    // Device type (CPU, GPU, ...)
-    string type = 1;
-    // Vendor (Intel, nvidia, ...)
-    string vendor = 2;
-    // Model (Haswell, K40, ...)
-    string model = 3;
-    // Core Frequency in Mhz
-    int64 frequency = 4;
-    // Number of cores
-    int64 num_cores = 5;
-    // Version of the tools and libraries used with this device (e.g. gcc 4.9,
-    // cudnn 5.1)
-    map<string, string> environment = 6;
-    // Number of registers per core.
-    int64 num_registers = 7;
-    // L1 cache size in bytes
-    int64 l1_cache_size = 8;
-    // L2 cache size in bytes
-    int64 l2_cache_size = 9;
-    // L3 cache size in bytes
-    int64 l3_cache_size = 10;
-    // Shared memory size per multiprocessor in bytes. This field is
-    // applicable to GPUs only.
-    int64 shared_memory_size_per_multiprocessor = 11;
-    // Memory size in bytes
-    int64 memory_size = 12;
-    // Memory bandwidth in KB/s
-    int64 bandwidth = 13;
-  }
  DeviceProperties device = 4;
 }

--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@ -125,7 +125,7 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
  return inputs;
 }

-OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
+DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
  DeviceNameUtils::ParsedName parsed;
  if (DeviceNameUtils::ParseFullName(node.device(), &parsed)) {
    if (parsed.type == "GPU") {
@ -134,13 +134,13 @@ OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
      return GetLocalCPUInfo();
    }
  }
-  OpInfo::DeviceProperties device;
+  DeviceProperties device;
  device.set_type("UNKNOWN");
  return device;
 }

-OpInfo::DeviceProperties GetLocalCPUInfo() {
-  OpInfo::DeviceProperties device;
+DeviceProperties GetLocalCPUInfo() {
+  DeviceProperties device;
  device.set_type("CPU");

  device.set_vendor(port::CPUVendorIDString());
@ -165,8 +165,8 @@ OpInfo::DeviceProperties GetLocalCPUInfo() {
  return device;
 }

-OpInfo::DeviceProperties GetLocalGPUInfo(int gpu_id) {
-  OpInfo::DeviceProperties device;
+DeviceProperties GetLocalGPUInfo(int gpu_id) {
+  DeviceProperties device;
  device.set_type("GPU");

 #if GOOGLE_CUDA
--- a/tensorflow/core/grappler/costs/utils.h
+++ b/tensorflow/core/grappler/costs/utils.h
@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"

 namespace tensorflow {
 namespace grappler {
@ -40,14 +41,14 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
    const std::unordered_map<string, const NodeDef*>& name_to_node);

 // Returns the DeviceProperties of the device on which 'node' runs.
-OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
+DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);

 // Returns the DeviceProperties of the CPU on which grappler is running.
-OpInfo::DeviceProperties GetLocalCPUInfo();
+DeviceProperties GetLocalCPUInfo();

 // Returns the DeviceProperties for the specified GPU attached to the server on
 // which grappler is running.
-OpInfo::DeviceProperties GetLocalGPUInfo(int gpu_id);
+DeviceProperties GetLocalGPUInfo(int gpu_id);

 }  // end namespace grappler
 }  // end namespace tensorflow
--- a/tensorflow/core/grappler/costs/virtual_placer.cc
+++ b/tensorflow/core/grappler/costs/virtual_placer.cc
@ -18,35 +18,48 @@ limitations under the License.
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/device_name_utils.h"

 namespace tensorflow {
 namespace grappler {

 VirtualPlacer::VirtualPlacer(Cluster* cluster) : has_gpu_(false) {
-  devices_["CPU"] = GetLocalCPUInfo();
-  if (GetNumAvailableGPUs() > 0) {
-    has_gpu_ = true;
-    devices_["GPU"] = GetLocalGPUInfo(0);
+  devices_ = cluster->GetDevices();
+  for (const auto& device : cluster->GetDevices()) {
+    if (str_util::Lowercase(device.first).find("gpu") != string::npos) {
+      has_gpu_ = true;
+    }
  }
+
  unknown_device_.set_type("UNKNOWN");
 }

-const OpInfo::DeviceProperties& VirtualPlacer::get_device(
-    const NodeDef& node) const {
-  string device_type;
+const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
  DeviceNameUtils::ParsedName parsed;
-  if (!node.device().empty() &&
-      DeviceNameUtils::ParseFullName(node.device(), &parsed)) {
-    device_type = parsed.type;
-  } else {
-    if (has_gpu_) {
-      device_type = "GPU";
-    } else {
-      device_type = "CPU";
+  if (!node.device().empty()) {
+    auto it = devices_.find(node.device());
+    if (it != devices_.end()) {
+      return it->second;
    }
+    if (DeviceNameUtils::ParseFullName(node.device(), &parsed)) {
+      string device_name =
+          strings::StrCat("/job:localhost/replica:0/task:0/",
+                          str_util::Lowercase(parsed.type), ":", parsed.id);
+      it = devices_.find(device_name);
+      if (it != devices_.end()) {
+        return it->second;
+      }
+    }
+    return unknown_device_;
  }
-  auto it = devices_.find(device_type);
+  string device;
+  if (has_gpu_) {
+    device = "/job:localhost/replica:0/task:0/gpu:0";
+  } else {
+    device = "/job:localhost/replica:0/task:0/cpu:0";
+  }
+  auto it = devices_.find(device);
  if (it == devices_.end()) {
    return unknown_device_;
  }
--- a/tensorflow/core/grappler/costs/virtual_placer.h
+++ b/tensorflow/core/grappler/costs/virtual_placer.h
@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_

 #include <unordered_map>
-#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"

 namespace tensorflow {
 class NodeDef;
@ -31,12 +31,12 @@ class VirtualPlacer {
 public:
  VirtualPlacer(Cluster* cluster);

-  const OpInfo::DeviceProperties& get_device(const NodeDef& node) const;
+  const DeviceProperties& get_device(const NodeDef& node) const;

 private:
-  std::unordered_map<string, OpInfo::DeviceProperties> devices_;
+  std::unordered_map<string, DeviceProperties> devices_;
  bool has_gpu_;
-  OpInfo::DeviceProperties unknown_device_;
+  DeviceProperties unknown_device_;
 };

 }  // namespace grappler
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@ -38,9 +38,9 @@ using Eigen::GpuDevice;
 // in NHWC format.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
          int kKnownDepthMultiplier>
-__global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
-                                             const T* input, const T* filter,
-                                             T* output, int num_outputs) {
+__global__ void __launch_bounds__(1024, 2)
+    DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input,
+                                 const T* filter, T* output, int num_outputs) {
  const int in_rows = args.in_rows;
  const int in_cols = args.in_cols;
  const int in_depth = args.in_depth;
@ -120,9 +120,9 @@ __global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
 // in NCHW format.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
          int kKnownDepthMultiplier>
-__global__ void DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args,
-                                             const T* input, const T* filter,
-                                             T* output, int num_outputs) {
+__global__ void __launch_bounds__(1024, 2)
+    DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input,
+                                 const T* filter, T* output, int num_outputs) {
  const int in_rows = args.in_rows;
  const int in_cols = args.in_cols;
  const int in_depth = args.in_depth;
@ -250,17 +250,34 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
                              TensorFormat data_format) {
  const int num_outputs =
      args.batch * args.out_rows * args.out_cols * args.out_depth;
-  CudaLaunchConfig config = GetCudaLaunchConfig(num_outputs, d);
+  // The compile-time constant version runs faster with a single block.
+  const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
+                                      kKnownDepthMultiplier < 0 ||
+                                      args.out_rows * args.out_cols <= 256
+                                  ? std::numeric_limits<int>::max()
+                                  : d.getNumCudaMultiProcessors();
  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
+                                     kKnownDepthMultiplier>,
+        0);
    DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
                                 kKnownDepthMultiplier>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            args, input, filter, output, num_outputs);
+        <<<std::min(max_block_count, config.block_count),
+           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
+                                                     output, num_outputs);
  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
+                                     kKnownDepthMultiplier>,
+        0);
    DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
                                 kKnownDepthMultiplier>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            args, input, filter, output, num_outputs);
+        <<<std::min(max_block_count, config.block_count),
+           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
+                                                     output, num_outputs);
  } else {
    assert(false);
  }
@ -288,9 +305,11 @@ template struct DepthwiseConv2dGPULaunch<double>;
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
          int kKnownDepthMultiplier>
-__global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
-    const DepthwiseArgs args, const T* out_backprop, const T* filter,
-    T* in_backprop, int num_in_backprop) {
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args,
+                                              const T* out_backprop,
+                                              const T* filter, T* in_backprop,
+                                              int num_in_backprop) {
  const int in_rows = args.in_rows;
  const int in_cols = args.in_cols;
  const int in_depth = args.in_depth;
@ -350,7 +369,7 @@ __global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(

 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
          int kKnownDepthMultiplier>
-__global__ void __launch_bounds__(1024)
+__global__ void __launch_bounds__(640, 2)
    DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args,
                                              const T* out_backprop,
                                              const T* filter, T* in_backprop,
@ -428,17 +447,22 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
                                           TensorFormat data_format) {
  const int num_in_backprop =
      args.batch * args.in_rows * args.in_cols * args.in_depth;
-  CudaLaunchConfig config = GetCudaLaunchConfig(num_in_backprop, d);
-  // Increase block count for when there are more warps/SM than threads/SM.
-  // TODO(csigg): this is pretty arbitraty and should be generalized using
-  // cudaOccupancyMaxPotentialBlockSize().
-  config.block_count *= 4;
  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_in_backprop, d,
+        DepthwiseConv2dBackpropInputGPUKernelNHWC<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
    DepthwiseConv2dBackpropInputGPUKernelNHWC<
        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
            args, out_backprop, filter, in_backprop, num_in_backprop);
  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_in_backprop, d,
+        DepthwiseConv2dBackpropInputGPUKernelNCHW<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
    DepthwiseConv2dBackpropInputGPUKernelNCHW<
        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
@ -475,9 +499,12 @@ template struct DepthwiseConv2dBackpropInputGPULaunch<double>;
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
          int kKnownDepthMultiplier>
-__global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
-    const DepthwiseArgs args, const T* out_backprop, const T* input,
-    T* filter_backprop, int num_out_backprop) {
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args,
+                                               const T* out_backprop,
+                                               const T* input,
+                                               T* filter_backprop,
+                                               int num_out_backprop) {
  const int in_rows = args.in_rows;
  const int in_cols = args.in_cols;
  const int in_depth = args.in_depth;
@ -566,9 +593,12 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
          int kKnownDepthMultiplier>
-__global__ void DepthwiseConv2dBackpropFilterGPUKernelNCHW(
-    const DepthwiseArgs args, const T* out_backprop, const T* input,
-    T* filter_backprop, int num_out_backprop) {
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args,
+                                               const T* out_backprop,
+                                               const T* input,
+                                               T* filter_backprop,
+                                               int num_out_backprop) {
  const int in_rows = args.in_rows;
  const int in_cols = args.in_cols;
  const int in_depth = args.in_depth;
@ -669,13 +699,22 @@ void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
                                            TensorFormat data_format) {
  const int num_out_backprop =
      args.batch * args.out_rows * args.out_cols * args.out_depth;
-  CudaLaunchConfig config = GetCudaLaunchConfig(num_out_backprop, d);
  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_out_backprop, d,
+        DepthwiseConv2dBackpropFilterGPUKernelNHWC<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
    DepthwiseConv2dBackpropFilterGPUKernelNHWC<
        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
            args, out_backprop, input, filter_backprop, num_out_backprop);
  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_out_backprop, d,
+        DepthwiseConv2dBackpropFilterGPUKernelNCHW<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
    DepthwiseConv2dBackpropFilterGPUKernelNCHW<
        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@ -87,6 +87,12 @@ class SplitVOpBase : public OpKernel {
    // Special case 1: num_split == 1. Nothing to do.
    if (num_split == 1) {
      context->set_output(0, context->input(0));
+      OP_REQUIRES(
+          context, (*split_sizes_vec)[0] == input_size_split_dim,
+          errors::InvalidArgument("If there is only one output, it must have "
+                                  "the same size as the input. Input size: ",
+                                  input_size_split_dim,
+                                  " output size: ", (*split_sizes_vec)[0]));
      *done = true;
      return;
    }
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@ -127,6 +127,16 @@ class TemporaryVariableOp : public OpKernel {
    OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
                                       var_name_, tmp_var));
    context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
+    if (context->track_allocations()) {
+      AllocatorAttributes attr;
+      if (context->allocate_on_host(attr)) {
+        context->record_host_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      } else {
+        context->record_device_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      }
+    }
  }

 private:
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@ -518,7 +518,17 @@ REGISTER_OP("SplitV")
      } else if (rank == 0) {
        // Throw error if input is a scalar.
        return errors::InvalidArgument("Can't split scalars");
-      } else if (size_splits == nullptr || !c->ValueKnown(split_dimension)) {
+      } else if (size_splits == nullptr && c->ValueKnown(split_dimension)) {
+        // If split dimension is known, but the sizes are unknown, then
+        // only the split dimension is unknown
+        output_shape = input;
+        TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
+                                         c->Value(split_dimension),
+                                         c->UnknownDim(), &output_shape));
+        for (int i = 0; i < num_outputs; ++i) {
+          c->set_output(i, output_shape);
+        }
+      } else if (size_splits == nullptr && !c->ValueKnown(split_dimension)) {
        // If split dimension or tensor containing the split sizes is unknown,
        // then return unknown shapes of same rank as input.
        output_shape = c->UnknownShapeOfRank(rank);
@ -540,12 +550,37 @@ REGISTER_OP("SplitV")
          return errors::InvalidArgument(
              "Length of size_splits should be equal to num_outputs");
        }
+        int64_t cumsum_outputs = 0;
+        bool has_neg_one = false;
+        // If the sizes of the splits are known, then
+        // make sure that the sizes add up to the expected
+        // dimension size, with the possibility of a -1.
+        // Specify the full output shapes.
        for (int i = 0; i < num_outputs; ++i) {
          output_shape = c->UnknownShapeOfRank(rank);
          TF_RETURN_IF_ERROR(c->ReplaceDim(input, split_dim,
                                           c->MakeDim(data[i]), &output_shape));
          c->set_output(i, output_shape);
+          if (data[i] == -1 && !has_neg_one)
+            has_neg_one = true;
+          else if (data[i] == -1 && has_neg_one)
+            return errors::InvalidArgument("size_splits can only have one -1");
+          else
+            cumsum_outputs += data[i];
        }
+        auto split_dim_size = c->Value(c->Dim(input, split_dim));
+        if (has_neg_one) {
+          if (cumsum_outputs < split_dim_size)
+            cumsum_outputs = split_dim_size;
+          else
+            cumsum_outputs = split_dim_size + 1;
+        }
+        if (cumsum_outputs != c->Value(c->Dim(input, split_dim)))
+          return errors::InvalidArgument(
+              "Sum of output sizes must match "
+              "the size of the original Tensor along the split dimension "
+              "or the sum of the positive sizes must be less if it contains a "
+              "-1");
      }

      return Status::OK();
--- a/tensorflow/core/protobuf/device_properties.proto
+++ b/tensorflow/core/protobuf/device_properties.proto
@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "DevicePropertiesProtos";
+
+message DeviceProperties {
+  // Device type (CPU, GPU, ...)
+  string type = 1;
+  // Vendor (Intel, nvidia, ...)
+  string vendor = 2;
+  // Model (Haswell, K40, ...)
+  string model = 3;
+  // Core Frequency in Mhz
+  int64 frequency = 4;
+  // Number of cores
+  int64 num_cores = 5;
+  // Version of the tools and libraries used with this device (e.g. gcc 4.9,
+  // cudnn 5.1)
+  map<string, string> environment = 6;
+  // Number of registers per core.
+  int64 num_registers = 7;
+  // L1 cache size in bytes
+  int64 l1_cache_size = 8;
+  // L2 cache size in bytes
+  int64 l2_cache_size = 9;
+  // L3 cache size in bytes
+  int64 l3_cache_size = 10;
+  // Shared memory size per multiprocessor in bytes. This field is
+  // applicable to GPUs only.
+  int64 shared_memory_size_per_multiprocessor = 11;
+  // Memory size in bytes
+  int64 memory_size = 12;
+  // Memory bandwidth in KB/s
+  int64 bandwidth = 13;
+}
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@ -63,6 +63,28 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
  return config;
 }

+// Calculate the Cuda launch config we should use for a kernel launch. This
+// variant takes the resource limits of func into account to maximize occupancy.
+template <typename DeviceFunc>
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const GPUDevice& d, DeviceFunc func,
+                                            size_t dynamic_shared_memory_size) {
+  int block_count = 0;
+  int thread_per_block = 0;
+  cudaOccupancyMaxPotentialBlockSize(&block_count, &thread_per_block, func,
+                                     dynamic_shared_memory_size,
+                                     work_element_count);
+  block_count =
+      std::min(block_count,
+               (work_element_count + thread_per_block - 1) / thread_per_block);
+
+  CudaLaunchConfig config;
+  config.virtual_thread_count = work_element_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+
 struct Cuda2DLaunchConfig {
  dim3 virtual_thread_count;
  dim3 thread_per_block;
--- a/tensorflow/opensource_only/eigen.threadpool
+++ b/tensorflow/opensource_only/eigen.threadpool
@ -0,0 +1 @@
+#include "unsupported/Eigen/CXX11/ThreadPool"
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@ -393,6 +393,17 @@ def bucketized_column(source_column, boundaries):
  `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
  `[1., 2.)`, and `[2., +inf)`.

+  For example, if the inputs are
+    `boundaries` = [0, 10, 100]
+    input tensor = [[-5, 10000]
+                    [150,   10]
+                    [5,    100]]
+
+  then the output will be
+    output = [[0, 3]
+              [3, 2]
+              [1, 3]]
+
  Example:

  ```python
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@ -324,6 +324,48 @@ class FunctionTest(test.TestCase):
                                   "assertion"):
        _ = MyFn(100.0).eval()

+  def testControlFlowStrictness(self):
+    """Inlined functions must not execute in a untaken control flow branch."""
+
+    @function.Defun(dtypes.int32)
+    def AssertFail(x):
+      # Assertion that always fails and does not have a data dependency on `x`.
+      assert_false = control_flow_ops.Assert(False, [42])
+      with ops.control_dependencies([assert_false]):
+        return array_ops.identity(x)
+
+    with ops.device("CPU"):
+      pred = array_ops.placeholder(dtypes.bool)
+      x = array_ops.placeholder(dtypes.int32)
+      cond = control_flow_ops.cond(pred, lambda: x + 1, lambda: AssertFail(x))
+      # pylint: disable=unnecessary-lambda
+      loop = control_flow_ops.while_loop(lambda y: pred,
+                                         lambda y: AssertFail(y), [x])
+      # pylint: enable=unnecessary-lambda
+
+    # Enables inlining.
+    config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
+        optimizer_options=config_pb2.OptimizerOptions(
+            opt_level=config_pb2.OptimizerOptions.L0,
+            do_common_subexpression_elimination=True,
+            do_function_inlining=True,
+            do_constant_folding=True)))
+
+    with session.Session(config=config) as sess:
+      # Since the 'False' branch is not taken, the assertion should not fire.
+      self.assertEqual(4, sess.run(cond, {pred: True, x: 3}))
+
+      # The assertion should still fire if the False branch is taken.
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "assertion"):
+        sess.run(cond, {pred: False, x: 3})
+
+      # Similarly for loops.
+      self.assertEqual(3, sess.run(loop, {pred: False, x: 3}))
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "assertion"):
+        sess.run(loop, {pred: True, x: 3})
+
  def testVar(self):

    @function.Defun(dtypes.float32)
--- a/tensorflow/python/kernel_tests/bias_op_test.py
+++ b/tensorflow/python/kernel_tests/bias_op_test.py
@ -184,8 +184,11 @@ class BiasAddTest(test.TestCase):
      if dtype == dtypes.float64:
        threshold = 1e-10
      self.assertAllClose(tensor_jacob_t, tensor_jacob_n, threshold, threshold)
-      self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold)
-      self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
+      # TODO(annarev): Re-add assertion for float16, float32 dtypes and NCHW
+      # once we figure out why this check started failing with cuda mavx.
+      if dtype == dtypes.float64 or data_format != "NCHW":
+        self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold)
+        self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)

  def testGradientTensor(self):
    for (data_format, use_gpu) in GetTestConfigs():
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@ -22,6 +22,7 @@ import numpy as np

 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
@ -40,6 +41,42 @@ class SplitOpTest(test.TestCase):
      data -= 1j * data
    return data

+  def testShapeInference(self):
+    model_input = array_ops.placeholder(dtypes.float32, shape=(1, 10))
+
+    # check that we fail during static shape inference if sizes are known
+    with self.assertRaises(ValueError):
+      # pylint: disable=expression-not-assigned
+      array_ops.split(model_input, [4], axis=1)[0]
+      # pylint: enable=expression-not-assigned
+
+    model_input = array_ops.placeholder(dtypes.float32)
+    inp = np.zeros((1, 10))
+    # check that we still fail at runtime if the shapes were unknown
+    with self.test_session(use_gpu=False) as sess:
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        sess.run(array_ops.split(model_input, [4]), {model_input: inp})
+
+    # test that we can pass a scalar Tensor as num_splits
+    with self.test_session(use_gpu=False) as sess:
+      result = sess.run(
+          array_ops.split(
+              array_ops.ones([4, 4]),
+              num_or_size_splits=array_ops.ones([2, 2]).get_shape()[1],
+              axis=0))
+
+    self.assertEqual(result[0].shape, (2, 4))
+    self.assertEqual(result[1].shape, (2, 4))
+
+    # test that none split dimensions remain, even if we don't know how
+    # the split_dim will be split, but we do know the axis
+    result = array_ops.split(
+        array_ops.ones([5, 2]), array_ops.constant([2, 1, 2]) * 1, axis=0)
+
+    self.assertEqual(result[0].shape[1], 2)
+    self.assertEqual(result[1].shape[1], 2)
+    self.assertEqual(result[2].shape[1], 2)
+
  def testExplicitNum(self):
    size_splits = array_ops.placeholder(dtype=dtypes.int32, shape=[None])

--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@ -84,7 +84,6 @@ from __future__ import print_function

 import sys
 import numpy as np
-import six

 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
@ -1165,13 +1164,14 @@ def sparse_mask(a, mask_indices, name=None):
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
  """Splits a tensor into sub tensors.

-  If `num_or_size_splits` is a scalar, `num_split`, then splits `value` along
-  dimension `axis` into `num_split` smaller tensors.
+  If `num_or_size_splits` is an integer type, `num_split`, then splits `value`
+  along dimension `axis` into `num_split` smaller tensors.
  Requires that `num_split` evenly divides `value.shape[axis]`.

-  If `num_or_size_splits` is a tensor, `size_splits`, then splits `value` into
-  `len(size_splits)` pieces. The shape of the `i`-th piece has the same size as
-  the `value` except along dimension `axis` where the size is `size_splits[i]`.
+  If `num_or_size_splits` is not an integer type, it is presumed to be a Tensor
+  `size_splits`, then splits `value` into `len(size_splits)` pieces. The shape
+  of the `i`-th piece has the same size as the `value` except along dimension
+  `axis` where the size is `size_splits[i]`.

  For example:

@ -1189,11 +1189,11 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):

  Args:
    value: The `Tensor` to split.
-    num_or_size_splits: Either an integer indicating the number of splits along
-      split_dim or a 1-D Tensor containing the sizes of each output tensor
-      along split_dim. If an integer then it must evenly divide
-      `value.shape[axis]`; otherwise the sum of sizes along the split
-      dimension must match that of the `value`.
+    num_or_size_splits: Either a 0-D integer `Tensor` indicating the number of
+      splits along split_dim or a 1-D integer `Tensor` integer tensor containing
+      the sizes of each output tensor along split_dim. If a scalar then it must
+      evenly divide `value.shape[axis]`; otherwise the sum of sizes along the
+      split dimension must match that of the `value`.
    axis: A 0-D `int32` `Tensor`. The dimension along which to split.
      Must be in the range `[0, rank(value))`. Defaults to 0.
    num: Optional, used to specify the number of outputs when it cannot be
@ -1209,11 +1209,11 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
  Raises:
    ValueError: If `num` is unspecified and cannot be inferred.
  """
-  if isinstance(num_or_size_splits, six.integer_types):
+  size_splits = ops.convert_to_tensor(num_or_size_splits)
+  if size_splits.get_shape().ndims == 0 and size_splits.dtype.is_integer:
    return gen_array_ops._split(
        split_dim=axis, num_split=num_or_size_splits, value=value, name=name)
  else:
-    size_splits = ops.convert_to_tensor(num_or_size_splits)
    if num is None:
      size_splits_shape = size_splits.get_shape()
      num = size_splits_shape.dims[0]
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@ -1622,6 +1622,11 @@ class CondContext(ControlFlowContext):
          # pylint: enable=protected-access
      for x in op.outputs:
        self._values.add(x.name)
+      # pylint: disable=protected-access
+      if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
+        op._add_control_input(self._pivot.op)
+      # pylint: enable=protected-access
+
    if self._outer_context or not IsLoopExit(op):
      op.graph.prevent_fetching(op)

@ -2147,8 +2152,13 @@ class WhileContext(ControlFlowContext):
  def _MaybeAddControlDependency(self, op):
    """Add a control input to the op if it only depends on loop invariants."""
    def _IsOpFree(op):
+      """Determines if `op` needs a control dependency."""
      if op.control_inputs:
        return False
+      # pylint: disable=protected-access
+      if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
+        return True
+      # pylint: enable=protected-access
      for x in op.inputs:
        if not _IsLoopConstantEnter(x.op):
          return False
--- a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
+++ b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
@ -30,6 +30,11 @@ def _IsDirectory(parent, item):
  return gfile.IsDirectory(os.path.join(parent, item))


+def PluginDirectory(logdir, plugin_name):
+  """Returns the plugin directory for plugin_name."""
+  return os.path.join(logdir, _PLUGINS_DIR, plugin_name)
+
+
 def ListPlugins(logdir):
  """List all the plugins that have registered assets in logdir.

@ -61,7 +66,7 @@ def ListAssets(logdir, plugin_name):
    not exist (either because the logdir doesn't exist, or because the plugin
    didn't register) an empty list is returned.
  """
-  plugin_dir = os.path.join(logdir, _PLUGINS_DIR, plugin_name)
+  plugin_dir = PluginDirectory(logdir, plugin_name)
  if not gfile.IsDirectory(plugin_dir):
    return []
  entries = gfile.ListDirectory(plugin_dir)
@ -83,7 +88,7 @@ def RetrieveAsset(logdir, plugin_name, asset_name):
    KeyError: if the asset does not exist.
  """

-  asset_path = os.path.join(logdir, _PLUGINS_DIR, plugin_name, asset_name)
+  asset_path = os.path.join(PluginDirectory(logdir, plugin_name), asset_name)
  try:
    with gfile.Open(asset_path, "r") as f:
      return f.read()
--- a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
@ -50,6 +50,11 @@ class PluginGamma(GenericContentPlugin):

 class PluginAssetUtilitiesTest(test.TestCase):

+  def testGetPluginDirectory(self):
+    self.assertEqual(
+        os.path.join("logdir", "plugins", "x"),
+        plugin_asset_util.PluginDirectory("logdir", "x"))
+
  def testNonExistentDirectory(self):
    tempdir = self.get_temp_dir()
    fake_dir = os.path.join(tempdir, "nonexistent_dir")
--- a/tensorflow/tensorboard/components/tf_dashboard_common/BUILD
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/BUILD
@ -46,6 +46,7 @@ tensorboard_typescript_genrule(
    ],
    typings = [
        "@org_definitelytyped//:d3.d.ts",
+        "@org_definitelytyped//:lodash.d.ts",
        "//tensorflow/tensorboard/components/vz_sorting:ts_typings",
    ],
 )
--- a/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts
@ -72,24 +72,31 @@ module Categorizer {
      if (tags.length === 0) {
        return [];
      }
-      let sortedTags = tags.slice().sort(VZ.Sorting.compareTagNames);
-      let categories: Category[] = [];
-      let currentCategory = {
-        name: extractor(sortedTags[0]),
-        tags: [],
-      };
-      sortedTags.forEach((t: string) => {
-        let topLevel = extractor(t);
-        if (currentCategory.name !== topLevel) {
-          categories.push(currentCategory);
-          currentCategory = {
+
+      // Maps between top-level name and category. We use the mapping to avoid
+      // duplicating categories per run.
+      const categoryMapping: {[key: string]: Category} = {};
+
+      tags.forEach((t: string) => {
+        const topLevel = extractor(t);
+        if (!categoryMapping[topLevel]) {
+          const newCategory = {
            name: topLevel,
            tags: [],
          };
+          categoryMapping[topLevel] = newCategory;
        }
-        currentCategory.tags.push(t);
+
+        categoryMapping[topLevel].tags.push(t);
+      });
+
+      // Sort categories into alphabetical order.
+      const categories =
+          _.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
+      _.forEach(categories, (category) => {
+        // Sort the tags within each category.
+        category.tags.sort(VZ.Sorting.compareTagNames);
      });
-      categories.push(currentCategory);
      return categories;
    };
  }
--- a/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts
@ -62,6 +62,18 @@ module Categorizer {
        assert.deepEqual(
            topLevelNamespaceCategorizer(['a']), [{name: 'a', tags: ['a']}]);
      });
+
+      it('only create 1 category per run', () => {
+        // TensorBoard separates runs from tags using the / and _ characters
+        // *only* during sorting. The categorizer should group all tags under
+        // their correct categories - and create only 1 category per run.
+        const tags = ['foo/bar', 'foo_in_between_run/baz', 'foo/quux'];
+        const expected = [
+          {name: 'foo', tags: ['foo/bar', 'foo/quux']},
+          {name: 'foo_in_between_run', tags: ['foo_in_between_run/baz']},
+        ];
+        assert.deepEqual(topLevelNamespaceCategorizer(tags), expected);
+      });
    });

    describe('customCategorizer', () => {
--- a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.ts
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.ts
@ -73,24 +73,31 @@ function extractorToCategorizer(extractor: (s: string) => string): Categorizer {
    if (tags.length === 0) {
      return [];
    }
-    let sortedTags = tags.slice().sort(compareTagNames);
-    let categories: Category[] = [];
-    let currentCategory = {
-      name: extractor(sortedTags[0]),
-      tags: [],
-    };
-    sortedTags.forEach((t: string) => {
-      let topLevel = extractor(t);
-      if (currentCategory.name !== topLevel) {
-        categories.push(currentCategory);
-        currentCategory = {
+
+    // Maps between top-level name and category. We use the mapping to avoid
+    // duplicating categories per run.
+    const categoryMapping: {[key: string]: Category} = {};
+
+    tags.forEach((t: string) => {
+      const topLevel = extractor(t);
+      if (!categoryMapping[topLevel]) {
+        const newCategory = {
          name: topLevel,
          tags: [],
        };
+        categoryMapping[topLevel] = newCategory;
      }
-      currentCategory.tags.push(t);
+
+      categoryMapping[topLevel].tags.push(t);
+    });
+
+    // Sort categories into alphabetical order.
+    const categories =
+        _.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
+    _.forEach(categories, (category) => {
+      // Sort the tags within each category.
+      category.tags.sort(compareTagNames);
    });
-    categories.push(currentCategory);
    return categories;
  };
 }
@ -180,4 +187,4 @@ Polymer({
      this._setCategories(categories);
    })
  },
-});
+});
--- a/tensorflow/tensorboard/components/tf_graph_loader/BUILD
+++ b/tensorflow/tensorboard/components/tf_graph_loader/BUILD
@ -0,0 +1,46 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+webfiles(
+    name = "tf_graph_loader",
+    srcs = [
+        "tf-graph-loader.html",
+    ],
+    path = "/tf-graph-loader",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common",
+        "@org_polymer",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph-loader.html",
+    ],
+    destdir = "tf-graph-loader",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common:legacy",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+)
--- a/tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD
+++ b/tensorflow/tensorboard/components/tf_graph_loader/demo/BUILD
@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_loader/demo
+webfiles(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-loader/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_loader",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
--- a/tensorflow/tensorboard/components/tf_graph_loader/demo/data/graph.pbtxt
+++ b/tensorflow/tensorboard/components/tf_graph_loader/demo/data/graph.pbtxt
--- a/tensorflow/tensorboard/components/tf_graph_loader/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_graph_loader/demo/index.html
@ -0,0 +1,78 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html>
+  <head>
+    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="../tf-graph-loader.html">
+    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+    <title>TF Graph Loader Demo</title>
+  </head>
+  <body>
+    <demo-snippet>
+      <template>
+        <dom-module id="tf-graph-loader-demo">
+          <template>
+             <tf-graph-loader id="loader"
+                datasets="[[_datasets]]"
+                selected-dataset="[[_selectedDataset]]"
+                progress="{{_progress}}"></tf-graph-loader>
+          </template>
+          <script>
+            Polymer({
+              is: "tf-graph-loader-demo",
+              properties: {
+                // We tell the graph loader to load a specific pbtxt file.
+                _datasets: {
+                  type: Array,
+                  value: [{
+                    "name": "Graph with XLA Clusters Specified",
+                    "path": "data/graph.pbtxt"
+                  }],
+                },
+                _selectedDataset: {
+                  type: Number,
+                  value: 0,
+                },
+
+                // This property will be updated by the graph loader.
+                _progress: {
+                  type: Object,
+                },
+              },
+              observers: [
+                '_progressUpdated(_progress)',
+              ],
+              _progressUpdated(progress) {
+                // console.log the progress.
+                console.log('Progress updated.', progress);
+
+                // The graph has loaded. console.log it.
+                if (progress.value == 100) {
+                  console.log('graph', this.$.loader.outGraph);
+                }
+              },
+            });
+          </script>
+        </dom-module>
+        <!-- The graph loader lacks visual elements. -->
+        <tf-graph-loader-demo></tf-graph-loader-demo>
+      </template>
+    </demo-snippet>
+  </body>
+</html>
--- a/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
+++ b/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
@ -16,6 +16,7 @@ limitations under the License.
 -->

 <link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">

 <!--
 An element which provides a filter parsing for pbtxt to graph output.
--- a/tensorflow/tools/tfprof/README.md
+++ b/tensorflow/tools/tfprof/README.md
@ -10,12 +10,17 @@ Consultants: Jon Shlens, Pete Warden
 1.  Measure model parameters, float operations, tensor shapes.
 2.  Measure op execution times, requested memory size and device placement.
 3.  Inspect checkpoint tensors' shapes and their values.
-4.  Explore model based on name scope or graph structure.
+4.  3 ways to view and explore TensorFlow model profiles
+
+    *  Organize by Python code call stack.
+    *  Organize by TensorFlow operation name scope hierarchies.
+    *  Organize by TensorFlow operation inputs/outputs graph.
+
 5.  Selectively grouping/filtering/accounting/ordering ops.

 [Python API Tutorials](#python-api-tutorials): It can be called directly from
 Python codes. Results are either printed
-to stdout or dumped to file. tensorflow.tfprof.TFProfNode proto is returned from
+to stdout or dumped to file. tensorflow.tfprof.TFGraphNodeProto proto is returned from
 the API to allow users to perform further analysis.

 [CLI Tutorials](#cli-tutorials):
@ -33,13 +38,23 @@ tfprof is part of TensorFlow core. Simply ```import tensorflow as tf```.
 ### Examine the shapes and sizes of all trainable Variables.
 ```python
 # Print trainable variable parameter statistics to stdout.
+# By default, statistics are associated with each graph node.
 param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
    tf.get_default_graph(),
    tfprof_options=tf.contrib.tfprof.model_analyzer.
        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)

-# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
-# of each graph node in tree scructure. Let's print the root below.
+
+# Set tfprof_cmd='code' to associate statistics with Python codes.
+opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
+opts['show_name_regexes'] = ['.*my_code1.py.*', '.*my_code2.py.*']
+param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    tfprof_cmd='code'
+    tfprof_options=opts)
+
+# param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
+# Let's print the root below.
 sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 ```

@ -84,8 +99,20 @@ Finally, you may run `print_model_analysis` to explore the timing and memory
 demands of the model.

 ``` python
+# See model_analyzer_test.py for more examples.
+#
 # Print to stdout an analysis of the memory usage and the timing information
-# from running the graph broken down by operations.
+# broken down by python codes.
+opts = tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
+opts['show_name_regexes'] = ['.*my_code.py.*']
+tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    run_meta=run_metadata,
+    tfprof_cmd='code',
+    tfprof_options=opts)
+
+# Print to stdout an analysis of the memory usage and the timing information
+# broken down by operations.
 tf.contrib.tfprof.model_analyzer.print_model_analysis(
    tf.get_default_graph(),
    run_meta=run_metadata,
@ -138,9 +165,9 @@ bazel-bin/tensorflow/tools/tfprof/tfprof \
    --run_meta_path=run_meta \
    --checkpoint_path=model.ckpt
 #
-# tfprof_log is used to define customized op types and float ops.
+# tfprof_log is used to define customized op types, float ops and code traces.
 # Use tfprof_logger.write_op_log() to create tfprof_log.
-# See 11) in Examples section on generating tfprof_log file.
+# See 12) in Examples section on generating tfprof_log file.
 bazel-bin/tensorflow/tools/tfprof/tfprof \
    --graph_path=graph.pbtxt \
    --run_meta_path=run_meta \
@ -174,7 +201,28 @@ tfprof>
 -dump_to_file
 ```

-3) I want to see the `BatchNorm`'s gamma value in checkpoint.
+3) I want to see which line of my python codes costs most time!
+
+```shell
+# Requires --graph_path --op_log_path
+tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
+_TFProfRoot (0us/22.44ms)
+  model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms)
+    model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms)
+      model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms)
+        model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms)
+          model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms)
+          model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us)
+          model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us)
+            model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us)
+            model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us)
+            ...
+          model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us)
+          model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us)
+        model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
+```
+
+4) I want to see the `BatchNorm`'s gamma value in checkpoint.

 ```shell
 # Requires --graph_path, --checkpoint_path.
@ -186,7 +234,7 @@ _TFProfRoot ()
 [1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
 ```

-4) I want to see my checkpoint tensors shape and number of parameters.
+5) I want to see my checkpoint tensors shape and number of parameters.

 ```shell
 # Requires --graph_path, --checkpoint_path.
@ -205,7 +253,7 @@ _TFProfRoot (--/930.58k params)
  unit_last/final_bn/moving_variance (64, 64/64 params)
 ```

-5) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
+6) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
 it depends on take a long time to run. Hint: Use the ‘graph’ command to explore
 graph dependencies.

@ -221,7 +269,7 @@ _TFProfRoot (0us/3.61sec)
  unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
 ```

-6) I want to know the expensive operations during the back propagation.
+7) I want to know the expensive operations during the back propagation.
 Hint: tensorflow prepend ‘gradient’ to your defined name scopes. Use the ‘scope’
 command to explore based on name scope hierarchies.

@ -238,7 +286,7 @@ _TFProfRoot (0us/2.29sec)
  ...
 ```

-7) Show the number of float operations in the model.
+8) Show the number of float operations in the model.
 Note: float operations calculation depends on
 1) op.RegisterStatistics. If an op doesn’t
 have RegisterStatistics defined, its float operations cannot be counted.
@ -263,7 +311,7 @@ _TFProfRoot (0/17.63b flops)
  ...
 ```

-8) Show the number of parameters of all `tf.trainable_variables()` in the model.
+9) Show the number of parameters of all `tf.trainable_variables()` in the model.

 ```shell
 # Requires --graph_path --op_log_path.
@ -283,7 +331,7 @@ generated by write_op_log() Python API. write_op_log() help users create some
 common op types implicitly. Users can define their own op types and log it
 through the write_op_log() API.

-9) What if I’m lazy and don’t want to define op type? I have given my ops
+109) What if I’m lazy and don’t want to define op type? I have given my ops
 well-defined names in my model’s code. And want to use names to select a group
 of ops. Let’s try it!

@ -301,7 +349,7 @@ in terminal. Otherwise, tfprof accounts all ops matched by
 `-account_type_regexes` recursively even if they are hidden due to some
 options such as -max_depth.

-10) TensorFlow has built-in op types. For example, built-in op type `Variable`
+11) TensorFlow has built-in op types. For example, built-in op type `Variable`
 seems to include `Variable's` created by your model. However, be careful when
 depending on it because TensorFlow creates extra `Variable` ops implicitly and
 the implicitly created ops can have the same prefix as the `Variable's` you
@ -327,7 +375,7 @@ _TFProfRoot (--/930.58k params)
 ```


-11) A example of defining extra op type for ops using `OpLog`
+12) A example of defining extra op type for ops using `OpLog`

 First, in Python code, create an `OpLog` proto and add op type
 information to it:
--- a/tensorflow/tools/tfprof/internal/BUILD
+++ b/tensorflow/tools/tfprof/internal/BUILD
@ -15,6 +15,7 @@ cc_library(
    srcs = ["tfprof_stats.cc"],
    hdrs = ["tfprof_stats.h"],
    deps = [
+        ":tfprof_code",
        ":tfprof_graph",
        ":tfprof_node",
        ":tfprof_options",
@ -61,6 +62,27 @@ cc_library(
    ],
 )

+cc_library(
+    name = "tfprof_code",
+    srcs = ["tfprof_code.cc"],
+    hdrs = ["tfprof_code.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_show_code",
+        ":tfprof_tensor",
+        ":tfprof_utils",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
 cc_library(
    name = "tfprof_graph",
    srcs = ["tfprof_graph.cc"],
@ -98,6 +120,26 @@ cc_library(
    ],
 )

+cc_library(
+    name = "tfprof_show_code",
+    srcs = ["tfprof_show_code.cc"],
+    hdrs = ["tfprof_show_code.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_scope",
+        ":tfprof_show",
+        ":tfprof_tensor",
+        ":tfprof_utils",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
 tf_cc_test(
    name = "tfprof_show_test",
    srcs = ["tfprof_show_test.cc"],
--- a/tensorflow/tools/tfprof/internal/print_model_analysis.cc
+++ b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
@ -40,13 +40,13 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
  graph_ptr->ParseFromString(*graph);

  std::unique_ptr<RunMetadata> run_meta_ptr;
-  if (run_meta) {
+  if (run_meta && !run_meta->empty()) {
    run_meta_ptr.reset(new RunMetadata());
    run_meta_ptr->ParseFromString(*run_meta);
  }

  std::unique_ptr<OpLog> op_log_ptr;
-  if (op_log) {
+  if (op_log && !op_log->empty()) {
    op_log_ptr.reset(new OpLog());
    op_log_ptr->ParseFromString(*op_log);
  }
@ -58,16 +58,27 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,

  Options opts = Options::FromProtoStr(*options);

+  // TODO(xpan): We should have dump_to_file/print_stdout/etc to control
+  // side-effects independently instead of one controlling the other.
  if (opts.dump_to_file.empty()) {
    printf("\n=========================Options=============================\n");
    printf("%s", opts.ToString().c_str());
    printf("\n==================Model Analysis Report======================\n");
-    TFProfNode root(tf_stats.PrintGraph(*command, opts));
+    string ret = "";
+    if (*command == kCmds[2]) {
+      ret = tf_stats.PrintCode(opts).SerializeAsString();
+    } else {
+      ret = tf_stats.PrintGraph(*command, opts).SerializeAsString();
+    }
    printf("\n======================End of Report==========================\n");
    fflush(stdout);
-    return root.SerializeAsString();
+    return ret;
+  }
+  if (*command == kCmds[2]) {
+    return tf_stats.PrintCode(opts).SerializeAsString();
+  } else {
+    return tf_stats.PrintGraph(*command, opts).SerializeAsString();
  }
-  return tf_stats.PrintGraph(*command, opts).SerializeAsString();
 }
 }  // namespace tfprof
 }  // namespace tensorflow
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`#include "unsupported/Eigen/CXX11/ThreadPool"`