[XLA] Convert XLA to use xla::se as a namespace alias for ::stream_executor.

PiperOrigin-RevId: 193301997
2018-04-17 21:04:35 -07:00 · 2018-04-17 21:04:35 -07:00 · d77a621a57
commit d77a621a57
parent 41e2cd187b
121 changed files with 443 additions and 663 deletions
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@ -443,6 +443,9 @@ cc_library(
    srcs = ["executable_run_options.cc"],
    hdrs = ["executable_run_options.h"],
    visibility = ["//visibility:public"],
+    deps = [
+        ":types",
+    ],
 )

 cc_library(
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@ -23,22 +23,19 @@ limitations under the License.

 namespace xla {

-LocalClientOptions::LocalClientOptions(perftools::gputools::Platform* platform,
+LocalClientOptions::LocalClientOptions(se::Platform* platform,
                                       int number_of_replicas,
                                       int intra_op_parallelism_threads)
    : platform_(platform),
      number_of_replicas_(number_of_replicas),
      intra_op_parallelism_threads_(intra_op_parallelism_threads) {}

-LocalClientOptions& LocalClientOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+LocalClientOptions& LocalClientOptions::set_platform(se::Platform* platform) {
  platform_ = platform;
  return *this;
 }

-perftools::gputools::Platform* LocalClientOptions::platform() const {
-  return platform_;
-}
+se::Platform* LocalClientOptions::platform() const { return platform_; }

 LocalClientOptions& LocalClientOptions::set_number_of_replicas(
    int number_of_replicas) {
@ -69,7 +66,7 @@ ClientLibrary::ClientLibrary() = default;
 ClientLibrary::~ClientLibrary() = default;

 /* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
  LocalClientOptions default_options;
  default_options.set_platform(platform);
  return GetOrCreateLocalClient(default_options);
@ -77,7 +74,7 @@ ClientLibrary::~ClientLibrary() = default;

 /* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
    const LocalClientOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
  int replica_count = options.number_of_replicas();
  ClientLibrary& client_library = Singleton();
  tensorflow::mutex_lock lock(client_library.service_mutex_);
@ -115,7 +112,7 @@ ClientLibrary::~ClientLibrary() = default;
 }

 /* static */ LocalService* ClientLibrary::GetXlaService(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
  ClientLibrary& client_library = Singleton();
  tensorflow::mutex_lock lock(client_library.service_mutex_);
  auto it = client_library.local_instances_.find(platform->id());
@ -124,8 +121,7 @@ ClientLibrary::~ClientLibrary() = default;
 }

 /* static */ StatusOr<CompileOnlyClient*>
-ClientLibrary::GetOrCreateCompileOnlyClient(
-    perftools::gputools::Platform* platform) {
+ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) {
  ClientLibrary& client_library = Singleton();
  tensorflow::mutex_lock lock(client_library.service_mutex_);

--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@ -43,13 +43,13 @@ namespace xla {
 // Options to configure the local client when it is created.
 class LocalClientOptions {
 public:
-  LocalClientOptions(perftools::gputools::Platform* platform = nullptr,
+  LocalClientOptions(se::Platform* platform = nullptr,
                     int number_of_replicas = 1,
                     int intra_op_parallelism_threads = -1);

  // Set the platform backing the service, or nullptr for the default platform.
-  LocalClientOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  LocalClientOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;

  // Set the number of replicas to use when compiling replicated
  // programs.
@ -61,7 +61,7 @@ class LocalClientOptions {
  int intra_op_parallelism_threads() const;

 private:
-  perftools::gputools::Platform* platform_;
+  se::Platform* platform_;
  int number_of_replicas_;
  int intra_op_parallelism_threads_;
 };
@ -74,7 +74,7 @@ class ClientLibrary {
  //   platform : The platform the underlying XLA service should target. If
  //     null then default platform is used.
  static StatusOr<LocalClient*> GetOrCreateLocalClient(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
  static StatusOr<LocalClient*> GetOrCreateLocalClient(
      const LocalClientOptions& options);

@ -84,14 +84,14 @@ class ClientLibrary {

  // Returns the service from the service thread. Only used in unit tests to
  // access user computations from client.
-  static LocalService* GetXlaService(perftools::gputools::Platform* platform);
+  static LocalService* GetXlaService(se::Platform* platform);

  // Singleton constructor-or-accessor for compile-only clients. Arguments:
  //
  //   platform : The platform the underlying XLA service should target. If
  //     null then default platform is used.
  static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);

  // Clears the local instance and compile only instance caches. The client
  // pointers returned by the previous GetOrCreateLocalClient() or
@ -120,12 +120,10 @@ class ClientLibrary {
  };

  tensorflow::mutex service_mutex_;  // Guards the singleton creation state.
-  std::unordered_map<perftools::gputools::Platform::Id,
-                     std::unique_ptr<LocalInstance>>
+  std::unordered_map<se::Platform::Id, std::unique_ptr<LocalInstance>>
      local_instances_ GUARDED_BY(service_mutex_);

-  std::unordered_map<perftools::gputools::Platform::Id,
-                     std::unique_ptr<CompileOnlyInstance>>
+  std::unordered_map<se::Platform::Id, std::unique_ptr<CompileOnlyInstance>>
      compile_only_instances_ GUARDED_BY(service_mutex_);

  TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@ -24,8 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"

-namespace se = ::perftools::gputools;
-
 using xla::source_map_util::InvalidParameterArgument;

 namespace xla {
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@ -167,7 +167,7 @@ class LocalClient : public Client {
  StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);

  // Returns the platform that the underlying service targets.
-  perftools::gputools::Platform* platform() const;
+  se::Platform* platform() const;

  // Returns the number of devices on the system of the service platform
  // type. Not all devices may be supported by the service (see
--- a/tensorflow/compiler/xla/device_util.h
+++ b/tensorflow/compiler/xla/device_util.h
@ -29,7 +29,7 @@ namespace xla {

 // Returns a string that represents the device in terms of platform and ordinal;
 // e.g. the first CUDA device will be "cuda:0"
-string DeviceIdentifier(perftools::gputools::StreamExecutor* stream_exec) {
+string DeviceIdentifier(se::StreamExecutor* stream_exec) {
  return tensorflow::strings::StrCat(stream_exec->platform()->Name(), ":",
                                     stream_exec->device_ordinal());
 }
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_

+// Pulls in the ::stream_executor -> ::xla::se namespace alias.
+#include "tensorflow/compiler/xla/types.h"
+
 // These classes are forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@ -37,10 +40,6 @@ struct ThreadPoolDevice;

 namespace xla {

-// TODO(b/77980417): Once the perftools::gputools -> stream_executor migration
-// is complete, add "using namespace se = stream_executor" here and
-// s/stream_executor/se::/ to match our idiom elsewhere.
-
 class DeviceMemoryAllocator;
 class DeviceAssignment;
 class ExecutionProfile;
--- a/tensorflow/compiler/xla/rpc/grpc_service.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@ -20,7 +20,7 @@ limitations under the License.
 namespace xla {

 /* static */ StatusOr<std::unique_ptr<GRPCService>> GRPCService::NewService(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
  std::unique_ptr<GRPCService> grpc_service(new GRPCService());
  TF_ASSIGN_OR_RETURN(grpc_service->service_,
                      ::xla::Service::NewService(platform));
--- a/tensorflow/compiler/xla/rpc/grpc_service.h
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@ -29,7 +29,7 @@ class GRPCService : public grpc::XlaService::Service {
  // that the service should target. If platform is null then the default
  // platform is used.
  static StatusOr<std::unique_ptr<GRPCService>> NewService(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);

  ::grpc::Status Computation(::grpc::ServerContext* context,
                             const ComputationRequest* arg,
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@ -204,7 +204,7 @@ StatusOr<std::vector<const ShapedBuffer*>> AllocationTracker::ResolveInternal(
 }

 void AllocationTracker::AddAllocationOrIncrementRefCount(
-    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+    se::DeviceMemoryBase device_memory, int device_ordinal) {
  AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
  auto it = allocation_map.find(device_memory.opaque());
  if (it == allocation_map.end()) {
@ -215,8 +215,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
  }
 }

-Status AllocationTracker::DecrementRefCount(
-    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory,
+                                            int device_ordinal) {
  AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
  auto it = allocation_map.find(device_memory.opaque());
  TF_RET_CHECK(it != allocation_map.end());
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@ -77,7 +77,7 @@ class AllocationTracker {
  // Data structure encapsulating single memory allocation on the device.
  struct Allocation {
    // The pointer to this allocation.
-    perftools::gputools::DeviceMemoryBase device_memory;
+    se::DeviceMemoryBase device_memory;

    // The device that the memory is allocated on.
    int device_ordinal;
@ -103,13 +103,13 @@ class AllocationTracker {

  // Adds the given device address to the allocation tracker, or if it already
  // exists, then increment it's reference count.
-  void AddAllocationOrIncrementRefCount(
-      perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal)
+  void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory,
+                                        int device_ordinal)
      EXCLUSIVE_LOCKS_REQUIRED(mutex_);

  // Decrements the reference count of the given device memory. Then, if it is
  // zero, deallocate the memory.
-  Status DecrementRefCount(perftools::gputools::DeviceMemoryBase device_memory,
+  Status DecrementRefCount(se::DeviceMemoryBase device_memory,
                           int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_);

  // A map from device memory opaque value to allocation. One such map is
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@ -36,19 +36,14 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {

-BackendOptions& BackendOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+BackendOptions& BackendOptions::set_platform(se::Platform* platform) {
  platform_ = platform;
  return *this;
 }

-perftools::gputools::Platform* BackendOptions::platform() const {
-  return platform_;
-}
+se::Platform* BackendOptions::platform() const { return platform_; }

 BackendOptions& BackendOptions::set_intra_op_parallelism_threads(
    int num_threads) {
@ -77,7 +72,7 @@ struct Backend::EigenThreadPoolWrapper {

 /* static */ StatusOr<std::unique_ptr<Backend>> Backend::CreateBackend(
    const BackendOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
  TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
  TF_ASSIGN_OR_RETURN(auto stream_executors,
                      PlatformUtil::GetStreamExecutors(platform));
@ -121,7 +116,7 @@ StatusOr<Backend::StreamPtr> Backend::BorrowStream(
 }

 Backend::Backend(
-    perftools::gputools::Platform* platform, Compiler* compiler,
+    se::Platform* platform, Compiler* compiler,
    tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
    TransferManager* transfer_manager, ComputationPlacer* computation_placer,
    int intra_op_parallelism_threads)
@ -178,7 +173,7 @@ tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
  return intra_op_thread_pool_wrapper_->pool.get();
 }

-StatusOr<perftools::gputools::StreamExecutor*> Backend::stream_executor(
+StatusOr<se::StreamExecutor*> Backend::stream_executor(
    int device_ordinal) const {
  if (device_ordinal < 0 ||
      device_ordinal > stream_executors_.back()->device_ordinal()) {
@ -201,9 +196,9 @@ StatusOr<bool> Backend::devices_equivalent(int device_ordinal_a,
  // bit crude but works for GPUs which is the important case where we compile
  // an executable for one GPU and want to know if it will run (well) on
  // another.
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_a,
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_a,
                      stream_executor(device_ordinal_a));
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_b,
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_b,
                      stream_executor(device_ordinal_b));
  return (executor_a->GetDeviceDescription().name() ==
          executor_b->GetDeviceDescription().name());
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@ -44,8 +44,8 @@ namespace xla {
 class BackendOptions {
 public:
  // Set the platform backing the backend, or nullptr for the default platform.
-  BackendOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  BackendOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;

  // Sets the thread pool size for parallel execution of an individual operator.
  // The default value of -1 will result in initializing the thread pool with
@ -54,7 +54,7 @@ class BackendOptions {
  int intra_op_parallelism_threads() const;

 private:
-  perftools::gputools::Platform* platform_ = nullptr;
+  se::Platform* platform_ = nullptr;
  int intra_op_parallelism_threads_ = -1;
 };

@ -66,7 +66,7 @@ class BackendOptions {
 //    StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie();
 class Backend {
 public:
-  using StreamPtr = Pool<perftools::gputools::Stream>::SmartPtr;
+  using StreamPtr = Pool<se::Stream>::SmartPtr;

  // Creates a new backend.
  static StatusOr<std::unique_ptr<Backend>> CreateBackend(
@ -79,7 +79,7 @@ class Backend {
  ~Backend();

  // Accessors for the various objects.
-  perftools::gputools::Platform* platform() const { return platform_; }
+  se::Platform* platform() const { return platform_; }
  Compiler* compiler() const { return compiler_; }
  DeviceMemoryAllocator* memory_allocator() const {
    return memory_allocator_.get();
@ -96,19 +96,17 @@ class Backend {

  // Returns stream executors of all supported devices for this backend. The
  // executors are ordered by the device ordinal.
-  const std::vector<perftools::gputools::StreamExecutor*>& stream_executors()
-      const {
+  const std::vector<se::StreamExecutor*>& stream_executors() const {
    return stream_executors_;
  }

  // Returns the stream executor for the given device ordinal.
-  StatusOr<perftools::gputools::StreamExecutor*> stream_executor(
-      int device_ordinal) const;
+  StatusOr<se::StreamExecutor*> stream_executor(int device_ordinal) const;

  // Returns the stream executor for the default device ordinal. This stream
  // executor can only be used when the number of computations is 1 (replication
  // can be > 1).
-  perftools::gputools::StreamExecutor* default_stream_executor() const {
+  se::StreamExecutor* default_stream_executor() const {
    CHECK(!stream_executors_.empty());
    return stream_executors_[0];
  }
@ -117,8 +115,7 @@ class Backend {
  // internal pool, or by constructing/initializating it, and returns the result
  // to the caller.
  StatusOr<StreamPtr> BorrowStream(int device_ordinal);
-  StatusOr<StreamPtr> BorrowStream(
-      perftools::gputools::StreamExecutor* executor);
+  StatusOr<StreamPtr> BorrowStream(se::StreamExecutor* executor);

  // Returns a function to borrow a stream, as `BorrowStream` above does.
  // Purely for convenience, the caller could rather make this anonymous
@ -157,29 +154,26 @@ class Backend {

 private:
  struct EigenThreadPoolWrapper;
-  Backend(perftools::gputools::Platform* platform, Compiler* compiler,
-          tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-              stream_executors,
+  Backend(se::Platform* platform, Compiler* compiler,
+          tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
          TransferManager* transfer_manager,
          ComputationPlacer* computation_placer,
          int intra_op_parallelism_threads);
  Backend(const Backend&) = delete;
  Backend& operator=(const Backend&) = delete;

-  perftools::gputools::Platform* platform_;
+  se::Platform* platform_;
  Compiler* compiler_;
  TransferManager* transfer_manager_;
  ComputationPlacer* computation_placer_;

  // Vector of stream executors. stream_executors_[0] is the default executor.
-  std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
+  std::vector<se::StreamExecutor*> stream_executors_;

  tensorflow::mutex mu_;

  // Mapping from stream executor to stream pools, used by `BorrowStream` above.
-  std::map<perftools::gputools::StreamExecutor*,
-           Pool<perftools::gputools::Stream>>
-      stream_pools_ GUARDED_BY(mu_);
+  std::map<se::StreamExecutor*, Pool<se::Stream>> stream_pools_ GUARDED_BY(mu_);

  // The default memory allocator to use.
  std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@ -37,7 +37,7 @@ limitations under the License.
 namespace xla {

 /* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
-CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
+CompileOnlyService::NewService(se::Platform* platform) {
  ServiceOptions default_options;
  default_options.set_platform(platform);
  return NewService(default_options);
@ -45,7 +45,7 @@ CompileOnlyService::NewService(perftools::gputools::Platform* platform) {

 /* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
 CompileOnlyService::NewService(const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
  if (platform == nullptr) {
    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
  }
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@ -34,7 +34,7 @@ class CompileOnlyService : public Service {
  // platform that the service should target. If platform is null then the
  // default platform is used.
  static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
-      perftools::gputools::Platform* platform);
+      se::Platform* platform);
  static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
      const ServiceOptions& options);

--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@ -23,26 +23,21 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {

 /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
    tensorflow::LINKER_INITIALIZED);

-/* static */ std::map<perftools::gputools::Platform::Id,
-                      Compiler::CompilerFactory>*
+/* static */ std::map<se::Platform::Id, Compiler::CompilerFactory>*
 Compiler::GetPlatformCompilerFactories() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, CompilerFactory>;
+  static auto* r = new std::map<se::Platform::Id, CompilerFactory>;
  return r;
 }

 /* static */
-std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
+std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
 Compiler::GetPlatformCompilers() {
-  static auto* r = new std::map<perftools::gputools::Platform::Id,
-                                std::unique_ptr<Compiler>>;
+  static auto* r = new std::map<se::Platform::Id, std::unique_ptr<Compiler>>;
  return r;
 }

--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@ -70,7 +70,7 @@ class AotCompilationOptions {
  virtual ~AotCompilationOptions() = default;

  // Returns the ID of the platform to which these options apply.
-  virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+  virtual se::Platform::Id PlatformId() const = 0;

  // Optional allocator that may be used for allocating temp space on the device
  // during compilation.
@ -109,7 +109,7 @@ class Compiler {
  virtual ~Compiler() {}

  // Returns the ID of the platform that this compiler targets.
-  virtual perftools::gputools::Platform::Id PlatformId() const = 0;
+  virtual se::Platform::Id PlatformId() const = 0;

  // Runs Hlo passes to optimize the given Hlo module, returns the optimized
  // module.
@ -120,8 +120,7 @@ class Compiler {
  // algorithm over those buffers, to see which variant is fastest.  Any space
  // allocated should be deallocated before this function returns.
  virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
      DeviceMemoryAllocator* device_allocator) = 0;

  // Compiles the HLO module for execution on a device given by the executor,
@ -137,8 +136,7 @@ class Compiler {
  //
  // Use the overload below to compile computations that run in parallel.
  virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
      DeviceMemoryAllocator* device_allocator) = 0;

  // Compiles a set of HLO modules that can run in parallel, potentially
@ -151,8 +149,7 @@ class Compiler {
  // modules to RunHloPasses and RunBackends.
  virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
      std::vector<std::unique_ptr<HloModule>> modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
      DeviceMemoryAllocator* device_allocator) = 0;

  // Compiles the HLO module for ahead-of-time execution.  This is intended for
@ -171,14 +168,12 @@ class Compiler {
  // be a singleton, so no ownership is transferred.
  //
  // Precondition: a platform kind must not be registered more than once.
-  static void RegisterCompilerFactory(
-      perftools::gputools::Platform::Id platform_id,
-      CompilerFactory compiler_factory);
+  static void RegisterCompilerFactory(se::Platform::Id platform_id,
+                                      CompilerFactory compiler_factory);

  // Returns the compiler singleton pointer if it is available for the given
  // platform, or an error status if it is not.
-  static StatusOr<Compiler*> GetForPlatform(
-      const perftools::gputools::Platform* platform);
+  static StatusOr<Compiler*> GetForPlatform(const se::Platform* platform);

  // Returns a function that computes the size in bytes of the logical
  // buffer that contains a shape.
@ -198,12 +193,12 @@ class Compiler {
  static tensorflow::mutex platform_compiler_mutex_;

  // Map from platform kind to compiler factory.
-  static std::map<perftools::gputools::Platform::Id, CompilerFactory>*
+  static std::map<se::Platform::Id, CompilerFactory>*
  GetPlatformCompilerFactories();

  // Map from platform kind to compiler instance, if we made one already (based
  // on the factories above).
-  static std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
+  static std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
  GetPlatformCompilers();
 };

--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@ -32,8 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {

 Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
@ -132,11 +130,9 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
    ComputationPlacer::platform_computation_placer_mutex_(
        tensorflow::LINKER_INITIALIZED);

-/* static */ std::map<perftools::gputools::Platform::Id,
-                      ComputationPlacer::State>*
+/* static */ std::map<se::Platform::Id, ComputationPlacer::State>*
 ComputationPlacer::GetPlatformComputationPlacers() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, ComputationPlacer::State>;
+  static auto* r = new std::map<se::Platform::Id, ComputationPlacer::State>;
  return r;
 }

@ -147,10 +143,10 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
 }

 static bool InitModule() {
-  xla::ComputationPlacer::RegisterComputationPlacer(se::host::kHostPlatformId,
-                                                    &CreateComputationPlacer);
-  xla::ComputationPlacer::RegisterComputationPlacer(se::cuda::kCudaPlatformId,
-                                                    &CreateComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(
+      stream_executor::host::kHostPlatformId, &CreateComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(
+      stream_executor::cuda::kCudaPlatformId, &CreateComputationPlacer);
  return true;
 }
 static bool module_initialized = InitModule();
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@ -80,13 +80,13 @@ class ComputationPlacer {

  // Registers a computation placer creation function for a particular platform.
  static void RegisterComputationPlacer(
-      perftools::gputools::Platform::Id platform_id,
+      se::Platform::Id platform_id,
      ComputationPlacerCreationFunction creation_function);

  // Returns the computation placer singleton pointer if it is available for the
  // given platform, or an error status if it is not.
  static StatusOr<ComputationPlacer*> GetForPlatform(
-      const perftools::gputools::Platform* platform);
+      const se::Platform* platform);

 private:
  // The mutex that guards the platform-to-computation placer map.
@ -101,10 +101,9 @@ class ComputationPlacer {
  };

  // Map from platform kind to computation placer singleton.
-  static std::map<perftools::gputools::Platform::Id, State>*
-  GetPlatformComputationPlacers();
+  static std::map<se::Platform::Id, State>* GetPlatformComputationPlacers();

-  perftools::gputools::Platform::Id platform_id_;
+  se::Platform::Id platform_id_;

  TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer);
 };
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@ -100,8 +100,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace cpu {

@ -440,8 +438,7 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
 }  // namespace

 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* /*stream_exec*/,
+    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
    DeviceMemoryAllocator* /*device_allocator*/) {
  VLOG(2) << "Before optimization:";
  XLA_VLOG_LINES(2, module->ToString());
@ -454,8 +451,7 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 }

 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
-    std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* stream_exec,
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
    DeviceMemoryAllocator* /*device_allocator*/) {
  const string timer_message =
      "Compiling [" + module->name() + "] for CPU using JIT";
@ -938,9 +934,9 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
 }  // namespace xla

 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(se::host::kHostPlatformId, []() {
-    return xla::MakeUnique<xla::cpu::CpuCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      stream_executor::host::kHostPlatformId,
+      []() { return xla::MakeUnique<xla::cpu::CpuCompiler>(); });
  return true;
 }
 static bool module_initialized = InitModule();
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@ -53,7 +53,7 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
                           RelocationModel relocation_model);
  ~CpuAotCompilationOptions() override;

-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;

  // The triple used for compilation, similar to clang's -target flag.
  const string& triple() const { return triple_; }
@ -112,25 +112,23 @@ class CpuCompiler : public LLVMCompiler {
  // Bring in
  // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
  //     std::vector<std::unique_ptr<HloModule>> modules,
-  //     std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+  //     std::vector<std::vector<se::StreamExecutor*>>
  //        stream_execs)
  using LLVMCompiler::Compile;

  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
      DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
      DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
                     const AotCompilationOptions& options) override;

-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;

  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;

--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@ -45,8 +45,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/host/host_stream.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace cpu {

@ -75,7 +73,7 @@ CpuExecutable::CpuExecutable(

 Status CpuExecutable::AllocateBuffers(
    DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
+    std::vector<se::DeviceMemoryBase>* buffers) {
  CHECK_EQ(buffers->size(), assignment_->Allocations().size());
  VLOG(3) << "Allocating " << assignment_->Allocations().size()
          << " allocations for module " << module().name();
@ -247,8 +245,7 @@ static Status DeallocateTempBuffers(

 StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        allocated_buffers,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
    std::vector<bool>* buffers_in_result) {
  se::Stream* stream = run_options->stream();
  auto result_buffer = MakeUnique<ShapedBuffer>(
@ -322,7 +319,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
        "supported on CPU.");
  }

-  auto* host_stream = dynamic_cast<perftools::gputools::host::HostStream*>(
+  auto* host_stream = dynamic_cast<se::host::HostStream*>(
      run_options->stream()->implementation());
  se::Stream* stream = run_options->stream();
  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@ -90,17 +90,16 @@ class CpuExecutable : public Executable {
  // assignment. Each vector element corresponds to a particular Index. If
  // a vector element already contains a non-null DeviceMemoryBase, then no
  // buffer is assigned for this element.
-  Status AllocateBuffers(
-      DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
+  Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
+                         int device_ordinal,
+                         std::vector<se::DeviceMemoryBase>* buffers);

  // Calls the generated function performing the computation with the given
  // arguments using the supplied buffers.
  Status ExecuteComputeFunction(
      const ExecutableRunOptions* run_options,
      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
      HloExecutionProfile* hlo_execution_profile);

  // Create a ShapedBuffer for holding the result of the computation. The
@ -111,8 +110,7 @@ class CpuExecutable : public Executable {
  // the returned ShapedBuffer).
  StatusOr<std::unique_ptr<ShapedBuffer>> CreateResultShapedBuffer(
      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          allocated_buffers,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
      std::vector<bool>* buffers_in_result);

  // Returns the points-to set of the root instruction of the entry
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@ -34,8 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {

 namespace {
@ -241,21 +239,20 @@ Status CpuTransferManager::TransferLiteralFromOutfeed(
 }

 StatusOr<Shape> CpuTransferManager::TransferTupleBuffersFromOutfeed(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
    tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data) {
  return TransferBuffersFromOutfeedInternal(executor, buffer_data,
                                            /*is_tuple=*/true);
 }

 StatusOr<Shape> CpuTransferManager::TransferArrayBufferFromOutfeed(
-    perftools::gputools::StreamExecutor* executor, void* destination,
-    int64 size_bytes) {
+    se::StreamExecutor* executor, void* destination, int64 size_bytes) {
  return TransferBuffersFromOutfeedInternal(
      executor, {{destination, size_bytes}}, /*is_tuple=*/false);
 }

 StatusOr<Shape> CpuTransferManager::TransferBuffersFromOutfeedInternal(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
    tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
    bool is_tuple) {
  std::vector<std::unique_ptr<CpuOutfeedBuffer>> buffers;
@ -306,8 +303,8 @@ static std::unique_ptr<xla::TransferManager> CreateCpuTransferManager() {
 }

 static bool InitModule() {
-  xla::TransferManager::RegisterTransferManager(se::host::kHostPlatformId,
-                                                &CreateCpuTransferManager);
+  xla::TransferManager::RegisterTransferManager(
+      stream_executor::host::kHostPlatformId, &CreateCpuTransferManager);
  return true;
 }
 static bool module_initialized = InitModule();
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
@ -37,36 +37,35 @@ class CpuTransferManager : public GenericTransferManager {
  CpuTransferManager();
  ~CpuTransferManager() override {}

-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                 const Literal& literal) override;
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
-  Status TransferLiteralFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-      Literal* literal) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;
+  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                    const Shape& literal_shape,
+                                    Literal* literal) override;

 private:
  // Transfers infeed data to device. InfeedBuffer->Done() must be
  // called to clean up the memory allocated for InfeedBuffer.
  StatusOr<cpu::runtime::XfeedBuffer*> TransferBufferToInfeedInternal(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source);
+      se::StreamExecutor* executor, int64 size, const void* source);

  // Helper that transfers a tuple of element buffers from the device's outfeed.
  StatusOr<Shape> TransferTupleBuffersFromOutfeed(
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
      tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data);

  // Helper that transfers an array buffer from the device's outfeed.
-  StatusOr<Shape> TransferArrayBufferFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, void* destination,
-      int64 size_bytes);
+  StatusOr<Shape> TransferArrayBufferFromOutfeed(se::StreamExecutor* executor,
+                                                 void* destination,
+                                                 int64 size_bytes);

  // On success, returns the shape that was transferred from the outfeed -- if
  // is_tuple is true, the returned shape will be a tuple of the returned shapes
  // for the given buffers.
  StatusOr<Shape> TransferBuffersFromOutfeedInternal(
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
      tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
      bool is_tuple);

--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@ -49,8 +49,6 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace cpu {

@ -325,7 +323,7 @@ const void** Executor::GetOperandBuffers(HloInstruction* instruction) {

 Status ParallelCpuExecutable::AllocateBuffers(
    DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
+    std::vector<se::DeviceMemoryBase>* buffers) {
  CHECK_EQ(buffers->size(), assignment_->Allocations().size());
  VLOG(3) << "Allocating " << assignment_->Allocations().size()
          << " allocations for module " << module().name();
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@ -89,17 +89,16 @@ class ParallelCpuExecutable : public Executable {
  // assignment. Each vector element corresponds to a particular Index. If
  // a vector element already contains a non-null DeviceMemoryBase, then no
  // buffer is assigned for this element.
-  Status AllocateBuffers(
-      DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
+  Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
+                         int device_ordinal,
+                         std::vector<se::DeviceMemoryBase>* buffers);

  // Calls the generated functions in 'function_names_', performing the
  // computation with the given arguments using the supplied buffers.
  Status ExecuteComputeFunctions(
      const ServiceExecutableRunOptions* run_options,
      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
      HloExecutionProfile* hlo_execution_profile);

  // Returns the points-to set of the root instruction of the entry
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@ -24,19 +24,16 @@ limitations under the License.
 namespace xla {

 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    const perftools::gputools::Platform* platform,
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-        stream_executors)
+    const se::Platform* platform,
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors)
    : DeviceMemoryAllocator(platform),
      stream_executors_(stream_executors.begin(), stream_executors.end()) {}

-StatusOr<perftools::gputools::DeviceMemoryBase>
-StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
-                                        bool retry_on_failure) {
-  TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
+StatusOr<se::DeviceMemoryBase> StreamExecutorMemoryAllocator::Allocate(
+    int device_ordinal, uint64 size, bool retry_on_failure) {
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                      GetStreamExecutor(device_ordinal));
-  perftools::gputools::DeviceMemoryBase result =
-      stream_executor->AllocateArray<uint8>(size);
+  se::DeviceMemoryBase result = stream_executor->AllocateArray<uint8>(size);
  if (size > 0 && result == nullptr) {
    return ResourceExhausted(
        "Failed to allocate request for %s (%lluB) on device ordinal %d",
@ -47,22 +44,22 @@ StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
 }

 tensorflow::Status StreamExecutorMemoryAllocator::Deallocate(
-    int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) {
+    int device_ordinal, se::DeviceMemoryBase* mem) {
  if (!mem->is_null()) {
-    TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
+    TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                        GetStreamExecutor(device_ordinal));
    // We make a local copy of 'mem' so the original is not zeroed out by the
    // Deallocate() call below. This gives us a better chance of
    // catching double-free bugs, since Deallocate silently succeeds for null
    // values.
-    perftools::gputools::DeviceMemoryBase mem_copy(*mem);
+    se::DeviceMemoryBase mem_copy(*mem);
    stream_executor->Deallocate(&mem_copy);
  }
  return tensorflow::Status::OK();
 }

-StatusOr<perftools::gputools::StreamExecutor*>
-StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
+StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
+    int device_ordinal) {
  if (device_ordinal < 0) {
    return InvalidArgument("device ordinal value (%d) must be non-negative",
                           device_ordinal);
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@ -33,7 +33,7 @@ class DeviceMemoryAllocator {
 public:
  // Parameter platform indicates which platform the allocator allocates memory
  // on. Must be non-null.
-  explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform)
+  explicit DeviceMemoryAllocator(const se::Platform* platform)
      : platform_(platform) {}
  virtual ~DeviceMemoryAllocator() {}

@ -43,20 +43,20 @@ class DeviceMemoryAllocator {
  // has only performance impact.
  // Allocate() should return a null pointer for a size-0 allocation.
  // Deallocate() must be a no-op for null pointers.
-  virtual StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
+  virtual StatusOr<se::DeviceMemoryBase> Allocate(
      int device_ordinal, uint64 size, bool retry_on_failure = true) = 0;
-  virtual tensorflow::Status Deallocate(
-      int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0;
+  virtual tensorflow::Status Deallocate(int device_ordinal,
+                                        se::DeviceMemoryBase* mem) = 0;

  // Return the platform that the allocator allocates memory on.
-  const perftools::gputools::Platform* platform() const { return platform_; }
+  const se::Platform* platform() const { return platform_; }

  // Can we call Deallocate() as soon as a computation has been scheduled on
  // a stream, or do we have to wait for the computation to complete first?
  virtual bool AllowsAsynchronousDeallocation() const = 0;

 protected:
-  const perftools::gputools::Platform* platform_;
+  const se::Platform* platform_;
 };

 // Default memory allocator for a platform which uses
@ -64,25 +64,23 @@ class DeviceMemoryAllocator {
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
 public:
  StreamExecutorMemoryAllocator(
-      const perftools::gputools::Platform* platform,
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          stream_executors);
+      const se::Platform* platform,
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);

-  StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
+  StatusOr<se::DeviceMemoryBase> Allocate(
      int device_ordinal, uint64 size, bool retry_on_failure = true) override;
-  tensorflow::Status Deallocate(
-      int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override;
+  tensorflow::Status Deallocate(int device_ordinal,
+                                se::DeviceMemoryBase* mem) override;

  bool AllowsAsynchronousDeallocation() const override;

 private:
-  StatusOr<perftools::gputools::StreamExecutor*> GetStreamExecutor(
-      int device_ordinal);
+  StatusOr<se::StreamExecutor*> GetStreamExecutor(int device_ordinal);

  // A vector indexed by device ordinal of StreamExecutors for each device of
  // the allocator's platform type. If an element is nullptr, then the device
  // with the respective device ordinal is not supported by XLA.
-  std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
+  std::vector<se::StreamExecutor*> stream_executors_;
 };

 }  // namespace xla
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@ -61,10 +61,10 @@ Executable::ExecuteOnStreams(
 StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
    const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
    ArraySlice<const ShapedBuffer*> arguments) {
-  perftools::gputools::Stream* stream = run_options->stream();
-  std::unique_ptr<perftools::gputools::Timer> timer;
+  se::Stream* stream = run_options->stream();
+  std::unique_ptr<se::Timer> timer;
  if (profile != nullptr) {
-    timer.reset(new perftools::gputools::Timer(stream->parent()));
+    timer.reset(new se::Timer(stream->parent()));
    stream->InitTimer(timer.get()).ThenStartTimer(timer.get());
  }

--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@ -90,7 +90,7 @@ class Executable {
  // has completed.
  virtual Status PopulateExecutionProfile(
      HloExecutionProfile* hlo_execution_profile,
-      perftools::gputools::StreamExecutor* executor) {
+      se::StreamExecutor* executor) {
    return Status::OK();
  }

--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@ -32,8 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {

 GenericTransferManager::GenericTransferManager(se::Platform::Id platform_id,
@ -45,9 +43,9 @@ se::Platform::Id GenericTransferManager::PlatformId() const {
 }

 Status GenericTransferManager::WriteSingleTupleIndexTable(
-    perftools::gputools::StreamExecutor* executor,
+    se::StreamExecutor* executor,
    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
-    const Shape& shape, perftools::gputools::DeviceMemoryBase* region) {
+    const Shape& shape, se::DeviceMemoryBase* region) {
  TF_RET_CHECK(elements.size() == ShapeUtil::TupleElementCount(shape));

  std::vector<const void*> element_pointers;
@ -144,20 +142,19 @@ Status GenericTransferManager::TransferLiteralToInfeed(
 }

 Status GenericTransferManager::TransferBufferToInfeed(
-    perftools::gputools::StreamExecutor* executor, int64 size,
-    const void* source) {
+    se::StreamExecutor* executor, int64 size, const void* source) {
  return Unimplemented("Generic transfer to Infeed");
 }

 Status GenericTransferManager::TransferLiteralFromOutfeed(
-    perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
+    se::StreamExecutor* executor, const Shape& literal_shape,
    Literal* literal) {
  return Unimplemented(
      "Outfeed is not supported on this platform (b/30467474)");
 }

 Status GenericTransferManager::ResetDevices(
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*>
    /*executors*/) {
  return Unimplemented(
      "Device reset is not yet supported on this platform (b/30481585)");
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@ -36,46 +36,41 @@ namespace xla {
 // infeed.
 class GenericTransferManager : public TransferManager {
 public:
-  GenericTransferManager(perftools::gputools::Platform::Id platform_id,
-                         size_t pointer_size);
+  GenericTransferManager(se::Platform::Id platform_id, size_t pointer_size);
  ~GenericTransferManager() override {}

-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;

  StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const ShapedBuffer& device_buffer) override;
+      se::StreamExecutor* executor, const ShapedBuffer& device_buffer) override;

-  Status TransferLiteralToDevice(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToDevice(se::StreamExecutor* executor,
                                 const Literal& literal,
                                 const ShapedBuffer& device_buffer) override;

-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                 const Literal& literal) override;
-  Status TransferLiteralFromOutfeed(
-      perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-      Literal* literal) override;
+  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
+                                    const Shape& literal_shape,
+                                    Literal* literal) override;

  Status ResetDevices(
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          executors) override;
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;

  int64 GetByteSizeRequirement(const Shape& shape) const override;

 protected:
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;

  Status WriteSingleTupleIndexTable(
-      perftools::gputools::StreamExecutor* executor,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          elements,
-      const Shape& shape,
-      perftools::gputools::DeviceMemoryBase* region) override;
+      se::StreamExecutor* executor,
+      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
+      const Shape& shape, se::DeviceMemoryBase* region) override;

 private:
  // The platform this transfer manager targets.
-  const perftools::gputools::Platform::Id platform_id_;
+  const se::Platform::Id platform_id_;

  // The size in bytes of pointers on this platform.
  const size_t pointer_size_;
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@ -28,8 +28,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {

--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@ -41,7 +41,7 @@ class BufferAllocations {
    // user-specified result buffers) to the given buffer index. The builder
    // will skip allocating buffers for registered buffer indices.
    void RegisterBuffer(BufferAllocation::Index index,
-                        perftools::gputools::DeviceMemoryBase address);
+                        se::DeviceMemoryBase address);

    // Builds a BufferAllocations object from the given buffer assignment.
    // `memory_allocator` is what this function uses to allocate device memory.
@ -52,8 +52,7 @@ class BufferAllocations {
        DeviceMemoryAllocator* memory_allocator);

   private:
-    std::map<BufferAllocation::Index, perftools::gputools::DeviceMemoryBase>
-        registered_buffers_;
+    std::map<BufferAllocation::Index, se::DeviceMemoryBase> registered_buffers_;
  };

  BufferAllocations(const BufferAllocations&) = delete;
@ -65,22 +64,20 @@ class BufferAllocations {
  // Returns the device address of buffer `buffer_index`. `buffer_index` must be
  // a valid index, i.e., in [0, buffer_count). This function returns null if
  // `buffer_index` is not assigned to a buffer address.
-  perftools::gputools::DeviceMemoryBase GetDeviceAddress(
+  se::DeviceMemoryBase GetDeviceAddress(
      BufferAllocation::Index buffer_index) const;

  // Same as above, but also adjusts the returned address for the offset and
  // size contained in the given slice.
-  perftools::gputools::DeviceMemoryBase GetDeviceAddress(
+  se::DeviceMemoryBase GetDeviceAddress(
      const BufferAllocation::Slice& buffer_slice) const;

-  perftools::gputools::DeviceMemoryBase GetTempBufferBase() const {
-    return temp_buffer_base_;
-  }
+  se::DeviceMemoryBase GetTempBufferBase() const { return temp_buffer_base_; }

  // Tears down all buffers allocated by this object that are not in
  // `live_addresses`.
  tensorflow::Status TearDown(
-      const std::set<perftools::gputools::DeviceMemoryBase>& live_addresses,
+      const std::set<se::DeviceMemoryBase>& live_addresses,
      const BufferAssignment& buffer_assignment);

 private:
@ -92,15 +89,15 @@ class BufferAllocations {

  // Sets the device address of buffer `buffer_index`.
  void SetBuffer(BufferAllocation::Index buffer_index,
-                 perftools::gputools::DeviceMemoryBase buffer);
+                 se::DeviceMemoryBase buffer);

  // An array of device pointers that stores the address of each buffer
  // indexed by Index. Each element can point to a temporary buffer, an
  // input buffer, or nullptr if no buffer is needed for that Index.
-  std::vector<perftools::gputools::DeviceMemoryBase> buffers_;
+  std::vector<se::DeviceMemoryBase> buffers_;

  // The base address of the memory block that contains all temporary buffers.
-  perftools::gputools::DeviceMemoryBase temp_buffer_base_;
+  se::DeviceMemoryBase temp_buffer_base_;

  int device_ordinal_;

--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@ -42,11 +42,10 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable) {
 }

 Status ConditionalThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
  // Copy the predicate value from device.
  bool predicate;
-  perftools::gputools::DeviceMemoryBase predicate_address =
+  se::DeviceMemoryBase predicate_address =
      buffer_allocations.GetDeviceAddress(predicate_buffer_index_);
  stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool));

--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@ -49,7 +49,7 @@ class ConditionalThunk : public Thunk {

  Status Initialize(const GpuExecutable& executable) override;
  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;

 private:
  BufferAllocation::Slice predicate_buffer_index_;
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@ -25,8 +25,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {

--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@ -66,23 +66,21 @@ class ConvolutionThunk : public Thunk {

  // Does the convolution for the thunk on "stream".
  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;

 private:
  class ScratchAllocator;

-  Status Convolve(
-      const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
-      perftools::gputools::DeviceMemory<float> input_data,
-      const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
-      perftools::gputools::DeviceMemory<float> filter_data,
-      const perftools::gputools::dnn::BatchDescriptor& output_descriptor,
-      perftools::gputools::DeviceMemory<float> output_data,
-      const perftools::gputools::dnn::ConvolutionDescriptor&
-          convolution_descriptor,
-      const perftools::gputools::dnn::AlgorithmConfig& algorithm_config,
-      perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator,
-      perftools::gputools::dnn::ProfileResult* profile_result);
+  Status Convolve(const se::dnn::BatchDescriptor& input_descriptor,
+                  se::DeviceMemory<float> input_data,
+                  const se::dnn::FilterDescriptor& filter_descriptor,
+                  se::DeviceMemory<float> filter_data,
+                  const se::dnn::BatchDescriptor& output_descriptor,
+                  se::DeviceMemory<float> output_data,
+                  const se::dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const se::dnn::AlgorithmConfig& algorithm_config,
+                  se::Stream* stream, ScratchAllocator* scratch_allocator,
+                  se::dnn::ProfileResult* profile_result);

  const CudnnConvKind convolution_kind_;

--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
@ -30,9 +30,8 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk(
      mem_size_(mem_size) {}

 tensorflow::Status HostToDeviceCopyThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase destination_data =
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase destination_data =
      buffer_allocations.GetDeviceAddress(destination_buffer_);
  stream->ThenMemcpy(&destination_data, source_address_, mem_size_);
  return tensorflow::Status::OK();
@ -48,11 +47,10 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
      mem_size_(mem_size) {}

 tensorflow::Status DeviceToDeviceCopyThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase destination_data =
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  se::DeviceMemoryBase destination_data =
      buffer_allocations.GetDeviceAddress(destination_buffer_);
-  perftools::gputools::DeviceMemoryBase source_data =
+  se::DeviceMemoryBase source_data =
      buffer_allocations.GetDeviceAddress(source_buffer_);
  stream->ThenMemcpy(&destination_data, source_data, mem_size_);
  return tensorflow::Status::OK();
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
@ -40,8 +40,7 @@ class HostToDeviceCopyThunk : public Thunk {
  HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete;

  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;

 private:
  const void* source_address_;
@ -64,8 +63,7 @@ class DeviceToDeviceCopyThunk : public Thunk {
  DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;

  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;

 private:
  const BufferAllocation::Slice source_buffer_;
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
@ -28,7 +28,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {

-namespace se = ::perftools::gputools;
 namespace dnn = se::dnn;

 static std::pair<dnn::BatchDescriptor /*input_desc*/,
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
@ -60,7 +60,7 @@ class CudnnBatchNormForwardInferenceThunk : public Thunk {
      const CudnnBatchNormForwardInferenceThunk&) = delete;

  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;

 private:
  BufferAllocation::Slice operand_;
@ -90,7 +90,7 @@ class CudnnBatchNormForwardTrainingThunk : public Thunk {
      const CudnnBatchNormForwardTrainingThunk&) = delete;

  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;

 private:
  BufferAllocation::Slice operand_;
@ -123,7 +123,7 @@ class CudnnBatchNormBackwardThunk : public Thunk {
      delete;

  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;

 private:
  BufferAllocation::Slice operand_;
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@ -24,8 +24,6 @@ namespace xla {
 namespace gpu {
 namespace {

-namespace se = perftools::gputools;
-
 using se::DeviceMemoryBase;
 using se::dnn::AlgorithmConfig;
 using se::dnn::AlgorithmDesc;
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
@ -33,9 +33,8 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
  // If the `allocator` parameter is not null, we will use it to allocate temp
  // memory while timing the various convolution algorithms.  If it's null,
  // we'll use the default allocator on the StreamExecutor.
-  CudnnConvolutionAlgorithmPicker(
-      perftools::gputools::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* allocator)
+  CudnnConvolutionAlgorithmPicker(se::StreamExecutor* stream_exec,
+                                  DeviceMemoryAllocator* allocator)
      : stream_exec_(stream_exec), allocator_(allocator) {}

  tensorflow::StringPiece name() const override {
@ -52,7 +51,7 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
      const Shape& output_shape, const Window& window,
      const ConvolutionDimensionNumbers& dnums, HloInstruction* instr);

-  perftools::gputools::StreamExecutor* stream_exec_;  // never null
+  se::StreamExecutor* stream_exec_;                   // never null
  DeviceMemoryAllocator* allocator_;                  // may be null
 };

--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
@ -22,8 +22,6 @@ namespace xla {
 namespace gpu {
 namespace {

-namespace se = ::perftools::gputools;
-
 using se::DeviceMemory;
 using se::DeviceMemoryBase;
 using se::Stream;
@ -215,14 +213,12 @@ string CudnnConvKindToString(CudnnConvKind kind) {

 Status RunCudnnConvolution(
    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::DeviceMemoryBase scratch_buf, const Window& window,
    const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result) {
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result) {
  ScratchBufAllocator scratch_allocator(scratch_buf);
  return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
                             input_buf, filter_buf, output_buf,
@ -232,14 +228,12 @@ Status RunCudnnConvolution(

 Status RunCudnnConvolution(
    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::ScratchAllocator* scratch_allocator,
-    const Window& window, const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result) {
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::ScratchAllocator* scratch_allocator, const Window& window,
+    const ConvolutionDimensionNumbers& dnums,
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result) {
  PrimitiveType output_primitive_type = output_shape.element_type();
  CHECK(output_primitive_type == F32 || output_primitive_type == F16)
      << ShapeUtil::HumanString(output_shape);
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
@ -72,25 +72,21 @@ string CudnnConvKindToString(CudnnConvKind kind);
 // that size, if you like.
 Status RunCudnnConvolution(
    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::DeviceMemoryBase scratch_buf, const Window& window,
    const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result = nullptr);

 Status RunCudnnConvolution(
    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
-    const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
-    perftools::gputools::DeviceMemoryBase filter_buf,
-    perftools::gputools::DeviceMemoryBase output_buf,
-    perftools::gputools::ScratchAllocator* scratch_allocator,
-    const Window& window, const ConvolutionDimensionNumbers& dnums,
-    perftools::gputools::dnn::AlgorithmConfig algorithm,
-    perftools::gputools::Stream* stream,
-    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+    const Shape& output_shape, se::DeviceMemoryBase input_buf,
+    se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
+    se::ScratchAllocator* scratch_allocator, const Window& window,
+    const ConvolutionDimensionNumbers& dnums,
+    se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
+    se::dnn::ProfileResult* profile_result = nullptr);

 }  // namespace gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@ -24,8 +24,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {

--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@ -34,24 +34,24 @@ namespace gpu {
 // released on destruction.
 //
 // Not thread-safe in that AllocateBytes, destructor are not locked.
-class FftScratchAllocator : public perftools::gputools::ScratchAllocator {
+class FftScratchAllocator : public se::ScratchAllocator {
 public:
  FftScratchAllocator(int device_ordinal,
                      DeviceMemoryAllocator* memory_allocator);

  ~FftScratchAllocator() override;

-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override;
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override;

  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }

-  perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
-  AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override;
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override;

 private:
  const int device_ordinal_;
  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<perftools::gputools::DeviceMemoryBase> allocated_buffers_;
+  std::vector<se::DeviceMemoryBase> allocated_buffers_;
  int64 total_allocated_bytes_ = 0;
 };

@ -74,16 +74,15 @@ class FftThunk : public Thunk {

  // Does the FFT for the thunk on "stream".
  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;

 private:
-  const perftools::gputools::fft::Type fft_type_;
+  const se::fft::Type fft_type_;
  const std::vector<int64> fft_length_;

  float scale_factor_;

-  std::unique_ptr<perftools::gputools::fft::Plan> fft_plan_;
+  std::unique_ptr<se::fft::Plan> fft_plan_;

  const BufferAllocation::Slice input_buffer_;
  const BufferAllocation::Slice output_buffer_;
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@ -36,8 +36,7 @@ tensorflow::Status ForThunk::Initialize(const GpuExecutable& executable) {
 }

 tensorflow::Status ForThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
  for (int64 i = 0; i < loop_limit_; ++i) {
    // Invoke loop body thunk sequence.
    TF_RETURN_IF_ERROR(
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@ -38,8 +38,7 @@ class ForThunk : public Thunk {

  tensorflow::Status Initialize(const GpuExecutable& executable) override;
  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;

 private:
  const int64 loop_limit_;
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@ -22,8 +22,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {

--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@ -50,14 +50,12 @@ class GemmThunk : public Thunk {

  // Does the gemm operation for the thunk on "stream", which must be non-null.
  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;

  // Returns true if we'll perform autotuning if run on the given stream.  If
  // so, we want the GPU to be quiescent during autotuning, so as not to
  // introduce noise in our results.
-  bool ShouldHaltAllActivityBeforeRunning(
-      perftools::gputools::Stream* stream) override {
+  bool ShouldHaltAllActivityBeforeRunning(se::Stream* stream) override {
    return autotune_results_.count(
               stream->parent()->GetDeviceDescription().name()) != 0;
  }
@ -79,8 +77,7 @@ class GemmThunk : public Thunk {
  // results.  The map's value is the best algorithm we've found for this thunk
  // on this device, or an error if none of the algorithms worked and we should
  // use the regular gemm without an algorithm.
-  std::unordered_map<string,
-                     StatusOr<::perftools::gputools::blas::AlgorithmType>>
+  std::unordered_map<string, StatusOr<se::blas::AlgorithmType>>
      autotune_results_;
 };

--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -91,8 +91,6 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {

@ -779,9 +777,9 @@ se::Platform::Id GpuCompiler::PlatformId() const {
 }  // namespace xla

 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(se::cuda::kCudaPlatformId, []() {
-    return xla::MakeUnique<xla::gpu::GpuCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      stream_executor::cuda::kCudaPlatformId,
+      []() { return xla::MakeUnique<xla::gpu::GpuCompiler>(); });
  return true;
 }
 static bool module_initialized = InitModule();
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@ -45,25 +45,23 @@ class GpuCompiler : public LLVMCompiler {
  // Bring in
  // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
  //     std::vector<std::unique_ptr<HloModule>> modules,
-  //     std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+  //     std::vector<std::vector<se::StreamExecutor*>>
  //        stream_execs)
  using LLVMCompiler::Compile;

  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
      DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
      DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
                     AotCompilationOptions const& options) override;

-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;

  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
    // Capture just the pointer size, not the entire GpuCompiler object.
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@ -34,8 +34,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {
 namespace {
@ -324,7 +322,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
            this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
        CHECK(!slice.allocation()->is_entry_computation_parameter());

-        perftools::gputools::DeviceMemoryBase src_base =
+        se::DeviceMemoryBase src_base =
            buffer_allocations->GetDeviceAddress(slice.index());
        CHECK(!src_base.is_null() || src_base.size() == 0);
        *device_memory = src_base;
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@ -33,8 +33,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {

 // TODO(b/30467474) Once GPU infeed implementation settles, consider
@ -153,8 +151,8 @@ static std::unique_ptr<xla::TransferManager> CreateGpuTransferManager() {
 }

 static bool InitModule() {
-  xla::TransferManager::RegisterTransferManager(se::cuda::kCudaPlatformId,
-                                                &CreateGpuTransferManager);
+  xla::TransferManager::RegisterTransferManager(
+      stream_executor::cuda::kCudaPlatformId, &CreateGpuTransferManager);
  return true;
 }
 static bool module_initialized = InitModule();
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
@ -36,21 +36,20 @@ class GpuTransferManager : public GenericTransferManager {
  GpuTransferManager();
  ~GpuTransferManager() override {}

-  Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                 const Literal& literal) override;
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
+  Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
+                                const void* source) override;

 private:
  // Initiates the infeed data transfers. InfeedBuffer->Done() must be
  // called to clean up the memory allocated for InfeedBuffer.
  StatusOr<gpu::InfeedBuffer*> TransferBufferToInfeedInternal(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source);
+      se::StreamExecutor* executor, int64 size, const void* source);

  // Enqueues infeed data buffers with the infeed manager after their
  // transfer completes.
-  Status EnqueueBuffersToInfeed(perftools::gputools::StreamExecutor* executor,
+  Status EnqueueBuffersToInfeed(se::StreamExecutor* executor,
                                std::vector<gpu::InfeedBuffer*> buffers);

  TF_DISALLOW_COPY_AND_ASSIGN(GpuTransferManager);
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
@ -19,8 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/core/platform/logging.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {

--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
@ -46,7 +46,7 @@ namespace gpu {
 // the client. The client manages the memory of the buffer.
 class InfeedBuffer {
 public:
-  InfeedBuffer(perftools::gputools::StreamExecutor* executor, int64 length)
+  InfeedBuffer(se::StreamExecutor* executor, int64 length)
      : executor_(executor), length_(length) {
    device_memory_ = executor_->AllocateArray<uint8>(length);
    CHECK(!device_memory_.is_null());
@ -60,14 +60,12 @@ class InfeedBuffer {
  // client to manage memory for the infeed buffers.
  void Done() { delete this; }

-  perftools::gputools::DeviceMemoryBase* device_memory() {
-    return &device_memory_;
-  }
+  se::DeviceMemoryBase* device_memory() { return &device_memory_; }

 private:
-  perftools::gputools::StreamExecutor* executor_;  // Not owned.
+  se::StreamExecutor* executor_;  // Not owned.
  const int64 length_;
-  perftools::gputools::DeviceMemoryBase device_memory_;
+  se::DeviceMemoryBase device_memory_;
 };

 // Client-side class used to enqueue infeed buffers.
@ -100,8 +98,7 @@ class InfeedManager {
  // new stream on the first invocation. On subsequent invocations, if
  // the cached executor is not the same as the requested executor,
  // returns null.
-  perftools::gputools::Stream* GetStream(
-      perftools::gputools::StreamExecutor* executor);
+  se::Stream* GetStream(se::StreamExecutor* executor);

 private:
  // TODO(b/30467474): Revisit if this mutex becomes a point of
@ -121,10 +118,10 @@ class InfeedManager {
  tensorflow::gtl::FlatSet<const InfeedBuffer*> dequeued_buffer_;

  // Cached host to device stream for queuing infeed data.
-  std::unique_ptr<perftools::gputools::Stream> host_to_device_stream_;
+  std::unique_ptr<se::Stream> host_to_device_stream_;

  // Executor that the host_to_device_stream belongs to. Not owned.
-  perftools::gputools::StreamExecutor* host_to_device_executor_;
+  se::StreamExecutor* host_to_device_executor_;
 };

 // Singleton creator-or-accessor: Returns the GPU infeed manager.
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@ -31,10 +31,10 @@ InfeedThunk::InfeedThunk(
      destination_buffer_(destination_buffer) {}

 Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                    perftools::gputools::Stream* stream) {
+                                    se::Stream* stream) {
  VLOG(2) << "Infeeding to GPU ";

-  perftools::gputools::DeviceMemoryBase destination_address =
+  se::DeviceMemoryBase destination_address =
      buffer_allocations.GetDeviceAddress(destination_buffer_);

  InfeedManager* infeed_manager = GetOrCreateInfeedManager();
@ -45,7 +45,7 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
    std::vector<void*> tuple_element_addresses;
    for (BufferAllocation::Slice tuple_element_buffer :
         tuple_element_buffers_) {
-      perftools::gputools::DeviceMemoryBase tuple_element_address =
+      se::DeviceMemoryBase tuple_element_address =
          buffer_allocations.GetDeviceAddress(tuple_element_buffer);

      InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer();
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@ -44,7 +44,7 @@ class InfeedThunk : public Thunk {
  InfeedThunk& operator=(const InfeedThunk&) = delete;

  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;

 private:
  const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@ -32,7 +32,7 @@ class IrEmitterContext {
 public:
  IrEmitterContext(const HloModule* hlo_module,
                   const BufferAssignment* buffer_assignment,
-                   const perftools::gputools::DeviceDescription* device_desc,
+                   const se::DeviceDescription* device_desc,
                   llvm::Module* llvm_module)
      : hlo_module_(hlo_module),
        buffer_assignment_(buffer_assignment),
@ -47,7 +47,7 @@ class IrEmitterContext {
  const BufferAssignment& buffer_assignment() const {
    return *buffer_assignment_;
  }
-  const perftools::gputools::DeviceDescription& device_description() const {
+  const se::DeviceDescription& device_description() const {
    return *device_desc_;
  }
  llvm::Module* llvm_module() { return llvm_module_; }
@ -56,7 +56,7 @@ class IrEmitterContext {
 private:
  const HloModule* hlo_module_;
  const BufferAssignment* buffer_assignment_;
-  const perftools::gputools::DeviceDescription* device_desc_;
+  const se::DeviceDescription* device_desc_;
  llvm::Module* llvm_module_;
  NameUniquer name_uniquer_;
 };
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {

--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@ -61,8 +61,7 @@ class KernelThunk : public Thunk {

  // Executes the kernel for the thunk on "stream", which must be non-null.
  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;

 private:
  // Buffers passed to the kernel as arguments.
@ -82,13 +81,11 @@ class KernelThunk : public Thunk {
  // Describes how to load this kernel. ExecuteOnStream reuses this loader
  // specification for all executions.
  mutable tensorflow::mutex mutex_;
-  std::unique_ptr<perftools::gputools::MultiKernelLoaderSpec> loader_spec_
-      GUARDED_BY(mutex_);
+  std::unique_ptr<se::MultiKernelLoaderSpec> loader_spec_ GUARDED_BY(mutex_);

  // Loaded kernels for each `StreamExecutor`
-  std::unordered_map<perftools::gputools::StreamExecutor*,
-                     perftools::gputools::KernelBase>
-      kernel_cache_ GUARDED_BY(mutex_);
+  std::unordered_map<se::StreamExecutor*, se::KernelBase> kernel_cache_
+      GUARDED_BY(mutex_);
 };

 }  // namespace gpu
--- a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
@ -19,8 +19,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {

-namespace se = ::perftools::gputools;
-
 Status MemzeroThunk::ExecuteOnStream(
    const BufferAllocations& buffer_allocations, se::Stream* stream) {
  se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_);
--- a/tensorflow/compiler/xla/service/gpu/memset_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
@ -36,7 +36,7 @@ class MemzeroThunk : public Thunk {
      : Thunk(Kind::kMemzero, hlo), dest_(dest) {}

  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;

 private:
  const BufferAllocation::Slice dest_;
@ -52,7 +52,7 @@ class Memset32BitValueThunk : public Thunk {
      : Thunk(Kind::kMemset32BitValue, hlo), value_(value), dest_(dest) {}

  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;

 private:
  uint32 value_;
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {

--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@ -57,8 +57,7 @@ std::ostream& operator<<(std::ostream& out,
                         const LaunchDimensions& launch_dims);

 LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape,
-    const perftools::gputools::DeviceDescription& device_desc,
+    const Shape& shape, const se::DeviceDescription& device_desc,
    int unroll_factor = 1);

 }  // namespace gpu
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@ -33,8 +33,7 @@ tensorflow::Status SequentialThunk::Initialize(
 }

 tensorflow::Status SequentialThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
  for (const auto& thunk : thunks_) {
    TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
  }
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@ -40,8 +40,7 @@ class SequentialThunk : public Thunk {

  tensorflow::Status Initialize(const GpuExecutable& executable) override;
  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;

 private:
  // The list of sub-thunks.
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@ -85,8 +85,7 @@ class Thunk {
  // This value is not required to be constant for a given Thunk.  For example,
  // a Thunk that performs autotuning may return true for its first run and
  // false thereafter.
-  virtual bool ShouldHaltAllActivityBeforeRunning(
-      perftools::gputools::Stream* /*stream*/) {
+  virtual bool ShouldHaltAllActivityBeforeRunning(se::Stream* /*stream*/) {
    return false;
  }

@ -104,8 +103,7 @@ class Thunk {
  // called after Initialize and can be called multiple times over Thunk's
  // lifetime. Stream argument must be non-null.
  virtual tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) = 0;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) = 0;

 private:
  Kind kind_;
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
@ -17,8 +17,6 @@ limitations under the License.

 #include "tensorflow/compiler/xla/util.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 namespace gpu {

--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
@ -46,8 +46,7 @@ class TupleThunk : public Thunk {
  TupleThunk& operator=(const TupleThunk&) = delete;

  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+      const BufferAllocations& buffer_allocations, se::Stream* stream) override;

 private:
  const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@ -41,8 +41,8 @@ Status WhileThunk::Initialize(const GpuExecutable& executable) {
 }

 Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                   perftools::gputools::Stream* stream) {
-  perftools::gputools::DeviceMemoryBase condition_result_data =
+                                   se::Stream* stream) {
+  se::DeviceMemoryBase condition_result_data =
      buffer_allocations.GetDeviceAddress(condition_result_buffer_index_);

  while (true) {
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@ -47,7 +47,7 @@ class WhileThunk : public Thunk {

  Status Initialize(const GpuExecutable& executable) override;
  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         perftools::gputools::Stream* stream) override;
+                         se::Stream* stream) override;

 private:
  const BufferAllocation::Slice condition_result_buffer_index_;
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@ -88,7 +88,7 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
 // down how much time each HLO took.
 class HloExecutionProfile {
 public:
-  using DeviceDescription = perftools::gputools::DeviceDescription;
+  using DeviceDescription = se::DeviceDescription;

  HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data,
                      const HloProfileIndexMap* hlo_profile_index_map);
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@ -30,8 +30,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {

 /*static*/ StatusOr<std::unique_ptr<HloModule>>
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@ -80,7 +80,7 @@ class HloRunner {
    bool run_hlo_passes = false;
  };

-  explicit HloRunner(::perftools::gputools::Platform* platform);
+  explicit HloRunner(se::Platform* platform);

  ~HloRunner();

@ -149,8 +149,7 @@ class HloRunner {
  // will be used to configure the replication parameters. Replicated executions
  // should pass the device_assignment parameter.
  ServiceExecutableRunOptions GetServiceRunOptionsForDevice(
-      int64 device, ::perftools::gputools::Stream* stream,
-      DeviceAssignment* device_assignment);
+      int64 device, se::Stream* stream, DeviceAssignment* device_assignment);

  std::unique_ptr<Backend> backend_;
 };
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@ -41,9 +41,6 @@ limitations under the License.
 namespace xla {
 namespace interpreter {

-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::interpreter;
-
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
  HloPassPipeline pipeline("Interpreter");

@ -96,7 +93,7 @@ InterpreterCompiler::CompileAheadOfTime(
 }

 se::Platform::Id InterpreterCompiler::PlatformId() const {
-  return sep::kXlaInterpreterPlatformId;
+  return se::interpreter::kXlaInterpreterPlatformId;
 }

 HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
@ -109,11 +106,12 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
 }

 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(sep::kXlaInterpreterPlatformId, []() {
-    return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      se::interpreter::kXlaInterpreterPlatformId, []() {
+        return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
+      });
  xla::ComputationPlacer::RegisterComputationPlacer(
-      sep::kXlaInterpreterPlatformId, &CreateComputationPlacer);
+      se::interpreter::kXlaInterpreterPlatformId, &CreateComputationPlacer);
  return true;
 }

--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@ -44,19 +44,16 @@ class InterpreterCompiler : public Compiler {
  ~InterpreterCompiler() override {}

  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
-      std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
      DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::unique_ptr<Executable>> RunBackend(
-      std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec,
+      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
      DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
      std::vector<std::unique_ptr<HloModule>> hlo_modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
      DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
@ -65,7 +62,7 @@ class InterpreterCompiler : public Compiler {

  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;

-  perftools::gputools::Platform::Id PlatformId() const override;
+  se::Platform::Id PlatformId() const override;

 private:
  Status RunHloOptimization(HloModule* hlo_module);
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@ -38,8 +38,6 @@ limitations under the License.
 namespace xla {
 namespace interpreter {

-namespace se = ::perftools::gputools;
-
 InterpreterExecutable::InterpreterExecutable(
    std::unique_ptr<const HloModule> hlo_module)
    : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@ -19,8 +19,7 @@ limitations under the License.

 #include "tensorflow/compiler/xla/status_macros.h"

-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {

 host::HostStream *AsExecutorStream(Stream *stream) {
@ -119,5 +118,4 @@ DeviceDescription *XlaInterpreterExecutor::PopulateDeviceDescription() const {
 }

 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@ -44,8 +44,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/timer.h"

-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {

 using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
@ -213,7 +212,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
 };

 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor

 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
--- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
@ -21,12 +21,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"

-namespace sei = ::perftools::gputools::interpreter;
-
 namespace xla {

 InterpreterTransferManager::InterpreterTransferManager()
-    : GenericTransferManager(sei::kXlaInterpreterPlatformId,
+    : GenericTransferManager(se::interpreter::kXlaInterpreterPlatformId,
                             /*pointer_size=*/sizeof(void*)) {}

 }  // namespace xla
@ -38,7 +36,8 @@ CreateInterpreterTransferManager() {

 static bool InitModule() {
  xla::TransferManager::RegisterTransferManager(
-      sei::kXlaInterpreterPlatformId, &CreateInterpreterTransferManager);
+      stream_executor::interpreter::kXlaInterpreterPlatformId,
+      &CreateInterpreterTransferManager);
  return true;
 }

--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@ -28,11 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"

-namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::interpreter;
-
-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {

 XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {}
@ -99,16 +95,16 @@ void XlaInterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
 }

 static void InitializeXlaInterpreterPlatform() {
-  std::unique_ptr<se::Platform> platform(new sep::XlaInterpreterPlatform);
-  SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  std::unique_ptr<Platform> platform(new XlaInterpreterPlatform);
+  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }

 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor

-REGISTER_MODULE_INITIALIZER(interpreter_platform,
-                            sep::InitializeXlaInterpreterPlatform());
+REGISTER_MODULE_INITIALIZER(
+    interpreter_platform,
+    stream_executor::interpreter::InitializeXlaInterpreterPlatform());

 DECLARE_MODULE_INITIALIZER(multi_platform_manager);

--- a/tensorflow/compiler/xla/service/interpreter/platform.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform.h
@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/trace_listener.h"

-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {

 class XlaInterpreterPlatform : public Platform {
@ -64,7 +63,6 @@ class XlaInterpreterPlatform : public Platform {
 };

 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor

 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_H_
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
@ -14,12 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/interpreter/platform_id.h"

-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {

 PLATFORM_DEFINE_ID(kXlaInterpreterPlatformId);

 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.h
@ -18,14 +18,12 @@ limitations under the License.

 #include "tensorflow/stream_executor/platform.h"

-namespace perftools {
-namespace gputools {
+namespace stream_executor {
 namespace interpreter {

 extern const Platform::Id kXlaInterpreterPlatformId;

 }  // namespace interpreter
-}  // namespace gputools
-}  // namespace perftools
+}  // namespace stream_executor

 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_ID_H_
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@ -23,7 +23,7 @@ limitations under the License.
 namespace xla {
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
    std::vector<std::unique_ptr<HloModule>> modules,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> stream_execs,
+    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
    DeviceMemoryAllocator* device_allocator) {
  // Tensorflow tries to enable the following behaviors in all its threads:
  //
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@ -60,19 +60,18 @@ class LLVMCompiler : public Compiler {
  // Bring in
  //   StatusOr<std::unique_ptr<Executable>> RunBackend(
  //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       se::StreamExecutor* stream_exec,
  //       DeviceMemoryAllocator* device_allocator)
  //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
  //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       se::StreamExecutor* stream_exec,
  //       DeviceMemoryAllocator* device_allocator)
  using Compiler::RunBackend;
  using Compiler::RunHloPasses;

  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
      std::vector<std::unique_ptr<HloModule>> modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_execs,
+      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
      DeviceMemoryAllocator* device_allocator) override;

 protected:
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@ -43,13 +43,11 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {

 /* static */ StatusOr<std::unique_ptr<LocalService>> LocalService::NewService(
    const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
  if (platform == nullptr) {
    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
  }
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {

 using tensorflow::str_util::Lowercase;
--- a/tensorflow/compiler/xla/service/platform_util.h
+++ b/tensorflow/compiler/xla/service/platform_util.h
@ -34,29 +34,27 @@ class PlatformUtil {
  //
  // Note that, even if a platform is present with zero devices, if we *do* have
  // compilation support for it, it will be returned in this sequence.
-  static StatusOr<std::vector<perftools::gputools::Platform*>>
-  GetSupportedPlatforms();
+  static StatusOr<std::vector<se::Platform*>> GetSupportedPlatforms();

  // Convenience function which returns the default supported platform for
  // tests. If exactly one supported platform is present, then this platform is
  // the default platform. If exactly two platforms are present and one of them
  // is the interpreter platform, then the other platform is the default
  // platform. Otherwise returns an error.
-  static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
+  static StatusOr<se::Platform*> GetDefaultPlatform();

  // Convenience function which returns the sole supported platform. If
  // exactly one supported platform is present, then this platform is the
  // default platform. Otherwise returns an error.
-  static StatusOr<perftools::gputools::Platform*> GetSolePlatform();
+  static StatusOr<se::Platform*> GetSolePlatform();

  // Returns the platform according to the given name. Returns error if there is
  // no such platform.
-  static StatusOr<perftools::gputools::Platform*> GetPlatform(
-      const string& platform_name);
+  static StatusOr<se::Platform*> GetPlatform(const string& platform_name);

  // Returns exactly one platform that does not have given name. Returns error
  // if there is no such platform, or there are multiple such platforms.
-  static StatusOr<perftools::gputools::Platform*> GetPlatformExceptFor(
+  static StatusOr<se::Platform*> GetPlatformExceptFor(
      const string& platform_name);

  // Returns a vector of StreamExecutors for the given platform. The vector is
@ -64,8 +62,8 @@ class PlatformUtil {
  // element is nullptr, then the device is present by not supported by XLA.
  //
  // If the platform has no visible devices, a not-found error is returned.
-  static StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
-  GetStreamExecutors(perftools::gputools::Platform* platform);
+  static StatusOr<std::vector<se::StreamExecutor*>> GetStreamExecutors(
+      se::Platform* platform);

 private:
  TF_DISALLOW_COPY_AND_ASSIGN(PlatformUtil);
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@ -54,8 +54,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"

-namespace se = ::perftools::gputools;
-
 using ::tensorflow::strings::Printf;
 using ::tensorflow::strings::StrCat;
 using ::xla::source_map_util::InvalidParameterArgument;
@ -95,15 +93,12 @@ tensorflow::Status RecordResult(const ShapedBuffer& result,

 }  // namespace

-ServiceOptions& ServiceOptions::set_platform(
-    perftools::gputools::Platform* platform) {
+ServiceOptions& ServiceOptions::set_platform(se::Platform* platform) {
  platform_ = platform;
  return *this;
 }

-perftools::gputools::Platform* ServiceOptions::platform() const {
-  return platform_;
-}
+se::Platform* ServiceOptions::platform() const { return platform_; }

 ServiceOptions& ServiceOptions::set_number_of_replicas(int number_of_replicas) {
  number_of_replicas_ = number_of_replicas;
@ -123,7 +118,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
 }

 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
-    perftools::gputools::Platform* platform) {
+    se::Platform* platform) {
  ServiceOptions default_options;
  default_options.set_platform(platform);
  return NewService(default_options);
@ -131,7 +126,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {

 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
    const ServiceOptions& options) {
-  perftools::gputools::Platform* platform = options.platform();
+  se::Platform* platform = options.platform();
  std::unique_ptr<Backend> execute_backend;
  if (platform == nullptr) {
    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
@ -235,8 +230,7 @@ tensorflow::Status Service::ValidateResultShapeWithLayout(
 StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
 Service::ResolveAndValidateArguments(
    tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-        stream_executors) {
+    tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors) {
  CHECK_EQ(options_.number_of_replicas(), stream_executors.size());
  std::vector<std::vector<const ShapedBuffer*>> replicated_arguments;
  replicated_arguments.resize(options_.number_of_replicas());
@ -349,8 +343,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
    std::vector<VersionedComputationHandle> versioned_handles,
    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    Backend* backend,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
    DeviceMemoryAllocator* device_allocator) {
  VLOG(1) << Printf("BuildExecutable on service %p", this);

@ -412,8 +405,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
    const std::vector<const HloModuleProto*>& module_protos,
    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    Backend* backend,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
    DeviceMemoryAllocator* device_allocator) {
  VLOG(1) << Printf("BuildExecutable on service %p", this);

@ -493,7 +485,7 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
    const VersionedComputationHandle& versioned_handle,
    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-    perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+    se::StreamExecutor* executor, ExecutionProfile* profile,
    DeviceMemoryAllocator* device_allocator) {
  std::shared_ptr<Executable> executable =
      compilation_cache_.LookUp(versioned_handle, *module_config);
@ -541,7 +533,7 @@ Service::ExecuteParallelAndRegisterResult(
  // Streams where the computation are launched, so we can wait on the streams
  // to complete.
  std::vector<Pool<se::Stream>::SmartPtr> streams;
-  std::vector<std::unique_ptr<perftools::gputools::Timer>> timers;
+  std::vector<std::unique_ptr<se::Timer>> timers;

  // Global data handles for the computation results, one for each computation.
  std::vector<GlobalDataHandle> result_handles;
@ -565,8 +557,7 @@ Service::ExecuteParallelAndRegisterResult(
      streams.push_back(std::move(stream));

      if (replica == 0 && profile != nullptr) {
-        timers.emplace_back(
-            new perftools::gputools::Timer(streams.back()->parent()));
+        timers.emplace_back(new se::Timer(streams.back()->parent()));
        streams.back()
            ->InitTimer(timers.back().get())
            .ThenStartTimer(timers.back().get());
@ -734,9 +725,9 @@ tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
  return computation->SetReturnValue(arg->operand());
 }

-StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
-Service::GetExecutors(const ExecutionOptions& execution_options,
-                      int64 requests_size, int64 request_index) const {
+StatusOr<std::vector<se::StreamExecutor*>> Service::GetExecutors(
+    const ExecutionOptions& execution_options, int64 requests_size,
+    int64 request_index) const {
  if (execution_options.device_handles().empty()) {
    return FailedPrecondition(
        "device handles must be given to execute parallel computations");
@ -748,7 +739,7 @@ Service::GetExecutors(const ExecutionOptions& execution_options,
        "handles.",
        requests_size, request_index, execution_options.device_handles_size());
  }
-  std::vector<perftools::gputools::StreamExecutor*> executors;
+  std::vector<se::StreamExecutor*> executors;
  for (const auto& device_handle : execution_options.device_handles()) {
    TF_ASSIGN_OR_RETURN(auto replicas,
                        Replicas(*execute_backend_, device_handle));
@ -780,7 +771,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
  VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();

  std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
-  std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
+  std::vector<std::vector<se::StreamExecutor*>> all_executors;
  std::vector<VersionedComputationHandle> versioned_handles;
  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
  std::vector<string> computation_names;
@ -891,7 +882,7 @@ tensorflow::Status Service::ExecuteGraphParallel(
  VLOG(1) << "running execute-graph-parallel request";

  std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
-  std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
+  std::vector<std::vector<se::StreamExecutor*>> all_executors;
  std::vector<const HloModuleProto*> module_protos;
  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
  std::vector<string> computation_names;
@ -1953,9 +1944,9 @@ DeviceHandle Service::SingleComputationDeviceHandle() const {
  return device_handle;
 }

-StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Service::Replicas(
+StatusOr<std::vector<se::StreamExecutor*>> Service::Replicas(
    const Backend& backend, const DeviceHandle& device_handle) const {
-  std::vector<perftools::gputools::StreamExecutor*> replicas;
+  std::vector<se::StreamExecutor*> replicas;
  for (int replica = 0; replica < options_.number_of_replicas(); ++replica) {
    // From the computation placer, find out the device ids of the replicas for
    // the given device handle.
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@ -53,8 +53,8 @@ namespace xla {
 class ServiceOptions {
 public:
  // Set the platform backing the service, or nullptr for the default platform.
-  ServiceOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
+  ServiceOptions& set_platform(se::Platform* platform);
+  se::Platform* platform() const;

  // Set the number of replicas to use when compiling replicated
  // programs.
@ -66,7 +66,7 @@ class ServiceOptions {
  int intra_op_parallelism_threads() const;

 private:
-  perftools::gputools::Platform* platform_ = nullptr;
+  se::Platform* platform_ = nullptr;
  int number_of_replicas_ = 1;
  int intra_op_parallelism_threads_ = -1;
 };
@ -79,7 +79,7 @@ class Service : public ServiceInterface {
 public:
  // Factory method for creating a new Service.
  static StatusOr<std::unique_ptr<Service>> NewService(
-      perftools::gputools::Platform* platform = nullptr);
+      se::Platform* platform = nullptr);
  static StatusOr<std::unique_ptr<Service>> NewService(
      const ServiceOptions& options);

@ -286,7 +286,7 @@ class Service : public ServiceInterface {
                              ExecuteResponse* result);

  // Prepare the executors for executing parallel.
-  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> GetExecutors(
+  StatusOr<std::vector<se::StreamExecutor*>> GetExecutors(
      const ExecutionOptions& execution_options, int64 requests_size,
      int64 request_index) const;

@ -310,8 +310,7 @@ class Service : public ServiceInterface {
  StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
  ResolveAndValidateArguments(
      tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-          stream_executors);
+      tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);

  // Create a Hlo module config for the given program shape and arguments.
  // execution_options is optional; if not given a default is used.
@ -329,7 +328,7 @@ class Service : public ServiceInterface {
  StatusOr<std::unique_ptr<Executable>> BuildExecutable(
      const VersionedComputationHandle& versioned_handle,
      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
      DeviceMemoryAllocator* device_allocator = nullptr);

  // Builds an Executable for the given HLO module proto.
@ -338,7 +337,7 @@ class Service : public ServiceInterface {
  StatusOr<std::unique_ptr<Executable>> BuildExecutable(
      const HloModuleProto& module_proto,
      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      perftools::gputools::StreamExecutor* executor,
+      se::StreamExecutor* executor,
      DeviceMemoryAllocator* device_allocator = nullptr);

  // Same as BuildExecutable() above, but builds a list of Executables for the
@ -346,14 +345,12 @@ class Service : public ServiceInterface {
  StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
      std::vector<VersionedComputationHandle> versioned_handles,
      std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-      Backend* backend,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
      DeviceMemoryAllocator* device_allocator);
  StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
      const std::vector<const HloModuleProto*>& module_protos,
      std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-      Backend* backend,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
      DeviceMemoryAllocator* device_allocator);

  // Similar to BuildExecutable, but look in the compilation cache for the
@ -362,7 +359,7 @@ class Service : public ServiceInterface {
  StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
      const VersionedComputationHandle& versioned_handle,
      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+      se::StreamExecutor* executor, ExecutionProfile* profile,
      DeviceMemoryAllocator* device_allocator = nullptr);

  // Runs the given executable with the given arguments and register the result
@ -411,7 +408,7 @@ class Service : public ServiceInterface {
  // Returns the stream executors assigned to the replicas represented by the
  // given device handle. Each device_handle is a virtual replicated device that
  // represents a set of physical devices for the replicas.
-  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Replicas(
+  StatusOr<std::vector<se::StreamExecutor*>> Replicas(
      const Backend& backend, const DeviceHandle& device_handle) const;

  Status MaybeDumpHloModule(const HloModule& module) const;
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@ -28,7 +28,7 @@ namespace xla {
 class ServiceExecutableRunOptions {
 public:
  using StreamBorrower =
-      std::function<StatusOr<Pool<perftools::gputools::Stream>::SmartPtr>(int)>;
+      std::function<StatusOr<Pool<se::Stream>::SmartPtr>(int)>;

  ServiceExecutableRunOptions()
      : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
@ -45,14 +45,13 @@ class ServiceExecutableRunOptions {
  ExecutableRunOptions* mutable_run_options() { return &run_options_; }

  // Delegate to `ExecutableRunOptions` member.
-  perftools::gputools::Stream* stream() const { return run_options_.stream(); }
+  se::Stream* stream() const { return run_options_.stream(); }
  DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); }
  int device_ordinal() const { return run_options_.device_ordinal(); }

  // Borrows a stream and returns a smart pointer which returns the stream on
  // destruction.
-  StatusOr<Pool<perftools::gputools::Stream>::SmartPtr> BorrowStream(
-      int device_ordinal) const {
+  StatusOr<Pool<se::Stream>::SmartPtr> BorrowStream(int device_ordinal) const {
    return borrow_stream_
               ? borrow_stream_(device_ordinal)
               : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@ -28,8 +28,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {

 using ::tensorflow::strings::Appendf;
@ -146,7 +144,7 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {

 std::unique_ptr<ShapedBuffer> ScopedShapedBuffer::release() {
  auto shaped_buffer = MakeUnique<ShapedBuffer>(std::move(*this));
-  buffers_ = ShapeTree<perftools::gputools::DeviceMemoryBase>();
+  buffers_ = ShapeTree<se::DeviceMemoryBase>();
  return shaped_buffer;
 }

--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@ -41,8 +41,7 @@ class ShapedBuffer {
  // determines the number of device allocations (DeviceMemoryBase) held by the
  // ShapedBuffer.
  ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
-               const perftools::gputools::Platform* platform,
-               int device_ordinal);
+               const se::Platform* platform, int device_ordinal);

  // Returns the shape of the on-host representation of the data held by this
  // ShapedBuffer.
@ -52,35 +51,29 @@ class ShapedBuffer {
  // ShapedBuffer.
  const Shape& on_device_shape() const { return on_device_shape_; }

-  const perftools::gputools::Platform* platform() const { return platform_; }
+  const se::Platform* platform() const { return platform_; }
  int device_ordinal() const { return device_ordinal_; }

  // Return the root buffer of the shape (shape index {}).
-  const perftools::gputools::DeviceMemoryBase& root_buffer() const {
+  const se::DeviceMemoryBase& root_buffer() const {
    return buffer(/*index=*/{});
  }

  // Returns the buffer at the given shape index where index is defined as in
  // ShapeUtil::GetSubshape.
-  const perftools::gputools::DeviceMemoryBase& buffer(
-      const ShapeIndex& index) const {
+  const se::DeviceMemoryBase& buffer(const ShapeIndex& index) const {
    return buffers_.element(index);
  }

  // Sets the device memory buffer at the given index.
-  void set_buffer(const perftools::gputools::DeviceMemoryBase& buffer,
-                  const ShapeIndex& index) {
+  void set_buffer(const se::DeviceMemoryBase& buffer, const ShapeIndex& index) {
    *buffers_.mutable_element(index) = buffer;
  }

  // Returns the underlying ShapeTree containing all the device addresses in the
  // ShapedBuffer.
-  const ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() const {
-    return buffers_;
-  }
-  ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() {
-    return buffers_;
-  }
+  const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
+  ShapeTree<se::DeviceMemoryBase>& buffers() { return buffers_; }

  // Set all device memory pointers in the object to null.
  void clear();
@ -101,13 +94,13 @@ class ShapedBuffer {
  Shape on_device_shape_;

  // The platform the memory is allocated on.
-  const perftools::gputools::Platform* platform_;
+  const se::Platform* platform_;

  // The device the memory is allocated on.
  int device_ordinal_;

  // The tree of device buffers. Its shape is on_device_shape().
-  ShapeTree<perftools::gputools::DeviceMemoryBase> buffers_;
+  ShapeTree<se::DeviceMemoryBase> buffers_;
 };

 std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@ -25,24 +25,20 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"

-namespace se = ::perftools::gputools;
-
 namespace xla {
 /* static */ tensorflow::mutex
    TransferManager::platform_transfer_manager_mutex_(
        tensorflow::LINKER_INITIALIZED);

-/* static */ std::map<perftools::gputools::Platform::Id,
-                      TransferManager::State>*
+/* static */ std::map<se::Platform::Id, TransferManager::State>*
 TransferManager::GetPlatformTransferManagers() {
-  static auto* r =
-      new std::map<perftools::gputools::Platform::Id, TransferManager::State>;
+  static auto* r = new std::map<se::Platform::Id, TransferManager::State>;
  return r;
 }

 Status TransferManager::TransferArrayToDevice(
-    perftools::gputools::StreamExecutor* executor, const Literal& literal,
-    const perftools::gputools::DeviceMemoryBase& dest) {
+    se::StreamExecutor* executor, const Literal& literal,
+    const se::DeviceMemoryBase& dest) {
  const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
  TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
      << "On-device representation of "
@ -61,8 +57,8 @@ Status TransferManager::TransferArrayToDevice(
 }

 StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
-    perftools::gputools::StreamExecutor* executor, const Shape& shape,
-    const perftools::gputools::DeviceMemoryBase& source) {
+    se::StreamExecutor* executor, const Shape& shape,
+    const se::DeviceMemoryBase& source) {
  TF_RET_CHECK(ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape))
      << "Shape " << ShapeUtil::HumanString(shape)
      << " has a differently shaped representation on-device: "
@ -112,8 +108,7 @@ StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
 }

 Status TransferManager::WriteTupleIndexTables(
-    perftools::gputools::StreamExecutor* executor,
-    const ShapedBuffer& device_buffer) {
+    se::StreamExecutor* executor, const ShapedBuffer& device_buffer) {
  VLOG(2) << "Writing tuple index tables for " << device_buffer;

  TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
--- a/Show More
+++ b/Show More