From d77a621a571d8ab0d69f2682586674e6dff4ec4e Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Tue, 17 Apr 2018 21:04:35 -0700 Subject: [PATCH] [XLA] Convert XLA to use xla::se as a namespace alias for ::stream_executor. PiperOrigin-RevId: 193301997 --- tensorflow/compiler/xla/BUILD | 3 + .../compiler/xla/client/client_library.cc | 18 ++--- .../compiler/xla/client/client_library.h | 20 +++--- .../compiler/xla/client/local_client.cc | 2 - tensorflow/compiler/xla/client/local_client.h | 2 +- tensorflow/compiler/xla/device_util.h | 2 +- .../compiler/xla/executable_run_options.h | 7 +- tensorflow/compiler/xla/rpc/grpc_service.cc | 2 +- tensorflow/compiler/xla/rpc/grpc_service.h | 2 +- .../xla/service/allocation_tracker.cc | 6 +- .../compiler/xla/service/allocation_tracker.h | 8 +-- tensorflow/compiler/xla/service/backend.cc | 19 ++--- tensorflow/compiler/xla/service/backend.h | 34 ++++----- .../xla/service/compile_only_service.cc | 4 +- .../xla/service/compile_only_service.h | 2 +- tensorflow/compiler/xla/service/compiler.cc | 13 ++-- tensorflow/compiler/xla/service/compiler.h | 25 +++---- .../xla/service/computation_placer.cc | 16 ++--- .../compiler/xla/service/computation_placer.h | 9 ++- .../compiler/xla/service/cpu/cpu_compiler.cc | 14 ++-- .../compiler/xla/service/cpu/cpu_compiler.h | 12 ++-- .../xla/service/cpu/cpu_executable.cc | 9 +-- .../compiler/xla/service/cpu/cpu_executable.h | 12 ++-- .../xla/service/cpu/cpu_transfer_manager.cc | 13 ++-- .../xla/service/cpu/cpu_transfer_manager.h | 25 ++++--- .../service/cpu/parallel_cpu_executable.cc | 4 +- .../xla/service/cpu/parallel_cpu_executable.h | 9 ++- .../xla/service/device_memory_allocator.cc | 25 +++---- .../xla/service/device_memory_allocator.h | 28 ++++---- tensorflow/compiler/xla/service/executable.cc | 6 +- tensorflow/compiler/xla/service/executable.h | 2 +- .../xla/service/generic_transfer_manager.cc | 13 ++-- .../xla/service/generic_transfer_manager.h | 35 ++++------ .../xla/service/gpu/buffer_allocations.cc | 2 - .../xla/service/gpu/buffer_allocations.h | 21 +++--- .../xla/service/gpu/conditional_thunk.cc | 5 +- .../xla/service/gpu/conditional_thunk.h | 2 +- .../xla/service/gpu/convolution_thunk.cc | 2 - .../xla/service/gpu/convolution_thunk.h | 24 +++---- .../compiler/xla/service/gpu/copy_thunk.cc | 12 ++-- .../compiler/xla/service/gpu/copy_thunk.h | 6 +- .../xla/service/gpu/cudnn_batchnorm_thunk.cc | 1 - .../xla/service/gpu/cudnn_batchnorm_thunk.h | 6 +- .../gpu/cudnn_convolution_algorithm_picker.cc | 2 - .../gpu/cudnn_convolution_algorithm_picker.h | 7 +- .../service/gpu/cudnn_convolution_runner.cc | 28 +++----- .../service/gpu/cudnn_convolution_runner.h | 26 +++---- .../compiler/xla/service/gpu/fft_thunk.cc | 2 - .../compiler/xla/service/gpu/fft_thunk.h | 17 +++-- .../compiler/xla/service/gpu/for_thunk.cc | 3 +- .../compiler/xla/service/gpu/for_thunk.h | 3 +- .../compiler/xla/service/gpu/gemm_thunk.cc | 2 - .../compiler/xla/service/gpu/gemm_thunk.h | 9 +-- .../compiler/xla/service/gpu/gpu_compiler.cc | 8 +-- .../compiler/xla/service/gpu/gpu_compiler.h | 10 ++- .../xla/service/gpu/gpu_executable.cc | 4 +- .../xla/service/gpu/gpu_transfer_manager.cc | 6 +- .../xla/service/gpu/gpu_transfer_manager.h | 11 ++- .../xla/service/gpu/infeed_manager.cc | 2 - .../compiler/xla/service/gpu/infeed_manager.h | 17 ++--- .../compiler/xla/service/gpu/infeed_thunk.cc | 6 +- .../compiler/xla/service/gpu/infeed_thunk.h | 2 +- .../xla/service/gpu/ir_emitter_context.h | 6 +- .../compiler/xla/service/gpu/kernel_thunk.cc | 2 - .../compiler/xla/service/gpu/kernel_thunk.h | 11 ++- .../compiler/xla/service/gpu/memset_thunk.cc | 2 - .../compiler/xla/service/gpu/memset_thunk.h | 4 +- .../xla/service/gpu/partition_assignment.cc | 2 - .../xla/service/gpu/partition_assignment.h | 3 +- .../xla/service/gpu/sequential_thunk.cc | 3 +- .../xla/service/gpu/sequential_thunk.h | 3 +- tensorflow/compiler/xla/service/gpu/thunk.h | 6 +- .../compiler/xla/service/gpu/tuple_thunk.cc | 2 - .../compiler/xla/service/gpu/tuple_thunk.h | 3 +- .../compiler/xla/service/gpu/while_thunk.cc | 4 +- .../compiler/xla/service/gpu/while_thunk.h | 2 +- .../xla/service/hlo_execution_profile.h | 2 +- tensorflow/compiler/xla/service/hlo_runner.cc | 2 - tensorflow/compiler/xla/service/hlo_runner.h | 5 +- .../xla/service/interpreter/compiler.cc | 14 ++-- .../xla/service/interpreter/compiler.h | 11 ++- .../xla/service/interpreter/executable.cc | 2 - .../xla/service/interpreter/executor.cc | 6 +- .../xla/service/interpreter/executor.h | 6 +- .../interpreter_transfer_manager.cc | 7 +- .../xla/service/interpreter/platform.cc | 18 ++--- .../xla/service/interpreter/platform.h | 6 +- .../xla/service/interpreter/platform_id.cc | 6 +- .../xla/service/interpreter/platform_id.h | 6 +- .../compiler/xla/service/llvm_compiler.cc | 2 +- .../compiler/xla/service/llvm_compiler.h | 7 +- .../compiler/xla/service/local_service.cc | 4 +- .../compiler/xla/service/platform_util.cc | 2 - .../compiler/xla/service/platform_util.h | 16 ++--- tensorflow/compiler/xla/service/service.cc | 45 +++++------- tensorflow/compiler/xla/service/service.h | 27 ++++---- .../service/service_executable_run_options.h | 7 +- .../compiler/xla/service/shaped_buffer.cc | 4 +- .../compiler/xla/service/shaped_buffer.h | 25 +++---- .../compiler/xla/service/transfer_manager.cc | 19 ++--- .../compiler/xla/service/transfer_manager.h | 69 +++++++++---------- .../xla/tests/bitcast_convert_test.cc | 2 +- .../xla/tests/client_library_test_base.cc | 5 +- .../xla/tests/client_library_test_base.h | 5 +- .../xla/tests/compute_constant_test.cc | 8 +-- tensorflow/compiler/xla/tests/convert_test.cc | 2 +- .../compiler/xla/tests/dynamic_ops_test.cc | 2 - tensorflow/compiler/xla/tests/fusion_test.cc | 2 - .../compiler/xla/tests/hlo_test_base.cc | 2 - tensorflow/compiler/xla/tests/hlo_test_base.h | 3 +- .../compiler/xla/tests/llvm_compiler_test.cc | 4 +- .../xla/tests/local_client_execute_test.cc | 2 - .../xla/tests/local_client_test_base.cc | 14 ++-- .../xla/tests/local_client_test_base.h | 18 +++-- tensorflow/compiler/xla/tests/map_test.cc | 2 +- tensorflow/compiler/xla/tests/test_utils.cc | 4 +- tensorflow/compiler/xla/tests/test_utils.h | 3 +- .../xla/tests/vector_ops_simple_test.cc | 2 +- tensorflow/compiler/xla/tests/while_test.cc | 2 - .../xla/tests/xla_hlo_profile_test.cc | 2 +- tensorflow/compiler/xla/types.h | 4 +- 121 files changed, 443 insertions(+), 663 deletions(-) diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index 751777222fc..88f37433a55 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -443,6 +443,9 @@ cc_library( srcs = ["executable_run_options.cc"], hdrs = ["executable_run_options.h"], visibility = ["//visibility:public"], + deps = [ + ":types", + ], ) cc_library( diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc index b1663bc8157..803a9e40094 100644 --- a/tensorflow/compiler/xla/client/client_library.cc +++ b/tensorflow/compiler/xla/client/client_library.cc @@ -23,22 +23,19 @@ limitations under the License. namespace xla { -LocalClientOptions::LocalClientOptions(perftools::gputools::Platform* platform, +LocalClientOptions::LocalClientOptions(se::Platform* platform, int number_of_replicas, int intra_op_parallelism_threads) : platform_(platform), number_of_replicas_(number_of_replicas), intra_op_parallelism_threads_(intra_op_parallelism_threads) {} -LocalClientOptions& LocalClientOptions::set_platform( - perftools::gputools::Platform* platform) { +LocalClientOptions& LocalClientOptions::set_platform(se::Platform* platform) { platform_ = platform; return *this; } -perftools::gputools::Platform* LocalClientOptions::platform() const { - return platform_; -} +se::Platform* LocalClientOptions::platform() const { return platform_; } LocalClientOptions& LocalClientOptions::set_number_of_replicas( int number_of_replicas) { @@ -69,7 +66,7 @@ ClientLibrary::ClientLibrary() = default; ClientLibrary::~ClientLibrary() = default; /* static */ StatusOr ClientLibrary::GetOrCreateLocalClient( - perftools::gputools::Platform* platform) { + se::Platform* platform) { LocalClientOptions default_options; default_options.set_platform(platform); return GetOrCreateLocalClient(default_options); @@ -77,7 +74,7 @@ ClientLibrary::~ClientLibrary() = default; /* static */ StatusOr ClientLibrary::GetOrCreateLocalClient( const LocalClientOptions& options) { - perftools::gputools::Platform* platform = options.platform(); + se::Platform* platform = options.platform(); int replica_count = options.number_of_replicas(); ClientLibrary& client_library = Singleton(); tensorflow::mutex_lock lock(client_library.service_mutex_); @@ -115,7 +112,7 @@ ClientLibrary::~ClientLibrary() = default; } /* static */ LocalService* ClientLibrary::GetXlaService( - perftools::gputools::Platform* platform) { + se::Platform* platform) { ClientLibrary& client_library = Singleton(); tensorflow::mutex_lock lock(client_library.service_mutex_); auto it = client_library.local_instances_.find(platform->id()); @@ -124,8 +121,7 @@ ClientLibrary::~ClientLibrary() = default; } /* static */ StatusOr -ClientLibrary::GetOrCreateCompileOnlyClient( - perftools::gputools::Platform* platform) { +ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) { ClientLibrary& client_library = Singleton(); tensorflow::mutex_lock lock(client_library.service_mutex_); diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h index a6f30d82e43..3ad558fa532 100644 --- a/tensorflow/compiler/xla/client/client_library.h +++ b/tensorflow/compiler/xla/client/client_library.h @@ -43,13 +43,13 @@ namespace xla { // Options to configure the local client when it is created. class LocalClientOptions { public: - LocalClientOptions(perftools::gputools::Platform* platform = nullptr, + LocalClientOptions(se::Platform* platform = nullptr, int number_of_replicas = 1, int intra_op_parallelism_threads = -1); // Set the platform backing the service, or nullptr for the default platform. - LocalClientOptions& set_platform(perftools::gputools::Platform* platform); - perftools::gputools::Platform* platform() const; + LocalClientOptions& set_platform(se::Platform* platform); + se::Platform* platform() const; // Set the number of replicas to use when compiling replicated // programs. @@ -61,7 +61,7 @@ class LocalClientOptions { int intra_op_parallelism_threads() const; private: - perftools::gputools::Platform* platform_; + se::Platform* platform_; int number_of_replicas_; int intra_op_parallelism_threads_; }; @@ -74,7 +74,7 @@ class ClientLibrary { // platform : The platform the underlying XLA service should target. If // null then default platform is used. static StatusOr GetOrCreateLocalClient( - perftools::gputools::Platform* platform = nullptr); + se::Platform* platform = nullptr); static StatusOr GetOrCreateLocalClient( const LocalClientOptions& options); @@ -84,14 +84,14 @@ class ClientLibrary { // Returns the service from the service thread. Only used in unit tests to // access user computations from client. - static LocalService* GetXlaService(perftools::gputools::Platform* platform); + static LocalService* GetXlaService(se::Platform* platform); // Singleton constructor-or-accessor for compile-only clients. Arguments: // // platform : The platform the underlying XLA service should target. If // null then default platform is used. static StatusOr GetOrCreateCompileOnlyClient( - perftools::gputools::Platform* platform = nullptr); + se::Platform* platform = nullptr); // Clears the local instance and compile only instance caches. The client // pointers returned by the previous GetOrCreateLocalClient() or @@ -120,12 +120,10 @@ class ClientLibrary { }; tensorflow::mutex service_mutex_; // Guards the singleton creation state. - std::unordered_map> + std::unordered_map> local_instances_ GUARDED_BY(service_mutex_); - std::unordered_map> + std::unordered_map> compile_only_instances_ GUARDED_BY(service_mutex_); TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary); diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index 30594243dcf..d951c44cb92 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -24,8 +24,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/source_map_util.h" #include "tensorflow/compiler/xla/status_macros.h" -namespace se = ::perftools::gputools; - using xla::source_map_util::InvalidParameterArgument; namespace xla { diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h index 98ee7c62c94..42812b936f2 100644 --- a/tensorflow/compiler/xla/client/local_client.h +++ b/tensorflow/compiler/xla/client/local_client.h @@ -167,7 +167,7 @@ class LocalClient : public Client { StatusOr ReplicaNumberToDeviceOrdinal(int replica_number); // Returns the platform that the underlying service targets. - perftools::gputools::Platform* platform() const; + se::Platform* platform() const; // Returns the number of devices on the system of the service platform // type. Not all devices may be supported by the service (see diff --git a/tensorflow/compiler/xla/device_util.h b/tensorflow/compiler/xla/device_util.h index 23a622b1ad0..1a51fdee680 100644 --- a/tensorflow/compiler/xla/device_util.h +++ b/tensorflow/compiler/xla/device_util.h @@ -29,7 +29,7 @@ namespace xla { // Returns a string that represents the device in terms of platform and ordinal; // e.g. the first CUDA device will be "cuda:0" -string DeviceIdentifier(perftools::gputools::StreamExecutor* stream_exec) { +string DeviceIdentifier(se::StreamExecutor* stream_exec) { return tensorflow::strings::StrCat(stream_exec->platform()->Name(), ":", stream_exec->device_ordinal()); } diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h index 1a095a82cca..a306ae16ba4 100644 --- a/tensorflow/compiler/xla/executable_run_options.h +++ b/tensorflow/compiler/xla/executable_run_options.h @@ -16,6 +16,9 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_ #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_ +// Pulls in the ::stream_executor -> ::xla::se namespace alias. +#include "tensorflow/compiler/xla/types.h" + // These classes are forward declared so that ExecutableRunOptions can be linked // into an XLA-compiled binary without having to link all of the pointed-to // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't @@ -37,10 +40,6 @@ struct ThreadPoolDevice; namespace xla { -// TODO(b/77980417): Once the perftools::gputools -> stream_executor migration -// is complete, add "using namespace se = stream_executor" here and -// s/stream_executor/se::/ to match our idiom elsewhere. - class DeviceMemoryAllocator; class DeviceAssignment; class ExecutionProfile; diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc index 414829d6e76..0b100bd108e 100644 --- a/tensorflow/compiler/xla/rpc/grpc_service.cc +++ b/tensorflow/compiler/xla/rpc/grpc_service.cc @@ -20,7 +20,7 @@ limitations under the License. namespace xla { /* static */ StatusOr> GRPCService::NewService( - perftools::gputools::Platform* platform) { + se::Platform* platform) { std::unique_ptr grpc_service(new GRPCService()); TF_ASSIGN_OR_RETURN(grpc_service->service_, ::xla::Service::NewService(platform)); diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h index 7c9e484517e..fad74375bd5 100644 --- a/tensorflow/compiler/xla/rpc/grpc_service.h +++ b/tensorflow/compiler/xla/rpc/grpc_service.h @@ -29,7 +29,7 @@ class GRPCService : public grpc::XlaService::Service { // that the service should target. If platform is null then the default // platform is used. static StatusOr> NewService( - perftools::gputools::Platform* platform = nullptr); + se::Platform* platform = nullptr); ::grpc::Status Computation(::grpc::ServerContext* context, const ComputationRequest* arg, diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc index 4f819a743c4..359582a78c3 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.cc +++ b/tensorflow/compiler/xla/service/allocation_tracker.cc @@ -204,7 +204,7 @@ StatusOr> AllocationTracker::ResolveInternal( } void AllocationTracker::AddAllocationOrIncrementRefCount( - perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) { + se::DeviceMemoryBase device_memory, int device_ordinal) { AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal]; auto it = allocation_map.find(device_memory.opaque()); if (it == allocation_map.end()) { @@ -215,8 +215,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount( } } -Status AllocationTracker::DecrementRefCount( - perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) { +Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory, + int device_ordinal) { AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal]; auto it = allocation_map.find(device_memory.opaque()); TF_RET_CHECK(it != allocation_map.end()); diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h index 038aee8541b..60e93358efb 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.h +++ b/tensorflow/compiler/xla/service/allocation_tracker.h @@ -77,7 +77,7 @@ class AllocationTracker { // Data structure encapsulating single memory allocation on the device. struct Allocation { // The pointer to this allocation. - perftools::gputools::DeviceMemoryBase device_memory; + se::DeviceMemoryBase device_memory; // The device that the memory is allocated on. int device_ordinal; @@ -103,13 +103,13 @@ class AllocationTracker { // Adds the given device address to the allocation tracker, or if it already // exists, then increment it's reference count. - void AddAllocationOrIncrementRefCount( - perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) + void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory, + int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Decrements the reference count of the given device memory. Then, if it is // zero, deallocate the memory. - Status DecrementRefCount(perftools::gputools::DeviceMemoryBase device_memory, + Status DecrementRefCount(se::DeviceMemoryBase device_memory, int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_); // A map from device memory opaque value to allocation. One such map is diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc index 05f2d062784..a582dbffd68 100644 --- a/tensorflow/compiler/xla/service/backend.cc +++ b/tensorflow/compiler/xla/service/backend.cc @@ -36,19 +36,14 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { -BackendOptions& BackendOptions::set_platform( - perftools::gputools::Platform* platform) { +BackendOptions& BackendOptions::set_platform(se::Platform* platform) { platform_ = platform; return *this; } -perftools::gputools::Platform* BackendOptions::platform() const { - return platform_; -} +se::Platform* BackendOptions::platform() const { return platform_; } BackendOptions& BackendOptions::set_intra_op_parallelism_threads( int num_threads) { @@ -77,7 +72,7 @@ struct Backend::EigenThreadPoolWrapper { /* static */ StatusOr> Backend::CreateBackend( const BackendOptions& options) { - perftools::gputools::Platform* platform = options.platform(); + se::Platform* platform = options.platform(); TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform)); TF_ASSIGN_OR_RETURN(auto stream_executors, PlatformUtil::GetStreamExecutors(platform)); @@ -121,7 +116,7 @@ StatusOr Backend::BorrowStream( } Backend::Backend( - perftools::gputools::Platform* platform, Compiler* compiler, + se::Platform* platform, Compiler* compiler, tensorflow::gtl::ArraySlice stream_executors, TransferManager* transfer_manager, ComputationPlacer* computation_placer, int intra_op_parallelism_threads) @@ -178,7 +173,7 @@ tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const { return intra_op_thread_pool_wrapper_->pool.get(); } -StatusOr Backend::stream_executor( +StatusOr Backend::stream_executor( int device_ordinal) const { if (device_ordinal < 0 || device_ordinal > stream_executors_.back()->device_ordinal()) { @@ -201,9 +196,9 @@ StatusOr Backend::devices_equivalent(int device_ordinal_a, // bit crude but works for GPUs which is the important case where we compile // an executable for one GPU and want to know if it will run (well) on // another. - TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_a, + TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_a, stream_executor(device_ordinal_a)); - TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_b, + TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_b, stream_executor(device_ordinal_b)); return (executor_a->GetDeviceDescription().name() == executor_b->GetDeviceDescription().name()); diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h index b5ca483b727..d32a0a400d8 100644 --- a/tensorflow/compiler/xla/service/backend.h +++ b/tensorflow/compiler/xla/service/backend.h @@ -44,8 +44,8 @@ namespace xla { class BackendOptions { public: // Set the platform backing the backend, or nullptr for the default platform. - BackendOptions& set_platform(perftools::gputools::Platform* platform); - perftools::gputools::Platform* platform() const; + BackendOptions& set_platform(se::Platform* platform); + se::Platform* platform() const; // Sets the thread pool size for parallel execution of an individual operator. // The default value of -1 will result in initializing the thread pool with @@ -54,7 +54,7 @@ class BackendOptions { int intra_op_parallelism_threads() const; private: - perftools::gputools::Platform* platform_ = nullptr; + se::Platform* platform_ = nullptr; int intra_op_parallelism_threads_ = -1; }; @@ -66,7 +66,7 @@ class BackendOptions { // StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie(); class Backend { public: - using StreamPtr = Pool::SmartPtr; + using StreamPtr = Pool::SmartPtr; // Creates a new backend. static StatusOr> CreateBackend( @@ -79,7 +79,7 @@ class Backend { ~Backend(); // Accessors for the various objects. - perftools::gputools::Platform* platform() const { return platform_; } + se::Platform* platform() const { return platform_; } Compiler* compiler() const { return compiler_; } DeviceMemoryAllocator* memory_allocator() const { return memory_allocator_.get(); @@ -96,19 +96,17 @@ class Backend { // Returns stream executors of all supported devices for this backend. The // executors are ordered by the device ordinal. - const std::vector& stream_executors() - const { + const std::vector& stream_executors() const { return stream_executors_; } // Returns the stream executor for the given device ordinal. - StatusOr stream_executor( - int device_ordinal) const; + StatusOr stream_executor(int device_ordinal) const; // Returns the stream executor for the default device ordinal. This stream // executor can only be used when the number of computations is 1 (replication // can be > 1). - perftools::gputools::StreamExecutor* default_stream_executor() const { + se::StreamExecutor* default_stream_executor() const { CHECK(!stream_executors_.empty()); return stream_executors_[0]; } @@ -117,8 +115,7 @@ class Backend { // internal pool, or by constructing/initializating it, and returns the result // to the caller. StatusOr BorrowStream(int device_ordinal); - StatusOr BorrowStream( - perftools::gputools::StreamExecutor* executor); + StatusOr BorrowStream(se::StreamExecutor* executor); // Returns a function to borrow a stream, as `BorrowStream` above does. // Purely for convenience, the caller could rather make this anonymous @@ -157,29 +154,26 @@ class Backend { private: struct EigenThreadPoolWrapper; - Backend(perftools::gputools::Platform* platform, Compiler* compiler, - tensorflow::gtl::ArraySlice - stream_executors, + Backend(se::Platform* platform, Compiler* compiler, + tensorflow::gtl::ArraySlice stream_executors, TransferManager* transfer_manager, ComputationPlacer* computation_placer, int intra_op_parallelism_threads); Backend(const Backend&) = delete; Backend& operator=(const Backend&) = delete; - perftools::gputools::Platform* platform_; + se::Platform* platform_; Compiler* compiler_; TransferManager* transfer_manager_; ComputationPlacer* computation_placer_; // Vector of stream executors. stream_executors_[0] is the default executor. - std::vector stream_executors_; + std::vector stream_executors_; tensorflow::mutex mu_; // Mapping from stream executor to stream pools, used by `BorrowStream` above. - std::map> - stream_pools_ GUARDED_BY(mu_); + std::map> stream_pools_ GUARDED_BY(mu_); // The default memory allocator to use. std::unique_ptr memory_allocator_; diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc index fb70ea53157..c9f78a0f9f1 100644 --- a/tensorflow/compiler/xla/service/compile_only_service.cc +++ b/tensorflow/compiler/xla/service/compile_only_service.cc @@ -37,7 +37,7 @@ limitations under the License. namespace xla { /* static */ StatusOr> -CompileOnlyService::NewService(perftools::gputools::Platform* platform) { +CompileOnlyService::NewService(se::Platform* platform) { ServiceOptions default_options; default_options.set_platform(platform); return NewService(default_options); @@ -45,7 +45,7 @@ CompileOnlyService::NewService(perftools::gputools::Platform* platform) { /* static */ StatusOr> CompileOnlyService::NewService(const ServiceOptions& options) { - perftools::gputools::Platform* platform = options.platform(); + se::Platform* platform = options.platform(); if (platform == nullptr) { TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform()); } diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h index dd8de42a0fc..c10609e67fc 100644 --- a/tensorflow/compiler/xla/service/compile_only_service.h +++ b/tensorflow/compiler/xla/service/compile_only_service.h @@ -34,7 +34,7 @@ class CompileOnlyService : public Service { // platform that the service should target. If platform is null then the // default platform is used. static StatusOr> NewService( - perftools::gputools::Platform* platform); + se::Platform* platform); static StatusOr> NewService( const ServiceOptions& options); diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc index 0392d4af48a..8b01a6c4b50 100644 --- a/tensorflow/compiler/xla/service/compiler.cc +++ b/tensorflow/compiler/xla/service/compiler.cc @@ -23,26 +23,21 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" -namespace se = ::perftools::gputools; - namespace xla { /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_( tensorflow::LINKER_INITIALIZED); -/* static */ std::map* +/* static */ std::map* Compiler::GetPlatformCompilerFactories() { - static auto* r = - new std::map; + static auto* r = new std::map; return r; } /* static */ -std::map>* +std::map>* Compiler::GetPlatformCompilers() { - static auto* r = new std::map>; + static auto* r = new std::map>; return r; } diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h index b4b53ae2ed4..5c14591d93c 100644 --- a/tensorflow/compiler/xla/service/compiler.h +++ b/tensorflow/compiler/xla/service/compiler.h @@ -70,7 +70,7 @@ class AotCompilationOptions { virtual ~AotCompilationOptions() = default; // Returns the ID of the platform to which these options apply. - virtual perftools::gputools::Platform::Id PlatformId() const = 0; + virtual se::Platform::Id PlatformId() const = 0; // Optional allocator that may be used for allocating temp space on the device // during compilation. @@ -109,7 +109,7 @@ class Compiler { virtual ~Compiler() {} // Returns the ID of the platform that this compiler targets. - virtual perftools::gputools::Platform::Id PlatformId() const = 0; + virtual se::Platform::Id PlatformId() const = 0; // Runs Hlo passes to optimize the given Hlo module, returns the optimized // module. @@ -120,8 +120,7 @@ class Compiler { // algorithm over those buffers, to see which variant is fastest. Any space // allocated should be deallocated before this function returns. virtual StatusOr> RunHloPasses( - std::unique_ptr module, - perftools::gputools::StreamExecutor* executor, + std::unique_ptr module, se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) = 0; // Compiles the HLO module for execution on a device given by the executor, @@ -137,8 +136,7 @@ class Compiler { // // Use the overload below to compile computations that run in parallel. virtual StatusOr> RunBackend( - std::unique_ptr module, - perftools::gputools::StreamExecutor* executor, + std::unique_ptr module, se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) = 0; // Compiles a set of HLO modules that can run in parallel, potentially @@ -151,8 +149,7 @@ class Compiler { // modules to RunHloPasses and RunBackends. virtual StatusOr>> Compile( std::vector> modules, - std::vector> - stream_exec, + std::vector> stream_exec, DeviceMemoryAllocator* device_allocator) = 0; // Compiles the HLO module for ahead-of-time execution. This is intended for @@ -171,14 +168,12 @@ class Compiler { // be a singleton, so no ownership is transferred. // // Precondition: a platform kind must not be registered more than once. - static void RegisterCompilerFactory( - perftools::gputools::Platform::Id platform_id, - CompilerFactory compiler_factory); + static void RegisterCompilerFactory(se::Platform::Id platform_id, + CompilerFactory compiler_factory); // Returns the compiler singleton pointer if it is available for the given // platform, or an error status if it is not. - static StatusOr GetForPlatform( - const perftools::gputools::Platform* platform); + static StatusOr GetForPlatform(const se::Platform* platform); // Returns a function that computes the size in bytes of the logical // buffer that contains a shape. @@ -198,12 +193,12 @@ class Compiler { static tensorflow::mutex platform_compiler_mutex_; // Map from platform kind to compiler factory. - static std::map* + static std::map* GetPlatformCompilerFactories(); // Map from platform kind to compiler instance, if we made one already (based // on the factories above). - static std::map>* + static std::map>* GetPlatformCompilers(); }; diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc index 657fba6b623..7c1bacff92b 100644 --- a/tensorflow/compiler/xla/service/computation_placer.cc +++ b/tensorflow/compiler/xla/service/computation_placer.cc @@ -32,8 +32,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const { @@ -132,11 +130,9 @@ StatusOr ComputationPlacer::AssignDevices( ComputationPlacer::platform_computation_placer_mutex_( tensorflow::LINKER_INITIALIZED); -/* static */ std::map* +/* static */ std::map* ComputationPlacer::GetPlatformComputationPlacers() { - static auto* r = - new std::map; + static auto* r = new std::map; return r; } @@ -147,10 +143,10 @@ static std::unique_ptr CreateComputationPlacer() { } static bool InitModule() { - xla::ComputationPlacer::RegisterComputationPlacer(se::host::kHostPlatformId, - &CreateComputationPlacer); - xla::ComputationPlacer::RegisterComputationPlacer(se::cuda::kCudaPlatformId, - &CreateComputationPlacer); + xla::ComputationPlacer::RegisterComputationPlacer( + stream_executor::host::kHostPlatformId, &CreateComputationPlacer); + xla::ComputationPlacer::RegisterComputationPlacer( + stream_executor::cuda::kCudaPlatformId, &CreateComputationPlacer); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h index 737ccabaa7a..737d00e93ec 100644 --- a/tensorflow/compiler/xla/service/computation_placer.h +++ b/tensorflow/compiler/xla/service/computation_placer.h @@ -80,13 +80,13 @@ class ComputationPlacer { // Registers a computation placer creation function for a particular platform. static void RegisterComputationPlacer( - perftools::gputools::Platform::Id platform_id, + se::Platform::Id platform_id, ComputationPlacerCreationFunction creation_function); // Returns the computation placer singleton pointer if it is available for the // given platform, or an error status if it is not. static StatusOr GetForPlatform( - const perftools::gputools::Platform* platform); + const se::Platform* platform); private: // The mutex that guards the platform-to-computation placer map. @@ -101,10 +101,9 @@ class ComputationPlacer { }; // Map from platform kind to computation placer singleton. - static std::map* - GetPlatformComputationPlacers(); + static std::map* GetPlatformComputationPlacers(); - perftools::gputools::Platform::Id platform_id_; + se::Platform::Id platform_id_; TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer); }; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index e43777c5e5e..e8472fd36b3 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -100,8 +100,6 @@ limitations under the License. #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" -namespace se = ::perftools::gputools; - namespace xla { namespace cpu { @@ -440,8 +438,7 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) { } // namespace StatusOr> CpuCompiler::RunHloPasses( - std::unique_ptr module, - perftools::gputools::StreamExecutor* /*stream_exec*/, + std::unique_ptr module, se::StreamExecutor* /*stream_exec*/, DeviceMemoryAllocator* /*device_allocator*/) { VLOG(2) << "Before optimization:"; XLA_VLOG_LINES(2, module->ToString()); @@ -454,8 +451,7 @@ StatusOr> CpuCompiler::RunHloPasses( } StatusOr> CpuCompiler::RunBackend( - std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* /*device_allocator*/) { const string timer_message = "Compiling [" + module->name() + "] for CPU using JIT"; @@ -938,9 +934,9 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const { } // namespace xla static bool InitModule() { - xla::Compiler::RegisterCompilerFactory(se::host::kHostPlatformId, []() { - return xla::MakeUnique(); - }); + xla::Compiler::RegisterCompilerFactory( + stream_executor::host::kHostPlatformId, + []() { return xla::MakeUnique(); }); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h index 3498139ab95..151af38438a 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h @@ -53,7 +53,7 @@ class CpuAotCompilationOptions : public AotCompilationOptions { RelocationModel relocation_model); ~CpuAotCompilationOptions() override; - perftools::gputools::Platform::Id PlatformId() const override; + se::Platform::Id PlatformId() const override; // The triple used for compilation, similar to clang's -target flag. const string& triple() const { return triple_; } @@ -112,25 +112,23 @@ class CpuCompiler : public LLVMCompiler { // Bring in // StatusOr>> Compile( // std::vector> modules, - // std::vector> + // std::vector> // stream_execs) using LLVMCompiler::Compile; StatusOr> RunHloPasses( - std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( - std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr>> CompileAheadOfTime(std::vector> modules, const AotCompilationOptions& options) override; - perftools::gputools::Platform::Id PlatformId() const override; + se::Platform::Id PlatformId() const override; HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index c053703c352..aee62a4935e 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -45,8 +45,6 @@ limitations under the License. #include "tensorflow/core/platform/types.h" #include "tensorflow/stream_executor/host/host_stream.h" -namespace se = ::perftools::gputools; - namespace xla { namespace cpu { @@ -75,7 +73,7 @@ CpuExecutable::CpuExecutable( Status CpuExecutable::AllocateBuffers( DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector* buffers) { + std::vector* buffers) { CHECK_EQ(buffers->size(), assignment_->Allocations().size()); VLOG(3) << "Allocating " << assignment_->Allocations().size() << " allocations for module " << module().name(); @@ -247,8 +245,7 @@ static Status DeallocateTempBuffers( StatusOr> CpuExecutable::CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice - allocated_buffers, + tensorflow::gtl::ArraySlice allocated_buffers, std::vector* buffers_in_result) { se::Stream* stream = run_options->stream(); auto result_buffer = MakeUnique( @@ -322,7 +319,7 @@ StatusOr> CpuExecutable::ExecuteAsyncOnStream( "supported on CPU."); } - auto* host_stream = dynamic_cast( + auto* host_stream = dynamic_cast( run_options->stream()->implementation()); se::Stream* stream = run_options->stream(); DeviceMemoryAllocator* memory_allocator = run_options->allocator(); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index d3502b3a03e..c3c2820c26c 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -90,17 +90,16 @@ class CpuExecutable : public Executable { // assignment. Each vector element corresponds to a particular Index. If // a vector element already contains a non-null DeviceMemoryBase, then no // buffer is assigned for this element. - Status AllocateBuffers( - DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector* buffers); + Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator, + int device_ordinal, + std::vector* buffers); // Calls the generated function performing the computation with the given // arguments using the supplied buffers. Status ExecuteComputeFunction( const ExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice - buffers, + tensorflow::gtl::ArraySlice buffers, HloExecutionProfile* hlo_execution_profile); // Create a ShapedBuffer for holding the result of the computation. The @@ -111,8 +110,7 @@ class CpuExecutable : public Executable { // the returned ShapedBuffer). StatusOr> CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice - allocated_buffers, + tensorflow::gtl::ArraySlice allocated_buffers, std::vector* buffers_in_result); // Returns the points-to set of the root instruction of the entry diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc index f5e61aef534..9b39e7f5765 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc @@ -34,8 +34,6 @@ limitations under the License. #include "tensorflow/core/platform/notification.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { @@ -241,21 +239,20 @@ Status CpuTransferManager::TransferLiteralFromOutfeed( } StatusOr CpuTransferManager::TransferTupleBuffersFromOutfeed( - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, tensorflow::gtl::ArraySlice> buffer_data) { return TransferBuffersFromOutfeedInternal(executor, buffer_data, /*is_tuple=*/true); } StatusOr CpuTransferManager::TransferArrayBufferFromOutfeed( - perftools::gputools::StreamExecutor* executor, void* destination, - int64 size_bytes) { + se::StreamExecutor* executor, void* destination, int64 size_bytes) { return TransferBuffersFromOutfeedInternal( executor, {{destination, size_bytes}}, /*is_tuple=*/false); } StatusOr CpuTransferManager::TransferBuffersFromOutfeedInternal( - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, tensorflow::gtl::ArraySlice> buffer_data, bool is_tuple) { std::vector> buffers; @@ -306,8 +303,8 @@ static std::unique_ptr CreateCpuTransferManager() { } static bool InitModule() { - xla::TransferManager::RegisterTransferManager(se::host::kHostPlatformId, - &CreateCpuTransferManager); + xla::TransferManager::RegisterTransferManager( + stream_executor::host::kHostPlatformId, &CreateCpuTransferManager); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h index 6c7524d9471..3ecb0d23649 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h @@ -37,36 +37,35 @@ class CpuTransferManager : public GenericTransferManager { CpuTransferManager(); ~CpuTransferManager() override {} - Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor, + Status TransferLiteralToInfeed(se::StreamExecutor* executor, const Literal& literal) override; - Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor, - int64 size, const void* source) override; - Status TransferLiteralFromOutfeed( - perftools::gputools::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) override; + Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size, + const void* source) override; + Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, + const Shape& literal_shape, + Literal* literal) override; private: // Transfers infeed data to device. InfeedBuffer->Done() must be // called to clean up the memory allocated for InfeedBuffer. StatusOr TransferBufferToInfeedInternal( - perftools::gputools::StreamExecutor* executor, int64 size, - const void* source); + se::StreamExecutor* executor, int64 size, const void* source); // Helper that transfers a tuple of element buffers from the device's outfeed. StatusOr TransferTupleBuffersFromOutfeed( - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, tensorflow::gtl::ArraySlice> buffer_data); // Helper that transfers an array buffer from the device's outfeed. - StatusOr TransferArrayBufferFromOutfeed( - perftools::gputools::StreamExecutor* executor, void* destination, - int64 size_bytes); + StatusOr TransferArrayBufferFromOutfeed(se::StreamExecutor* executor, + void* destination, + int64 size_bytes); // On success, returns the shape that was transferred from the outfeed -- if // is_tuple is true, the returned shape will be a tuple of the returned shapes // for the given buffers. StatusOr TransferBuffersFromOutfeedInternal( - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, tensorflow::gtl::ArraySlice> buffer_data, bool is_tuple); diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc index 07a9f0efcb6..2d0f1d0be5f 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc @@ -49,8 +49,6 @@ limitations under the License. #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace cpu { @@ -325,7 +323,7 @@ const void** Executor::GetOperandBuffers(HloInstruction* instruction) { Status ParallelCpuExecutable::AllocateBuffers( DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector* buffers) { + std::vector* buffers) { CHECK_EQ(buffers->size(), assignment_->Allocations().size()); VLOG(3) << "Allocating " << assignment_->Allocations().size() << " allocations for module " << module().name(); diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h index 87c0a3df458..d87ba57a1e4 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h @@ -89,17 +89,16 @@ class ParallelCpuExecutable : public Executable { // assignment. Each vector element corresponds to a particular Index. If // a vector element already contains a non-null DeviceMemoryBase, then no // buffer is assigned for this element. - Status AllocateBuffers( - DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector* buffers); + Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator, + int device_ordinal, + std::vector* buffers); // Calls the generated functions in 'function_names_', performing the // computation with the given arguments using the supplied buffers. Status ExecuteComputeFunctions( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice - buffers, + tensorflow::gtl::ArraySlice buffers, HloExecutionProfile* hlo_execution_profile); // Returns the points-to set of the root instruction of the entry diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc index 78e7aa48acc..35db4fd2a22 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.cc +++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc @@ -24,19 +24,16 @@ limitations under the License. namespace xla { StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator( - const perftools::gputools::Platform* platform, - tensorflow::gtl::ArraySlice - stream_executors) + const se::Platform* platform, + tensorflow::gtl::ArraySlice stream_executors) : DeviceMemoryAllocator(platform), stream_executors_(stream_executors.begin(), stream_executors.end()) {} -StatusOr -StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size, - bool retry_on_failure) { - TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor, +StatusOr StreamExecutorMemoryAllocator::Allocate( + int device_ordinal, uint64 size, bool retry_on_failure) { + TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor, GetStreamExecutor(device_ordinal)); - perftools::gputools::DeviceMemoryBase result = - stream_executor->AllocateArray(size); + se::DeviceMemoryBase result = stream_executor->AllocateArray(size); if (size > 0 && result == nullptr) { return ResourceExhausted( "Failed to allocate request for %s (%lluB) on device ordinal %d", @@ -47,22 +44,22 @@ StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size, } tensorflow::Status StreamExecutorMemoryAllocator::Deallocate( - int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) { + int device_ordinal, se::DeviceMemoryBase* mem) { if (!mem->is_null()) { - TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor, + TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor, GetStreamExecutor(device_ordinal)); // We make a local copy of 'mem' so the original is not zeroed out by the // Deallocate() call below. This gives us a better chance of // catching double-free bugs, since Deallocate silently succeeds for null // values. - perftools::gputools::DeviceMemoryBase mem_copy(*mem); + se::DeviceMemoryBase mem_copy(*mem); stream_executor->Deallocate(&mem_copy); } return tensorflow::Status::OK(); } -StatusOr -StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) { +StatusOr StreamExecutorMemoryAllocator::GetStreamExecutor( + int device_ordinal) { if (device_ordinal < 0) { return InvalidArgument("device ordinal value (%d) must be non-negative", device_ordinal); diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h index 39dfad84c1c..240acf89739 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.h +++ b/tensorflow/compiler/xla/service/device_memory_allocator.h @@ -33,7 +33,7 @@ class DeviceMemoryAllocator { public: // Parameter platform indicates which platform the allocator allocates memory // on. Must be non-null. - explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform) + explicit DeviceMemoryAllocator(const se::Platform* platform) : platform_(platform) {} virtual ~DeviceMemoryAllocator() {} @@ -43,20 +43,20 @@ class DeviceMemoryAllocator { // has only performance impact. // Allocate() should return a null pointer for a size-0 allocation. // Deallocate() must be a no-op for null pointers. - virtual StatusOr Allocate( + virtual StatusOr Allocate( int device_ordinal, uint64 size, bool retry_on_failure = true) = 0; - virtual tensorflow::Status Deallocate( - int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0; + virtual tensorflow::Status Deallocate(int device_ordinal, + se::DeviceMemoryBase* mem) = 0; // Return the platform that the allocator allocates memory on. - const perftools::gputools::Platform* platform() const { return platform_; } + const se::Platform* platform() const { return platform_; } // Can we call Deallocate() as soon as a computation has been scheduled on // a stream, or do we have to wait for the computation to complete first? virtual bool AllowsAsynchronousDeallocation() const = 0; protected: - const perftools::gputools::Platform* platform_; + const se::Platform* platform_; }; // Default memory allocator for a platform which uses @@ -64,25 +64,23 @@ class DeviceMemoryAllocator { class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator { public: StreamExecutorMemoryAllocator( - const perftools::gputools::Platform* platform, - tensorflow::gtl::ArraySlice - stream_executors); + const se::Platform* platform, + tensorflow::gtl::ArraySlice stream_executors); - StatusOr Allocate( + StatusOr Allocate( int device_ordinal, uint64 size, bool retry_on_failure = true) override; - tensorflow::Status Deallocate( - int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override; + tensorflow::Status Deallocate(int device_ordinal, + se::DeviceMemoryBase* mem) override; bool AllowsAsynchronousDeallocation() const override; private: - StatusOr GetStreamExecutor( - int device_ordinal); + StatusOr GetStreamExecutor(int device_ordinal); // A vector indexed by device ordinal of StreamExecutors for each device of // the allocator's platform type. If an element is nullptr, then the device // with the respective device ordinal is not supported by XLA. - std::vector stream_executors_; + std::vector stream_executors_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index 471d2fd6ceb..caa46686be1 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -61,10 +61,10 @@ Executable::ExecuteOnStreams( StatusOr> Executable::ExecuteOnStreamWrapper( const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile, ArraySlice arguments) { - perftools::gputools::Stream* stream = run_options->stream(); - std::unique_ptr timer; + se::Stream* stream = run_options->stream(); + std::unique_ptr timer; if (profile != nullptr) { - timer.reset(new perftools::gputools::Timer(stream->parent())); + timer.reset(new se::Timer(stream->parent())); stream->InitTimer(timer.get()).ThenStartTimer(timer.get()); } diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index a157235f8af..6f4cd99767f 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -90,7 +90,7 @@ class Executable { // has completed. virtual Status PopulateExecutionProfile( HloExecutionProfile* hlo_execution_profile, - perftools::gputools::StreamExecutor* executor) { + se::StreamExecutor* executor) { return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc index a99e2b7794a..ddb687314ee 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc @@ -32,8 +32,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { GenericTransferManager::GenericTransferManager(se::Platform::Id platform_id, @@ -45,9 +43,9 @@ se::Platform::Id GenericTransferManager::PlatformId() const { } Status GenericTransferManager::WriteSingleTupleIndexTable( - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, tensorflow::gtl::ArraySlice elements, - const Shape& shape, perftools::gputools::DeviceMemoryBase* region) { + const Shape& shape, se::DeviceMemoryBase* region) { TF_RET_CHECK(elements.size() == ShapeUtil::TupleElementCount(shape)); std::vector element_pointers; @@ -144,20 +142,19 @@ Status GenericTransferManager::TransferLiteralToInfeed( } Status GenericTransferManager::TransferBufferToInfeed( - perftools::gputools::StreamExecutor* executor, int64 size, - const void* source) { + se::StreamExecutor* executor, int64 size, const void* source) { return Unimplemented("Generic transfer to Infeed"); } Status GenericTransferManager::TransferLiteralFromOutfeed( - perftools::gputools::StreamExecutor* executor, const Shape& literal_shape, + se::StreamExecutor* executor, const Shape& literal_shape, Literal* literal) { return Unimplemented( "Outfeed is not supported on this platform (b/30467474)"); } Status GenericTransferManager::ResetDevices( - tensorflow::gtl::ArraySlice + tensorflow::gtl::ArraySlice /*executors*/) { return Unimplemented( "Device reset is not yet supported on this platform (b/30481585)"); diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h index 63a7c820cf4..0579099de40 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.h +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h @@ -36,46 +36,41 @@ namespace xla { // infeed. class GenericTransferManager : public TransferManager { public: - GenericTransferManager(perftools::gputools::Platform::Id platform_id, - size_t pointer_size); + GenericTransferManager(se::Platform::Id platform_id, size_t pointer_size); ~GenericTransferManager() override {} - perftools::gputools::Platform::Id PlatformId() const override; + se::Platform::Id PlatformId() const override; StatusOr> TransferLiteralFromDevice( - perftools::gputools::StreamExecutor* executor, - const ShapedBuffer& device_buffer) override; + se::StreamExecutor* executor, const ShapedBuffer& device_buffer) override; - Status TransferLiteralToDevice(perftools::gputools::StreamExecutor* executor, + Status TransferLiteralToDevice(se::StreamExecutor* executor, const Literal& literal, const ShapedBuffer& device_buffer) override; - Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor, + Status TransferLiteralToInfeed(se::StreamExecutor* executor, const Literal& literal) override; - Status TransferLiteralFromOutfeed( - perftools::gputools::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) override; + Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, + const Shape& literal_shape, + Literal* literal) override; Status ResetDevices( - tensorflow::gtl::ArraySlice - executors) override; + tensorflow::gtl::ArraySlice executors) override; int64 GetByteSizeRequirement(const Shape& shape) const override; protected: - Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor, - int64 size, const void* source) override; + Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size, + const void* source) override; Status WriteSingleTupleIndexTable( - perftools::gputools::StreamExecutor* executor, - tensorflow::gtl::ArraySlice - elements, - const Shape& shape, - perftools::gputools::DeviceMemoryBase* region) override; + se::StreamExecutor* executor, + tensorflow::gtl::ArraySlice elements, + const Shape& shape, se::DeviceMemoryBase* region) override; private: // The platform this transfer manager targets. - const perftools::gputools::Platform::Id platform_id_; + const se::Platform::Id platform_id_; // The size in bytes of pointers on this platform. const size_t pointer_size_; diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc index 2029c303d47..837f05244f7 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc +++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc @@ -28,8 +28,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h index ea7f0eb3745..c2fc35be4ca 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h +++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h @@ -41,7 +41,7 @@ class BufferAllocations { // user-specified result buffers) to the given buffer index. The builder // will skip allocating buffers for registered buffer indices. void RegisterBuffer(BufferAllocation::Index index, - perftools::gputools::DeviceMemoryBase address); + se::DeviceMemoryBase address); // Builds a BufferAllocations object from the given buffer assignment. // `memory_allocator` is what this function uses to allocate device memory. @@ -52,8 +52,7 @@ class BufferAllocations { DeviceMemoryAllocator* memory_allocator); private: - std::map - registered_buffers_; + std::map registered_buffers_; }; BufferAllocations(const BufferAllocations&) = delete; @@ -65,22 +64,20 @@ class BufferAllocations { // Returns the device address of buffer `buffer_index`. `buffer_index` must be // a valid index, i.e., in [0, buffer_count). This function returns null if // `buffer_index` is not assigned to a buffer address. - perftools::gputools::DeviceMemoryBase GetDeviceAddress( + se::DeviceMemoryBase GetDeviceAddress( BufferAllocation::Index buffer_index) const; // Same as above, but also adjusts the returned address for the offset and // size contained in the given slice. - perftools::gputools::DeviceMemoryBase GetDeviceAddress( + se::DeviceMemoryBase GetDeviceAddress( const BufferAllocation::Slice& buffer_slice) const; - perftools::gputools::DeviceMemoryBase GetTempBufferBase() const { - return temp_buffer_base_; - } + se::DeviceMemoryBase GetTempBufferBase() const { return temp_buffer_base_; } // Tears down all buffers allocated by this object that are not in // `live_addresses`. tensorflow::Status TearDown( - const std::set& live_addresses, + const std::set& live_addresses, const BufferAssignment& buffer_assignment); private: @@ -92,15 +89,15 @@ class BufferAllocations { // Sets the device address of buffer `buffer_index`. void SetBuffer(BufferAllocation::Index buffer_index, - perftools::gputools::DeviceMemoryBase buffer); + se::DeviceMemoryBase buffer); // An array of device pointers that stores the address of each buffer // indexed by Index. Each element can point to a temporary buffer, an // input buffer, or nullptr if no buffer is needed for that Index. - std::vector buffers_; + std::vector buffers_; // The base address of the memory block that contains all temporary buffers. - perftools::gputools::DeviceMemoryBase temp_buffer_base_; + se::DeviceMemoryBase temp_buffer_base_; int device_ordinal_; diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc index 790ca535b11..dce8de2e301 100644 --- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc @@ -42,11 +42,10 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable) { } Status ConditionalThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream) { // Copy the predicate value from device. bool predicate; - perftools::gputools::DeviceMemoryBase predicate_address = + se::DeviceMemoryBase predicate_address = buffer_allocations.GetDeviceAddress(predicate_buffer_index_); stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool)); diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h index 7725c46a3b4..e40872688fd 100644 --- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h @@ -49,7 +49,7 @@ class ConditionalThunk : public Thunk { Status Initialize(const GpuExecutable& executable) override; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: BufferAllocation::Slice predicate_buffer_index_; diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc index 461747b699b..64d3b84b8c7 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc @@ -25,8 +25,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h index 900d9cb6243..6d845025b1a 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h @@ -66,23 +66,21 @@ class ConvolutionThunk : public Thunk { // Does the convolution for the thunk on "stream". Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: class ScratchAllocator; - Status Convolve( - const perftools::gputools::dnn::BatchDescriptor& input_descriptor, - perftools::gputools::DeviceMemory input_data, - const perftools::gputools::dnn::FilterDescriptor& filter_descriptor, - perftools::gputools::DeviceMemory filter_data, - const perftools::gputools::dnn::BatchDescriptor& output_descriptor, - perftools::gputools::DeviceMemory output_data, - const perftools::gputools::dnn::ConvolutionDescriptor& - convolution_descriptor, - const perftools::gputools::dnn::AlgorithmConfig& algorithm_config, - perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator, - perftools::gputools::dnn::ProfileResult* profile_result); + Status Convolve(const se::dnn::BatchDescriptor& input_descriptor, + se::DeviceMemory input_data, + const se::dnn::FilterDescriptor& filter_descriptor, + se::DeviceMemory filter_data, + const se::dnn::BatchDescriptor& output_descriptor, + se::DeviceMemory output_data, + const se::dnn::ConvolutionDescriptor& convolution_descriptor, + const se::dnn::AlgorithmConfig& algorithm_config, + se::Stream* stream, ScratchAllocator* scratch_allocator, + se::dnn::ProfileResult* profile_result); const CudnnConvKind convolution_kind_; diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc index f4498663b1c..bf912fbd14d 100644 --- a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc @@ -30,9 +30,8 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk( mem_size_(mem_size) {} tensorflow::Status HostToDeviceCopyThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { - perftools::gputools::DeviceMemoryBase destination_data = + const BufferAllocations& buffer_allocations, se::Stream* stream) { + se::DeviceMemoryBase destination_data = buffer_allocations.GetDeviceAddress(destination_buffer_); stream->ThenMemcpy(&destination_data, source_address_, mem_size_); return tensorflow::Status::OK(); @@ -48,11 +47,10 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk( mem_size_(mem_size) {} tensorflow::Status DeviceToDeviceCopyThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { - perftools::gputools::DeviceMemoryBase destination_data = + const BufferAllocations& buffer_allocations, se::Stream* stream) { + se::DeviceMemoryBase destination_data = buffer_allocations.GetDeviceAddress(destination_buffer_); - perftools::gputools::DeviceMemoryBase source_data = + se::DeviceMemoryBase source_data = buffer_allocations.GetDeviceAddress(source_buffer_); stream->ThenMemcpy(&destination_data, source_data, mem_size_); return tensorflow::Status::OK(); diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.h b/tensorflow/compiler/xla/service/gpu/copy_thunk.h index e2783fd2552..2e7eb5f3445 100644 --- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h @@ -40,8 +40,7 @@ class HostToDeviceCopyThunk : public Thunk { HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete; tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: const void* source_address_; @@ -64,8 +63,7 @@ class DeviceToDeviceCopyThunk : public Thunk { DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete; tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: const BufferAllocation::Slice source_buffer_; diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc index 58d9c8caff3..68099fd6384 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc @@ -28,7 +28,6 @@ limitations under the License. namespace xla { namespace gpu { -namespace se = ::perftools::gputools; namespace dnn = se::dnn; static std::pair> - AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override; + se::port::StatusOr> AllocateBytes( + se::Stream* stream, int64 byte_size) override; private: const int device_ordinal_; DeviceMemoryAllocator* memory_allocator_; - std::vector allocated_buffers_; + std::vector allocated_buffers_; int64 total_allocated_bytes_ = 0; }; @@ -74,16 +74,15 @@ class FftThunk : public Thunk { // Does the FFT for the thunk on "stream". tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: - const perftools::gputools::fft::Type fft_type_; + const se::fft::Type fft_type_; const std::vector fft_length_; float scale_factor_; - std::unique_ptr fft_plan_; + std::unique_ptr fft_plan_; const BufferAllocation::Slice input_buffer_; const BufferAllocation::Slice output_buffer_; diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc index 283d21ca222..6e6966df398 100644 --- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc @@ -36,8 +36,7 @@ tensorflow::Status ForThunk::Initialize(const GpuExecutable& executable) { } tensorflow::Status ForThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream) { for (int64 i = 0; i < loop_limit_; ++i) { // Invoke loop body thunk sequence. TF_RETURN_IF_ERROR( diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h index 832494d17e9..c78d1c50686 100644 --- a/tensorflow/compiler/xla/service/gpu/for_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h @@ -38,8 +38,7 @@ class ForThunk : public Thunk { tensorflow::Status Initialize(const GpuExecutable& executable) override; tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: const int64 loop_limit_; diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc index 38668ff455a..0ec12f52d8b 100644 --- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc @@ -22,8 +22,6 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h index df3edcefef8..a18f425bc38 100644 --- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h @@ -50,14 +50,12 @@ class GemmThunk : public Thunk { // Does the gemm operation for the thunk on "stream", which must be non-null. tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; // Returns true if we'll perform autotuning if run on the given stream. If // so, we want the GPU to be quiescent during autotuning, so as not to // introduce noise in our results. - bool ShouldHaltAllActivityBeforeRunning( - perftools::gputools::Stream* stream) override { + bool ShouldHaltAllActivityBeforeRunning(se::Stream* stream) override { return autotune_results_.count( stream->parent()->GetDeviceDescription().name()) != 0; } @@ -79,8 +77,7 @@ class GemmThunk : public Thunk { // results. The map's value is the best algorithm we've found for this thunk // on this device, or an error if none of the algorithms worked and we should // use the regular gemm without an algorithm. - std::unordered_map> + std::unordered_map> autotune_results_; }; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index 07be2a0cf90..30bfc9351a5 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -91,8 +91,6 @@ limitations under the License. #include "tensorflow/core/platform/tracing.h" #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { @@ -779,9 +777,9 @@ se::Platform::Id GpuCompiler::PlatformId() const { } // namespace xla static bool InitModule() { - xla::Compiler::RegisterCompilerFactory(se::cuda::kCudaPlatformId, []() { - return xla::MakeUnique(); - }); + xla::Compiler::RegisterCompilerFactory( + stream_executor::cuda::kCudaPlatformId, + []() { return xla::MakeUnique(); }); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h index c352d4d8462..f3b02ae5d88 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h @@ -45,25 +45,23 @@ class GpuCompiler : public LLVMCompiler { // Bring in // StatusOr>> Compile( // std::vector> modules, - // std::vector> + // std::vector> // stream_execs) using LLVMCompiler::Compile; StatusOr> RunHloPasses( - std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( - std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr>> CompileAheadOfTime(std::vector> module, AotCompilationOptions const& options) override; - perftools::gputools::Platform::Id PlatformId() const override; + se::Platform::Id PlatformId() const override; HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override { // Capture just the pointer size, not the entire GpuCompiler object. diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 28f93447953..5676d4de8e3 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -34,8 +34,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { namespace { @@ -324,7 +322,7 @@ StatusOr> GpuExecutable::ExecuteOnStream( this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index())); CHECK(!slice.allocation()->is_entry_computation_parameter()); - perftools::gputools::DeviceMemoryBase src_base = + se::DeviceMemoryBase src_base = buffer_allocations->GetDeviceAddress(slice.index()); CHECK(!src_base.is_null() || src_base.size() == 0); *device_memory = src_base; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc index af9897769fd..f13727ca9b6 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc @@ -33,8 +33,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { // TODO(b/30467474) Once GPU infeed implementation settles, consider @@ -153,8 +151,8 @@ static std::unique_ptr CreateGpuTransferManager() { } static bool InitModule() { - xla::TransferManager::RegisterTransferManager(se::cuda::kCudaPlatformId, - &CreateGpuTransferManager); + xla::TransferManager::RegisterTransferManager( + stream_executor::cuda::kCudaPlatformId, &CreateGpuTransferManager); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h index 9aa369c6683..d040a999752 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h @@ -36,21 +36,20 @@ class GpuTransferManager : public GenericTransferManager { GpuTransferManager(); ~GpuTransferManager() override {} - Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor, + Status TransferLiteralToInfeed(se::StreamExecutor* executor, const Literal& literal) override; - Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor, - int64 size, const void* source) override; + Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size, + const void* source) override; private: // Initiates the infeed data transfers. InfeedBuffer->Done() must be // called to clean up the memory allocated for InfeedBuffer. StatusOr TransferBufferToInfeedInternal( - perftools::gputools::StreamExecutor* executor, int64 size, - const void* source); + se::StreamExecutor* executor, int64 size, const void* source); // Enqueues infeed data buffers with the infeed manager after their // transfer completes. - Status EnqueueBuffersToInfeed(perftools::gputools::StreamExecutor* executor, + Status EnqueueBuffersToInfeed(se::StreamExecutor* executor, std::vector buffers); TF_DISALLOW_COPY_AND_ASSIGN(GpuTransferManager); diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc index ee5b447c9cd..3ddc1c0789d 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc +++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc @@ -19,8 +19,6 @@ limitations under the License. #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/core/platform/logging.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.h b/tensorflow/compiler/xla/service/gpu/infeed_manager.h index 73d5a5ce354..d5f2216d460 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_manager.h +++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.h @@ -46,7 +46,7 @@ namespace gpu { // the client. The client manages the memory of the buffer. class InfeedBuffer { public: - InfeedBuffer(perftools::gputools::StreamExecutor* executor, int64 length) + InfeedBuffer(se::StreamExecutor* executor, int64 length) : executor_(executor), length_(length) { device_memory_ = executor_->AllocateArray(length); CHECK(!device_memory_.is_null()); @@ -60,14 +60,12 @@ class InfeedBuffer { // client to manage memory for the infeed buffers. void Done() { delete this; } - perftools::gputools::DeviceMemoryBase* device_memory() { - return &device_memory_; - } + se::DeviceMemoryBase* device_memory() { return &device_memory_; } private: - perftools::gputools::StreamExecutor* executor_; // Not owned. + se::StreamExecutor* executor_; // Not owned. const int64 length_; - perftools::gputools::DeviceMemoryBase device_memory_; + se::DeviceMemoryBase device_memory_; }; // Client-side class used to enqueue infeed buffers. @@ -100,8 +98,7 @@ class InfeedManager { // new stream on the first invocation. On subsequent invocations, if // the cached executor is not the same as the requested executor, // returns null. - perftools::gputools::Stream* GetStream( - perftools::gputools::StreamExecutor* executor); + se::Stream* GetStream(se::StreamExecutor* executor); private: // TODO(b/30467474): Revisit if this mutex becomes a point of @@ -121,10 +118,10 @@ class InfeedManager { tensorflow::gtl::FlatSet dequeued_buffer_; // Cached host to device stream for queuing infeed data. - std::unique_ptr host_to_device_stream_; + std::unique_ptr host_to_device_stream_; // Executor that the host_to_device_stream belongs to. Not owned. - perftools::gputools::StreamExecutor* host_to_device_executor_; + se::StreamExecutor* host_to_device_executor_; }; // Singleton creator-or-accessor: Returns the GPU infeed manager. diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc index 2ac95ceb692..ea34d5b30c9 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc @@ -31,10 +31,10 @@ InfeedThunk::InfeedThunk( destination_buffer_(destination_buffer) {} Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { + se::Stream* stream) { VLOG(2) << "Infeeding to GPU "; - perftools::gputools::DeviceMemoryBase destination_address = + se::DeviceMemoryBase destination_address = buffer_allocations.GetDeviceAddress(destination_buffer_); InfeedManager* infeed_manager = GetOrCreateInfeedManager(); @@ -45,7 +45,7 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, std::vector tuple_element_addresses; for (BufferAllocation::Slice tuple_element_buffer : tuple_element_buffers_) { - perftools::gputools::DeviceMemoryBase tuple_element_address = + se::DeviceMemoryBase tuple_element_address = buffer_allocations.GetDeviceAddress(tuple_element_buffer); InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer(); diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h index 86918705fa0..93713cb12de 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h @@ -44,7 +44,7 @@ class InfeedThunk : public Thunk { InfeedThunk& operator=(const InfeedThunk&) = delete; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: const std::vector tuple_element_buffers_; diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h index 3790ed313b9..a78b4ff8307 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h @@ -32,7 +32,7 @@ class IrEmitterContext { public: IrEmitterContext(const HloModule* hlo_module, const BufferAssignment* buffer_assignment, - const perftools::gputools::DeviceDescription* device_desc, + const se::DeviceDescription* device_desc, llvm::Module* llvm_module) : hlo_module_(hlo_module), buffer_assignment_(buffer_assignment), @@ -47,7 +47,7 @@ class IrEmitterContext { const BufferAssignment& buffer_assignment() const { return *buffer_assignment_; } - const perftools::gputools::DeviceDescription& device_description() const { + const se::DeviceDescription& device_description() const { return *device_desc_; } llvm::Module* llvm_module() { return llvm_module_; } @@ -56,7 +56,7 @@ class IrEmitterContext { private: const HloModule* hlo_module_; const BufferAssignment* buffer_assignment_; - const perftools::gputools::DeviceDescription* device_desc_; + const se::DeviceDescription* device_desc_; llvm::Module* llvm_module_; NameUniquer name_uniquer_; }; diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc index c24dc1457f8..d376ef7a245 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc @@ -23,8 +23,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h index df8971b083f..b556befe66b 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h @@ -61,8 +61,7 @@ class KernelThunk : public Thunk { // Executes the kernel for the thunk on "stream", which must be non-null. tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: // Buffers passed to the kernel as arguments. @@ -82,13 +81,11 @@ class KernelThunk : public Thunk { // Describes how to load this kernel. ExecuteOnStream reuses this loader // specification for all executions. mutable tensorflow::mutex mutex_; - std::unique_ptr loader_spec_ - GUARDED_BY(mutex_); + std::unique_ptr loader_spec_ GUARDED_BY(mutex_); // Loaded kernels for each `StreamExecutor` - std::unordered_map - kernel_cache_ GUARDED_BY(mutex_); + std::unordered_map kernel_cache_ + GUARDED_BY(mutex_); }; } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc index 18e673542c5..d4100a898b5 100644 --- a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc @@ -19,8 +19,6 @@ limitations under the License. namespace xla { namespace gpu { -namespace se = ::perftools::gputools; - Status MemzeroThunk::ExecuteOnStream( const BufferAllocations& buffer_allocations, se::Stream* stream) { se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_); diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.h b/tensorflow/compiler/xla/service/gpu/memset_thunk.h index b4bb74d1dd6..51c332d287d 100644 --- a/tensorflow/compiler/xla/service/gpu/memset_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.h @@ -36,7 +36,7 @@ class MemzeroThunk : public Thunk { : Thunk(Kind::kMemzero, hlo), dest_(dest) {} Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: const BufferAllocation::Slice dest_; @@ -52,7 +52,7 @@ class Memset32BitValueThunk : public Thunk { : Thunk(Kind::kMemset32BitValue, hlo), value_(value), dest_(dest) {} Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: uint32 value_; diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc index 5283d51cd10..d3fd0544fb6 100644 --- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc +++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc @@ -29,8 +29,6 @@ limitations under the License. #include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h index 42d2d2af2e3..c125474edb1 100644 --- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h +++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h @@ -57,8 +57,7 @@ std::ostream& operator<<(std::ostream& out, const LaunchDimensions& launch_dims); LaunchDimensions CalculateLaunchDimensions( - const Shape& shape, - const perftools::gputools::DeviceDescription& device_desc, + const Shape& shape, const se::DeviceDescription& device_desc, int unroll_factor = 1); } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc index d8a43091d40..c8510808f10 100644 --- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc @@ -33,8 +33,7 @@ tensorflow::Status SequentialThunk::Initialize( } tensorflow::Status SequentialThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream) { for (const auto& thunk : thunks_) { TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream)); } diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h index 32c5b748aba..df17b8d67b8 100644 --- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h @@ -40,8 +40,7 @@ class SequentialThunk : public Thunk { tensorflow::Status Initialize(const GpuExecutable& executable) override; tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: // The list of sub-thunks. diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h index 9eea958d121..a0c785ed913 100644 --- a/tensorflow/compiler/xla/service/gpu/thunk.h +++ b/tensorflow/compiler/xla/service/gpu/thunk.h @@ -85,8 +85,7 @@ class Thunk { // This value is not required to be constant for a given Thunk. For example, // a Thunk that performs autotuning may return true for its first run and // false thereafter. - virtual bool ShouldHaltAllActivityBeforeRunning( - perftools::gputools::Stream* /*stream*/) { + virtual bool ShouldHaltAllActivityBeforeRunning(se::Stream* /*stream*/) { return false; } @@ -104,8 +103,7 @@ class Thunk { // called after Initialize and can be called multiple times over Thunk's // lifetime. Stream argument must be non-null. virtual tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) = 0; + const BufferAllocations& buffer_allocations, se::Stream* stream) = 0; private: Kind kind_; diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc index bd65e72393a..ecb54857ccc 100644 --- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc @@ -17,8 +17,6 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h index 3b1a4963285..8b459c29a13 100644 --- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h @@ -46,8 +46,7 @@ class TupleThunk : public Thunk { TupleThunk& operator=(const TupleThunk&) = delete; tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: const std::vector tuple_element_buffers_; diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc index c21559af6d2..a9f3d619a3f 100644 --- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc @@ -41,8 +41,8 @@ Status WhileThunk::Initialize(const GpuExecutable& executable) { } Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { - perftools::gputools::DeviceMemoryBase condition_result_data = + se::Stream* stream) { + se::DeviceMemoryBase condition_result_data = buffer_allocations.GetDeviceAddress(condition_result_buffer_index_); while (true) { diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h index 4c9f45de9e4..e589ca78a7e 100644 --- a/tensorflow/compiler/xla/service/gpu/while_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h @@ -47,7 +47,7 @@ class WhileThunk : public Thunk { Status Initialize(const GpuExecutable& executable) override; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: const BufferAllocation::Slice condition_result_buffer_index_; diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h index 6fb91b9bef9..be989846ef5 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.h +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h @@ -88,7 +88,7 @@ std::unique_ptr CreateHloProfilePrinterData( // down how much time each HLO took. class HloExecutionProfile { public: - using DeviceDescription = perftools::gputools::DeviceDescription; + using DeviceDescription = se::DeviceDescription; HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data, const HloProfileIndexMap* hlo_profile_index_map); diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc index 2e834a79d9f..171477299e4 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.cc +++ b/tensorflow/compiler/xla/service/hlo_runner.cc @@ -30,8 +30,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { /*static*/ StatusOr> diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h index f54fb44766e..53f7c6fe4a0 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.h +++ b/tensorflow/compiler/xla/service/hlo_runner.h @@ -80,7 +80,7 @@ class HloRunner { bool run_hlo_passes = false; }; - explicit HloRunner(::perftools::gputools::Platform* platform); + explicit HloRunner(se::Platform* platform); ~HloRunner(); @@ -149,8 +149,7 @@ class HloRunner { // will be used to configure the replication parameters. Replicated executions // should pass the device_assignment parameter. ServiceExecutableRunOptions GetServiceRunOptionsForDevice( - int64 device, ::perftools::gputools::Stream* stream, - DeviceAssignment* device_assignment); + int64 device, se::Stream* stream, DeviceAssignment* device_assignment); std::unique_ptr backend_; }; diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc index 5b9bf5faf36..76b3ecad26f 100644 --- a/tensorflow/compiler/xla/service/interpreter/compiler.cc +++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc @@ -41,9 +41,6 @@ limitations under the License. namespace xla { namespace interpreter { -namespace se = ::perftools::gputools; -namespace sep = ::perftools::gputools::interpreter; - Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) { HloPassPipeline pipeline("Interpreter"); @@ -96,7 +93,7 @@ InterpreterCompiler::CompileAheadOfTime( } se::Platform::Id InterpreterCompiler::PlatformId() const { - return sep::kXlaInterpreterPlatformId; + return se::interpreter::kXlaInterpreterPlatformId; } HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction() @@ -109,11 +106,12 @@ static std::unique_ptr CreateComputationPlacer() { } static bool InitModule() { - xla::Compiler::RegisterCompilerFactory(sep::kXlaInterpreterPlatformId, []() { - return xla::MakeUnique(); - }); + xla::Compiler::RegisterCompilerFactory( + se::interpreter::kXlaInterpreterPlatformId, []() { + return xla::MakeUnique(); + }); xla::ComputationPlacer::RegisterComputationPlacer( - sep::kXlaInterpreterPlatformId, &CreateComputationPlacer); + se::interpreter::kXlaInterpreterPlatformId, &CreateComputationPlacer); return true; } diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h index c8660c04d86..e90ae3e8185 100644 --- a/tensorflow/compiler/xla/service/interpreter/compiler.h +++ b/tensorflow/compiler/xla/service/interpreter/compiler.h @@ -44,19 +44,16 @@ class InterpreterCompiler : public Compiler { ~InterpreterCompiler() override {} StatusOr> RunHloPasses( - std::unique_ptr hlo_module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr hlo_module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( - std::unique_ptr hlo_module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr hlo_module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr>> Compile( std::vector> hlo_modules, - std::vector> - stream_exec, + std::vector> stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr>> @@ -65,7 +62,7 @@ class InterpreterCompiler : public Compiler { HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override; - perftools::gputools::Platform::Id PlatformId() const override; + se::Platform::Id PlatformId() const override; private: Status RunHloOptimization(HloModule* hlo_module); diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc index 883063d0f07..acfa79ea750 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.cc +++ b/tensorflow/compiler/xla/service/interpreter/executable.cc @@ -38,8 +38,6 @@ limitations under the License. namespace xla { namespace interpreter { -namespace se = ::perftools::gputools; - InterpreterExecutable::InterpreterExecutable( std::unique_ptr hlo_module) : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr, diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc index 3caf9e7b82b..97e9fa2c8e8 100644 --- a/tensorflow/compiler/xla/service/interpreter/executor.cc +++ b/tensorflow/compiler/xla/service/interpreter/executor.cc @@ -19,8 +19,7 @@ limitations under the License. #include "tensorflow/compiler/xla/status_macros.h" -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { host::HostStream *AsExecutorStream(Stream *stream) { @@ -119,5 +118,4 @@ DeviceDescription *XlaInterpreterExecutor::PopulateDeviceDescription() const { } } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h index 77426b0820d..9b109022fbf 100644 --- a/tensorflow/compiler/xla/service/interpreter/executor.h +++ b/tensorflow/compiler/xla/service/interpreter/executor.h @@ -44,8 +44,7 @@ limitations under the License. #include "tensorflow/stream_executor/stream_executor_internal.h" #include "tensorflow/stream_executor/timer.h" -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { using Args = tensorflow::gtl::ArraySlice; @@ -213,7 +212,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface { }; } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor #endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_ diff --git a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc index 3cf8506d1c4..d27cd7502f1 100644 --- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc @@ -21,12 +21,10 @@ limitations under the License. #include "tensorflow/compiler/xla/service/interpreter/platform_id.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" -namespace sei = ::perftools::gputools::interpreter; - namespace xla { InterpreterTransferManager::InterpreterTransferManager() - : GenericTransferManager(sei::kXlaInterpreterPlatformId, + : GenericTransferManager(se::interpreter::kXlaInterpreterPlatformId, /*pointer_size=*/sizeof(void*)) {} } // namespace xla @@ -38,7 +36,8 @@ CreateInterpreterTransferManager() { static bool InitModule() { xla::TransferManager::RegisterTransferManager( - sei::kXlaInterpreterPlatformId, &CreateInterpreterTransferManager); + stream_executor::interpreter::kXlaInterpreterPlatformId, + &CreateInterpreterTransferManager); return true; } diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc index 015e00e1e8e..ce2f4d378c0 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform.cc +++ b/tensorflow/compiler/xla/service/interpreter/platform.cc @@ -28,11 +28,7 @@ limitations under the License. #include "tensorflow/stream_executor/multi_platform_manager.h" #include "tensorflow/stream_executor/platform.h" -namespace se = ::perftools::gputools; -namespace sep = ::perftools::gputools::interpreter; - -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {} @@ -99,16 +95,16 @@ void XlaInterpreterPlatform::UnregisterTraceListener(TraceListener* listener) { } static void InitializeXlaInterpreterPlatform() { - std::unique_ptr platform(new sep::XlaInterpreterPlatform); - SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform))); + std::unique_ptr platform(new XlaInterpreterPlatform); + SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform))); } } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor -REGISTER_MODULE_INITIALIZER(interpreter_platform, - sep::InitializeXlaInterpreterPlatform()); +REGISTER_MODULE_INITIALIZER( + interpreter_platform, + stream_executor::interpreter::InitializeXlaInterpreterPlatform()); DECLARE_MODULE_INITIALIZER(multi_platform_manager); diff --git a/tensorflow/compiler/xla/service/interpreter/platform.h b/tensorflow/compiler/xla/service/interpreter/platform.h index 2f71b29be44..d68c5aa20dd 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform.h +++ b/tensorflow/compiler/xla/service/interpreter/platform.h @@ -23,8 +23,7 @@ limitations under the License. #include "tensorflow/stream_executor/stream_executor.h" #include "tensorflow/stream_executor/trace_listener.h" -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { class XlaInterpreterPlatform : public Platform { @@ -64,7 +63,6 @@ class XlaInterpreterPlatform : public Platform { }; } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor #endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_H_ diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.cc b/tensorflow/compiler/xla/service/interpreter/platform_id.cc index b7fb365b70d..3272396ce50 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform_id.cc +++ b/tensorflow/compiler/xla/service/interpreter/platform_id.cc @@ -14,12 +14,10 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/interpreter/platform_id.h" -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { PLATFORM_DEFINE_ID(kXlaInterpreterPlatformId); } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.h b/tensorflow/compiler/xla/service/interpreter/platform_id.h index 292f958449b..a6cc10bcc1e 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform_id.h +++ b/tensorflow/compiler/xla/service/interpreter/platform_id.h @@ -18,14 +18,12 @@ limitations under the License. #include "tensorflow/stream_executor/platform.h" -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { extern const Platform::Id kXlaInterpreterPlatformId; } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor #endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_ID_H_ diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc index 911b243fe28..b17c9d50450 100644 --- a/tensorflow/compiler/xla/service/llvm_compiler.cc +++ b/tensorflow/compiler/xla/service/llvm_compiler.cc @@ -23,7 +23,7 @@ limitations under the License. namespace xla { StatusOr>> LLVMCompiler::Compile( std::vector> modules, - std::vector> stream_execs, + std::vector> stream_execs, DeviceMemoryAllocator* device_allocator) { // Tensorflow tries to enable the following behaviors in all its threads: // diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h index d74e81bb7f6..f1c623508c5 100644 --- a/tensorflow/compiler/xla/service/llvm_compiler.h +++ b/tensorflow/compiler/xla/service/llvm_compiler.h @@ -60,19 +60,18 @@ class LLVMCompiler : public Compiler { // Bring in // StatusOr> RunBackend( // std::unique_ptr module, - // perftools::gputools::StreamExecutor* stream_exec, + // se::StreamExecutor* stream_exec, // DeviceMemoryAllocator* device_allocator) // StatusOr> RunHloPasses( // std::unique_ptr module, - // perftools::gputools::StreamExecutor* stream_exec, + // se::StreamExecutor* stream_exec, // DeviceMemoryAllocator* device_allocator) using Compiler::RunBackend; using Compiler::RunHloPasses; StatusOr>> Compile( std::vector> modules, - std::vector> - stream_execs, + std::vector> stream_execs, DeviceMemoryAllocator* device_allocator) override; protected: diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc index 499f280211a..0fa40617386 100644 --- a/tensorflow/compiler/xla/service/local_service.cc +++ b/tensorflow/compiler/xla/service/local_service.cc @@ -43,13 +43,11 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { /* static */ StatusOr> LocalService::NewService( const ServiceOptions& options) { - perftools::gputools::Platform* platform = options.platform(); + se::Platform* platform = options.platform(); if (platform == nullptr) { TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform()); } diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc index aa974ee61a2..7c63c0acc77 100644 --- a/tensorflow/compiler/xla/service/platform_util.cc +++ b/tensorflow/compiler/xla/service/platform_util.cc @@ -29,8 +29,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { using tensorflow::str_util::Lowercase; diff --git a/tensorflow/compiler/xla/service/platform_util.h b/tensorflow/compiler/xla/service/platform_util.h index 69188820a70..571451ba43a 100644 --- a/tensorflow/compiler/xla/service/platform_util.h +++ b/tensorflow/compiler/xla/service/platform_util.h @@ -34,29 +34,27 @@ class PlatformUtil { // // Note that, even if a platform is present with zero devices, if we *do* have // compilation support for it, it will be returned in this sequence. - static StatusOr> - GetSupportedPlatforms(); + static StatusOr> GetSupportedPlatforms(); // Convenience function which returns the default supported platform for // tests. If exactly one supported platform is present, then this platform is // the default platform. If exactly two platforms are present and one of them // is the interpreter platform, then the other platform is the default // platform. Otherwise returns an error. - static StatusOr GetDefaultPlatform(); + static StatusOr GetDefaultPlatform(); // Convenience function which returns the sole supported platform. If // exactly one supported platform is present, then this platform is the // default platform. Otherwise returns an error. - static StatusOr GetSolePlatform(); + static StatusOr GetSolePlatform(); // Returns the platform according to the given name. Returns error if there is // no such platform. - static StatusOr GetPlatform( - const string& platform_name); + static StatusOr GetPlatform(const string& platform_name); // Returns exactly one platform that does not have given name. Returns error // if there is no such platform, or there are multiple such platforms. - static StatusOr GetPlatformExceptFor( + static StatusOr GetPlatformExceptFor( const string& platform_name); // Returns a vector of StreamExecutors for the given platform. The vector is @@ -64,8 +62,8 @@ class PlatformUtil { // element is nullptr, then the device is present by not supported by XLA. // // If the platform has no visible devices, a not-found error is returned. - static StatusOr> - GetStreamExecutors(perftools::gputools::Platform* platform); + static StatusOr> GetStreamExecutors( + se::Platform* platform); private: TF_DISALLOW_COPY_AND_ASSIGN(PlatformUtil); diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 52500e4e790..2df59c35564 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -54,8 +54,6 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - using ::tensorflow::strings::Printf; using ::tensorflow::strings::StrCat; using ::xla::source_map_util::InvalidParameterArgument; @@ -95,15 +93,12 @@ tensorflow::Status RecordResult(const ShapedBuffer& result, } // namespace -ServiceOptions& ServiceOptions::set_platform( - perftools::gputools::Platform* platform) { +ServiceOptions& ServiceOptions::set_platform(se::Platform* platform) { platform_ = platform; return *this; } -perftools::gputools::Platform* ServiceOptions::platform() const { - return platform_; -} +se::Platform* ServiceOptions::platform() const { return platform_; } ServiceOptions& ServiceOptions::set_number_of_replicas(int number_of_replicas) { number_of_replicas_ = number_of_replicas; @@ -123,7 +118,7 @@ int ServiceOptions::intra_op_parallelism_threads() const { } /* static */ StatusOr> Service::NewService( - perftools::gputools::Platform* platform) { + se::Platform* platform) { ServiceOptions default_options; default_options.set_platform(platform); return NewService(default_options); @@ -131,7 +126,7 @@ int ServiceOptions::intra_op_parallelism_threads() const { /* static */ StatusOr> Service::NewService( const ServiceOptions& options) { - perftools::gputools::Platform* platform = options.platform(); + se::Platform* platform = options.platform(); std::unique_ptr execute_backend; if (platform == nullptr) { TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform()); @@ -235,8 +230,7 @@ tensorflow::Status Service::ValidateResultShapeWithLayout( StatusOr>> Service::ResolveAndValidateArguments( tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice - stream_executors) { + tensorflow::gtl::ArraySlice stream_executors) { CHECK_EQ(options_.number_of_replicas(), stream_executors.size()); std::vector> replicated_arguments; replicated_arguments.resize(options_.number_of_replicas()); @@ -349,8 +343,7 @@ StatusOr> Service::CreateModuleConfig( StatusOr>> Service::BuildExecutables( std::vector versioned_handles, std::vector> module_configs, - Backend* backend, - std::vector> executors, + Backend* backend, std::vector> executors, DeviceMemoryAllocator* device_allocator) { VLOG(1) << Printf("BuildExecutable on service %p", this); @@ -412,8 +405,7 @@ StatusOr>> Service::BuildExecutables( StatusOr>> Service::BuildExecutables( const std::vector& module_protos, std::vector> module_configs, - Backend* backend, - std::vector> executors, + Backend* backend, std::vector> executors, DeviceMemoryAllocator* device_allocator) { VLOG(1) << Printf("BuildExecutable on service %p", this); @@ -493,7 +485,7 @@ StatusOr> Service::BuildExecutable( StatusOr> Service::BuildAndCacheExecutable( const VersionedComputationHandle& versioned_handle, std::unique_ptr module_config, Backend* backend, - perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile, + se::StreamExecutor* executor, ExecutionProfile* profile, DeviceMemoryAllocator* device_allocator) { std::shared_ptr executable = compilation_cache_.LookUp(versioned_handle, *module_config); @@ -541,7 +533,7 @@ Service::ExecuteParallelAndRegisterResult( // Streams where the computation are launched, so we can wait on the streams // to complete. std::vector::SmartPtr> streams; - std::vector> timers; + std::vector> timers; // Global data handles for the computation results, one for each computation. std::vector result_handles; @@ -565,8 +557,7 @@ Service::ExecuteParallelAndRegisterResult( streams.push_back(std::move(stream)); if (replica == 0 && profile != nullptr) { - timers.emplace_back( - new perftools::gputools::Timer(streams.back()->parent())); + timers.emplace_back(new se::Timer(streams.back()->parent())); streams.back() ->InitTimer(timers.back().get()) .ThenStartTimer(timers.back().get()); @@ -734,9 +725,9 @@ tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg, return computation->SetReturnValue(arg->operand()); } -StatusOr> -Service::GetExecutors(const ExecutionOptions& execution_options, - int64 requests_size, int64 request_index) const { +StatusOr> Service::GetExecutors( + const ExecutionOptions& execution_options, int64 requests_size, + int64 request_index) const { if (execution_options.device_handles().empty()) { return FailedPrecondition( "device handles must be given to execute parallel computations"); @@ -748,7 +739,7 @@ Service::GetExecutors(const ExecutionOptions& execution_options, "handles.", requests_size, request_index, execution_options.device_handles_size()); } - std::vector executors; + std::vector executors; for (const auto& device_handle : execution_options.device_handles()) { TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_, device_handle)); @@ -780,7 +771,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg, VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString(); std::vector>> all_arguments; - std::vector> all_executors; + std::vector> all_executors; std::vector versioned_handles; std::vector> module_configs; std::vector computation_names; @@ -891,7 +882,7 @@ tensorflow::Status Service::ExecuteGraphParallel( VLOG(1) << "running execute-graph-parallel request"; std::vector>> all_arguments; - std::vector> all_executors; + std::vector> all_executors; std::vector module_protos; std::vector> module_configs; std::vector computation_names; @@ -1953,9 +1944,9 @@ DeviceHandle Service::SingleComputationDeviceHandle() const { return device_handle; } -StatusOr> Service::Replicas( +StatusOr> Service::Replicas( const Backend& backend, const DeviceHandle& device_handle) const { - std::vector replicas; + std::vector replicas; for (int replica = 0; replica < options_.number_of_replicas(); ++replica) { // From the computation placer, find out the device ids of the replicas for // the given device handle. diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h index e399f1ac190..476bd0597de 100644 --- a/tensorflow/compiler/xla/service/service.h +++ b/tensorflow/compiler/xla/service/service.h @@ -53,8 +53,8 @@ namespace xla { class ServiceOptions { public: // Set the platform backing the service, or nullptr for the default platform. - ServiceOptions& set_platform(perftools::gputools::Platform* platform); - perftools::gputools::Platform* platform() const; + ServiceOptions& set_platform(se::Platform* platform); + se::Platform* platform() const; // Set the number of replicas to use when compiling replicated // programs. @@ -66,7 +66,7 @@ class ServiceOptions { int intra_op_parallelism_threads() const; private: - perftools::gputools::Platform* platform_ = nullptr; + se::Platform* platform_ = nullptr; int number_of_replicas_ = 1; int intra_op_parallelism_threads_ = -1; }; @@ -79,7 +79,7 @@ class Service : public ServiceInterface { public: // Factory method for creating a new Service. static StatusOr> NewService( - perftools::gputools::Platform* platform = nullptr); + se::Platform* platform = nullptr); static StatusOr> NewService( const ServiceOptions& options); @@ -286,7 +286,7 @@ class Service : public ServiceInterface { ExecuteResponse* result); // Prepare the executors for executing parallel. - StatusOr> GetExecutors( + StatusOr> GetExecutors( const ExecutionOptions& execution_options, int64 requests_size, int64 request_index) const; @@ -310,8 +310,7 @@ class Service : public ServiceInterface { StatusOr>> ResolveAndValidateArguments( tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice - stream_executors); + tensorflow::gtl::ArraySlice stream_executors); // Create a Hlo module config for the given program shape and arguments. // execution_options is optional; if not given a default is used. @@ -329,7 +328,7 @@ class Service : public ServiceInterface { StatusOr> BuildExecutable( const VersionedComputationHandle& versioned_handle, std::unique_ptr module_config, Backend* backend, - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator = nullptr); // Builds an Executable for the given HLO module proto. @@ -338,7 +337,7 @@ class Service : public ServiceInterface { StatusOr> BuildExecutable( const HloModuleProto& module_proto, std::unique_ptr module_config, Backend* backend, - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator = nullptr); // Same as BuildExecutable() above, but builds a list of Executables for the @@ -346,14 +345,12 @@ class Service : public ServiceInterface { StatusOr>> BuildExecutables( std::vector versioned_handles, std::vector> module_configs, - Backend* backend, - std::vector> executors, + Backend* backend, std::vector> executors, DeviceMemoryAllocator* device_allocator); StatusOr>> BuildExecutables( const std::vector& module_protos, std::vector> module_configs, - Backend* backend, - std::vector> executors, + Backend* backend, std::vector> executors, DeviceMemoryAllocator* device_allocator); // Similar to BuildExecutable, but look in the compilation cache for the @@ -362,7 +359,7 @@ class Service : public ServiceInterface { StatusOr> BuildAndCacheExecutable( const VersionedComputationHandle& versioned_handle, std::unique_ptr module_config, Backend* backend, - perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile, + se::StreamExecutor* executor, ExecutionProfile* profile, DeviceMemoryAllocator* device_allocator = nullptr); // Runs the given executable with the given arguments and register the result @@ -411,7 +408,7 @@ class Service : public ServiceInterface { // Returns the stream executors assigned to the replicas represented by the // given device handle. Each device_handle is a virtual replicated device that // represents a set of physical devices for the replicas. - StatusOr> Replicas( + StatusOr> Replicas( const Backend& backend, const DeviceHandle& device_handle) const; Status MaybeDumpHloModule(const HloModule& module) const; diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h index 6c1f8feac7e..7f3910cdb03 100644 --- a/tensorflow/compiler/xla/service/service_executable_run_options.h +++ b/tensorflow/compiler/xla/service/service_executable_run_options.h @@ -28,7 +28,7 @@ namespace xla { class ServiceExecutableRunOptions { public: using StreamBorrower = - std::function::SmartPtr>(int)>; + std::function::SmartPtr>(int)>; ServiceExecutableRunOptions() : ServiceExecutableRunOptions(ExecutableRunOptions()) {} @@ -45,14 +45,13 @@ class ServiceExecutableRunOptions { ExecutableRunOptions* mutable_run_options() { return &run_options_; } // Delegate to `ExecutableRunOptions` member. - perftools::gputools::Stream* stream() const { return run_options_.stream(); } + se::Stream* stream() const { return run_options_.stream(); } DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); } int device_ordinal() const { return run_options_.device_ordinal(); } // Borrows a stream and returns a smart pointer which returns the stream on // destruction. - StatusOr::SmartPtr> BorrowStream( - int device_ordinal) const { + StatusOr::SmartPtr> BorrowStream(int device_ordinal) const { return borrow_stream_ ? borrow_stream_(device_ordinal) : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache"); diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc index 6e9986165f7..10a2aa2b30f 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer.cc @@ -28,8 +28,6 @@ limitations under the License. #include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" -namespace se = ::perftools::gputools; - namespace xla { using ::tensorflow::strings::Appendf; @@ -146,7 +144,7 @@ ScopedShapedBuffer::~ScopedShapedBuffer() { std::unique_ptr ScopedShapedBuffer::release() { auto shaped_buffer = MakeUnique(std::move(*this)); - buffers_ = ShapeTree(); + buffers_ = ShapeTree(); return shaped_buffer; } diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h index b816df8385e..62ba8f27342 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.h +++ b/tensorflow/compiler/xla/service/shaped_buffer.h @@ -41,8 +41,7 @@ class ShapedBuffer { // determines the number of device allocations (DeviceMemoryBase) held by the // ShapedBuffer. ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape, - const perftools::gputools::Platform* platform, - int device_ordinal); + const se::Platform* platform, int device_ordinal); // Returns the shape of the on-host representation of the data held by this // ShapedBuffer. @@ -52,35 +51,29 @@ class ShapedBuffer { // ShapedBuffer. const Shape& on_device_shape() const { return on_device_shape_; } - const perftools::gputools::Platform* platform() const { return platform_; } + const se::Platform* platform() const { return platform_; } int device_ordinal() const { return device_ordinal_; } // Return the root buffer of the shape (shape index {}). - const perftools::gputools::DeviceMemoryBase& root_buffer() const { + const se::DeviceMemoryBase& root_buffer() const { return buffer(/*index=*/{}); } // Returns the buffer at the given shape index where index is defined as in // ShapeUtil::GetSubshape. - const perftools::gputools::DeviceMemoryBase& buffer( - const ShapeIndex& index) const { + const se::DeviceMemoryBase& buffer(const ShapeIndex& index) const { return buffers_.element(index); } // Sets the device memory buffer at the given index. - void set_buffer(const perftools::gputools::DeviceMemoryBase& buffer, - const ShapeIndex& index) { + void set_buffer(const se::DeviceMemoryBase& buffer, const ShapeIndex& index) { *buffers_.mutable_element(index) = buffer; } // Returns the underlying ShapeTree containing all the device addresses in the // ShapedBuffer. - const ShapeTree& buffers() const { - return buffers_; - } - ShapeTree& buffers() { - return buffers_; - } + const ShapeTree& buffers() const { return buffers_; } + ShapeTree& buffers() { return buffers_; } // Set all device memory pointers in the object to null. void clear(); @@ -101,13 +94,13 @@ class ShapedBuffer { Shape on_device_shape_; // The platform the memory is allocated on. - const perftools::gputools::Platform* platform_; + const se::Platform* platform_; // The device the memory is allocated on. int device_ordinal_; // The tree of device buffers. Its shape is on_device_shape(). - ShapeTree buffers_; + ShapeTree buffers_; }; std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer); diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc index 2f36e2b16e0..be8231b73c0 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.cc +++ b/tensorflow/compiler/xla/service/transfer_manager.cc @@ -25,24 +25,20 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" -namespace se = ::perftools::gputools; - namespace xla { /* static */ tensorflow::mutex TransferManager::platform_transfer_manager_mutex_( tensorflow::LINKER_INITIALIZED); -/* static */ std::map* +/* static */ std::map* TransferManager::GetPlatformTransferManagers() { - static auto* r = - new std::map; + static auto* r = new std::map; return r; } Status TransferManager::TransferArrayToDevice( - perftools::gputools::StreamExecutor* executor, const Literal& literal, - const perftools::gputools::DeviceMemoryBase& dest) { + se::StreamExecutor* executor, const Literal& literal, + const se::DeviceMemoryBase& dest) { const Shape on_device_shape = HostShapeToDeviceShape(literal.shape()); TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape)) << "On-device representation of " @@ -61,8 +57,8 @@ Status TransferManager::TransferArrayToDevice( } StatusOr> TransferManager::TransferArrayFromDevice( - perftools::gputools::StreamExecutor* executor, const Shape& shape, - const perftools::gputools::DeviceMemoryBase& source) { + se::StreamExecutor* executor, const Shape& shape, + const se::DeviceMemoryBase& source) { TF_RET_CHECK(ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape)) << "Shape " << ShapeUtil::HumanString(shape) << " has a differently shaped representation on-device: " @@ -112,8 +108,7 @@ StatusOr> TransferManager::TransferArrayFromDevice( } Status TransferManager::WriteTupleIndexTables( - perftools::gputools::StreamExecutor* executor, - const ShapedBuffer& device_buffer) { + se::StreamExecutor* executor, const ShapedBuffer& device_buffer) { VLOG(2) << "Writing tuple index tables for " << device_buffer; TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal()); diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h index 9f2b5c4aecf..410d2af7af6 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.h +++ b/tensorflow/compiler/xla/service/transfer_manager.h @@ -42,7 +42,7 @@ class TransferManager { virtual ~TransferManager() {} // Returns the ID of the platform that this transfer manager acts on. - virtual perftools::gputools::Platform::Id PlatformId() const = 0; + virtual se::Platform::Id PlatformId() const = 0; // Returns the shape of the on-device representation for the given shape on // the host. This is intended for use with ShapedBuffer where buffers are @@ -58,48 +58,45 @@ class TransferManager { // DeviceShape(literal_shape) must be compatible, but need not have the same // layout. virtual StatusOr> TransferLiteralFromDevice( - perftools::gputools::StreamExecutor* executor, - const ShapedBuffer& device_buffer) = 0; + se::StreamExecutor* executor, const ShapedBuffer& device_buffer) = 0; // Transfers the given literal into the previously allocated device memory // represented by the given ShapedBuffer using the given executor. The shape // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible, // but need not have the same layout - virtual Status TransferLiteralToDevice( - perftools::gputools::StreamExecutor* executor, const Literal& literal, - const ShapedBuffer& device_buffer) = 0; + virtual Status TransferLiteralToDevice(se::StreamExecutor* executor, + const Literal& literal, + const ShapedBuffer& device_buffer) = 0; // Convenience methods for transferring an array to or from the device at a // known address. This avoids having to construct a ShapedBuffer just to // transfer an array at a known address. - Status TransferArrayToDevice( - perftools::gputools::StreamExecutor* executor, const Literal& literal, - const perftools::gputools::DeviceMemoryBase& dest); + Status TransferArrayToDevice(se::StreamExecutor* executor, + const Literal& literal, + const se::DeviceMemoryBase& dest); StatusOr> TransferArrayFromDevice( - perftools::gputools::StreamExecutor* executor, const Shape& shape, - const perftools::gputools::DeviceMemoryBase& source); + se::StreamExecutor* executor, const Shape& shape, + const se::DeviceMemoryBase& source); // Transfers the given literal into the Infeed interface of the device, // using the given executor. - virtual Status TransferLiteralToInfeed( - perftools::gputools::StreamExecutor* executor, - const Literal& literal) = 0; + virtual Status TransferLiteralToInfeed(se::StreamExecutor* executor, + const Literal& literal) = 0; // Transfers the given literal from the Outfeed interface of the device, // using the given executor. - virtual Status TransferLiteralFromOutfeed( - perftools::gputools::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) = 0; + virtual Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, + const Shape& literal_shape, + Literal* literal) = 0; // Resets the devices associated with this transfer manager. virtual Status ResetDevices( - tensorflow::gtl::ArraySlice - executor) = 0; + tensorflow::gtl::ArraySlice executor) = 0; // Given an allocated ShapedBuffer, constructs the tuple index table(s) in // each buffer of the given ShapedBuffer corresponding to tuple shapes. If the // ShapedBuffer is array-shaped this method does nothing. - Status WriteTupleIndexTables(perftools::gputools::StreamExecutor* executor, + Status WriteTupleIndexTables(se::StreamExecutor* executor, const ShapedBuffer& device_buffer); // Determines the byte size requirement for the given shape on the underlying @@ -127,13 +124,13 @@ class TransferManager { // Precondition: a platform kind must not be registered more than once. typedef std::unique_ptr (*TransferManagerCreationFunction)(); static void RegisterTransferManager( - perftools::gputools::Platform::Id platform_id, + se::Platform::Id platform_id, TransferManagerCreationFunction transfer_manager); // Returns the transfer manager singleton pointer if it is available for the // given platform, or an error status if it is not. static StatusOr GetForPlatform( - const perftools::gputools::Platform* platform); + const se::Platform* platform); protected: // Transfer a memory block of the given size from 'source' buffer to the @@ -143,35 +140,32 @@ class TransferManager { // // source is the source data that must be in the target-dependent layout that // the Infeed HLO used in the computation expects. - virtual Status TransferBufferToInfeed( - perftools::gputools::StreamExecutor* executor, int64 size, - const void* source) = 0; + virtual Status TransferBufferToInfeed(se::StreamExecutor* executor, + int64 size, const void* source) = 0; // Transfer a memory block of the given size from the device source into the // 'destination' buffer. // // size is the size to transfer to destination in bytes. - virtual Status TransferBufferFromDevice( - perftools::gputools::StreamExecutor* executor, - const perftools::gputools::DeviceMemoryBase& source, int64 size, - void* destination); + virtual Status TransferBufferFromDevice(se::StreamExecutor* executor, + const se::DeviceMemoryBase& source, + int64 size, void* destination); // Transfer a memory block of the given size from 'source' buffer to the given // destination of the device. // // size is the size to transfer from source in bytes. - virtual Status TransferBufferToDevice( - perftools::gputools::StreamExecutor* executor, int64 size, - const void* source, perftools::gputools::DeviceMemoryBase* destination); + virtual Status TransferBufferToDevice(se::StreamExecutor* executor, + int64 size, const void* source, + se::DeviceMemoryBase* destination); // Writes the given device-memory pointers in 'elements' to the given region // to construct a tuple index table in the platform-specific tuple // representation. virtual Status WriteSingleTupleIndexTable( - perftools::gputools::StreamExecutor* executor, - tensorflow::gtl::ArraySlice - elements, - const Shape& shape, perftools::gputools::DeviceMemoryBase* region) = 0; + se::StreamExecutor* executor, + tensorflow::gtl::ArraySlice elements, + const Shape& shape, se::DeviceMemoryBase* region) = 0; private: // The mutex that guards the platform-to-transfer manager map. @@ -186,8 +180,7 @@ class TransferManager { }; // Map from platform kind to transfer manager singleton. - static std::map* - GetPlatformTransferManagers(); + static std::map* GetPlatformTransferManagers(); }; } // namespace xla diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc index 777ac167a3c..bff60f25ec8 100644 --- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc +++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc @@ -34,7 +34,7 @@ namespace { class BitcastConvertTest : public ClientLibraryTestBase { public: - explicit BitcastConvertTest(perftools::gputools::Platform* platform = nullptr) + explicit BitcastConvertTest(se::Platform* platform = nullptr) : ClientLibraryTestBase(platform) { mutable_debug_options()->add_xla_disable_hlo_passes("algsimp"); mutable_debug_options()->add_xla_disable_hlo_passes("inline"); diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc index 312d8f284d3..69389dae3f2 100644 --- a/tensorflow/compiler/xla/tests/client_library_test_base.cc +++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc @@ -32,8 +32,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { @@ -59,8 +57,7 @@ se::Platform* GetReferencePlatform() { } // namespace ClientLibraryTestBase::ClientLibraryTestBase( - perftools::gputools::Platform* platform, - const LocalClientOptions& client_options) + se::Platform* platform, const LocalClientOptions& client_options) : client_(GetOrCreateLocalClientOrDie(client_options)), execution_options_(CreateDefaultExecutionOptions()) { CHECK_EQ(platform, client_options.platform()); diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h index b3212dd2282..481d7c5c25a 100644 --- a/tensorflow/compiler/xla/tests/client_library_test_base.h +++ b/tensorflow/compiler/xla/tests/client_library_test_base.h @@ -64,11 +64,10 @@ std::vector ExpandUseBfloat16( // A client library test establishes an in-process XLA client connection. class ClientLibraryTestBase : public ::testing::Test { protected: - explicit ClientLibraryTestBase( - perftools::gputools::Platform* platform = nullptr); + explicit ClientLibraryTestBase(se::Platform* platform = nullptr); // Creates a new ClientLibraryTestBase with custom client options. - ClientLibraryTestBase(perftools::gputools::Platform* platform, + ClientLibraryTestBase(se::Platform* platform, const LocalClientOptions& client_options); // Returns the name of the test currently being run. diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc index c15d808f1dd..7ea82a791f7 100644 --- a/tensorflow/compiler/xla/tests/compute_constant_test.cc +++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc @@ -47,16 +47,14 @@ ClientType client_types[] = {ClientType::kLocal, ClientType::kCompileOnly}; class ComputeConstantTest : public ::testing::Test { public: - explicit ComputeConstantTest( - perftools::gputools::Platform* platform = nullptr) + explicit ComputeConstantTest(se::Platform* platform = nullptr) : platform_(platform) {} string TestName() const { return ::testing::UnitTest::GetInstance()->current_test_info()->name(); } - Client* ClientOrDie(::perftools::gputools::Platform* platform, - ClientType client_type) { + Client* ClientOrDie(se::Platform* platform, ClientType client_type) { if (client_type == ClientType::kLocal) { StatusOr result = ClientLibrary::GetOrCreateLocalClient(platform); @@ -107,7 +105,7 @@ class ComputeConstantTest : public ::testing::Test { return result.ok() ? result.ValueOrDie() : false; } - perftools::gputools::Platform* platform_; + se::Platform* platform_; }; TEST_F(ComputeConstantTest, ScalarInt32Literal) { diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc index 0842a8918bc..e67a30d76c2 100644 --- a/tensorflow/compiler/xla/tests/convert_test.cc +++ b/tensorflow/compiler/xla/tests/convert_test.cc @@ -36,7 +36,7 @@ namespace { class ConvertTest : public ClientLibraryTestBase { public: - explicit ConvertTest(perftools::gputools::Platform* platform = nullptr) + explicit ConvertTest(se::Platform* platform = nullptr) : ClientLibraryTestBase(platform) { mutable_debug_options()->add_xla_disable_hlo_passes("algsimp"); mutable_debug_options()->add_xla_disable_hlo_passes("inline"); diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc index 5f00c340028..464b8cbebb1 100644 --- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc +++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc @@ -35,8 +35,6 @@ limitations under the License. #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc index a292eab1d19..ed16963b40b 100644 --- a/tensorflow/compiler/xla/tests/fusion_test.cc +++ b/tensorflow/compiler/xla/tests/fusion_test.cc @@ -50,8 +50,6 @@ limitations under the License. using tensorflow::gtl::ArraySlice; -namespace se = ::perftools::gputools; - namespace xla { namespace { diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc index 21f71fc91bb..c5afe0c3e05 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.cc +++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc @@ -35,8 +35,6 @@ limitations under the License. #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h index 3e8e2360bb3..28d7ab09cb6 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.h +++ b/tensorflow/compiler/xla/tests/hlo_test_base.h @@ -76,8 +76,7 @@ class HloTestBase : public ::testing::Test { // If your test doesn't use interpreter as the reference backend, you can use // this constructor. Note that your test target is responsible for linking in // both needed backends. - HloTestBase(::perftools::gputools::Platform* test_platform, - ::perftools::gputools::Platform* reference_platform); + HloTestBase(se::Platform* test_platform, se::Platform* reference_platform); ~HloTestBase() override {} diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc index 7e92439c494..2f46ee0be21 100644 --- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc +++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc @@ -43,7 +43,7 @@ class LLVMCompilerTest : public ::testing::Test { ~LLVMCompilerTest() override {} protected: - using Platform = ::perftools::gputools::Platform; + using Platform = se::Platform; explicit LLVMCompilerTest(string platform_name) : platform_name_(std::move(platform_name)) {} @@ -95,7 +95,7 @@ class LLVMCompilerTest : public ::testing::Test { modules.push_back(hlo_module->Clone()); modules.push_back(std::move(hlo_module)); - std::vector> executors; + std::vector> executors; executors.push_back({backend_->default_stream_executor()}); executors.push_back({backend_->default_stream_executor()}); diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc index 2462ea39f91..373dd3c5df4 100644 --- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc @@ -43,8 +43,6 @@ limitations under the License. #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc index 96b976d25d7..29fd985acfc 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.cc +++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc @@ -35,8 +35,9 @@ namespace xla { /* static */ TestAllocator* LocalClientTestBase::allocator_; -StatusOr TestAllocator::Allocate( - int device_ordinal, uint64 size, bool retry_on_failure) { +StatusOr TestAllocator::Allocate(int device_ordinal, + uint64 size, + bool retry_on_failure) { VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")"; { tensorflow::mutex_lock lock(count_mutex_); @@ -46,8 +47,8 @@ StatusOr TestAllocator::Allocate( return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size); } -tensorflow::Status TestAllocator::Deallocate( - int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) { +tensorflow::Status TestAllocator::Deallocate(int device_ordinal, + se::DeviceMemoryBase* mem) { VLOG(2) << "Deallocate(" << device_ordinal << ")"; { tensorflow::mutex_lock lock(count_mutex_); @@ -88,7 +89,7 @@ int64 TestAllocator::deallocation_count(int device_ordinal) const { } /* static */ TestAllocator* LocalClientTestBase::GetOrCreateAllocator( - perftools::gputools::Platform* platform) { + se::Platform* platform) { static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); tensorflow::mutex_lock lock(mu); @@ -115,8 +116,7 @@ struct LocalClientTestBase::EigenThreadPoolWrapper { std::unique_ptr device; }; -LocalClientTestBase::LocalClientTestBase( - perftools::gputools::Platform* platform) +LocalClientTestBase::LocalClientTestBase(se::Platform* platform) : local_client_( ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()), thread_pool_wrapper_(new EigenThreadPoolWrapper()) { diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h index f0c73f04f6e..7555d5e8938 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.h +++ b/tensorflow/compiler/xla/tests/local_client_test_base.h @@ -41,15 +41,15 @@ namespace xla { class TestAllocator : public StreamExecutorMemoryAllocator { public: - explicit TestAllocator(perftools::gputools::Platform* platform) + explicit TestAllocator(se::Platform* platform) : StreamExecutorMemoryAllocator( platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) { } - StatusOr Allocate( - int device_ordinal, uint64 size, bool retry_on_failure) override; - tensorflow::Status Deallocate( - int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override; + StatusOr Allocate(int device_ordinal, uint64 size, + bool retry_on_failure) override; + tensorflow::Status Deallocate(int device_ordinal, + se::DeviceMemoryBase* mem) override; // Return the number of allocations that have been performed. int64 allocation_count() const; @@ -75,12 +75,10 @@ class TestAllocator : public StreamExecutorMemoryAllocator { class LocalClientTestBase : public ::testing::Test { protected: struct EigenThreadPoolWrapper; - explicit LocalClientTestBase( - perftools::gputools::Platform* platform = nullptr); + explicit LocalClientTestBase(se::Platform* platform = nullptr); virtual ~LocalClientTestBase(); - static TestAllocator* GetOrCreateAllocator( - perftools::gputools::Platform* platform); + static TestAllocator* GetOrCreateAllocator(se::Platform* platform); // Copy the given literal onto the default device and return a // ScopedShapedBuffer. Convenience wrapper around @@ -128,7 +126,7 @@ class LocalClientTestBase : public ::testing::Test { // of the process. So make the allocator static. static TestAllocator* allocator_; - perftools::gputools::StreamExecutor* stream_executor_; + se::StreamExecutor* stream_executor_; TransferManager* transfer_manager_; LocalClient* local_client_; diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc index efe6cc67872..8fabcaca1b9 100644 --- a/tensorflow/compiler/xla/tests/map_test.cc +++ b/tensorflow/compiler/xla/tests/map_test.cc @@ -41,7 +41,7 @@ namespace { class MapTest : public ClientLibraryTestBase { public: - explicit MapTest(perftools::gputools::Platform* platform = nullptr) + explicit MapTest(se::Platform* platform = nullptr) : ClientLibraryTestBase(platform) { mutable_debug_options()->add_xla_disable_hlo_passes("algsimp"); mutable_debug_options()->add_xla_disable_hlo_passes("inline"); diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc index cda1989fad6..997a1d82737 100644 --- a/tensorflow/compiler/xla/tests/test_utils.cc +++ b/tensorflow/compiler/xla/tests/test_utils.cc @@ -339,8 +339,8 @@ StatusOr>> MakeFakeArguments( return std::move(arguments); } -Status VerifyHloModule(const perftools::gputools::Platform& platform, - HloModule* const module, bool allow_mixed_precision) { +Status VerifyHloModule(const se::Platform& platform, HloModule* const module, + bool allow_mixed_precision) { return HloVerifier(allow_mixed_precision).Run(module).status(); } diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h index b5ab779574f..30c147910ca 100644 --- a/tensorflow/compiler/xla/tests/test_utils.h +++ b/tensorflow/compiler/xla/tests/test_utils.h @@ -68,8 +68,7 @@ StatusOr>> MakeFakeArguments( // Check that a given module satisfies various constraints before trying to // execute it. -Status VerifyHloModule(const perftools::gputools::Platform& platform, - HloModule* const module, +Status VerifyHloModule(const se::Platform& platform, HloModule* const module, bool allow_mixed_precision = false); } // namespace xla diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc index b52c718814d..697d78fe6e9 100644 --- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc +++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc @@ -39,7 +39,7 @@ namespace { class VecOpsSimpleTest : public ClientLibraryTestBase { public: - explicit VecOpsSimpleTest(perftools::gputools::Platform* platform = nullptr) + explicit VecOpsSimpleTest(se::Platform* platform = nullptr) : ClientLibraryTestBase(platform) { mutable_debug_options()->add_xla_disable_hlo_passes("algsimp"); mutable_debug_options()->add_xla_disable_hlo_passes("inline"); diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc index 89ce2ce797f..1e18b567995 100644 --- a/tensorflow/compiler/xla/tests/while_test.cc +++ b/tensorflow/compiler/xla/tests/while_test.cc @@ -37,8 +37,6 @@ limitations under the License. #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc index ff3418a128e..efb00d56c58 100644 --- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc +++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc @@ -34,7 +34,7 @@ limitations under the License. namespace xla { namespace { -namespace se = ::perftools::gputools; + namespace gtl = ::tensorflow::gtl; class HloProfileTest : public ClientLibraryTestBase {}; diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h index 20f3f1b957c..b645acb700b 100644 --- a/tensorflow/compiler/xla/types.h +++ b/tensorflow/compiler/xla/types.h @@ -49,9 +49,7 @@ using ::Eigen::half; // Alias namespace ::stream_executor as ::xla::se. namespace stream_executor {} namespace xla { -// TODO(b/77980417): Uncomment this once all namespace aliases named 'se' are -// removed in ::xla. -// namespace se = ::stream_executor; +namespace se = ::stream_executor; } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_TYPES_H_