[XLA] Convert XLA to use xla::se as a namespace alias for ::stream_executor.
PiperOrigin-RevId: 193301997
This commit is contained in:
parent
41e2cd187b
commit
d77a621a57
@ -443,6 +443,9 @@ cc_library(
|
||||
srcs = ["executable_run_options.cc"],
|
||||
hdrs = ["executable_run_options.h"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":types",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
|
@ -23,22 +23,19 @@ limitations under the License.
|
||||
|
||||
namespace xla {
|
||||
|
||||
LocalClientOptions::LocalClientOptions(perftools::gputools::Platform* platform,
|
||||
LocalClientOptions::LocalClientOptions(se::Platform* platform,
|
||||
int number_of_replicas,
|
||||
int intra_op_parallelism_threads)
|
||||
: platform_(platform),
|
||||
number_of_replicas_(number_of_replicas),
|
||||
intra_op_parallelism_threads_(intra_op_parallelism_threads) {}
|
||||
|
||||
LocalClientOptions& LocalClientOptions::set_platform(
|
||||
perftools::gputools::Platform* platform) {
|
||||
LocalClientOptions& LocalClientOptions::set_platform(se::Platform* platform) {
|
||||
platform_ = platform;
|
||||
return *this;
|
||||
}
|
||||
|
||||
perftools::gputools::Platform* LocalClientOptions::platform() const {
|
||||
return platform_;
|
||||
}
|
||||
se::Platform* LocalClientOptions::platform() const { return platform_; }
|
||||
|
||||
LocalClientOptions& LocalClientOptions::set_number_of_replicas(
|
||||
int number_of_replicas) {
|
||||
@ -69,7 +66,7 @@ ClientLibrary::ClientLibrary() = default;
|
||||
ClientLibrary::~ClientLibrary() = default;
|
||||
|
||||
/* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
|
||||
perftools::gputools::Platform* platform) {
|
||||
se::Platform* platform) {
|
||||
LocalClientOptions default_options;
|
||||
default_options.set_platform(platform);
|
||||
return GetOrCreateLocalClient(default_options);
|
||||
@ -77,7 +74,7 @@ ClientLibrary::~ClientLibrary() = default;
|
||||
|
||||
/* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
|
||||
const LocalClientOptions& options) {
|
||||
perftools::gputools::Platform* platform = options.platform();
|
||||
se::Platform* platform = options.platform();
|
||||
int replica_count = options.number_of_replicas();
|
||||
ClientLibrary& client_library = Singleton();
|
||||
tensorflow::mutex_lock lock(client_library.service_mutex_);
|
||||
@ -115,7 +112,7 @@ ClientLibrary::~ClientLibrary() = default;
|
||||
}
|
||||
|
||||
/* static */ LocalService* ClientLibrary::GetXlaService(
|
||||
perftools::gputools::Platform* platform) {
|
||||
se::Platform* platform) {
|
||||
ClientLibrary& client_library = Singleton();
|
||||
tensorflow::mutex_lock lock(client_library.service_mutex_);
|
||||
auto it = client_library.local_instances_.find(platform->id());
|
||||
@ -124,8 +121,7 @@ ClientLibrary::~ClientLibrary() = default;
|
||||
}
|
||||
|
||||
/* static */ StatusOr<CompileOnlyClient*>
|
||||
ClientLibrary::GetOrCreateCompileOnlyClient(
|
||||
perftools::gputools::Platform* platform) {
|
||||
ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) {
|
||||
ClientLibrary& client_library = Singleton();
|
||||
tensorflow::mutex_lock lock(client_library.service_mutex_);
|
||||
|
||||
|
@ -43,13 +43,13 @@ namespace xla {
|
||||
// Options to configure the local client when it is created.
|
||||
class LocalClientOptions {
|
||||
public:
|
||||
LocalClientOptions(perftools::gputools::Platform* platform = nullptr,
|
||||
LocalClientOptions(se::Platform* platform = nullptr,
|
||||
int number_of_replicas = 1,
|
||||
int intra_op_parallelism_threads = -1);
|
||||
|
||||
// Set the platform backing the service, or nullptr for the default platform.
|
||||
LocalClientOptions& set_platform(perftools::gputools::Platform* platform);
|
||||
perftools::gputools::Platform* platform() const;
|
||||
LocalClientOptions& set_platform(se::Platform* platform);
|
||||
se::Platform* platform() const;
|
||||
|
||||
// Set the number of replicas to use when compiling replicated
|
||||
// programs.
|
||||
@ -61,7 +61,7 @@ class LocalClientOptions {
|
||||
int intra_op_parallelism_threads() const;
|
||||
|
||||
private:
|
||||
perftools::gputools::Platform* platform_;
|
||||
se::Platform* platform_;
|
||||
int number_of_replicas_;
|
||||
int intra_op_parallelism_threads_;
|
||||
};
|
||||
@ -74,7 +74,7 @@ class ClientLibrary {
|
||||
// platform : The platform the underlying XLA service should target. If
|
||||
// null then default platform is used.
|
||||
static StatusOr<LocalClient*> GetOrCreateLocalClient(
|
||||
perftools::gputools::Platform* platform = nullptr);
|
||||
se::Platform* platform = nullptr);
|
||||
static StatusOr<LocalClient*> GetOrCreateLocalClient(
|
||||
const LocalClientOptions& options);
|
||||
|
||||
@ -84,14 +84,14 @@ class ClientLibrary {
|
||||
|
||||
// Returns the service from the service thread. Only used in unit tests to
|
||||
// access user computations from client.
|
||||
static LocalService* GetXlaService(perftools::gputools::Platform* platform);
|
||||
static LocalService* GetXlaService(se::Platform* platform);
|
||||
|
||||
// Singleton constructor-or-accessor for compile-only clients. Arguments:
|
||||
//
|
||||
// platform : The platform the underlying XLA service should target. If
|
||||
// null then default platform is used.
|
||||
static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
|
||||
perftools::gputools::Platform* platform = nullptr);
|
||||
se::Platform* platform = nullptr);
|
||||
|
||||
// Clears the local instance and compile only instance caches. The client
|
||||
// pointers returned by the previous GetOrCreateLocalClient() or
|
||||
@ -120,12 +120,10 @@ class ClientLibrary {
|
||||
};
|
||||
|
||||
tensorflow::mutex service_mutex_; // Guards the singleton creation state.
|
||||
std::unordered_map<perftools::gputools::Platform::Id,
|
||||
std::unique_ptr<LocalInstance>>
|
||||
std::unordered_map<se::Platform::Id, std::unique_ptr<LocalInstance>>
|
||||
local_instances_ GUARDED_BY(service_mutex_);
|
||||
|
||||
std::unordered_map<perftools::gputools::Platform::Id,
|
||||
std::unique_ptr<CompileOnlyInstance>>
|
||||
std::unordered_map<se::Platform::Id, std::unique_ptr<CompileOnlyInstance>>
|
||||
compile_only_instances_ GUARDED_BY(service_mutex_);
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
|
||||
|
@ -24,8 +24,6 @@ limitations under the License.
|
||||
#include "tensorflow/compiler/xla/service/source_map_util.h"
|
||||
#include "tensorflow/compiler/xla/status_macros.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
using xla::source_map_util::InvalidParameterArgument;
|
||||
|
||||
namespace xla {
|
||||
|
@ -167,7 +167,7 @@ class LocalClient : public Client {
|
||||
StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
|
||||
|
||||
// Returns the platform that the underlying service targets.
|
||||
perftools::gputools::Platform* platform() const;
|
||||
se::Platform* platform() const;
|
||||
|
||||
// Returns the number of devices on the system of the service platform
|
||||
// type. Not all devices may be supported by the service (see
|
||||
|
@ -29,7 +29,7 @@ namespace xla {
|
||||
|
||||
// Returns a string that represents the device in terms of platform and ordinal;
|
||||
// e.g. the first CUDA device will be "cuda:0"
|
||||
string DeviceIdentifier(perftools::gputools::StreamExecutor* stream_exec) {
|
||||
string DeviceIdentifier(se::StreamExecutor* stream_exec) {
|
||||
return tensorflow::strings::StrCat(stream_exec->platform()->Name(), ":",
|
||||
stream_exec->device_ordinal());
|
||||
}
|
||||
|
@ -16,6 +16,9 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
|
||||
#define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
|
||||
|
||||
// Pulls in the ::stream_executor -> ::xla::se namespace alias.
|
||||
#include "tensorflow/compiler/xla/types.h"
|
||||
|
||||
// These classes are forward declared so that ExecutableRunOptions can be linked
|
||||
// into an XLA-compiled binary without having to link all of the pointed-to
|
||||
// objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
|
||||
@ -37,10 +40,6 @@ struct ThreadPoolDevice;
|
||||
|
||||
namespace xla {
|
||||
|
||||
// TODO(b/77980417): Once the perftools::gputools -> stream_executor migration
|
||||
// is complete, add "using namespace se = stream_executor" here and
|
||||
// s/stream_executor/se::/ to match our idiom elsewhere.
|
||||
|
||||
class DeviceMemoryAllocator;
|
||||
class DeviceAssignment;
|
||||
class ExecutionProfile;
|
||||
|
@ -20,7 +20,7 @@ limitations under the License.
|
||||
namespace xla {
|
||||
|
||||
/* static */ StatusOr<std::unique_ptr<GRPCService>> GRPCService::NewService(
|
||||
perftools::gputools::Platform* platform) {
|
||||
se::Platform* platform) {
|
||||
std::unique_ptr<GRPCService> grpc_service(new GRPCService());
|
||||
TF_ASSIGN_OR_RETURN(grpc_service->service_,
|
||||
::xla::Service::NewService(platform));
|
||||
|
@ -29,7 +29,7 @@ class GRPCService : public grpc::XlaService::Service {
|
||||
// that the service should target. If platform is null then the default
|
||||
// platform is used.
|
||||
static StatusOr<std::unique_ptr<GRPCService>> NewService(
|
||||
perftools::gputools::Platform* platform = nullptr);
|
||||
se::Platform* platform = nullptr);
|
||||
|
||||
::grpc::Status Computation(::grpc::ServerContext* context,
|
||||
const ComputationRequest* arg,
|
||||
|
@ -204,7 +204,7 @@ StatusOr<std::vector<const ShapedBuffer*>> AllocationTracker::ResolveInternal(
|
||||
}
|
||||
|
||||
void AllocationTracker::AddAllocationOrIncrementRefCount(
|
||||
perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
|
||||
se::DeviceMemoryBase device_memory, int device_ordinal) {
|
||||
AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
|
||||
auto it = allocation_map.find(device_memory.opaque());
|
||||
if (it == allocation_map.end()) {
|
||||
@ -215,8 +215,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
|
||||
}
|
||||
}
|
||||
|
||||
Status AllocationTracker::DecrementRefCount(
|
||||
perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
|
||||
Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory,
|
||||
int device_ordinal) {
|
||||
AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
|
||||
auto it = allocation_map.find(device_memory.opaque());
|
||||
TF_RET_CHECK(it != allocation_map.end());
|
||||
|
@ -77,7 +77,7 @@ class AllocationTracker {
|
||||
// Data structure encapsulating single memory allocation on the device.
|
||||
struct Allocation {
|
||||
// The pointer to this allocation.
|
||||
perftools::gputools::DeviceMemoryBase device_memory;
|
||||
se::DeviceMemoryBase device_memory;
|
||||
|
||||
// The device that the memory is allocated on.
|
||||
int device_ordinal;
|
||||
@ -103,13 +103,13 @@ class AllocationTracker {
|
||||
|
||||
// Adds the given device address to the allocation tracker, or if it already
|
||||
// exists, then increment it's reference count.
|
||||
void AddAllocationOrIncrementRefCount(
|
||||
perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal)
|
||||
void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory,
|
||||
int device_ordinal)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
|
||||
// Decrements the reference count of the given device memory. Then, if it is
|
||||
// zero, deallocate the memory.
|
||||
Status DecrementRefCount(perftools::gputools::DeviceMemoryBase device_memory,
|
||||
Status DecrementRefCount(se::DeviceMemoryBase device_memory,
|
||||
int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
|
||||
|
||||
// A map from device memory opaque value to allocation. One such map is
|
||||
|
@ -36,19 +36,14 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
BackendOptions& BackendOptions::set_platform(
|
||||
perftools::gputools::Platform* platform) {
|
||||
BackendOptions& BackendOptions::set_platform(se::Platform* platform) {
|
||||
platform_ = platform;
|
||||
return *this;
|
||||
}
|
||||
|
||||
perftools::gputools::Platform* BackendOptions::platform() const {
|
||||
return platform_;
|
||||
}
|
||||
se::Platform* BackendOptions::platform() const { return platform_; }
|
||||
|
||||
BackendOptions& BackendOptions::set_intra_op_parallelism_threads(
|
||||
int num_threads) {
|
||||
@ -77,7 +72,7 @@ struct Backend::EigenThreadPoolWrapper {
|
||||
|
||||
/* static */ StatusOr<std::unique_ptr<Backend>> Backend::CreateBackend(
|
||||
const BackendOptions& options) {
|
||||
perftools::gputools::Platform* platform = options.platform();
|
||||
se::Platform* platform = options.platform();
|
||||
TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
|
||||
TF_ASSIGN_OR_RETURN(auto stream_executors,
|
||||
PlatformUtil::GetStreamExecutors(platform));
|
||||
@ -121,7 +116,7 @@ StatusOr<Backend::StreamPtr> Backend::BorrowStream(
|
||||
}
|
||||
|
||||
Backend::Backend(
|
||||
perftools::gputools::Platform* platform, Compiler* compiler,
|
||||
se::Platform* platform, Compiler* compiler,
|
||||
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
|
||||
TransferManager* transfer_manager, ComputationPlacer* computation_placer,
|
||||
int intra_op_parallelism_threads)
|
||||
@ -178,7 +173,7 @@ tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
|
||||
return intra_op_thread_pool_wrapper_->pool.get();
|
||||
}
|
||||
|
||||
StatusOr<perftools::gputools::StreamExecutor*> Backend::stream_executor(
|
||||
StatusOr<se::StreamExecutor*> Backend::stream_executor(
|
||||
int device_ordinal) const {
|
||||
if (device_ordinal < 0 ||
|
||||
device_ordinal > stream_executors_.back()->device_ordinal()) {
|
||||
@ -201,9 +196,9 @@ StatusOr<bool> Backend::devices_equivalent(int device_ordinal_a,
|
||||
// bit crude but works for GPUs which is the important case where we compile
|
||||
// an executable for one GPU and want to know if it will run (well) on
|
||||
// another.
|
||||
TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_a,
|
||||
TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_a,
|
||||
stream_executor(device_ordinal_a));
|
||||
TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_b,
|
||||
TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_b,
|
||||
stream_executor(device_ordinal_b));
|
||||
return (executor_a->GetDeviceDescription().name() ==
|
||||
executor_b->GetDeviceDescription().name());
|
||||
|
@ -44,8 +44,8 @@ namespace xla {
|
||||
class BackendOptions {
|
||||
public:
|
||||
// Set the platform backing the backend, or nullptr for the default platform.
|
||||
BackendOptions& set_platform(perftools::gputools::Platform* platform);
|
||||
perftools::gputools::Platform* platform() const;
|
||||
BackendOptions& set_platform(se::Platform* platform);
|
||||
se::Platform* platform() const;
|
||||
|
||||
// Sets the thread pool size for parallel execution of an individual operator.
|
||||
// The default value of -1 will result in initializing the thread pool with
|
||||
@ -54,7 +54,7 @@ class BackendOptions {
|
||||
int intra_op_parallelism_threads() const;
|
||||
|
||||
private:
|
||||
perftools::gputools::Platform* platform_ = nullptr;
|
||||
se::Platform* platform_ = nullptr;
|
||||
int intra_op_parallelism_threads_ = -1;
|
||||
};
|
||||
|
||||
@ -66,7 +66,7 @@ class BackendOptions {
|
||||
// StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie();
|
||||
class Backend {
|
||||
public:
|
||||
using StreamPtr = Pool<perftools::gputools::Stream>::SmartPtr;
|
||||
using StreamPtr = Pool<se::Stream>::SmartPtr;
|
||||
|
||||
// Creates a new backend.
|
||||
static StatusOr<std::unique_ptr<Backend>> CreateBackend(
|
||||
@ -79,7 +79,7 @@ class Backend {
|
||||
~Backend();
|
||||
|
||||
// Accessors for the various objects.
|
||||
perftools::gputools::Platform* platform() const { return platform_; }
|
||||
se::Platform* platform() const { return platform_; }
|
||||
Compiler* compiler() const { return compiler_; }
|
||||
DeviceMemoryAllocator* memory_allocator() const {
|
||||
return memory_allocator_.get();
|
||||
@ -96,19 +96,17 @@ class Backend {
|
||||
|
||||
// Returns stream executors of all supported devices for this backend. The
|
||||
// executors are ordered by the device ordinal.
|
||||
const std::vector<perftools::gputools::StreamExecutor*>& stream_executors()
|
||||
const {
|
||||
const std::vector<se::StreamExecutor*>& stream_executors() const {
|
||||
return stream_executors_;
|
||||
}
|
||||
|
||||
// Returns the stream executor for the given device ordinal.
|
||||
StatusOr<perftools::gputools::StreamExecutor*> stream_executor(
|
||||
int device_ordinal) const;
|
||||
StatusOr<se::StreamExecutor*> stream_executor(int device_ordinal) const;
|
||||
|
||||
// Returns the stream executor for the default device ordinal. This stream
|
||||
// executor can only be used when the number of computations is 1 (replication
|
||||
// can be > 1).
|
||||
perftools::gputools::StreamExecutor* default_stream_executor() const {
|
||||
se::StreamExecutor* default_stream_executor() const {
|
||||
CHECK(!stream_executors_.empty());
|
||||
return stream_executors_[0];
|
||||
}
|
||||
@ -117,8 +115,7 @@ class Backend {
|
||||
// internal pool, or by constructing/initializating it, and returns the result
|
||||
// to the caller.
|
||||
StatusOr<StreamPtr> BorrowStream(int device_ordinal);
|
||||
StatusOr<StreamPtr> BorrowStream(
|
||||
perftools::gputools::StreamExecutor* executor);
|
||||
StatusOr<StreamPtr> BorrowStream(se::StreamExecutor* executor);
|
||||
|
||||
// Returns a function to borrow a stream, as `BorrowStream` above does.
|
||||
// Purely for convenience, the caller could rather make this anonymous
|
||||
@ -157,29 +154,26 @@ class Backend {
|
||||
|
||||
private:
|
||||
struct EigenThreadPoolWrapper;
|
||||
Backend(perftools::gputools::Platform* platform, Compiler* compiler,
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
|
||||
stream_executors,
|
||||
Backend(se::Platform* platform, Compiler* compiler,
|
||||
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
|
||||
TransferManager* transfer_manager,
|
||||
ComputationPlacer* computation_placer,
|
||||
int intra_op_parallelism_threads);
|
||||
Backend(const Backend&) = delete;
|
||||
Backend& operator=(const Backend&) = delete;
|
||||
|
||||
perftools::gputools::Platform* platform_;
|
||||
se::Platform* platform_;
|
||||
Compiler* compiler_;
|
||||
TransferManager* transfer_manager_;
|
||||
ComputationPlacer* computation_placer_;
|
||||
|
||||
// Vector of stream executors. stream_executors_[0] is the default executor.
|
||||
std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
|
||||
std::vector<se::StreamExecutor*> stream_executors_;
|
||||
|
||||
tensorflow::mutex mu_;
|
||||
|
||||
// Mapping from stream executor to stream pools, used by `BorrowStream` above.
|
||||
std::map<perftools::gputools::StreamExecutor*,
|
||||
Pool<perftools::gputools::Stream>>
|
||||
stream_pools_ GUARDED_BY(mu_);
|
||||
std::map<se::StreamExecutor*, Pool<se::Stream>> stream_pools_ GUARDED_BY(mu_);
|
||||
|
||||
// The default memory allocator to use.
|
||||
std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
|
||||
|
@ -37,7 +37,7 @@ limitations under the License.
|
||||
namespace xla {
|
||||
|
||||
/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
|
||||
CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
|
||||
CompileOnlyService::NewService(se::Platform* platform) {
|
||||
ServiceOptions default_options;
|
||||
default_options.set_platform(platform);
|
||||
return NewService(default_options);
|
||||
@ -45,7 +45,7 @@ CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
|
||||
|
||||
/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
|
||||
CompileOnlyService::NewService(const ServiceOptions& options) {
|
||||
perftools::gputools::Platform* platform = options.platform();
|
||||
se::Platform* platform = options.platform();
|
||||
if (platform == nullptr) {
|
||||
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
|
||||
}
|
||||
|
@ -34,7 +34,7 @@ class CompileOnlyService : public Service {
|
||||
// platform that the service should target. If platform is null then the
|
||||
// default platform is used.
|
||||
static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
|
||||
perftools::gputools::Platform* platform);
|
||||
se::Platform* platform);
|
||||
static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
|
||||
const ServiceOptions& options);
|
||||
|
||||
|
@ -23,26 +23,21 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
/* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
|
||||
tensorflow::LINKER_INITIALIZED);
|
||||
|
||||
/* static */ std::map<perftools::gputools::Platform::Id,
|
||||
Compiler::CompilerFactory>*
|
||||
/* static */ std::map<se::Platform::Id, Compiler::CompilerFactory>*
|
||||
Compiler::GetPlatformCompilerFactories() {
|
||||
static auto* r =
|
||||
new std::map<perftools::gputools::Platform::Id, CompilerFactory>;
|
||||
static auto* r = new std::map<se::Platform::Id, CompilerFactory>;
|
||||
return r;
|
||||
}
|
||||
|
||||
/* static */
|
||||
std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
|
||||
std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
|
||||
Compiler::GetPlatformCompilers() {
|
||||
static auto* r = new std::map<perftools::gputools::Platform::Id,
|
||||
std::unique_ptr<Compiler>>;
|
||||
static auto* r = new std::map<se::Platform::Id, std::unique_ptr<Compiler>>;
|
||||
return r;
|
||||
}
|
||||
|
||||
|
@ -70,7 +70,7 @@ class AotCompilationOptions {
|
||||
virtual ~AotCompilationOptions() = default;
|
||||
|
||||
// Returns the ID of the platform to which these options apply.
|
||||
virtual perftools::gputools::Platform::Id PlatformId() const = 0;
|
||||
virtual se::Platform::Id PlatformId() const = 0;
|
||||
|
||||
// Optional allocator that may be used for allocating temp space on the device
|
||||
// during compilation.
|
||||
@ -109,7 +109,7 @@ class Compiler {
|
||||
virtual ~Compiler() {}
|
||||
|
||||
// Returns the ID of the platform that this compiler targets.
|
||||
virtual perftools::gputools::Platform::Id PlatformId() const = 0;
|
||||
virtual se::Platform::Id PlatformId() const = 0;
|
||||
|
||||
// Runs Hlo passes to optimize the given Hlo module, returns the optimized
|
||||
// module.
|
||||
@ -120,8 +120,7 @@ class Compiler {
|
||||
// algorithm over those buffers, to see which variant is fastest. Any space
|
||||
// allocated should be deallocated before this function returns.
|
||||
virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
||||
std::unique_ptr<HloModule> module,
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
|
||||
DeviceMemoryAllocator* device_allocator) = 0;
|
||||
|
||||
// Compiles the HLO module for execution on a device given by the executor,
|
||||
@ -137,8 +136,7 @@ class Compiler {
|
||||
//
|
||||
// Use the overload below to compile computations that run in parallel.
|
||||
virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
|
||||
std::unique_ptr<HloModule> module,
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
|
||||
DeviceMemoryAllocator* device_allocator) = 0;
|
||||
|
||||
// Compiles a set of HLO modules that can run in parallel, potentially
|
||||
@ -151,8 +149,7 @@ class Compiler {
|
||||
// modules to RunHloPasses and RunBackends.
|
||||
virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
|
||||
std::vector<std::unique_ptr<HloModule>> modules,
|
||||
std::vector<std::vector<perftools::gputools::StreamExecutor*>>
|
||||
stream_exec,
|
||||
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
||||
DeviceMemoryAllocator* device_allocator) = 0;
|
||||
|
||||
// Compiles the HLO module for ahead-of-time execution. This is intended for
|
||||
@ -171,14 +168,12 @@ class Compiler {
|
||||
// be a singleton, so no ownership is transferred.
|
||||
//
|
||||
// Precondition: a platform kind must not be registered more than once.
|
||||
static void RegisterCompilerFactory(
|
||||
perftools::gputools::Platform::Id platform_id,
|
||||
CompilerFactory compiler_factory);
|
||||
static void RegisterCompilerFactory(se::Platform::Id platform_id,
|
||||
CompilerFactory compiler_factory);
|
||||
|
||||
// Returns the compiler singleton pointer if it is available for the given
|
||||
// platform, or an error status if it is not.
|
||||
static StatusOr<Compiler*> GetForPlatform(
|
||||
const perftools::gputools::Platform* platform);
|
||||
static StatusOr<Compiler*> GetForPlatform(const se::Platform* platform);
|
||||
|
||||
// Returns a function that computes the size in bytes of the logical
|
||||
// buffer that contains a shape.
|
||||
@ -198,12 +193,12 @@ class Compiler {
|
||||
static tensorflow::mutex platform_compiler_mutex_;
|
||||
|
||||
// Map from platform kind to compiler factory.
|
||||
static std::map<perftools::gputools::Platform::Id, CompilerFactory>*
|
||||
static std::map<se::Platform::Id, CompilerFactory>*
|
||||
GetPlatformCompilerFactories();
|
||||
|
||||
// Map from platform kind to compiler instance, if we made one already (based
|
||||
// on the factories above).
|
||||
static std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
|
||||
static std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
|
||||
GetPlatformCompilers();
|
||||
};
|
||||
|
||||
|
@ -32,8 +32,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
|
||||
@ -132,11 +130,9 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
|
||||
ComputationPlacer::platform_computation_placer_mutex_(
|
||||
tensorflow::LINKER_INITIALIZED);
|
||||
|
||||
/* static */ std::map<perftools::gputools::Platform::Id,
|
||||
ComputationPlacer::State>*
|
||||
/* static */ std::map<se::Platform::Id, ComputationPlacer::State>*
|
||||
ComputationPlacer::GetPlatformComputationPlacers() {
|
||||
static auto* r =
|
||||
new std::map<perftools::gputools::Platform::Id, ComputationPlacer::State>;
|
||||
static auto* r = new std::map<se::Platform::Id, ComputationPlacer::State>;
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -147,10 +143,10 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
|
||||
}
|
||||
|
||||
static bool InitModule() {
|
||||
xla::ComputationPlacer::RegisterComputationPlacer(se::host::kHostPlatformId,
|
||||
&CreateComputationPlacer);
|
||||
xla::ComputationPlacer::RegisterComputationPlacer(se::cuda::kCudaPlatformId,
|
||||
&CreateComputationPlacer);
|
||||
xla::ComputationPlacer::RegisterComputationPlacer(
|
||||
stream_executor::host::kHostPlatformId, &CreateComputationPlacer);
|
||||
xla::ComputationPlacer::RegisterComputationPlacer(
|
||||
stream_executor::cuda::kCudaPlatformId, &CreateComputationPlacer);
|
||||
return true;
|
||||
}
|
||||
static bool module_initialized = InitModule();
|
||||
|
@ -80,13 +80,13 @@ class ComputationPlacer {
|
||||
|
||||
// Registers a computation placer creation function for a particular platform.
|
||||
static void RegisterComputationPlacer(
|
||||
perftools::gputools::Platform::Id platform_id,
|
||||
se::Platform::Id platform_id,
|
||||
ComputationPlacerCreationFunction creation_function);
|
||||
|
||||
// Returns the computation placer singleton pointer if it is available for the
|
||||
// given platform, or an error status if it is not.
|
||||
static StatusOr<ComputationPlacer*> GetForPlatform(
|
||||
const perftools::gputools::Platform* platform);
|
||||
const se::Platform* platform);
|
||||
|
||||
private:
|
||||
// The mutex that guards the platform-to-computation placer map.
|
||||
@ -101,10 +101,9 @@ class ComputationPlacer {
|
||||
};
|
||||
|
||||
// Map from platform kind to computation placer singleton.
|
||||
static std::map<perftools::gputools::Platform::Id, State>*
|
||||
GetPlatformComputationPlacers();
|
||||
static std::map<se::Platform::Id, State>* GetPlatformComputationPlacers();
|
||||
|
||||
perftools::gputools::Platform::Id platform_id_;
|
||||
se::Platform::Id platform_id_;
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer);
|
||||
};
|
||||
|
@ -100,8 +100,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/lib/strings/str_util.h"
|
||||
#include "tensorflow/core/lib/strings/strcat.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace cpu {
|
||||
|
||||
@ -440,8 +438,7 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
|
||||
} // namespace
|
||||
|
||||
StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
|
||||
std::unique_ptr<HloModule> module,
|
||||
perftools::gputools::StreamExecutor* /*stream_exec*/,
|
||||
std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
|
||||
DeviceMemoryAllocator* /*device_allocator*/) {
|
||||
VLOG(2) << "Before optimization:";
|
||||
XLA_VLOG_LINES(2, module->ToString());
|
||||
@ -454,8 +451,7 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
|
||||
}
|
||||
|
||||
StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
|
||||
std::unique_ptr<HloModule> module,
|
||||
perftools::gputools::StreamExecutor* stream_exec,
|
||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||
DeviceMemoryAllocator* /*device_allocator*/) {
|
||||
const string timer_message =
|
||||
"Compiling [" + module->name() + "] for CPU using JIT";
|
||||
@ -938,9 +934,9 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
|
||||
} // namespace xla
|
||||
|
||||
static bool InitModule() {
|
||||
xla::Compiler::RegisterCompilerFactory(se::host::kHostPlatformId, []() {
|
||||
return xla::MakeUnique<xla::cpu::CpuCompiler>();
|
||||
});
|
||||
xla::Compiler::RegisterCompilerFactory(
|
||||
stream_executor::host::kHostPlatformId,
|
||||
[]() { return xla::MakeUnique<xla::cpu::CpuCompiler>(); });
|
||||
return true;
|
||||
}
|
||||
static bool module_initialized = InitModule();
|
||||
|
@ -53,7 +53,7 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
|
||||
RelocationModel relocation_model);
|
||||
~CpuAotCompilationOptions() override;
|
||||
|
||||
perftools::gputools::Platform::Id PlatformId() const override;
|
||||
se::Platform::Id PlatformId() const override;
|
||||
|
||||
// The triple used for compilation, similar to clang's -target flag.
|
||||
const string& triple() const { return triple_; }
|
||||
@ -112,25 +112,23 @@ class CpuCompiler : public LLVMCompiler {
|
||||
// Bring in
|
||||
// StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
|
||||
// std::vector<std::unique_ptr<HloModule>> modules,
|
||||
// std::vector<std::vector<perftools::gputools::StreamExecutor*>>
|
||||
// std::vector<std::vector<se::StreamExecutor*>>
|
||||
// stream_execs)
|
||||
using LLVMCompiler::Compile;
|
||||
|
||||
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
||||
std::unique_ptr<HloModule> module,
|
||||
perftools::gputools::StreamExecutor* stream_exec,
|
||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||
DeviceMemoryAllocator* device_allocator) override;
|
||||
|
||||
StatusOr<std::unique_ptr<Executable>> RunBackend(
|
||||
std::unique_ptr<HloModule> module,
|
||||
perftools::gputools::StreamExecutor* stream_exec,
|
||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||
DeviceMemoryAllocator* device_allocator) override;
|
||||
|
||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
|
||||
const AotCompilationOptions& options) override;
|
||||
|
||||
perftools::gputools::Platform::Id PlatformId() const override;
|
||||
se::Platform::Id PlatformId() const override;
|
||||
|
||||
HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
|
||||
|
||||
|
@ -45,8 +45,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/stream_executor/host/host_stream.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace cpu {
|
||||
|
||||
@ -75,7 +73,7 @@ CpuExecutable::CpuExecutable(
|
||||
|
||||
Status CpuExecutable::AllocateBuffers(
|
||||
DeviceMemoryAllocator* memory_allocator, int device_ordinal,
|
||||
std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
|
||||
std::vector<se::DeviceMemoryBase>* buffers) {
|
||||
CHECK_EQ(buffers->size(), assignment_->Allocations().size());
|
||||
VLOG(3) << "Allocating " << assignment_->Allocations().size()
|
||||
<< " allocations for module " << module().name();
|
||||
@ -247,8 +245,7 @@ static Status DeallocateTempBuffers(
|
||||
|
||||
StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
|
||||
const ServiceExecutableRunOptions* run_options,
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
|
||||
allocated_buffers,
|
||||
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
|
||||
std::vector<bool>* buffers_in_result) {
|
||||
se::Stream* stream = run_options->stream();
|
||||
auto result_buffer = MakeUnique<ShapedBuffer>(
|
||||
@ -322,7 +319,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
|
||||
"supported on CPU.");
|
||||
}
|
||||
|
||||
auto* host_stream = dynamic_cast<perftools::gputools::host::HostStream*>(
|
||||
auto* host_stream = dynamic_cast<se::host::HostStream*>(
|
||||
run_options->stream()->implementation());
|
||||
se::Stream* stream = run_options->stream();
|
||||
DeviceMemoryAllocator* memory_allocator = run_options->allocator();
|
||||
|
@ -90,17 +90,16 @@ class CpuExecutable : public Executable {
|
||||
// assignment. Each vector element corresponds to a particular Index. If
|
||||
// a vector element already contains a non-null DeviceMemoryBase, then no
|
||||
// buffer is assigned for this element.
|
||||
Status AllocateBuffers(
|
||||
DeviceMemoryAllocator* memory_allocator, int device_ordinal,
|
||||
std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
|
||||
Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
|
||||
int device_ordinal,
|
||||
std::vector<se::DeviceMemoryBase>* buffers);
|
||||
|
||||
// Calls the generated function performing the computation with the given
|
||||
// arguments using the supplied buffers.
|
||||
Status ExecuteComputeFunction(
|
||||
const ExecutableRunOptions* run_options,
|
||||
tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
|
||||
buffers,
|
||||
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
|
||||
HloExecutionProfile* hlo_execution_profile);
|
||||
|
||||
// Create a ShapedBuffer for holding the result of the computation. The
|
||||
@ -111,8 +110,7 @@ class CpuExecutable : public Executable {
|
||||
// the returned ShapedBuffer).
|
||||
StatusOr<std::unique_ptr<ShapedBuffer>> CreateResultShapedBuffer(
|
||||
const ServiceExecutableRunOptions* run_options,
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
|
||||
allocated_buffers,
|
||||
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
|
||||
std::vector<bool>* buffers_in_result);
|
||||
|
||||
// Returns the points-to set of the root instruction of the entry
|
||||
|
@ -34,8 +34,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/notification.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
namespace {
|
||||
@ -241,21 +239,20 @@ Status CpuTransferManager::TransferLiteralFromOutfeed(
|
||||
}
|
||||
|
||||
StatusOr<Shape> CpuTransferManager::TransferTupleBuffersFromOutfeed(
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
se::StreamExecutor* executor,
|
||||
tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data) {
|
||||
return TransferBuffersFromOutfeedInternal(executor, buffer_data,
|
||||
/*is_tuple=*/true);
|
||||
}
|
||||
|
||||
StatusOr<Shape> CpuTransferManager::TransferArrayBufferFromOutfeed(
|
||||
perftools::gputools::StreamExecutor* executor, void* destination,
|
||||
int64 size_bytes) {
|
||||
se::StreamExecutor* executor, void* destination, int64 size_bytes) {
|
||||
return TransferBuffersFromOutfeedInternal(
|
||||
executor, {{destination, size_bytes}}, /*is_tuple=*/false);
|
||||
}
|
||||
|
||||
StatusOr<Shape> CpuTransferManager::TransferBuffersFromOutfeedInternal(
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
se::StreamExecutor* executor,
|
||||
tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
|
||||
bool is_tuple) {
|
||||
std::vector<std::unique_ptr<CpuOutfeedBuffer>> buffers;
|
||||
@ -306,8 +303,8 @@ static std::unique_ptr<xla::TransferManager> CreateCpuTransferManager() {
|
||||
}
|
||||
|
||||
static bool InitModule() {
|
||||
xla::TransferManager::RegisterTransferManager(se::host::kHostPlatformId,
|
||||
&CreateCpuTransferManager);
|
||||
xla::TransferManager::RegisterTransferManager(
|
||||
stream_executor::host::kHostPlatformId, &CreateCpuTransferManager);
|
||||
return true;
|
||||
}
|
||||
static bool module_initialized = InitModule();
|
||||
|
@ -37,36 +37,35 @@ class CpuTransferManager : public GenericTransferManager {
|
||||
CpuTransferManager();
|
||||
~CpuTransferManager() override {}
|
||||
|
||||
Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
|
||||
Status TransferLiteralToInfeed(se::StreamExecutor* executor,
|
||||
const Literal& literal) override;
|
||||
Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
|
||||
int64 size, const void* source) override;
|
||||
Status TransferLiteralFromOutfeed(
|
||||
perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
|
||||
Literal* literal) override;
|
||||
Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
|
||||
const void* source) override;
|
||||
Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
|
||||
const Shape& literal_shape,
|
||||
Literal* literal) override;
|
||||
|
||||
private:
|
||||
// Transfers infeed data to device. InfeedBuffer->Done() must be
|
||||
// called to clean up the memory allocated for InfeedBuffer.
|
||||
StatusOr<cpu::runtime::XfeedBuffer*> TransferBufferToInfeedInternal(
|
||||
perftools::gputools::StreamExecutor* executor, int64 size,
|
||||
const void* source);
|
||||
se::StreamExecutor* executor, int64 size, const void* source);
|
||||
|
||||
// Helper that transfers a tuple of element buffers from the device's outfeed.
|
||||
StatusOr<Shape> TransferTupleBuffersFromOutfeed(
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
se::StreamExecutor* executor,
|
||||
tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data);
|
||||
|
||||
// Helper that transfers an array buffer from the device's outfeed.
|
||||
StatusOr<Shape> TransferArrayBufferFromOutfeed(
|
||||
perftools::gputools::StreamExecutor* executor, void* destination,
|
||||
int64 size_bytes);
|
||||
StatusOr<Shape> TransferArrayBufferFromOutfeed(se::StreamExecutor* executor,
|
||||
void* destination,
|
||||
int64 size_bytes);
|
||||
|
||||
// On success, returns the shape that was transferred from the outfeed -- if
|
||||
// is_tuple is true, the returned shape will be a tuple of the returned shapes
|
||||
// for the given buffers.
|
||||
StatusOr<Shape> TransferBuffersFromOutfeedInternal(
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
se::StreamExecutor* executor,
|
||||
tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
|
||||
bool is_tuple);
|
||||
|
||||
|
@ -49,8 +49,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace cpu {
|
||||
|
||||
@ -325,7 +323,7 @@ const void** Executor::GetOperandBuffers(HloInstruction* instruction) {
|
||||
|
||||
Status ParallelCpuExecutable::AllocateBuffers(
|
||||
DeviceMemoryAllocator* memory_allocator, int device_ordinal,
|
||||
std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
|
||||
std::vector<se::DeviceMemoryBase>* buffers) {
|
||||
CHECK_EQ(buffers->size(), assignment_->Allocations().size());
|
||||
VLOG(3) << "Allocating " << assignment_->Allocations().size()
|
||||
<< " allocations for module " << module().name();
|
||||
|
@ -89,17 +89,16 @@ class ParallelCpuExecutable : public Executable {
|
||||
// assignment. Each vector element corresponds to a particular Index. If
|
||||
// a vector element already contains a non-null DeviceMemoryBase, then no
|
||||
// buffer is assigned for this element.
|
||||
Status AllocateBuffers(
|
||||
DeviceMemoryAllocator* memory_allocator, int device_ordinal,
|
||||
std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
|
||||
Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
|
||||
int device_ordinal,
|
||||
std::vector<se::DeviceMemoryBase>* buffers);
|
||||
|
||||
// Calls the generated functions in 'function_names_', performing the
|
||||
// computation with the given arguments using the supplied buffers.
|
||||
Status ExecuteComputeFunctions(
|
||||
const ServiceExecutableRunOptions* run_options,
|
||||
tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
|
||||
buffers,
|
||||
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
|
||||
HloExecutionProfile* hlo_execution_profile);
|
||||
|
||||
// Returns the points-to set of the root instruction of the entry
|
||||
|
@ -24,19 +24,16 @@ limitations under the License.
|
||||
namespace xla {
|
||||
|
||||
StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
|
||||
const perftools::gputools::Platform* platform,
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
|
||||
stream_executors)
|
||||
const se::Platform* platform,
|
||||
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors)
|
||||
: DeviceMemoryAllocator(platform),
|
||||
stream_executors_(stream_executors.begin(), stream_executors.end()) {}
|
||||
|
||||
StatusOr<perftools::gputools::DeviceMemoryBase>
|
||||
StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
|
||||
bool retry_on_failure) {
|
||||
TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
|
||||
StatusOr<se::DeviceMemoryBase> StreamExecutorMemoryAllocator::Allocate(
|
||||
int device_ordinal, uint64 size, bool retry_on_failure) {
|
||||
TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
|
||||
GetStreamExecutor(device_ordinal));
|
||||
perftools::gputools::DeviceMemoryBase result =
|
||||
stream_executor->AllocateArray<uint8>(size);
|
||||
se::DeviceMemoryBase result = stream_executor->AllocateArray<uint8>(size);
|
||||
if (size > 0 && result == nullptr) {
|
||||
return ResourceExhausted(
|
||||
"Failed to allocate request for %s (%lluB) on device ordinal %d",
|
||||
@ -47,22 +44,22 @@ StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
|
||||
}
|
||||
|
||||
tensorflow::Status StreamExecutorMemoryAllocator::Deallocate(
|
||||
int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) {
|
||||
int device_ordinal, se::DeviceMemoryBase* mem) {
|
||||
if (!mem->is_null()) {
|
||||
TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
|
||||
TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
|
||||
GetStreamExecutor(device_ordinal));
|
||||
// We make a local copy of 'mem' so the original is not zeroed out by the
|
||||
// Deallocate() call below. This gives us a better chance of
|
||||
// catching double-free bugs, since Deallocate silently succeeds for null
|
||||
// values.
|
||||
perftools::gputools::DeviceMemoryBase mem_copy(*mem);
|
||||
se::DeviceMemoryBase mem_copy(*mem);
|
||||
stream_executor->Deallocate(&mem_copy);
|
||||
}
|
||||
return tensorflow::Status::OK();
|
||||
}
|
||||
|
||||
StatusOr<perftools::gputools::StreamExecutor*>
|
||||
StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
|
||||
StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
|
||||
int device_ordinal) {
|
||||
if (device_ordinal < 0) {
|
||||
return InvalidArgument("device ordinal value (%d) must be non-negative",
|
||||
device_ordinal);
|
||||
|
@ -33,7 +33,7 @@ class DeviceMemoryAllocator {
|
||||
public:
|
||||
// Parameter platform indicates which platform the allocator allocates memory
|
||||
// on. Must be non-null.
|
||||
explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform)
|
||||
explicit DeviceMemoryAllocator(const se::Platform* platform)
|
||||
: platform_(platform) {}
|
||||
virtual ~DeviceMemoryAllocator() {}
|
||||
|
||||
@ -43,20 +43,20 @@ class DeviceMemoryAllocator {
|
||||
// has only performance impact.
|
||||
// Allocate() should return a null pointer for a size-0 allocation.
|
||||
// Deallocate() must be a no-op for null pointers.
|
||||
virtual StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
|
||||
virtual StatusOr<se::DeviceMemoryBase> Allocate(
|
||||
int device_ordinal, uint64 size, bool retry_on_failure = true) = 0;
|
||||
virtual tensorflow::Status Deallocate(
|
||||
int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0;
|
||||
virtual tensorflow::Status Deallocate(int device_ordinal,
|
||||
se::DeviceMemoryBase* mem) = 0;
|
||||
|
||||
// Return the platform that the allocator allocates memory on.
|
||||
const perftools::gputools::Platform* platform() const { return platform_; }
|
||||
const se::Platform* platform() const { return platform_; }
|
||||
|
||||
// Can we call Deallocate() as soon as a computation has been scheduled on
|
||||
// a stream, or do we have to wait for the computation to complete first?
|
||||
virtual bool AllowsAsynchronousDeallocation() const = 0;
|
||||
|
||||
protected:
|
||||
const perftools::gputools::Platform* platform_;
|
||||
const se::Platform* platform_;
|
||||
};
|
||||
|
||||
// Default memory allocator for a platform which uses
|
||||
@ -64,25 +64,23 @@ class DeviceMemoryAllocator {
|
||||
class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
|
||||
public:
|
||||
StreamExecutorMemoryAllocator(
|
||||
const perftools::gputools::Platform* platform,
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
|
||||
stream_executors);
|
||||
const se::Platform* platform,
|
||||
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
|
||||
|
||||
StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
|
||||
StatusOr<se::DeviceMemoryBase> Allocate(
|
||||
int device_ordinal, uint64 size, bool retry_on_failure = true) override;
|
||||
tensorflow::Status Deallocate(
|
||||
int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override;
|
||||
tensorflow::Status Deallocate(int device_ordinal,
|
||||
se::DeviceMemoryBase* mem) override;
|
||||
|
||||
bool AllowsAsynchronousDeallocation() const override;
|
||||
|
||||
private:
|
||||
StatusOr<perftools::gputools::StreamExecutor*> GetStreamExecutor(
|
||||
int device_ordinal);
|
||||
StatusOr<se::StreamExecutor*> GetStreamExecutor(int device_ordinal);
|
||||
|
||||
// A vector indexed by device ordinal of StreamExecutors for each device of
|
||||
// the allocator's platform type. If an element is nullptr, then the device
|
||||
// with the respective device ordinal is not supported by XLA.
|
||||
std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
|
||||
std::vector<se::StreamExecutor*> stream_executors_;
|
||||
};
|
||||
|
||||
} // namespace xla
|
||||
|
@ -61,10 +61,10 @@ Executable::ExecuteOnStreams(
|
||||
StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
|
||||
const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
|
||||
ArraySlice<const ShapedBuffer*> arguments) {
|
||||
perftools::gputools::Stream* stream = run_options->stream();
|
||||
std::unique_ptr<perftools::gputools::Timer> timer;
|
||||
se::Stream* stream = run_options->stream();
|
||||
std::unique_ptr<se::Timer> timer;
|
||||
if (profile != nullptr) {
|
||||
timer.reset(new perftools::gputools::Timer(stream->parent()));
|
||||
timer.reset(new se::Timer(stream->parent()));
|
||||
stream->InitTimer(timer.get()).ThenStartTimer(timer.get());
|
||||
}
|
||||
|
||||
|
@ -90,7 +90,7 @@ class Executable {
|
||||
// has completed.
|
||||
virtual Status PopulateExecutionProfile(
|
||||
HloExecutionProfile* hlo_execution_profile,
|
||||
perftools::gputools::StreamExecutor* executor) {
|
||||
se::StreamExecutor* executor) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -32,8 +32,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
GenericTransferManager::GenericTransferManager(se::Platform::Id platform_id,
|
||||
@ -45,9 +43,9 @@ se::Platform::Id GenericTransferManager::PlatformId() const {
|
||||
}
|
||||
|
||||
Status GenericTransferManager::WriteSingleTupleIndexTable(
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
se::StreamExecutor* executor,
|
||||
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
|
||||
const Shape& shape, perftools::gputools::DeviceMemoryBase* region) {
|
||||
const Shape& shape, se::DeviceMemoryBase* region) {
|
||||
TF_RET_CHECK(elements.size() == ShapeUtil::TupleElementCount(shape));
|
||||
|
||||
std::vector<const void*> element_pointers;
|
||||
@ -144,20 +142,19 @@ Status GenericTransferManager::TransferLiteralToInfeed(
|
||||
}
|
||||
|
||||
Status GenericTransferManager::TransferBufferToInfeed(
|
||||
perftools::gputools::StreamExecutor* executor, int64 size,
|
||||
const void* source) {
|
||||
se::StreamExecutor* executor, int64 size, const void* source) {
|
||||
return Unimplemented("Generic transfer to Infeed");
|
||||
}
|
||||
|
||||
Status GenericTransferManager::TransferLiteralFromOutfeed(
|
||||
perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
|
||||
se::StreamExecutor* executor, const Shape& literal_shape,
|
||||
Literal* literal) {
|
||||
return Unimplemented(
|
||||
"Outfeed is not supported on this platform (b/30467474)");
|
||||
}
|
||||
|
||||
Status GenericTransferManager::ResetDevices(
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
|
||||
tensorflow::gtl::ArraySlice<se::StreamExecutor*>
|
||||
/*executors*/) {
|
||||
return Unimplemented(
|
||||
"Device reset is not yet supported on this platform (b/30481585)");
|
||||
|
@ -36,46 +36,41 @@ namespace xla {
|
||||
// infeed.
|
||||
class GenericTransferManager : public TransferManager {
|
||||
public:
|
||||
GenericTransferManager(perftools::gputools::Platform::Id platform_id,
|
||||
size_t pointer_size);
|
||||
GenericTransferManager(se::Platform::Id platform_id, size_t pointer_size);
|
||||
~GenericTransferManager() override {}
|
||||
|
||||
perftools::gputools::Platform::Id PlatformId() const override;
|
||||
se::Platform::Id PlatformId() const override;
|
||||
|
||||
StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
const ShapedBuffer& device_buffer) override;
|
||||
se::StreamExecutor* executor, const ShapedBuffer& device_buffer) override;
|
||||
|
||||
Status TransferLiteralToDevice(perftools::gputools::StreamExecutor* executor,
|
||||
Status TransferLiteralToDevice(se::StreamExecutor* executor,
|
||||
const Literal& literal,
|
||||
const ShapedBuffer& device_buffer) override;
|
||||
|
||||
Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
|
||||
Status TransferLiteralToInfeed(se::StreamExecutor* executor,
|
||||
const Literal& literal) override;
|
||||
Status TransferLiteralFromOutfeed(
|
||||
perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
|
||||
Literal* literal) override;
|
||||
Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
|
||||
const Shape& literal_shape,
|
||||
Literal* literal) override;
|
||||
|
||||
Status ResetDevices(
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
|
||||
executors) override;
|
||||
tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;
|
||||
|
||||
int64 GetByteSizeRequirement(const Shape& shape) const override;
|
||||
|
||||
protected:
|
||||
Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
|
||||
int64 size, const void* source) override;
|
||||
Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
|
||||
const void* source) override;
|
||||
|
||||
Status WriteSingleTupleIndexTable(
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
|
||||
elements,
|
||||
const Shape& shape,
|
||||
perftools::gputools::DeviceMemoryBase* region) override;
|
||||
se::StreamExecutor* executor,
|
||||
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
|
||||
const Shape& shape, se::DeviceMemoryBase* region) override;
|
||||
|
||||
private:
|
||||
// The platform this transfer manager targets.
|
||||
const perftools::gputools::Platform::Id platform_id_;
|
||||
const se::Platform::Id platform_id_;
|
||||
|
||||
// The size in bytes of pointers on this platform.
|
||||
const size_t pointer_size_;
|
||||
|
@ -28,8 +28,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
|
@ -41,7 +41,7 @@ class BufferAllocations {
|
||||
// user-specified result buffers) to the given buffer index. The builder
|
||||
// will skip allocating buffers for registered buffer indices.
|
||||
void RegisterBuffer(BufferAllocation::Index index,
|
||||
perftools::gputools::DeviceMemoryBase address);
|
||||
se::DeviceMemoryBase address);
|
||||
|
||||
// Builds a BufferAllocations object from the given buffer assignment.
|
||||
// `memory_allocator` is what this function uses to allocate device memory.
|
||||
@ -52,8 +52,7 @@ class BufferAllocations {
|
||||
DeviceMemoryAllocator* memory_allocator);
|
||||
|
||||
private:
|
||||
std::map<BufferAllocation::Index, perftools::gputools::DeviceMemoryBase>
|
||||
registered_buffers_;
|
||||
std::map<BufferAllocation::Index, se::DeviceMemoryBase> registered_buffers_;
|
||||
};
|
||||
|
||||
BufferAllocations(const BufferAllocations&) = delete;
|
||||
@ -65,22 +64,20 @@ class BufferAllocations {
|
||||
// Returns the device address of buffer `buffer_index`. `buffer_index` must be
|
||||
// a valid index, i.e., in [0, buffer_count). This function returns null if
|
||||
// `buffer_index` is not assigned to a buffer address.
|
||||
perftools::gputools::DeviceMemoryBase GetDeviceAddress(
|
||||
se::DeviceMemoryBase GetDeviceAddress(
|
||||
BufferAllocation::Index buffer_index) const;
|
||||
|
||||
// Same as above, but also adjusts the returned address for the offset and
|
||||
// size contained in the given slice.
|
||||
perftools::gputools::DeviceMemoryBase GetDeviceAddress(
|
||||
se::DeviceMemoryBase GetDeviceAddress(
|
||||
const BufferAllocation::Slice& buffer_slice) const;
|
||||
|
||||
perftools::gputools::DeviceMemoryBase GetTempBufferBase() const {
|
||||
return temp_buffer_base_;
|
||||
}
|
||||
se::DeviceMemoryBase GetTempBufferBase() const { return temp_buffer_base_; }
|
||||
|
||||
// Tears down all buffers allocated by this object that are not in
|
||||
// `live_addresses`.
|
||||
tensorflow::Status TearDown(
|
||||
const std::set<perftools::gputools::DeviceMemoryBase>& live_addresses,
|
||||
const std::set<se::DeviceMemoryBase>& live_addresses,
|
||||
const BufferAssignment& buffer_assignment);
|
||||
|
||||
private:
|
||||
@ -92,15 +89,15 @@ class BufferAllocations {
|
||||
|
||||
// Sets the device address of buffer `buffer_index`.
|
||||
void SetBuffer(BufferAllocation::Index buffer_index,
|
||||
perftools::gputools::DeviceMemoryBase buffer);
|
||||
se::DeviceMemoryBase buffer);
|
||||
|
||||
// An array of device pointers that stores the address of each buffer
|
||||
// indexed by Index. Each element can point to a temporary buffer, an
|
||||
// input buffer, or nullptr if no buffer is needed for that Index.
|
||||
std::vector<perftools::gputools::DeviceMemoryBase> buffers_;
|
||||
std::vector<se::DeviceMemoryBase> buffers_;
|
||||
|
||||
// The base address of the memory block that contains all temporary buffers.
|
||||
perftools::gputools::DeviceMemoryBase temp_buffer_base_;
|
||||
se::DeviceMemoryBase temp_buffer_base_;
|
||||
|
||||
int device_ordinal_;
|
||||
|
||||
|
@ -42,11 +42,10 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable) {
|
||||
}
|
||||
|
||||
Status ConditionalThunk::ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) {
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) {
|
||||
// Copy the predicate value from device.
|
||||
bool predicate;
|
||||
perftools::gputools::DeviceMemoryBase predicate_address =
|
||||
se::DeviceMemoryBase predicate_address =
|
||||
buffer_allocations.GetDeviceAddress(predicate_buffer_index_);
|
||||
stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool));
|
||||
|
||||
|
@ -49,7 +49,7 @@ class ConditionalThunk : public Thunk {
|
||||
|
||||
Status Initialize(const GpuExecutable& executable) override;
|
||||
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
BufferAllocation::Slice predicate_buffer_index_;
|
||||
|
@ -25,8 +25,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
|
@ -66,23 +66,21 @@ class ConvolutionThunk : public Thunk {
|
||||
|
||||
// Does the convolution for the thunk on "stream".
|
||||
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
class ScratchAllocator;
|
||||
|
||||
Status Convolve(
|
||||
const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
|
||||
perftools::gputools::DeviceMemory<float> input_data,
|
||||
const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
|
||||
perftools::gputools::DeviceMemory<float> filter_data,
|
||||
const perftools::gputools::dnn::BatchDescriptor& output_descriptor,
|
||||
perftools::gputools::DeviceMemory<float> output_data,
|
||||
const perftools::gputools::dnn::ConvolutionDescriptor&
|
||||
convolution_descriptor,
|
||||
const perftools::gputools::dnn::AlgorithmConfig& algorithm_config,
|
||||
perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator,
|
||||
perftools::gputools::dnn::ProfileResult* profile_result);
|
||||
Status Convolve(const se::dnn::BatchDescriptor& input_descriptor,
|
||||
se::DeviceMemory<float> input_data,
|
||||
const se::dnn::FilterDescriptor& filter_descriptor,
|
||||
se::DeviceMemory<float> filter_data,
|
||||
const se::dnn::BatchDescriptor& output_descriptor,
|
||||
se::DeviceMemory<float> output_data,
|
||||
const se::dnn::ConvolutionDescriptor& convolution_descriptor,
|
||||
const se::dnn::AlgorithmConfig& algorithm_config,
|
||||
se::Stream* stream, ScratchAllocator* scratch_allocator,
|
||||
se::dnn::ProfileResult* profile_result);
|
||||
|
||||
const CudnnConvKind convolution_kind_;
|
||||
|
||||
|
@ -30,9 +30,8 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk(
|
||||
mem_size_(mem_size) {}
|
||||
|
||||
tensorflow::Status HostToDeviceCopyThunk::ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) {
|
||||
perftools::gputools::DeviceMemoryBase destination_data =
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) {
|
||||
se::DeviceMemoryBase destination_data =
|
||||
buffer_allocations.GetDeviceAddress(destination_buffer_);
|
||||
stream->ThenMemcpy(&destination_data, source_address_, mem_size_);
|
||||
return tensorflow::Status::OK();
|
||||
@ -48,11 +47,10 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
|
||||
mem_size_(mem_size) {}
|
||||
|
||||
tensorflow::Status DeviceToDeviceCopyThunk::ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) {
|
||||
perftools::gputools::DeviceMemoryBase destination_data =
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) {
|
||||
se::DeviceMemoryBase destination_data =
|
||||
buffer_allocations.GetDeviceAddress(destination_buffer_);
|
||||
perftools::gputools::DeviceMemoryBase source_data =
|
||||
se::DeviceMemoryBase source_data =
|
||||
buffer_allocations.GetDeviceAddress(source_buffer_);
|
||||
stream->ThenMemcpy(&destination_data, source_data, mem_size_);
|
||||
return tensorflow::Status::OK();
|
||||
|
@ -40,8 +40,7 @@ class HostToDeviceCopyThunk : public Thunk {
|
||||
HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete;
|
||||
|
||||
tensorflow::Status ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
const void* source_address_;
|
||||
@ -64,8 +63,7 @@ class DeviceToDeviceCopyThunk : public Thunk {
|
||||
DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;
|
||||
|
||||
tensorflow::Status ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
const BufferAllocation::Slice source_buffer_;
|
||||
|
@ -28,7 +28,6 @@ limitations under the License.
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
namespace dnn = se::dnn;
|
||||
|
||||
static std::pair<dnn::BatchDescriptor /*input_desc*/,
|
||||
|
@ -60,7 +60,7 @@ class CudnnBatchNormForwardInferenceThunk : public Thunk {
|
||||
const CudnnBatchNormForwardInferenceThunk&) = delete;
|
||||
|
||||
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
BufferAllocation::Slice operand_;
|
||||
@ -90,7 +90,7 @@ class CudnnBatchNormForwardTrainingThunk : public Thunk {
|
||||
const CudnnBatchNormForwardTrainingThunk&) = delete;
|
||||
|
||||
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
BufferAllocation::Slice operand_;
|
||||
@ -123,7 +123,7 @@ class CudnnBatchNormBackwardThunk : public Thunk {
|
||||
delete;
|
||||
|
||||
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
BufferAllocation::Slice operand_;
|
||||
|
@ -24,8 +24,6 @@ namespace xla {
|
||||
namespace gpu {
|
||||
namespace {
|
||||
|
||||
namespace se = perftools::gputools;
|
||||
|
||||
using se::DeviceMemoryBase;
|
||||
using se::dnn::AlgorithmConfig;
|
||||
using se::dnn::AlgorithmDesc;
|
||||
|
@ -33,9 +33,8 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
|
||||
// If the `allocator` parameter is not null, we will use it to allocate temp
|
||||
// memory while timing the various convolution algorithms. If it's null,
|
||||
// we'll use the default allocator on the StreamExecutor.
|
||||
CudnnConvolutionAlgorithmPicker(
|
||||
perftools::gputools::StreamExecutor* stream_exec,
|
||||
DeviceMemoryAllocator* allocator)
|
||||
CudnnConvolutionAlgorithmPicker(se::StreamExecutor* stream_exec,
|
||||
DeviceMemoryAllocator* allocator)
|
||||
: stream_exec_(stream_exec), allocator_(allocator) {}
|
||||
|
||||
tensorflow::StringPiece name() const override {
|
||||
@ -52,7 +51,7 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
|
||||
const Shape& output_shape, const Window& window,
|
||||
const ConvolutionDimensionNumbers& dnums, HloInstruction* instr);
|
||||
|
||||
perftools::gputools::StreamExecutor* stream_exec_; // never null
|
||||
se::StreamExecutor* stream_exec_; // never null
|
||||
DeviceMemoryAllocator* allocator_; // may be null
|
||||
};
|
||||
|
||||
|
@ -22,8 +22,6 @@ namespace xla {
|
||||
namespace gpu {
|
||||
namespace {
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
using se::DeviceMemory;
|
||||
using se::DeviceMemoryBase;
|
||||
using se::Stream;
|
||||
@ -215,14 +213,12 @@ string CudnnConvKindToString(CudnnConvKind kind) {
|
||||
|
||||
Status RunCudnnConvolution(
|
||||
CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
|
||||
const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
|
||||
perftools::gputools::DeviceMemoryBase filter_buf,
|
||||
perftools::gputools::DeviceMemoryBase output_buf,
|
||||
perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
|
||||
const Shape& output_shape, se::DeviceMemoryBase input_buf,
|
||||
se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
|
||||
se::DeviceMemoryBase scratch_buf, const Window& window,
|
||||
const ConvolutionDimensionNumbers& dnums,
|
||||
perftools::gputools::dnn::AlgorithmConfig algorithm,
|
||||
perftools::gputools::Stream* stream,
|
||||
perftools::gputools::dnn::ProfileResult* profile_result) {
|
||||
se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
|
||||
se::dnn::ProfileResult* profile_result) {
|
||||
ScratchBufAllocator scratch_allocator(scratch_buf);
|
||||
return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
|
||||
input_buf, filter_buf, output_buf,
|
||||
@ -232,14 +228,12 @@ Status RunCudnnConvolution(
|
||||
|
||||
Status RunCudnnConvolution(
|
||||
CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
|
||||
const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
|
||||
perftools::gputools::DeviceMemoryBase filter_buf,
|
||||
perftools::gputools::DeviceMemoryBase output_buf,
|
||||
perftools::gputools::ScratchAllocator* scratch_allocator,
|
||||
const Window& window, const ConvolutionDimensionNumbers& dnums,
|
||||
perftools::gputools::dnn::AlgorithmConfig algorithm,
|
||||
perftools::gputools::Stream* stream,
|
||||
perftools::gputools::dnn::ProfileResult* profile_result) {
|
||||
const Shape& output_shape, se::DeviceMemoryBase input_buf,
|
||||
se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
|
||||
se::ScratchAllocator* scratch_allocator, const Window& window,
|
||||
const ConvolutionDimensionNumbers& dnums,
|
||||
se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
|
||||
se::dnn::ProfileResult* profile_result) {
|
||||
PrimitiveType output_primitive_type = output_shape.element_type();
|
||||
CHECK(output_primitive_type == F32 || output_primitive_type == F16)
|
||||
<< ShapeUtil::HumanString(output_shape);
|
||||
|
@ -72,25 +72,21 @@ string CudnnConvKindToString(CudnnConvKind kind);
|
||||
// that size, if you like.
|
||||
Status RunCudnnConvolution(
|
||||
CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
|
||||
const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
|
||||
perftools::gputools::DeviceMemoryBase filter_buf,
|
||||
perftools::gputools::DeviceMemoryBase output_buf,
|
||||
perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
|
||||
const Shape& output_shape, se::DeviceMemoryBase input_buf,
|
||||
se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
|
||||
se::DeviceMemoryBase scratch_buf, const Window& window,
|
||||
const ConvolutionDimensionNumbers& dnums,
|
||||
perftools::gputools::dnn::AlgorithmConfig algorithm,
|
||||
perftools::gputools::Stream* stream,
|
||||
perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
|
||||
se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
|
||||
se::dnn::ProfileResult* profile_result = nullptr);
|
||||
|
||||
Status RunCudnnConvolution(
|
||||
CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
|
||||
const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
|
||||
perftools::gputools::DeviceMemoryBase filter_buf,
|
||||
perftools::gputools::DeviceMemoryBase output_buf,
|
||||
perftools::gputools::ScratchAllocator* scratch_allocator,
|
||||
const Window& window, const ConvolutionDimensionNumbers& dnums,
|
||||
perftools::gputools::dnn::AlgorithmConfig algorithm,
|
||||
perftools::gputools::Stream* stream,
|
||||
perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
|
||||
const Shape& output_shape, se::DeviceMemoryBase input_buf,
|
||||
se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
|
||||
se::ScratchAllocator* scratch_allocator, const Window& window,
|
||||
const ConvolutionDimensionNumbers& dnums,
|
||||
se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
|
||||
se::dnn::ProfileResult* profile_result = nullptr);
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace xla
|
||||
|
@ -24,8 +24,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
|
@ -34,24 +34,24 @@ namespace gpu {
|
||||
// released on destruction.
|
||||
//
|
||||
// Not thread-safe in that AllocateBytes, destructor are not locked.
|
||||
class FftScratchAllocator : public perftools::gputools::ScratchAllocator {
|
||||
class FftScratchAllocator : public se::ScratchAllocator {
|
||||
public:
|
||||
FftScratchAllocator(int device_ordinal,
|
||||
DeviceMemoryAllocator* memory_allocator);
|
||||
|
||||
~FftScratchAllocator() override;
|
||||
|
||||
int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override;
|
||||
int64 GetMemoryLimitInBytes(se::Stream* stream) override;
|
||||
|
||||
int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
|
||||
|
||||
perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
|
||||
AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override;
|
||||
se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
|
||||
se::Stream* stream, int64 byte_size) override;
|
||||
|
||||
private:
|
||||
const int device_ordinal_;
|
||||
DeviceMemoryAllocator* memory_allocator_;
|
||||
std::vector<perftools::gputools::DeviceMemoryBase> allocated_buffers_;
|
||||
std::vector<se::DeviceMemoryBase> allocated_buffers_;
|
||||
int64 total_allocated_bytes_ = 0;
|
||||
};
|
||||
|
||||
@ -74,16 +74,15 @@ class FftThunk : public Thunk {
|
||||
|
||||
// Does the FFT for the thunk on "stream".
|
||||
tensorflow::Status ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
const perftools::gputools::fft::Type fft_type_;
|
||||
const se::fft::Type fft_type_;
|
||||
const std::vector<int64> fft_length_;
|
||||
|
||||
float scale_factor_;
|
||||
|
||||
std::unique_ptr<perftools::gputools::fft::Plan> fft_plan_;
|
||||
std::unique_ptr<se::fft::Plan> fft_plan_;
|
||||
|
||||
const BufferAllocation::Slice input_buffer_;
|
||||
const BufferAllocation::Slice output_buffer_;
|
||||
|
@ -36,8 +36,7 @@ tensorflow::Status ForThunk::Initialize(const GpuExecutable& executable) {
|
||||
}
|
||||
|
||||
tensorflow::Status ForThunk::ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) {
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) {
|
||||
for (int64 i = 0; i < loop_limit_; ++i) {
|
||||
// Invoke loop body thunk sequence.
|
||||
TF_RETURN_IF_ERROR(
|
||||
|
@ -38,8 +38,7 @@ class ForThunk : public Thunk {
|
||||
|
||||
tensorflow::Status Initialize(const GpuExecutable& executable) override;
|
||||
tensorflow::Status ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
const int64 loop_limit_;
|
||||
|
@ -22,8 +22,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
|
@ -50,14 +50,12 @@ class GemmThunk : public Thunk {
|
||||
|
||||
// Does the gemm operation for the thunk on "stream", which must be non-null.
|
||||
tensorflow::Status ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
|
||||
|
||||
// Returns true if we'll perform autotuning if run on the given stream. If
|
||||
// so, we want the GPU to be quiescent during autotuning, so as not to
|
||||
// introduce noise in our results.
|
||||
bool ShouldHaltAllActivityBeforeRunning(
|
||||
perftools::gputools::Stream* stream) override {
|
||||
bool ShouldHaltAllActivityBeforeRunning(se::Stream* stream) override {
|
||||
return autotune_results_.count(
|
||||
stream->parent()->GetDeviceDescription().name()) != 0;
|
||||
}
|
||||
@ -79,8 +77,7 @@ class GemmThunk : public Thunk {
|
||||
// results. The map's value is the best algorithm we've found for this thunk
|
||||
// on this device, or an error if none of the algorithms worked and we should
|
||||
// use the regular gemm without an algorithm.
|
||||
std::unordered_map<string,
|
||||
StatusOr<::perftools::gputools::blas::AlgorithmType>>
|
||||
std::unordered_map<string, StatusOr<se::blas::AlgorithmType>>
|
||||
autotune_results_;
|
||||
};
|
||||
|
||||
|
@ -91,8 +91,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/tracing.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
@ -779,9 +777,9 @@ se::Platform::Id GpuCompiler::PlatformId() const {
|
||||
} // namespace xla
|
||||
|
||||
static bool InitModule() {
|
||||
xla::Compiler::RegisterCompilerFactory(se::cuda::kCudaPlatformId, []() {
|
||||
return xla::MakeUnique<xla::gpu::GpuCompiler>();
|
||||
});
|
||||
xla::Compiler::RegisterCompilerFactory(
|
||||
stream_executor::cuda::kCudaPlatformId,
|
||||
[]() { return xla::MakeUnique<xla::gpu::GpuCompiler>(); });
|
||||
return true;
|
||||
}
|
||||
static bool module_initialized = InitModule();
|
||||
|
@ -45,25 +45,23 @@ class GpuCompiler : public LLVMCompiler {
|
||||
// Bring in
|
||||
// StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
|
||||
// std::vector<std::unique_ptr<HloModule>> modules,
|
||||
// std::vector<std::vector<perftools::gputools::StreamExecutor*>>
|
||||
// std::vector<std::vector<se::StreamExecutor*>>
|
||||
// stream_execs)
|
||||
using LLVMCompiler::Compile;
|
||||
|
||||
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
||||
std::unique_ptr<HloModule> module,
|
||||
perftools::gputools::StreamExecutor* stream_exec,
|
||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||
DeviceMemoryAllocator* device_allocator) override;
|
||||
|
||||
StatusOr<std::unique_ptr<Executable>> RunBackend(
|
||||
std::unique_ptr<HloModule> module,
|
||||
perftools::gputools::StreamExecutor* stream_exec,
|
||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||
DeviceMemoryAllocator* device_allocator) override;
|
||||
|
||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
|
||||
AotCompilationOptions const& options) override;
|
||||
|
||||
perftools::gputools::Platform::Id PlatformId() const override;
|
||||
se::Platform::Id PlatformId() const override;
|
||||
|
||||
HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
|
||||
// Capture just the pointer size, not the entire GpuCompiler object.
|
||||
|
@ -34,8 +34,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
namespace {
|
||||
@ -324,7 +322,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
|
||||
this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
|
||||
CHECK(!slice.allocation()->is_entry_computation_parameter());
|
||||
|
||||
perftools::gputools::DeviceMemoryBase src_base =
|
||||
se::DeviceMemoryBase src_base =
|
||||
buffer_allocations->GetDeviceAddress(slice.index());
|
||||
CHECK(!src_base.is_null() || src_base.size() == 0);
|
||||
*device_memory = src_base;
|
||||
|
@ -33,8 +33,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
// TODO(b/30467474) Once GPU infeed implementation settles, consider
|
||||
@ -153,8 +151,8 @@ static std::unique_ptr<xla::TransferManager> CreateGpuTransferManager() {
|
||||
}
|
||||
|
||||
static bool InitModule() {
|
||||
xla::TransferManager::RegisterTransferManager(se::cuda::kCudaPlatformId,
|
||||
&CreateGpuTransferManager);
|
||||
xla::TransferManager::RegisterTransferManager(
|
||||
stream_executor::cuda::kCudaPlatformId, &CreateGpuTransferManager);
|
||||
return true;
|
||||
}
|
||||
static bool module_initialized = InitModule();
|
||||
|
@ -36,21 +36,20 @@ class GpuTransferManager : public GenericTransferManager {
|
||||
GpuTransferManager();
|
||||
~GpuTransferManager() override {}
|
||||
|
||||
Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
|
||||
Status TransferLiteralToInfeed(se::StreamExecutor* executor,
|
||||
const Literal& literal) override;
|
||||
Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
|
||||
int64 size, const void* source) override;
|
||||
Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
|
||||
const void* source) override;
|
||||
|
||||
private:
|
||||
// Initiates the infeed data transfers. InfeedBuffer->Done() must be
|
||||
// called to clean up the memory allocated for InfeedBuffer.
|
||||
StatusOr<gpu::InfeedBuffer*> TransferBufferToInfeedInternal(
|
||||
perftools::gputools::StreamExecutor* executor, int64 size,
|
||||
const void* source);
|
||||
se::StreamExecutor* executor, int64 size, const void* source);
|
||||
|
||||
// Enqueues infeed data buffers with the infeed manager after their
|
||||
// transfer completes.
|
||||
Status EnqueueBuffersToInfeed(perftools::gputools::StreamExecutor* executor,
|
||||
Status EnqueueBuffersToInfeed(se::StreamExecutor* executor,
|
||||
std::vector<gpu::InfeedBuffer*> buffers);
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(GpuTransferManager);
|
||||
|
@ -19,8 +19,6 @@ limitations under the License.
|
||||
#include "tensorflow/compiler/xla/ptr_util.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
|
@ -46,7 +46,7 @@ namespace gpu {
|
||||
// the client. The client manages the memory of the buffer.
|
||||
class InfeedBuffer {
|
||||
public:
|
||||
InfeedBuffer(perftools::gputools::StreamExecutor* executor, int64 length)
|
||||
InfeedBuffer(se::StreamExecutor* executor, int64 length)
|
||||
: executor_(executor), length_(length) {
|
||||
device_memory_ = executor_->AllocateArray<uint8>(length);
|
||||
CHECK(!device_memory_.is_null());
|
||||
@ -60,14 +60,12 @@ class InfeedBuffer {
|
||||
// client to manage memory for the infeed buffers.
|
||||
void Done() { delete this; }
|
||||
|
||||
perftools::gputools::DeviceMemoryBase* device_memory() {
|
||||
return &device_memory_;
|
||||
}
|
||||
se::DeviceMemoryBase* device_memory() { return &device_memory_; }
|
||||
|
||||
private:
|
||||
perftools::gputools::StreamExecutor* executor_; // Not owned.
|
||||
se::StreamExecutor* executor_; // Not owned.
|
||||
const int64 length_;
|
||||
perftools::gputools::DeviceMemoryBase device_memory_;
|
||||
se::DeviceMemoryBase device_memory_;
|
||||
};
|
||||
|
||||
// Client-side class used to enqueue infeed buffers.
|
||||
@ -100,8 +98,7 @@ class InfeedManager {
|
||||
// new stream on the first invocation. On subsequent invocations, if
|
||||
// the cached executor is not the same as the requested executor,
|
||||
// returns null.
|
||||
perftools::gputools::Stream* GetStream(
|
||||
perftools::gputools::StreamExecutor* executor);
|
||||
se::Stream* GetStream(se::StreamExecutor* executor);
|
||||
|
||||
private:
|
||||
// TODO(b/30467474): Revisit if this mutex becomes a point of
|
||||
@ -121,10 +118,10 @@ class InfeedManager {
|
||||
tensorflow::gtl::FlatSet<const InfeedBuffer*> dequeued_buffer_;
|
||||
|
||||
// Cached host to device stream for queuing infeed data.
|
||||
std::unique_ptr<perftools::gputools::Stream> host_to_device_stream_;
|
||||
std::unique_ptr<se::Stream> host_to_device_stream_;
|
||||
|
||||
// Executor that the host_to_device_stream belongs to. Not owned.
|
||||
perftools::gputools::StreamExecutor* host_to_device_executor_;
|
||||
se::StreamExecutor* host_to_device_executor_;
|
||||
};
|
||||
|
||||
// Singleton creator-or-accessor: Returns the GPU infeed manager.
|
||||
|
@ -31,10 +31,10 @@ InfeedThunk::InfeedThunk(
|
||||
destination_buffer_(destination_buffer) {}
|
||||
|
||||
Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) {
|
||||
se::Stream* stream) {
|
||||
VLOG(2) << "Infeeding to GPU ";
|
||||
|
||||
perftools::gputools::DeviceMemoryBase destination_address =
|
||||
se::DeviceMemoryBase destination_address =
|
||||
buffer_allocations.GetDeviceAddress(destination_buffer_);
|
||||
|
||||
InfeedManager* infeed_manager = GetOrCreateInfeedManager();
|
||||
@ -45,7 +45,7 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
std::vector<void*> tuple_element_addresses;
|
||||
for (BufferAllocation::Slice tuple_element_buffer :
|
||||
tuple_element_buffers_) {
|
||||
perftools::gputools::DeviceMemoryBase tuple_element_address =
|
||||
se::DeviceMemoryBase tuple_element_address =
|
||||
buffer_allocations.GetDeviceAddress(tuple_element_buffer);
|
||||
|
||||
InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer();
|
||||
|
@ -44,7 +44,7 @@ class InfeedThunk : public Thunk {
|
||||
InfeedThunk& operator=(const InfeedThunk&) = delete;
|
||||
|
||||
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
|
||||
|
@ -32,7 +32,7 @@ class IrEmitterContext {
|
||||
public:
|
||||
IrEmitterContext(const HloModule* hlo_module,
|
||||
const BufferAssignment* buffer_assignment,
|
||||
const perftools::gputools::DeviceDescription* device_desc,
|
||||
const se::DeviceDescription* device_desc,
|
||||
llvm::Module* llvm_module)
|
||||
: hlo_module_(hlo_module),
|
||||
buffer_assignment_(buffer_assignment),
|
||||
@ -47,7 +47,7 @@ class IrEmitterContext {
|
||||
const BufferAssignment& buffer_assignment() const {
|
||||
return *buffer_assignment_;
|
||||
}
|
||||
const perftools::gputools::DeviceDescription& device_description() const {
|
||||
const se::DeviceDescription& device_description() const {
|
||||
return *device_desc_;
|
||||
}
|
||||
llvm::Module* llvm_module() { return llvm_module_; }
|
||||
@ -56,7 +56,7 @@ class IrEmitterContext {
|
||||
private:
|
||||
const HloModule* hlo_module_;
|
||||
const BufferAssignment* buffer_assignment_;
|
||||
const perftools::gputools::DeviceDescription* device_desc_;
|
||||
const se::DeviceDescription* device_desc_;
|
||||
llvm::Module* llvm_module_;
|
||||
NameUniquer name_uniquer_;
|
||||
};
|
||||
|
@ -23,8 +23,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
|
@ -61,8 +61,7 @@ class KernelThunk : public Thunk {
|
||||
|
||||
// Executes the kernel for the thunk on "stream", which must be non-null.
|
||||
tensorflow::Status ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
// Buffers passed to the kernel as arguments.
|
||||
@ -82,13 +81,11 @@ class KernelThunk : public Thunk {
|
||||
// Describes how to load this kernel. ExecuteOnStream reuses this loader
|
||||
// specification for all executions.
|
||||
mutable tensorflow::mutex mutex_;
|
||||
std::unique_ptr<perftools::gputools::MultiKernelLoaderSpec> loader_spec_
|
||||
GUARDED_BY(mutex_);
|
||||
std::unique_ptr<se::MultiKernelLoaderSpec> loader_spec_ GUARDED_BY(mutex_);
|
||||
|
||||
// Loaded kernels for each `StreamExecutor`
|
||||
std::unordered_map<perftools::gputools::StreamExecutor*,
|
||||
perftools::gputools::KernelBase>
|
||||
kernel_cache_ GUARDED_BY(mutex_);
|
||||
std::unordered_map<se::StreamExecutor*, se::KernelBase> kernel_cache_
|
||||
GUARDED_BY(mutex_);
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
@ -19,8 +19,6 @@ limitations under the License.
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
Status MemzeroThunk::ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) {
|
||||
se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_);
|
||||
|
@ -36,7 +36,7 @@ class MemzeroThunk : public Thunk {
|
||||
: Thunk(Kind::kMemzero, hlo), dest_(dest) {}
|
||||
|
||||
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
const BufferAllocation::Slice dest_;
|
||||
@ -52,7 +52,7 @@ class Memset32BitValueThunk : public Thunk {
|
||||
: Thunk(Kind::kMemset32BitValue, hlo), value_(value), dest_(dest) {}
|
||||
|
||||
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
uint32 value_;
|
||||
|
@ -29,8 +29,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/lib/strings/stringprintf.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
|
@ -57,8 +57,7 @@ std::ostream& operator<<(std::ostream& out,
|
||||
const LaunchDimensions& launch_dims);
|
||||
|
||||
LaunchDimensions CalculateLaunchDimensions(
|
||||
const Shape& shape,
|
||||
const perftools::gputools::DeviceDescription& device_desc,
|
||||
const Shape& shape, const se::DeviceDescription& device_desc,
|
||||
int unroll_factor = 1);
|
||||
|
||||
} // namespace gpu
|
||||
|
@ -33,8 +33,7 @@ tensorflow::Status SequentialThunk::Initialize(
|
||||
}
|
||||
|
||||
tensorflow::Status SequentialThunk::ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) {
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) {
|
||||
for (const auto& thunk : thunks_) {
|
||||
TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
|
||||
}
|
||||
|
@ -40,8 +40,7 @@ class SequentialThunk : public Thunk {
|
||||
|
||||
tensorflow::Status Initialize(const GpuExecutable& executable) override;
|
||||
tensorflow::Status ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
// The list of sub-thunks.
|
||||
|
@ -85,8 +85,7 @@ class Thunk {
|
||||
// This value is not required to be constant for a given Thunk. For example,
|
||||
// a Thunk that performs autotuning may return true for its first run and
|
||||
// false thereafter.
|
||||
virtual bool ShouldHaltAllActivityBeforeRunning(
|
||||
perftools::gputools::Stream* /*stream*/) {
|
||||
virtual bool ShouldHaltAllActivityBeforeRunning(se::Stream* /*stream*/) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -104,8 +103,7 @@ class Thunk {
|
||||
// called after Initialize and can be called multiple times over Thunk's
|
||||
// lifetime. Stream argument must be non-null.
|
||||
virtual tensorflow::Status ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) = 0;
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) = 0;
|
||||
|
||||
private:
|
||||
Kind kind_;
|
||||
|
@ -17,8 +17,6 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/compiler/xla/util.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
|
@ -46,8 +46,7 @@ class TupleThunk : public Thunk {
|
||||
TupleThunk& operator=(const TupleThunk&) = delete;
|
||||
|
||||
tensorflow::Status ExecuteOnStream(
|
||||
const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
|
||||
|
@ -41,8 +41,8 @@ Status WhileThunk::Initialize(const GpuExecutable& executable) {
|
||||
}
|
||||
|
||||
Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) {
|
||||
perftools::gputools::DeviceMemoryBase condition_result_data =
|
||||
se::Stream* stream) {
|
||||
se::DeviceMemoryBase condition_result_data =
|
||||
buffer_allocations.GetDeviceAddress(condition_result_buffer_index_);
|
||||
|
||||
while (true) {
|
||||
|
@ -47,7 +47,7 @@ class WhileThunk : public Thunk {
|
||||
|
||||
Status Initialize(const GpuExecutable& executable) override;
|
||||
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
|
||||
perftools::gputools::Stream* stream) override;
|
||||
se::Stream* stream) override;
|
||||
|
||||
private:
|
||||
const BufferAllocation::Slice condition_result_buffer_index_;
|
||||
|
@ -88,7 +88,7 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
|
||||
// down how much time each HLO took.
|
||||
class HloExecutionProfile {
|
||||
public:
|
||||
using DeviceDescription = perftools::gputools::DeviceDescription;
|
||||
using DeviceDescription = se::DeviceDescription;
|
||||
|
||||
HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data,
|
||||
const HloProfileIndexMap* hlo_profile_index_map);
|
||||
|
@ -30,8 +30,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
/*static*/ StatusOr<std::unique_ptr<HloModule>>
|
||||
|
@ -80,7 +80,7 @@ class HloRunner {
|
||||
bool run_hlo_passes = false;
|
||||
};
|
||||
|
||||
explicit HloRunner(::perftools::gputools::Platform* platform);
|
||||
explicit HloRunner(se::Platform* platform);
|
||||
|
||||
~HloRunner();
|
||||
|
||||
@ -149,8 +149,7 @@ class HloRunner {
|
||||
// will be used to configure the replication parameters. Replicated executions
|
||||
// should pass the device_assignment parameter.
|
||||
ServiceExecutableRunOptions GetServiceRunOptionsForDevice(
|
||||
int64 device, ::perftools::gputools::Stream* stream,
|
||||
DeviceAssignment* device_assignment);
|
||||
int64 device, se::Stream* stream, DeviceAssignment* device_assignment);
|
||||
|
||||
std::unique_ptr<Backend> backend_;
|
||||
};
|
||||
|
@ -41,9 +41,6 @@ limitations under the License.
|
||||
namespace xla {
|
||||
namespace interpreter {
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
namespace sep = ::perftools::gputools::interpreter;
|
||||
|
||||
Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
|
||||
HloPassPipeline pipeline("Interpreter");
|
||||
|
||||
@ -96,7 +93,7 @@ InterpreterCompiler::CompileAheadOfTime(
|
||||
}
|
||||
|
||||
se::Platform::Id InterpreterCompiler::PlatformId() const {
|
||||
return sep::kXlaInterpreterPlatformId;
|
||||
return se::interpreter::kXlaInterpreterPlatformId;
|
||||
}
|
||||
|
||||
HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
|
||||
@ -109,11 +106,12 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
|
||||
}
|
||||
|
||||
static bool InitModule() {
|
||||
xla::Compiler::RegisterCompilerFactory(sep::kXlaInterpreterPlatformId, []() {
|
||||
return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
|
||||
});
|
||||
xla::Compiler::RegisterCompilerFactory(
|
||||
se::interpreter::kXlaInterpreterPlatformId, []() {
|
||||
return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
|
||||
});
|
||||
xla::ComputationPlacer::RegisterComputationPlacer(
|
||||
sep::kXlaInterpreterPlatformId, &CreateComputationPlacer);
|
||||
se::interpreter::kXlaInterpreterPlatformId, &CreateComputationPlacer);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -44,19 +44,16 @@ class InterpreterCompiler : public Compiler {
|
||||
~InterpreterCompiler() override {}
|
||||
|
||||
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
||||
std::unique_ptr<HloModule> hlo_module,
|
||||
perftools::gputools::StreamExecutor* stream_exec,
|
||||
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
|
||||
DeviceMemoryAllocator* device_allocator) override;
|
||||
|
||||
StatusOr<std::unique_ptr<Executable>> RunBackend(
|
||||
std::unique_ptr<HloModule> hlo_module,
|
||||
perftools::gputools::StreamExecutor* stream_exec,
|
||||
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
|
||||
DeviceMemoryAllocator* device_allocator) override;
|
||||
|
||||
StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
|
||||
std::vector<std::unique_ptr<HloModule>> hlo_modules,
|
||||
std::vector<std::vector<perftools::gputools::StreamExecutor*>>
|
||||
stream_exec,
|
||||
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
||||
DeviceMemoryAllocator* device_allocator) override;
|
||||
|
||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||
@ -65,7 +62,7 @@ class InterpreterCompiler : public Compiler {
|
||||
|
||||
HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
|
||||
|
||||
perftools::gputools::Platform::Id PlatformId() const override;
|
||||
se::Platform::Id PlatformId() const override;
|
||||
|
||||
private:
|
||||
Status RunHloOptimization(HloModule* hlo_module);
|
||||
|
@ -38,8 +38,6 @@ limitations under the License.
|
||||
namespace xla {
|
||||
namespace interpreter {
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
InterpreterExecutable::InterpreterExecutable(
|
||||
std::unique_ptr<const HloModule> hlo_module)
|
||||
: Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
|
||||
|
@ -19,8 +19,7 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/compiler/xla/status_macros.h"
|
||||
|
||||
namespace perftools {
|
||||
namespace gputools {
|
||||
namespace stream_executor {
|
||||
namespace interpreter {
|
||||
|
||||
host::HostStream *AsExecutorStream(Stream *stream) {
|
||||
@ -119,5 +118,4 @@ DeviceDescription *XlaInterpreterExecutor::PopulateDeviceDescription() const {
|
||||
}
|
||||
|
||||
} // namespace interpreter
|
||||
} // namespace gputools
|
||||
} // namespace perftools
|
||||
} // namespace stream_executor
|
||||
|
@ -44,8 +44,7 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
#include "tensorflow/stream_executor/timer.h"
|
||||
|
||||
namespace perftools {
|
||||
namespace gputools {
|
||||
namespace stream_executor {
|
||||
namespace interpreter {
|
||||
|
||||
using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
|
||||
@ -213,7 +212,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
|
||||
};
|
||||
|
||||
} // namespace interpreter
|
||||
} // namespace gputools
|
||||
} // namespace perftools
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
|
||||
|
@ -21,12 +21,10 @@ limitations under the License.
|
||||
#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
|
||||
#include "tensorflow/compiler/xla/service/transfer_manager.h"
|
||||
|
||||
namespace sei = ::perftools::gputools::interpreter;
|
||||
|
||||
namespace xla {
|
||||
|
||||
InterpreterTransferManager::InterpreterTransferManager()
|
||||
: GenericTransferManager(sei::kXlaInterpreterPlatformId,
|
||||
: GenericTransferManager(se::interpreter::kXlaInterpreterPlatformId,
|
||||
/*pointer_size=*/sizeof(void*)) {}
|
||||
|
||||
} // namespace xla
|
||||
@ -38,7 +36,8 @@ CreateInterpreterTransferManager() {
|
||||
|
||||
static bool InitModule() {
|
||||
xla::TransferManager::RegisterTransferManager(
|
||||
sei::kXlaInterpreterPlatformId, &CreateInterpreterTransferManager);
|
||||
stream_executor::interpreter::kXlaInterpreterPlatformId,
|
||||
&CreateInterpreterTransferManager);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -28,11 +28,7 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/multi_platform_manager.h"
|
||||
#include "tensorflow/stream_executor/platform.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
namespace sep = ::perftools::gputools::interpreter;
|
||||
|
||||
namespace perftools {
|
||||
namespace gputools {
|
||||
namespace stream_executor {
|
||||
namespace interpreter {
|
||||
|
||||
XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {}
|
||||
@ -99,16 +95,16 @@ void XlaInterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
|
||||
}
|
||||
|
||||
static void InitializeXlaInterpreterPlatform() {
|
||||
std::unique_ptr<se::Platform> platform(new sep::XlaInterpreterPlatform);
|
||||
SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
|
||||
std::unique_ptr<Platform> platform(new XlaInterpreterPlatform);
|
||||
SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
|
||||
}
|
||||
|
||||
} // namespace interpreter
|
||||
} // namespace gputools
|
||||
} // namespace perftools
|
||||
} // namespace stream_executor
|
||||
|
||||
REGISTER_MODULE_INITIALIZER(interpreter_platform,
|
||||
sep::InitializeXlaInterpreterPlatform());
|
||||
REGISTER_MODULE_INITIALIZER(
|
||||
interpreter_platform,
|
||||
stream_executor::interpreter::InitializeXlaInterpreterPlatform());
|
||||
|
||||
DECLARE_MODULE_INITIALIZER(multi_platform_manager);
|
||||
|
||||
|
@ -23,8 +23,7 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/stream_executor.h"
|
||||
#include "tensorflow/stream_executor/trace_listener.h"
|
||||
|
||||
namespace perftools {
|
||||
namespace gputools {
|
||||
namespace stream_executor {
|
||||
namespace interpreter {
|
||||
|
||||
class XlaInterpreterPlatform : public Platform {
|
||||
@ -64,7 +63,6 @@ class XlaInterpreterPlatform : public Platform {
|
||||
};
|
||||
|
||||
} // namespace interpreter
|
||||
} // namespace gputools
|
||||
} // namespace perftools
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_H_
|
||||
|
@ -14,12 +14,10 @@ limitations under the License.
|
||||
==============================================================================*/
|
||||
#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
|
||||
|
||||
namespace perftools {
|
||||
namespace gputools {
|
||||
namespace stream_executor {
|
||||
namespace interpreter {
|
||||
|
||||
PLATFORM_DEFINE_ID(kXlaInterpreterPlatformId);
|
||||
|
||||
} // namespace interpreter
|
||||
} // namespace gputools
|
||||
} // namespace perftools
|
||||
} // namespace stream_executor
|
||||
|
@ -18,14 +18,12 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/stream_executor/platform.h"
|
||||
|
||||
namespace perftools {
|
||||
namespace gputools {
|
||||
namespace stream_executor {
|
||||
namespace interpreter {
|
||||
|
||||
extern const Platform::Id kXlaInterpreterPlatformId;
|
||||
|
||||
} // namespace interpreter
|
||||
} // namespace gputools
|
||||
} // namespace perftools
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_ID_H_
|
||||
|
@ -23,7 +23,7 @@ limitations under the License.
|
||||
namespace xla {
|
||||
StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
|
||||
std::vector<std::unique_ptr<HloModule>> modules,
|
||||
std::vector<std::vector<perftools::gputools::StreamExecutor*>> stream_execs,
|
||||
std::vector<std::vector<se::StreamExecutor*>> stream_execs,
|
||||
DeviceMemoryAllocator* device_allocator) {
|
||||
// Tensorflow tries to enable the following behaviors in all its threads:
|
||||
//
|
||||
|
@ -60,19 +60,18 @@ class LLVMCompiler : public Compiler {
|
||||
// Bring in
|
||||
// StatusOr<std::unique_ptr<Executable>> RunBackend(
|
||||
// std::unique_ptr<HloModule> module,
|
||||
// perftools::gputools::StreamExecutor* stream_exec,
|
||||
// se::StreamExecutor* stream_exec,
|
||||
// DeviceMemoryAllocator* device_allocator)
|
||||
// StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
||||
// std::unique_ptr<HloModule> module,
|
||||
// perftools::gputools::StreamExecutor* stream_exec,
|
||||
// se::StreamExecutor* stream_exec,
|
||||
// DeviceMemoryAllocator* device_allocator)
|
||||
using Compiler::RunBackend;
|
||||
using Compiler::RunHloPasses;
|
||||
|
||||
StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
|
||||
std::vector<std::unique_ptr<HloModule>> modules,
|
||||
std::vector<std::vector<perftools::gputools::StreamExecutor*>>
|
||||
stream_execs,
|
||||
std::vector<std::vector<se::StreamExecutor*>> stream_execs,
|
||||
DeviceMemoryAllocator* device_allocator) override;
|
||||
|
||||
protected:
|
||||
|
@ -43,13 +43,11 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
/* static */ StatusOr<std::unique_ptr<LocalService>> LocalService::NewService(
|
||||
const ServiceOptions& options) {
|
||||
perftools::gputools::Platform* platform = options.platform();
|
||||
se::Platform* platform = options.platform();
|
||||
if (platform == nullptr) {
|
||||
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
|
||||
}
|
||||
|
@ -29,8 +29,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
using tensorflow::str_util::Lowercase;
|
||||
|
@ -34,29 +34,27 @@ class PlatformUtil {
|
||||
//
|
||||
// Note that, even if a platform is present with zero devices, if we *do* have
|
||||
// compilation support for it, it will be returned in this sequence.
|
||||
static StatusOr<std::vector<perftools::gputools::Platform*>>
|
||||
GetSupportedPlatforms();
|
||||
static StatusOr<std::vector<se::Platform*>> GetSupportedPlatforms();
|
||||
|
||||
// Convenience function which returns the default supported platform for
|
||||
// tests. If exactly one supported platform is present, then this platform is
|
||||
// the default platform. If exactly two platforms are present and one of them
|
||||
// is the interpreter platform, then the other platform is the default
|
||||
// platform. Otherwise returns an error.
|
||||
static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
|
||||
static StatusOr<se::Platform*> GetDefaultPlatform();
|
||||
|
||||
// Convenience function which returns the sole supported platform. If
|
||||
// exactly one supported platform is present, then this platform is the
|
||||
// default platform. Otherwise returns an error.
|
||||
static StatusOr<perftools::gputools::Platform*> GetSolePlatform();
|
||||
static StatusOr<se::Platform*> GetSolePlatform();
|
||||
|
||||
// Returns the platform according to the given name. Returns error if there is
|
||||
// no such platform.
|
||||
static StatusOr<perftools::gputools::Platform*> GetPlatform(
|
||||
const string& platform_name);
|
||||
static StatusOr<se::Platform*> GetPlatform(const string& platform_name);
|
||||
|
||||
// Returns exactly one platform that does not have given name. Returns error
|
||||
// if there is no such platform, or there are multiple such platforms.
|
||||
static StatusOr<perftools::gputools::Platform*> GetPlatformExceptFor(
|
||||
static StatusOr<se::Platform*> GetPlatformExceptFor(
|
||||
const string& platform_name);
|
||||
|
||||
// Returns a vector of StreamExecutors for the given platform. The vector is
|
||||
@ -64,8 +62,8 @@ class PlatformUtil {
|
||||
// element is nullptr, then the device is present by not supported by XLA.
|
||||
//
|
||||
// If the platform has no visible devices, a not-found error is returned.
|
||||
static StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
|
||||
GetStreamExecutors(perftools::gputools::Platform* platform);
|
||||
static StatusOr<std::vector<se::StreamExecutor*>> GetStreamExecutors(
|
||||
se::Platform* platform);
|
||||
|
||||
private:
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(PlatformUtil);
|
||||
|
@ -54,8 +54,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
using ::tensorflow::strings::Printf;
|
||||
using ::tensorflow::strings::StrCat;
|
||||
using ::xla::source_map_util::InvalidParameterArgument;
|
||||
@ -95,15 +93,12 @@ tensorflow::Status RecordResult(const ShapedBuffer& result,
|
||||
|
||||
} // namespace
|
||||
|
||||
ServiceOptions& ServiceOptions::set_platform(
|
||||
perftools::gputools::Platform* platform) {
|
||||
ServiceOptions& ServiceOptions::set_platform(se::Platform* platform) {
|
||||
platform_ = platform;
|
||||
return *this;
|
||||
}
|
||||
|
||||
perftools::gputools::Platform* ServiceOptions::platform() const {
|
||||
return platform_;
|
||||
}
|
||||
se::Platform* ServiceOptions::platform() const { return platform_; }
|
||||
|
||||
ServiceOptions& ServiceOptions::set_number_of_replicas(int number_of_replicas) {
|
||||
number_of_replicas_ = number_of_replicas;
|
||||
@ -123,7 +118,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
|
||||
}
|
||||
|
||||
/* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
|
||||
perftools::gputools::Platform* platform) {
|
||||
se::Platform* platform) {
|
||||
ServiceOptions default_options;
|
||||
default_options.set_platform(platform);
|
||||
return NewService(default_options);
|
||||
@ -131,7 +126,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
|
||||
|
||||
/* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
|
||||
const ServiceOptions& options) {
|
||||
perftools::gputools::Platform* platform = options.platform();
|
||||
se::Platform* platform = options.platform();
|
||||
std::unique_ptr<Backend> execute_backend;
|
||||
if (platform == nullptr) {
|
||||
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
|
||||
@ -235,8 +230,7 @@ tensorflow::Status Service::ValidateResultShapeWithLayout(
|
||||
StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
|
||||
Service::ResolveAndValidateArguments(
|
||||
tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
|
||||
stream_executors) {
|
||||
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors) {
|
||||
CHECK_EQ(options_.number_of_replicas(), stream_executors.size());
|
||||
std::vector<std::vector<const ShapedBuffer*>> replicated_arguments;
|
||||
replicated_arguments.resize(options_.number_of_replicas());
|
||||
@ -349,8 +343,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
|
||||
StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
|
||||
std::vector<VersionedComputationHandle> versioned_handles,
|
||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
|
||||
Backend* backend,
|
||||
std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
|
||||
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
|
||||
DeviceMemoryAllocator* device_allocator) {
|
||||
VLOG(1) << Printf("BuildExecutable on service %p", this);
|
||||
|
||||
@ -412,8 +405,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
|
||||
StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
|
||||
const std::vector<const HloModuleProto*>& module_protos,
|
||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
|
||||
Backend* backend,
|
||||
std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
|
||||
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
|
||||
DeviceMemoryAllocator* device_allocator) {
|
||||
VLOG(1) << Printf("BuildExecutable on service %p", this);
|
||||
|
||||
@ -493,7 +485,7 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
|
||||
StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
|
||||
const VersionedComputationHandle& versioned_handle,
|
||||
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
|
||||
perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
|
||||
se::StreamExecutor* executor, ExecutionProfile* profile,
|
||||
DeviceMemoryAllocator* device_allocator) {
|
||||
std::shared_ptr<Executable> executable =
|
||||
compilation_cache_.LookUp(versioned_handle, *module_config);
|
||||
@ -541,7 +533,7 @@ Service::ExecuteParallelAndRegisterResult(
|
||||
// Streams where the computation are launched, so we can wait on the streams
|
||||
// to complete.
|
||||
std::vector<Pool<se::Stream>::SmartPtr> streams;
|
||||
std::vector<std::unique_ptr<perftools::gputools::Timer>> timers;
|
||||
std::vector<std::unique_ptr<se::Timer>> timers;
|
||||
|
||||
// Global data handles for the computation results, one for each computation.
|
||||
std::vector<GlobalDataHandle> result_handles;
|
||||
@ -565,8 +557,7 @@ Service::ExecuteParallelAndRegisterResult(
|
||||
streams.push_back(std::move(stream));
|
||||
|
||||
if (replica == 0 && profile != nullptr) {
|
||||
timers.emplace_back(
|
||||
new perftools::gputools::Timer(streams.back()->parent()));
|
||||
timers.emplace_back(new se::Timer(streams.back()->parent()));
|
||||
streams.back()
|
||||
->InitTimer(timers.back().get())
|
||||
.ThenStartTimer(timers.back().get());
|
||||
@ -734,9 +725,9 @@ tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
|
||||
return computation->SetReturnValue(arg->operand());
|
||||
}
|
||||
|
||||
StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
|
||||
Service::GetExecutors(const ExecutionOptions& execution_options,
|
||||
int64 requests_size, int64 request_index) const {
|
||||
StatusOr<std::vector<se::StreamExecutor*>> Service::GetExecutors(
|
||||
const ExecutionOptions& execution_options, int64 requests_size,
|
||||
int64 request_index) const {
|
||||
if (execution_options.device_handles().empty()) {
|
||||
return FailedPrecondition(
|
||||
"device handles must be given to execute parallel computations");
|
||||
@ -748,7 +739,7 @@ Service::GetExecutors(const ExecutionOptions& execution_options,
|
||||
"handles.",
|
||||
requests_size, request_index, execution_options.device_handles_size());
|
||||
}
|
||||
std::vector<perftools::gputools::StreamExecutor*> executors;
|
||||
std::vector<se::StreamExecutor*> executors;
|
||||
for (const auto& device_handle : execution_options.device_handles()) {
|
||||
TF_ASSIGN_OR_RETURN(auto replicas,
|
||||
Replicas(*execute_backend_, device_handle));
|
||||
@ -780,7 +771,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
|
||||
VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
|
||||
|
||||
std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
|
||||
std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
|
||||
std::vector<std::vector<se::StreamExecutor*>> all_executors;
|
||||
std::vector<VersionedComputationHandle> versioned_handles;
|
||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
|
||||
std::vector<string> computation_names;
|
||||
@ -891,7 +882,7 @@ tensorflow::Status Service::ExecuteGraphParallel(
|
||||
VLOG(1) << "running execute-graph-parallel request";
|
||||
|
||||
std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
|
||||
std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
|
||||
std::vector<std::vector<se::StreamExecutor*>> all_executors;
|
||||
std::vector<const HloModuleProto*> module_protos;
|
||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
|
||||
std::vector<string> computation_names;
|
||||
@ -1953,9 +1944,9 @@ DeviceHandle Service::SingleComputationDeviceHandle() const {
|
||||
return device_handle;
|
||||
}
|
||||
|
||||
StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Service::Replicas(
|
||||
StatusOr<std::vector<se::StreamExecutor*>> Service::Replicas(
|
||||
const Backend& backend, const DeviceHandle& device_handle) const {
|
||||
std::vector<perftools::gputools::StreamExecutor*> replicas;
|
||||
std::vector<se::StreamExecutor*> replicas;
|
||||
for (int replica = 0; replica < options_.number_of_replicas(); ++replica) {
|
||||
// From the computation placer, find out the device ids of the replicas for
|
||||
// the given device handle.
|
||||
|
@ -53,8 +53,8 @@ namespace xla {
|
||||
class ServiceOptions {
|
||||
public:
|
||||
// Set the platform backing the service, or nullptr for the default platform.
|
||||
ServiceOptions& set_platform(perftools::gputools::Platform* platform);
|
||||
perftools::gputools::Platform* platform() const;
|
||||
ServiceOptions& set_platform(se::Platform* platform);
|
||||
se::Platform* platform() const;
|
||||
|
||||
// Set the number of replicas to use when compiling replicated
|
||||
// programs.
|
||||
@ -66,7 +66,7 @@ class ServiceOptions {
|
||||
int intra_op_parallelism_threads() const;
|
||||
|
||||
private:
|
||||
perftools::gputools::Platform* platform_ = nullptr;
|
||||
se::Platform* platform_ = nullptr;
|
||||
int number_of_replicas_ = 1;
|
||||
int intra_op_parallelism_threads_ = -1;
|
||||
};
|
||||
@ -79,7 +79,7 @@ class Service : public ServiceInterface {
|
||||
public:
|
||||
// Factory method for creating a new Service.
|
||||
static StatusOr<std::unique_ptr<Service>> NewService(
|
||||
perftools::gputools::Platform* platform = nullptr);
|
||||
se::Platform* platform = nullptr);
|
||||
static StatusOr<std::unique_ptr<Service>> NewService(
|
||||
const ServiceOptions& options);
|
||||
|
||||
@ -286,7 +286,7 @@ class Service : public ServiceInterface {
|
||||
ExecuteResponse* result);
|
||||
|
||||
// Prepare the executors for executing parallel.
|
||||
StatusOr<std::vector<perftools::gputools::StreamExecutor*>> GetExecutors(
|
||||
StatusOr<std::vector<se::StreamExecutor*>> GetExecutors(
|
||||
const ExecutionOptions& execution_options, int64 requests_size,
|
||||
int64 request_index) const;
|
||||
|
||||
@ -310,8 +310,7 @@ class Service : public ServiceInterface {
|
||||
StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
|
||||
ResolveAndValidateArguments(
|
||||
tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
|
||||
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
|
||||
stream_executors);
|
||||
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
|
||||
|
||||
// Create a Hlo module config for the given program shape and arguments.
|
||||
// execution_options is optional; if not given a default is used.
|
||||
@ -329,7 +328,7 @@ class Service : public ServiceInterface {
|
||||
StatusOr<std::unique_ptr<Executable>> BuildExecutable(
|
||||
const VersionedComputationHandle& versioned_handle,
|
||||
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
se::StreamExecutor* executor,
|
||||
DeviceMemoryAllocator* device_allocator = nullptr);
|
||||
|
||||
// Builds an Executable for the given HLO module proto.
|
||||
@ -338,7 +337,7 @@ class Service : public ServiceInterface {
|
||||
StatusOr<std::unique_ptr<Executable>> BuildExecutable(
|
||||
const HloModuleProto& module_proto,
|
||||
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
se::StreamExecutor* executor,
|
||||
DeviceMemoryAllocator* device_allocator = nullptr);
|
||||
|
||||
// Same as BuildExecutable() above, but builds a list of Executables for the
|
||||
@ -346,14 +345,12 @@ class Service : public ServiceInterface {
|
||||
StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
|
||||
std::vector<VersionedComputationHandle> versioned_handles,
|
||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
|
||||
Backend* backend,
|
||||
std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
|
||||
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
|
||||
DeviceMemoryAllocator* device_allocator);
|
||||
StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
|
||||
const std::vector<const HloModuleProto*>& module_protos,
|
||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
|
||||
Backend* backend,
|
||||
std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
|
||||
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
|
||||
DeviceMemoryAllocator* device_allocator);
|
||||
|
||||
// Similar to BuildExecutable, but look in the compilation cache for the
|
||||
@ -362,7 +359,7 @@ class Service : public ServiceInterface {
|
||||
StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
|
||||
const VersionedComputationHandle& versioned_handle,
|
||||
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
|
||||
perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
|
||||
se::StreamExecutor* executor, ExecutionProfile* profile,
|
||||
DeviceMemoryAllocator* device_allocator = nullptr);
|
||||
|
||||
// Runs the given executable with the given arguments and register the result
|
||||
@ -411,7 +408,7 @@ class Service : public ServiceInterface {
|
||||
// Returns the stream executors assigned to the replicas represented by the
|
||||
// given device handle. Each device_handle is a virtual replicated device that
|
||||
// represents a set of physical devices for the replicas.
|
||||
StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Replicas(
|
||||
StatusOr<std::vector<se::StreamExecutor*>> Replicas(
|
||||
const Backend& backend, const DeviceHandle& device_handle) const;
|
||||
|
||||
Status MaybeDumpHloModule(const HloModule& module) const;
|
||||
|
@ -28,7 +28,7 @@ namespace xla {
|
||||
class ServiceExecutableRunOptions {
|
||||
public:
|
||||
using StreamBorrower =
|
||||
std::function<StatusOr<Pool<perftools::gputools::Stream>::SmartPtr>(int)>;
|
||||
std::function<StatusOr<Pool<se::Stream>::SmartPtr>(int)>;
|
||||
|
||||
ServiceExecutableRunOptions()
|
||||
: ServiceExecutableRunOptions(ExecutableRunOptions()) {}
|
||||
@ -45,14 +45,13 @@ class ServiceExecutableRunOptions {
|
||||
ExecutableRunOptions* mutable_run_options() { return &run_options_; }
|
||||
|
||||
// Delegate to `ExecutableRunOptions` member.
|
||||
perftools::gputools::Stream* stream() const { return run_options_.stream(); }
|
||||
se::Stream* stream() const { return run_options_.stream(); }
|
||||
DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); }
|
||||
int device_ordinal() const { return run_options_.device_ordinal(); }
|
||||
|
||||
// Borrows a stream and returns a smart pointer which returns the stream on
|
||||
// destruction.
|
||||
StatusOr<Pool<perftools::gputools::Stream>::SmartPtr> BorrowStream(
|
||||
int device_ordinal) const {
|
||||
StatusOr<Pool<se::Stream>::SmartPtr> BorrowStream(int device_ordinal) const {
|
||||
return borrow_stream_
|
||||
? borrow_stream_(device_ordinal)
|
||||
: Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
|
||||
|
@ -28,8 +28,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/lib/strings/stringprintf.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
|
||||
using ::tensorflow::strings::Appendf;
|
||||
@ -146,7 +144,7 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
|
||||
|
||||
std::unique_ptr<ShapedBuffer> ScopedShapedBuffer::release() {
|
||||
auto shaped_buffer = MakeUnique<ShapedBuffer>(std::move(*this));
|
||||
buffers_ = ShapeTree<perftools::gputools::DeviceMemoryBase>();
|
||||
buffers_ = ShapeTree<se::DeviceMemoryBase>();
|
||||
return shaped_buffer;
|
||||
}
|
||||
|
||||
|
@ -41,8 +41,7 @@ class ShapedBuffer {
|
||||
// determines the number of device allocations (DeviceMemoryBase) held by the
|
||||
// ShapedBuffer.
|
||||
ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
|
||||
const perftools::gputools::Platform* platform,
|
||||
int device_ordinal);
|
||||
const se::Platform* platform, int device_ordinal);
|
||||
|
||||
// Returns the shape of the on-host representation of the data held by this
|
||||
// ShapedBuffer.
|
||||
@ -52,35 +51,29 @@ class ShapedBuffer {
|
||||
// ShapedBuffer.
|
||||
const Shape& on_device_shape() const { return on_device_shape_; }
|
||||
|
||||
const perftools::gputools::Platform* platform() const { return platform_; }
|
||||
const se::Platform* platform() const { return platform_; }
|
||||
int device_ordinal() const { return device_ordinal_; }
|
||||
|
||||
// Return the root buffer of the shape (shape index {}).
|
||||
const perftools::gputools::DeviceMemoryBase& root_buffer() const {
|
||||
const se::DeviceMemoryBase& root_buffer() const {
|
||||
return buffer(/*index=*/{});
|
||||
}
|
||||
|
||||
// Returns the buffer at the given shape index where index is defined as in
|
||||
// ShapeUtil::GetSubshape.
|
||||
const perftools::gputools::DeviceMemoryBase& buffer(
|
||||
const ShapeIndex& index) const {
|
||||
const se::DeviceMemoryBase& buffer(const ShapeIndex& index) const {
|
||||
return buffers_.element(index);
|
||||
}
|
||||
|
||||
// Sets the device memory buffer at the given index.
|
||||
void set_buffer(const perftools::gputools::DeviceMemoryBase& buffer,
|
||||
const ShapeIndex& index) {
|
||||
void set_buffer(const se::DeviceMemoryBase& buffer, const ShapeIndex& index) {
|
||||
*buffers_.mutable_element(index) = buffer;
|
||||
}
|
||||
|
||||
// Returns the underlying ShapeTree containing all the device addresses in the
|
||||
// ShapedBuffer.
|
||||
const ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() const {
|
||||
return buffers_;
|
||||
}
|
||||
ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() {
|
||||
return buffers_;
|
||||
}
|
||||
const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
|
||||
ShapeTree<se::DeviceMemoryBase>& buffers() { return buffers_; }
|
||||
|
||||
// Set all device memory pointers in the object to null.
|
||||
void clear();
|
||||
@ -101,13 +94,13 @@ class ShapedBuffer {
|
||||
Shape on_device_shape_;
|
||||
|
||||
// The platform the memory is allocated on.
|
||||
const perftools::gputools::Platform* platform_;
|
||||
const se::Platform* platform_;
|
||||
|
||||
// The device the memory is allocated on.
|
||||
int device_ordinal_;
|
||||
|
||||
// The tree of device buffers. Its shape is on_device_shape().
|
||||
ShapeTree<perftools::gputools::DeviceMemoryBase> buffers_;
|
||||
ShapeTree<se::DeviceMemoryBase> buffers_;
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
|
||||
|
@ -25,24 +25,20 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
|
||||
namespace se = ::perftools::gputools;
|
||||
|
||||
namespace xla {
|
||||
/* static */ tensorflow::mutex
|
||||
TransferManager::platform_transfer_manager_mutex_(
|
||||
tensorflow::LINKER_INITIALIZED);
|
||||
|
||||
/* static */ std::map<perftools::gputools::Platform::Id,
|
||||
TransferManager::State>*
|
||||
/* static */ std::map<se::Platform::Id, TransferManager::State>*
|
||||
TransferManager::GetPlatformTransferManagers() {
|
||||
static auto* r =
|
||||
new std::map<perftools::gputools::Platform::Id, TransferManager::State>;
|
||||
static auto* r = new std::map<se::Platform::Id, TransferManager::State>;
|
||||
return r;
|
||||
}
|
||||
|
||||
Status TransferManager::TransferArrayToDevice(
|
||||
perftools::gputools::StreamExecutor* executor, const Literal& literal,
|
||||
const perftools::gputools::DeviceMemoryBase& dest) {
|
||||
se::StreamExecutor* executor, const Literal& literal,
|
||||
const se::DeviceMemoryBase& dest) {
|
||||
const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
|
||||
TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
|
||||
<< "On-device representation of "
|
||||
@ -61,8 +57,8 @@ Status TransferManager::TransferArrayToDevice(
|
||||
}
|
||||
|
||||
StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
|
||||
perftools::gputools::StreamExecutor* executor, const Shape& shape,
|
||||
const perftools::gputools::DeviceMemoryBase& source) {
|
||||
se::StreamExecutor* executor, const Shape& shape,
|
||||
const se::DeviceMemoryBase& source) {
|
||||
TF_RET_CHECK(ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape))
|
||||
<< "Shape " << ShapeUtil::HumanString(shape)
|
||||
<< " has a differently shaped representation on-device: "
|
||||
@ -112,8 +108,7 @@ StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
|
||||
}
|
||||
|
||||
Status TransferManager::WriteTupleIndexTables(
|
||||
perftools::gputools::StreamExecutor* executor,
|
||||
const ShapedBuffer& device_buffer) {
|
||||
se::StreamExecutor* executor, const ShapedBuffer& device_buffer) {
|
||||
VLOG(2) << "Writing tuple index tables for " << device_buffer;
|
||||
|
||||
TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user