[XLA] Convert XLA to use xla::se as a namespace alias for ::stream_executor.

PiperOrigin-RevId: 193301997
This commit is contained in:
Justin Lebar 2018-04-17 21:04:35 -07:00 committed by TensorFlower Gardener
parent 41e2cd187b
commit d77a621a57
121 changed files with 443 additions and 663 deletions

View File

@ -443,6 +443,9 @@ cc_library(
srcs = ["executable_run_options.cc"],
hdrs = ["executable_run_options.h"],
visibility = ["//visibility:public"],
deps = [
":types",
],
)
cc_library(

View File

@ -23,22 +23,19 @@ limitations under the License.
namespace xla {
LocalClientOptions::LocalClientOptions(perftools::gputools::Platform* platform,
LocalClientOptions::LocalClientOptions(se::Platform* platform,
int number_of_replicas,
int intra_op_parallelism_threads)
: platform_(platform),
number_of_replicas_(number_of_replicas),
intra_op_parallelism_threads_(intra_op_parallelism_threads) {}
LocalClientOptions& LocalClientOptions::set_platform(
perftools::gputools::Platform* platform) {
LocalClientOptions& LocalClientOptions::set_platform(se::Platform* platform) {
platform_ = platform;
return *this;
}
perftools::gputools::Platform* LocalClientOptions::platform() const {
return platform_;
}
se::Platform* LocalClientOptions::platform() const { return platform_; }
LocalClientOptions& LocalClientOptions::set_number_of_replicas(
int number_of_replicas) {
@ -69,7 +66,7 @@ ClientLibrary::ClientLibrary() = default;
ClientLibrary::~ClientLibrary() = default;
/* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
perftools::gputools::Platform* platform) {
se::Platform* platform) {
LocalClientOptions default_options;
default_options.set_platform(platform);
return GetOrCreateLocalClient(default_options);
@ -77,7 +74,7 @@ ClientLibrary::~ClientLibrary() = default;
/* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
const LocalClientOptions& options) {
perftools::gputools::Platform* platform = options.platform();
se::Platform* platform = options.platform();
int replica_count = options.number_of_replicas();
ClientLibrary& client_library = Singleton();
tensorflow::mutex_lock lock(client_library.service_mutex_);
@ -115,7 +112,7 @@ ClientLibrary::~ClientLibrary() = default;
}
/* static */ LocalService* ClientLibrary::GetXlaService(
perftools::gputools::Platform* platform) {
se::Platform* platform) {
ClientLibrary& client_library = Singleton();
tensorflow::mutex_lock lock(client_library.service_mutex_);
auto it = client_library.local_instances_.find(platform->id());
@ -124,8 +121,7 @@ ClientLibrary::~ClientLibrary() = default;
}
/* static */ StatusOr<CompileOnlyClient*>
ClientLibrary::GetOrCreateCompileOnlyClient(
perftools::gputools::Platform* platform) {
ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) {
ClientLibrary& client_library = Singleton();
tensorflow::mutex_lock lock(client_library.service_mutex_);

View File

@ -43,13 +43,13 @@ namespace xla {
// Options to configure the local client when it is created.
class LocalClientOptions {
public:
LocalClientOptions(perftools::gputools::Platform* platform = nullptr,
LocalClientOptions(se::Platform* platform = nullptr,
int number_of_replicas = 1,
int intra_op_parallelism_threads = -1);
// Set the platform backing the service, or nullptr for the default platform.
LocalClientOptions& set_platform(perftools::gputools::Platform* platform);
perftools::gputools::Platform* platform() const;
LocalClientOptions& set_platform(se::Platform* platform);
se::Platform* platform() const;
// Set the number of replicas to use when compiling replicated
// programs.
@ -61,7 +61,7 @@ class LocalClientOptions {
int intra_op_parallelism_threads() const;
private:
perftools::gputools::Platform* platform_;
se::Platform* platform_;
int number_of_replicas_;
int intra_op_parallelism_threads_;
};
@ -74,7 +74,7 @@ class ClientLibrary {
// platform : The platform the underlying XLA service should target. If
// null then default platform is used.
static StatusOr<LocalClient*> GetOrCreateLocalClient(
perftools::gputools::Platform* platform = nullptr);
se::Platform* platform = nullptr);
static StatusOr<LocalClient*> GetOrCreateLocalClient(
const LocalClientOptions& options);
@ -84,14 +84,14 @@ class ClientLibrary {
// Returns the service from the service thread. Only used in unit tests to
// access user computations from client.
static LocalService* GetXlaService(perftools::gputools::Platform* platform);
static LocalService* GetXlaService(se::Platform* platform);
// Singleton constructor-or-accessor for compile-only clients. Arguments:
//
// platform : The platform the underlying XLA service should target. If
// null then default platform is used.
static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
perftools::gputools::Platform* platform = nullptr);
se::Platform* platform = nullptr);
// Clears the local instance and compile only instance caches. The client
// pointers returned by the previous GetOrCreateLocalClient() or
@ -120,12 +120,10 @@ class ClientLibrary {
};
tensorflow::mutex service_mutex_; // Guards the singleton creation state.
std::unordered_map<perftools::gputools::Platform::Id,
std::unique_ptr<LocalInstance>>
std::unordered_map<se::Platform::Id, std::unique_ptr<LocalInstance>>
local_instances_ GUARDED_BY(service_mutex_);
std::unordered_map<perftools::gputools::Platform::Id,
std::unique_ptr<CompileOnlyInstance>>
std::unordered_map<se::Platform::Id, std::unique_ptr<CompileOnlyInstance>>
compile_only_instances_ GUARDED_BY(service_mutex_);
TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);

View File

@ -24,8 +24,6 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/source_map_util.h"
#include "tensorflow/compiler/xla/status_macros.h"
namespace se = ::perftools::gputools;
using xla::source_map_util::InvalidParameterArgument;
namespace xla {

View File

@ -167,7 +167,7 @@ class LocalClient : public Client {
StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
// Returns the platform that the underlying service targets.
perftools::gputools::Platform* platform() const;
se::Platform* platform() const;
// Returns the number of devices on the system of the service platform
// type. Not all devices may be supported by the service (see

View File

@ -29,7 +29,7 @@ namespace xla {
// Returns a string that represents the device in terms of platform and ordinal;
// e.g. the first CUDA device will be "cuda:0"
string DeviceIdentifier(perftools::gputools::StreamExecutor* stream_exec) {
string DeviceIdentifier(se::StreamExecutor* stream_exec) {
return tensorflow::strings::StrCat(stream_exec->platform()->Name(), ":",
stream_exec->device_ordinal());
}

View File

@ -16,6 +16,9 @@ limitations under the License.
#ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
#define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
// Pulls in the ::stream_executor -> ::xla::se namespace alias.
#include "tensorflow/compiler/xla/types.h"
// These classes are forward declared so that ExecutableRunOptions can be linked
// into an XLA-compiled binary without having to link all of the pointed-to
// objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@ -37,10 +40,6 @@ struct ThreadPoolDevice;
namespace xla {
// TODO(b/77980417): Once the perftools::gputools -> stream_executor migration
// is complete, add "using namespace se = stream_executor" here and
// s/stream_executor/se::/ to match our idiom elsewhere.
class DeviceMemoryAllocator;
class DeviceAssignment;
class ExecutionProfile;

View File

@ -20,7 +20,7 @@ limitations under the License.
namespace xla {
/* static */ StatusOr<std::unique_ptr<GRPCService>> GRPCService::NewService(
perftools::gputools::Platform* platform) {
se::Platform* platform) {
std::unique_ptr<GRPCService> grpc_service(new GRPCService());
TF_ASSIGN_OR_RETURN(grpc_service->service_,
::xla::Service::NewService(platform));

View File

@ -29,7 +29,7 @@ class GRPCService : public grpc::XlaService::Service {
// that the service should target. If platform is null then the default
// platform is used.
static StatusOr<std::unique_ptr<GRPCService>> NewService(
perftools::gputools::Platform* platform = nullptr);
se::Platform* platform = nullptr);
::grpc::Status Computation(::grpc::ServerContext* context,
const ComputationRequest* arg,

View File

@ -204,7 +204,7 @@ StatusOr<std::vector<const ShapedBuffer*>> AllocationTracker::ResolveInternal(
}
void AllocationTracker::AddAllocationOrIncrementRefCount(
perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
se::DeviceMemoryBase device_memory, int device_ordinal) {
AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
auto it = allocation_map.find(device_memory.opaque());
if (it == allocation_map.end()) {
@ -215,8 +215,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
}
}
Status AllocationTracker::DecrementRefCount(
perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory,
int device_ordinal) {
AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
auto it = allocation_map.find(device_memory.opaque());
TF_RET_CHECK(it != allocation_map.end());

View File

@ -77,7 +77,7 @@ class AllocationTracker {
// Data structure encapsulating single memory allocation on the device.
struct Allocation {
// The pointer to this allocation.
perftools::gputools::DeviceMemoryBase device_memory;
se::DeviceMemoryBase device_memory;
// The device that the memory is allocated on.
int device_ordinal;
@ -103,13 +103,13 @@ class AllocationTracker {
// Adds the given device address to the allocation tracker, or if it already
// exists, then increment it's reference count.
void AddAllocationOrIncrementRefCount(
perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal)
void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory,
int device_ordinal)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Decrements the reference count of the given device memory. Then, if it is
// zero, deallocate the memory.
Status DecrementRefCount(perftools::gputools::DeviceMemoryBase device_memory,
Status DecrementRefCount(se::DeviceMemoryBase device_memory,
int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// A map from device memory opaque value to allocation. One such map is

View File

@ -36,19 +36,14 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace se = ::perftools::gputools;
namespace xla {
BackendOptions& BackendOptions::set_platform(
perftools::gputools::Platform* platform) {
BackendOptions& BackendOptions::set_platform(se::Platform* platform) {
platform_ = platform;
return *this;
}
perftools::gputools::Platform* BackendOptions::platform() const {
return platform_;
}
se::Platform* BackendOptions::platform() const { return platform_; }
BackendOptions& BackendOptions::set_intra_op_parallelism_threads(
int num_threads) {
@ -77,7 +72,7 @@ struct Backend::EigenThreadPoolWrapper {
/* static */ StatusOr<std::unique_ptr<Backend>> Backend::CreateBackend(
const BackendOptions& options) {
perftools::gputools::Platform* platform = options.platform();
se::Platform* platform = options.platform();
TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
TF_ASSIGN_OR_RETURN(auto stream_executors,
PlatformUtil::GetStreamExecutors(platform));
@ -121,7 +116,7 @@ StatusOr<Backend::StreamPtr> Backend::BorrowStream(
}
Backend::Backend(
perftools::gputools::Platform* platform, Compiler* compiler,
se::Platform* platform, Compiler* compiler,
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
TransferManager* transfer_manager, ComputationPlacer* computation_placer,
int intra_op_parallelism_threads)
@ -178,7 +173,7 @@ tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
return intra_op_thread_pool_wrapper_->pool.get();
}
StatusOr<perftools::gputools::StreamExecutor*> Backend::stream_executor(
StatusOr<se::StreamExecutor*> Backend::stream_executor(
int device_ordinal) const {
if (device_ordinal < 0 ||
device_ordinal > stream_executors_.back()->device_ordinal()) {
@ -201,9 +196,9 @@ StatusOr<bool> Backend::devices_equivalent(int device_ordinal_a,
// bit crude but works for GPUs which is the important case where we compile
// an executable for one GPU and want to know if it will run (well) on
// another.
TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_a,
TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_a,
stream_executor(device_ordinal_a));
TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_b,
TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_b,
stream_executor(device_ordinal_b));
return (executor_a->GetDeviceDescription().name() ==
executor_b->GetDeviceDescription().name());

View File

@ -44,8 +44,8 @@ namespace xla {
class BackendOptions {
public:
// Set the platform backing the backend, or nullptr for the default platform.
BackendOptions& set_platform(perftools::gputools::Platform* platform);
perftools::gputools::Platform* platform() const;
BackendOptions& set_platform(se::Platform* platform);
se::Platform* platform() const;
// Sets the thread pool size for parallel execution of an individual operator.
// The default value of -1 will result in initializing the thread pool with
@ -54,7 +54,7 @@ class BackendOptions {
int intra_op_parallelism_threads() const;
private:
perftools::gputools::Platform* platform_ = nullptr;
se::Platform* platform_ = nullptr;
int intra_op_parallelism_threads_ = -1;
};
@ -66,7 +66,7 @@ class BackendOptions {
// StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie();
class Backend {
public:
using StreamPtr = Pool<perftools::gputools::Stream>::SmartPtr;
using StreamPtr = Pool<se::Stream>::SmartPtr;
// Creates a new backend.
static StatusOr<std::unique_ptr<Backend>> CreateBackend(
@ -79,7 +79,7 @@ class Backend {
~Backend();
// Accessors for the various objects.
perftools::gputools::Platform* platform() const { return platform_; }
se::Platform* platform() const { return platform_; }
Compiler* compiler() const { return compiler_; }
DeviceMemoryAllocator* memory_allocator() const {
return memory_allocator_.get();
@ -96,19 +96,17 @@ class Backend {
// Returns stream executors of all supported devices for this backend. The
// executors are ordered by the device ordinal.
const std::vector<perftools::gputools::StreamExecutor*>& stream_executors()
const {
const std::vector<se::StreamExecutor*>& stream_executors() const {
return stream_executors_;
}
// Returns the stream executor for the given device ordinal.
StatusOr<perftools::gputools::StreamExecutor*> stream_executor(
int device_ordinal) const;
StatusOr<se::StreamExecutor*> stream_executor(int device_ordinal) const;
// Returns the stream executor for the default device ordinal. This stream
// executor can only be used when the number of computations is 1 (replication
// can be > 1).
perftools::gputools::StreamExecutor* default_stream_executor() const {
se::StreamExecutor* default_stream_executor() const {
CHECK(!stream_executors_.empty());
return stream_executors_[0];
}
@ -117,8 +115,7 @@ class Backend {
// internal pool, or by constructing/initializating it, and returns the result
// to the caller.
StatusOr<StreamPtr> BorrowStream(int device_ordinal);
StatusOr<StreamPtr> BorrowStream(
perftools::gputools::StreamExecutor* executor);
StatusOr<StreamPtr> BorrowStream(se::StreamExecutor* executor);
// Returns a function to borrow a stream, as `BorrowStream` above does.
// Purely for convenience, the caller could rather make this anonymous
@ -157,29 +154,26 @@ class Backend {
private:
struct EigenThreadPoolWrapper;
Backend(perftools::gputools::Platform* platform, Compiler* compiler,
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
stream_executors,
Backend(se::Platform* platform, Compiler* compiler,
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
TransferManager* transfer_manager,
ComputationPlacer* computation_placer,
int intra_op_parallelism_threads);
Backend(const Backend&) = delete;
Backend& operator=(const Backend&) = delete;
perftools::gputools::Platform* platform_;
se::Platform* platform_;
Compiler* compiler_;
TransferManager* transfer_manager_;
ComputationPlacer* computation_placer_;
// Vector of stream executors. stream_executors_[0] is the default executor.
std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
std::vector<se::StreamExecutor*> stream_executors_;
tensorflow::mutex mu_;
// Mapping from stream executor to stream pools, used by `BorrowStream` above.
std::map<perftools::gputools::StreamExecutor*,
Pool<perftools::gputools::Stream>>
stream_pools_ GUARDED_BY(mu_);
std::map<se::StreamExecutor*, Pool<se::Stream>> stream_pools_ GUARDED_BY(mu_);
// The default memory allocator to use.
std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;

View File

@ -37,7 +37,7 @@ limitations under the License.
namespace xla {
/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
CompileOnlyService::NewService(se::Platform* platform) {
ServiceOptions default_options;
default_options.set_platform(platform);
return NewService(default_options);
@ -45,7 +45,7 @@ CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
CompileOnlyService::NewService(const ServiceOptions& options) {
perftools::gputools::Platform* platform = options.platform();
se::Platform* platform = options.platform();
if (platform == nullptr) {
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
}

View File

@ -34,7 +34,7 @@ class CompileOnlyService : public Service {
// platform that the service should target. If platform is null then the
// default platform is used.
static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
perftools::gputools::Platform* platform);
se::Platform* platform);
static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
const ServiceOptions& options);

View File

@ -23,26 +23,21 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
namespace se = ::perftools::gputools;
namespace xla {
/* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
tensorflow::LINKER_INITIALIZED);
/* static */ std::map<perftools::gputools::Platform::Id,
Compiler::CompilerFactory>*
/* static */ std::map<se::Platform::Id, Compiler::CompilerFactory>*
Compiler::GetPlatformCompilerFactories() {
static auto* r =
new std::map<perftools::gputools::Platform::Id, CompilerFactory>;
static auto* r = new std::map<se::Platform::Id, CompilerFactory>;
return r;
}
/* static */
std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
Compiler::GetPlatformCompilers() {
static auto* r = new std::map<perftools::gputools::Platform::Id,
std::unique_ptr<Compiler>>;
static auto* r = new std::map<se::Platform::Id, std::unique_ptr<Compiler>>;
return r;
}

View File

@ -70,7 +70,7 @@ class AotCompilationOptions {
virtual ~AotCompilationOptions() = default;
// Returns the ID of the platform to which these options apply.
virtual perftools::gputools::Platform::Id PlatformId() const = 0;
virtual se::Platform::Id PlatformId() const = 0;
// Optional allocator that may be used for allocating temp space on the device
// during compilation.
@ -109,7 +109,7 @@ class Compiler {
virtual ~Compiler() {}
// Returns the ID of the platform that this compiler targets.
virtual perftools::gputools::Platform::Id PlatformId() const = 0;
virtual se::Platform::Id PlatformId() const = 0;
// Runs Hlo passes to optimize the given Hlo module, returns the optimized
// module.
@ -120,8 +120,7 @@ class Compiler {
// algorithm over those buffers, to see which variant is fastest. Any space
// allocated should be deallocated before this function returns.
virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
std::unique_ptr<HloModule> module,
perftools::gputools::StreamExecutor* executor,
std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
DeviceMemoryAllocator* device_allocator) = 0;
// Compiles the HLO module for execution on a device given by the executor,
@ -137,8 +136,7 @@ class Compiler {
//
// Use the overload below to compile computations that run in parallel.
virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
std::unique_ptr<HloModule> module,
perftools::gputools::StreamExecutor* executor,
std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
DeviceMemoryAllocator* device_allocator) = 0;
// Compiles a set of HLO modules that can run in parallel, potentially
@ -151,8 +149,7 @@ class Compiler {
// modules to RunHloPasses and RunBackends.
virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
std::vector<std::unique_ptr<HloModule>> modules,
std::vector<std::vector<perftools::gputools::StreamExecutor*>>
stream_exec,
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
DeviceMemoryAllocator* device_allocator) = 0;
// Compiles the HLO module for ahead-of-time execution. This is intended for
@ -171,14 +168,12 @@ class Compiler {
// be a singleton, so no ownership is transferred.
//
// Precondition: a platform kind must not be registered more than once.
static void RegisterCompilerFactory(
perftools::gputools::Platform::Id platform_id,
CompilerFactory compiler_factory);
static void RegisterCompilerFactory(se::Platform::Id platform_id,
CompilerFactory compiler_factory);
// Returns the compiler singleton pointer if it is available for the given
// platform, or an error status if it is not.
static StatusOr<Compiler*> GetForPlatform(
const perftools::gputools::Platform* platform);
static StatusOr<Compiler*> GetForPlatform(const se::Platform* platform);
// Returns a function that computes the size in bytes of the logical
// buffer that contains a shape.
@ -198,12 +193,12 @@ class Compiler {
static tensorflow::mutex platform_compiler_mutex_;
// Map from platform kind to compiler factory.
static std::map<perftools::gputools::Platform::Id, CompilerFactory>*
static std::map<se::Platform::Id, CompilerFactory>*
GetPlatformCompilerFactories();
// Map from platform kind to compiler instance, if we made one already (based
// on the factories above).
static std::map<perftools::gputools::Platform::Id, std::unique_ptr<Compiler>>*
static std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
GetPlatformCompilers();
};

View File

@ -32,8 +32,6 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace se = ::perftools::gputools;
namespace xla {
Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
@ -132,11 +130,9 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
ComputationPlacer::platform_computation_placer_mutex_(
tensorflow::LINKER_INITIALIZED);
/* static */ std::map<perftools::gputools::Platform::Id,
ComputationPlacer::State>*
/* static */ std::map<se::Platform::Id, ComputationPlacer::State>*
ComputationPlacer::GetPlatformComputationPlacers() {
static auto* r =
new std::map<perftools::gputools::Platform::Id, ComputationPlacer::State>;
static auto* r = new std::map<se::Platform::Id, ComputationPlacer::State>;
return r;
}
@ -147,10 +143,10 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
}
static bool InitModule() {
xla::ComputationPlacer::RegisterComputationPlacer(se::host::kHostPlatformId,
&CreateComputationPlacer);
xla::ComputationPlacer::RegisterComputationPlacer(se::cuda::kCudaPlatformId,
&CreateComputationPlacer);
xla::ComputationPlacer::RegisterComputationPlacer(
stream_executor::host::kHostPlatformId, &CreateComputationPlacer);
xla::ComputationPlacer::RegisterComputationPlacer(
stream_executor::cuda::kCudaPlatformId, &CreateComputationPlacer);
return true;
}
static bool module_initialized = InitModule();

View File

@ -80,13 +80,13 @@ class ComputationPlacer {
// Registers a computation placer creation function for a particular platform.
static void RegisterComputationPlacer(
perftools::gputools::Platform::Id platform_id,
se::Platform::Id platform_id,
ComputationPlacerCreationFunction creation_function);
// Returns the computation placer singleton pointer if it is available for the
// given platform, or an error status if it is not.
static StatusOr<ComputationPlacer*> GetForPlatform(
const perftools::gputools::Platform* platform);
const se::Platform* platform);
private:
// The mutex that guards the platform-to-computation placer map.
@ -101,10 +101,9 @@ class ComputationPlacer {
};
// Map from platform kind to computation placer singleton.
static std::map<perftools::gputools::Platform::Id, State>*
GetPlatformComputationPlacers();
static std::map<se::Platform::Id, State>* GetPlatformComputationPlacers();
perftools::gputools::Platform::Id platform_id_;
se::Platform::Id platform_id_;
TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer);
};

View File

@ -100,8 +100,6 @@ limitations under the License.
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/lib/strings/strcat.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace cpu {
@ -440,8 +438,7 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
} // namespace
StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
std::unique_ptr<HloModule> module,
perftools::gputools::StreamExecutor* /*stream_exec*/,
std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
DeviceMemoryAllocator* /*device_allocator*/) {
VLOG(2) << "Before optimization:";
XLA_VLOG_LINES(2, module->ToString());
@ -454,8 +451,7 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
}
StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
std::unique_ptr<HloModule> module,
perftools::gputools::StreamExecutor* stream_exec,
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
DeviceMemoryAllocator* /*device_allocator*/) {
const string timer_message =
"Compiling [" + module->name() + "] for CPU using JIT";
@ -938,9 +934,9 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
} // namespace xla
static bool InitModule() {
xla::Compiler::RegisterCompilerFactory(se::host::kHostPlatformId, []() {
return xla::MakeUnique<xla::cpu::CpuCompiler>();
});
xla::Compiler::RegisterCompilerFactory(
stream_executor::host::kHostPlatformId,
[]() { return xla::MakeUnique<xla::cpu::CpuCompiler>(); });
return true;
}
static bool module_initialized = InitModule();

View File

@ -53,7 +53,7 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
RelocationModel relocation_model);
~CpuAotCompilationOptions() override;
perftools::gputools::Platform::Id PlatformId() const override;
se::Platform::Id PlatformId() const override;
// The triple used for compilation, similar to clang's -target flag.
const string& triple() const { return triple_; }
@ -112,25 +112,23 @@ class CpuCompiler : public LLVMCompiler {
// Bring in
// StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
// std::vector<std::unique_ptr<HloModule>> modules,
// std::vector<std::vector<perftools::gputools::StreamExecutor*>>
// std::vector<std::vector<se::StreamExecutor*>>
// stream_execs)
using LLVMCompiler::Compile;
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
std::unique_ptr<HloModule> module,
perftools::gputools::StreamExecutor* stream_exec,
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::unique_ptr<Executable>> RunBackend(
std::unique_ptr<HloModule> module,
perftools::gputools::StreamExecutor* stream_exec,
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
const AotCompilationOptions& options) override;
perftools::gputools::Platform::Id PlatformId() const override;
se::Platform::Id PlatformId() const override;
HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;

View File

@ -45,8 +45,6 @@ limitations under the License.
#include "tensorflow/core/platform/types.h"
#include "tensorflow/stream_executor/host/host_stream.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace cpu {
@ -75,7 +73,7 @@ CpuExecutable::CpuExecutable(
Status CpuExecutable::AllocateBuffers(
DeviceMemoryAllocator* memory_allocator, int device_ordinal,
std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
std::vector<se::DeviceMemoryBase>* buffers) {
CHECK_EQ(buffers->size(), assignment_->Allocations().size());
VLOG(3) << "Allocating " << assignment_->Allocations().size()
<< " allocations for module " << module().name();
@ -247,8 +245,7 @@ static Status DeallocateTempBuffers(
StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
const ServiceExecutableRunOptions* run_options,
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
allocated_buffers,
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
std::vector<bool>* buffers_in_result) {
se::Stream* stream = run_options->stream();
auto result_buffer = MakeUnique<ShapedBuffer>(
@ -322,7 +319,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
"supported on CPU.");
}
auto* host_stream = dynamic_cast<perftools::gputools::host::HostStream*>(
auto* host_stream = dynamic_cast<se::host::HostStream*>(
run_options->stream()->implementation());
se::Stream* stream = run_options->stream();
DeviceMemoryAllocator* memory_allocator = run_options->allocator();

View File

@ -90,17 +90,16 @@ class CpuExecutable : public Executable {
// assignment. Each vector element corresponds to a particular Index. If
// a vector element already contains a non-null DeviceMemoryBase, then no
// buffer is assigned for this element.
Status AllocateBuffers(
DeviceMemoryAllocator* memory_allocator, int device_ordinal,
std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
int device_ordinal,
std::vector<se::DeviceMemoryBase>* buffers);
// Calls the generated function performing the computation with the given
// arguments using the supplied buffers.
Status ExecuteComputeFunction(
const ExecutableRunOptions* run_options,
tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
buffers,
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
HloExecutionProfile* hlo_execution_profile);
// Create a ShapedBuffer for holding the result of the computation. The
@ -111,8 +110,7 @@ class CpuExecutable : public Executable {
// the returned ShapedBuffer).
StatusOr<std::unique_ptr<ShapedBuffer>> CreateResultShapedBuffer(
const ServiceExecutableRunOptions* run_options,
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
allocated_buffers,
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
std::vector<bool>* buffers_in_result);
// Returns the points-to set of the root instruction of the entry

View File

@ -34,8 +34,6 @@ limitations under the License.
#include "tensorflow/core/platform/notification.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace {
@ -241,21 +239,20 @@ Status CpuTransferManager::TransferLiteralFromOutfeed(
}
StatusOr<Shape> CpuTransferManager::TransferTupleBuffersFromOutfeed(
perftools::gputools::StreamExecutor* executor,
se::StreamExecutor* executor,
tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data) {
return TransferBuffersFromOutfeedInternal(executor, buffer_data,
/*is_tuple=*/true);
}
StatusOr<Shape> CpuTransferManager::TransferArrayBufferFromOutfeed(
perftools::gputools::StreamExecutor* executor, void* destination,
int64 size_bytes) {
se::StreamExecutor* executor, void* destination, int64 size_bytes) {
return TransferBuffersFromOutfeedInternal(
executor, {{destination, size_bytes}}, /*is_tuple=*/false);
}
StatusOr<Shape> CpuTransferManager::TransferBuffersFromOutfeedInternal(
perftools::gputools::StreamExecutor* executor,
se::StreamExecutor* executor,
tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
bool is_tuple) {
std::vector<std::unique_ptr<CpuOutfeedBuffer>> buffers;
@ -306,8 +303,8 @@ static std::unique_ptr<xla::TransferManager> CreateCpuTransferManager() {
}
static bool InitModule() {
xla::TransferManager::RegisterTransferManager(se::host::kHostPlatformId,
&CreateCpuTransferManager);
xla::TransferManager::RegisterTransferManager(
stream_executor::host::kHostPlatformId, &CreateCpuTransferManager);
return true;
}
static bool module_initialized = InitModule();

View File

@ -37,36 +37,35 @@ class CpuTransferManager : public GenericTransferManager {
CpuTransferManager();
~CpuTransferManager() override {}
Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
Status TransferLiteralToInfeed(se::StreamExecutor* executor,
const Literal& literal) override;
Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
int64 size, const void* source) override;
Status TransferLiteralFromOutfeed(
perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
Literal* literal) override;
Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
const void* source) override;
Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
const Shape& literal_shape,
Literal* literal) override;
private:
// Transfers infeed data to device. InfeedBuffer->Done() must be
// called to clean up the memory allocated for InfeedBuffer.
StatusOr<cpu::runtime::XfeedBuffer*> TransferBufferToInfeedInternal(
perftools::gputools::StreamExecutor* executor, int64 size,
const void* source);
se::StreamExecutor* executor, int64 size, const void* source);
// Helper that transfers a tuple of element buffers from the device's outfeed.
StatusOr<Shape> TransferTupleBuffersFromOutfeed(
perftools::gputools::StreamExecutor* executor,
se::StreamExecutor* executor,
tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data);
// Helper that transfers an array buffer from the device's outfeed.
StatusOr<Shape> TransferArrayBufferFromOutfeed(
perftools::gputools::StreamExecutor* executor, void* destination,
int64 size_bytes);
StatusOr<Shape> TransferArrayBufferFromOutfeed(se::StreamExecutor* executor,
void* destination,
int64 size_bytes);
// On success, returns the shape that was transferred from the outfeed -- if
// is_tuple is true, the returned shape will be a tuple of the returned shapes
// for the given buffers.
StatusOr<Shape> TransferBuffersFromOutfeedInternal(
perftools::gputools::StreamExecutor* executor,
se::StreamExecutor* executor,
tensorflow::gtl::ArraySlice<std::pair<void*, int64>> buffer_data,
bool is_tuple);

View File

@ -49,8 +49,6 @@ limitations under the License.
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/types.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace cpu {
@ -325,7 +323,7 @@ const void** Executor::GetOperandBuffers(HloInstruction* instruction) {
Status ParallelCpuExecutable::AllocateBuffers(
DeviceMemoryAllocator* memory_allocator, int device_ordinal,
std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
std::vector<se::DeviceMemoryBase>* buffers) {
CHECK_EQ(buffers->size(), assignment_->Allocations().size());
VLOG(3) << "Allocating " << assignment_->Allocations().size()
<< " allocations for module " << module().name();

View File

@ -89,17 +89,16 @@ class ParallelCpuExecutable : public Executable {
// assignment. Each vector element corresponds to a particular Index. If
// a vector element already contains a non-null DeviceMemoryBase, then no
// buffer is assigned for this element.
Status AllocateBuffers(
DeviceMemoryAllocator* memory_allocator, int device_ordinal,
std::vector<perftools::gputools::DeviceMemoryBase>* buffers);
Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
int device_ordinal,
std::vector<se::DeviceMemoryBase>* buffers);
// Calls the generated functions in 'function_names_', performing the
// computation with the given arguments using the supplied buffers.
Status ExecuteComputeFunctions(
const ServiceExecutableRunOptions* run_options,
tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
buffers,
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
HloExecutionProfile* hlo_execution_profile);
// Returns the points-to set of the root instruction of the entry

View File

@ -24,19 +24,16 @@ limitations under the License.
namespace xla {
StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
const perftools::gputools::Platform* platform,
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
stream_executors)
const se::Platform* platform,
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors)
: DeviceMemoryAllocator(platform),
stream_executors_(stream_executors.begin(), stream_executors.end()) {}
StatusOr<perftools::gputools::DeviceMemoryBase>
StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
bool retry_on_failure) {
TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
StatusOr<se::DeviceMemoryBase> StreamExecutorMemoryAllocator::Allocate(
int device_ordinal, uint64 size, bool retry_on_failure) {
TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
GetStreamExecutor(device_ordinal));
perftools::gputools::DeviceMemoryBase result =
stream_executor->AllocateArray<uint8>(size);
se::DeviceMemoryBase result = stream_executor->AllocateArray<uint8>(size);
if (size > 0 && result == nullptr) {
return ResourceExhausted(
"Failed to allocate request for %s (%lluB) on device ordinal %d",
@ -47,22 +44,22 @@ StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size,
}
tensorflow::Status StreamExecutorMemoryAllocator::Deallocate(
int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) {
int device_ordinal, se::DeviceMemoryBase* mem) {
if (!mem->is_null()) {
TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor,
TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
GetStreamExecutor(device_ordinal));
// We make a local copy of 'mem' so the original is not zeroed out by the
// Deallocate() call below. This gives us a better chance of
// catching double-free bugs, since Deallocate silently succeeds for null
// values.
perftools::gputools::DeviceMemoryBase mem_copy(*mem);
se::DeviceMemoryBase mem_copy(*mem);
stream_executor->Deallocate(&mem_copy);
}
return tensorflow::Status::OK();
}
StatusOr<perftools::gputools::StreamExecutor*>
StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
int device_ordinal) {
if (device_ordinal < 0) {
return InvalidArgument("device ordinal value (%d) must be non-negative",
device_ordinal);

View File

@ -33,7 +33,7 @@ class DeviceMemoryAllocator {
public:
// Parameter platform indicates which platform the allocator allocates memory
// on. Must be non-null.
explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform)
explicit DeviceMemoryAllocator(const se::Platform* platform)
: platform_(platform) {}
virtual ~DeviceMemoryAllocator() {}
@ -43,20 +43,20 @@ class DeviceMemoryAllocator {
// has only performance impact.
// Allocate() should return a null pointer for a size-0 allocation.
// Deallocate() must be a no-op for null pointers.
virtual StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
virtual StatusOr<se::DeviceMemoryBase> Allocate(
int device_ordinal, uint64 size, bool retry_on_failure = true) = 0;
virtual tensorflow::Status Deallocate(
int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0;
virtual tensorflow::Status Deallocate(int device_ordinal,
se::DeviceMemoryBase* mem) = 0;
// Return the platform that the allocator allocates memory on.
const perftools::gputools::Platform* platform() const { return platform_; }
const se::Platform* platform() const { return platform_; }
// Can we call Deallocate() as soon as a computation has been scheduled on
// a stream, or do we have to wait for the computation to complete first?
virtual bool AllowsAsynchronousDeallocation() const = 0;
protected:
const perftools::gputools::Platform* platform_;
const se::Platform* platform_;
};
// Default memory allocator for a platform which uses
@ -64,25 +64,23 @@ class DeviceMemoryAllocator {
class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
public:
StreamExecutorMemoryAllocator(
const perftools::gputools::Platform* platform,
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
stream_executors);
const se::Platform* platform,
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
StatusOr<se::DeviceMemoryBase> Allocate(
int device_ordinal, uint64 size, bool retry_on_failure = true) override;
tensorflow::Status Deallocate(
int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override;
tensorflow::Status Deallocate(int device_ordinal,
se::DeviceMemoryBase* mem) override;
bool AllowsAsynchronousDeallocation() const override;
private:
StatusOr<perftools::gputools::StreamExecutor*> GetStreamExecutor(
int device_ordinal);
StatusOr<se::StreamExecutor*> GetStreamExecutor(int device_ordinal);
// A vector indexed by device ordinal of StreamExecutors for each device of
// the allocator's platform type. If an element is nullptr, then the device
// with the respective device ordinal is not supported by XLA.
std::vector<perftools::gputools::StreamExecutor*> stream_executors_;
std::vector<se::StreamExecutor*> stream_executors_;
};
} // namespace xla

View File

@ -61,10 +61,10 @@ Executable::ExecuteOnStreams(
StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
ArraySlice<const ShapedBuffer*> arguments) {
perftools::gputools::Stream* stream = run_options->stream();
std::unique_ptr<perftools::gputools::Timer> timer;
se::Stream* stream = run_options->stream();
std::unique_ptr<se::Timer> timer;
if (profile != nullptr) {
timer.reset(new perftools::gputools::Timer(stream->parent()));
timer.reset(new se::Timer(stream->parent()));
stream->InitTimer(timer.get()).ThenStartTimer(timer.get());
}

View File

@ -90,7 +90,7 @@ class Executable {
// has completed.
virtual Status PopulateExecutionProfile(
HloExecutionProfile* hlo_execution_profile,
perftools::gputools::StreamExecutor* executor) {
se::StreamExecutor* executor) {
return Status::OK();
}

View File

@ -32,8 +32,6 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace se = ::perftools::gputools;
namespace xla {
GenericTransferManager::GenericTransferManager(se::Platform::Id platform_id,
@ -45,9 +43,9 @@ se::Platform::Id GenericTransferManager::PlatformId() const {
}
Status GenericTransferManager::WriteSingleTupleIndexTable(
perftools::gputools::StreamExecutor* executor,
se::StreamExecutor* executor,
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
const Shape& shape, perftools::gputools::DeviceMemoryBase* region) {
const Shape& shape, se::DeviceMemoryBase* region) {
TF_RET_CHECK(elements.size() == ShapeUtil::TupleElementCount(shape));
std::vector<const void*> element_pointers;
@ -144,20 +142,19 @@ Status GenericTransferManager::TransferLiteralToInfeed(
}
Status GenericTransferManager::TransferBufferToInfeed(
perftools::gputools::StreamExecutor* executor, int64 size,
const void* source) {
se::StreamExecutor* executor, int64 size, const void* source) {
return Unimplemented("Generic transfer to Infeed");
}
Status GenericTransferManager::TransferLiteralFromOutfeed(
perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
se::StreamExecutor* executor, const Shape& literal_shape,
Literal* literal) {
return Unimplemented(
"Outfeed is not supported on this platform (b/30467474)");
}
Status GenericTransferManager::ResetDevices(
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
tensorflow::gtl::ArraySlice<se::StreamExecutor*>
/*executors*/) {
return Unimplemented(
"Device reset is not yet supported on this platform (b/30481585)");

View File

@ -36,46 +36,41 @@ namespace xla {
// infeed.
class GenericTransferManager : public TransferManager {
public:
GenericTransferManager(perftools::gputools::Platform::Id platform_id,
size_t pointer_size);
GenericTransferManager(se::Platform::Id platform_id, size_t pointer_size);
~GenericTransferManager() override {}
perftools::gputools::Platform::Id PlatformId() const override;
se::Platform::Id PlatformId() const override;
StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
perftools::gputools::StreamExecutor* executor,
const ShapedBuffer& device_buffer) override;
se::StreamExecutor* executor, const ShapedBuffer& device_buffer) override;
Status TransferLiteralToDevice(perftools::gputools::StreamExecutor* executor,
Status TransferLiteralToDevice(se::StreamExecutor* executor,
const Literal& literal,
const ShapedBuffer& device_buffer) override;
Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
Status TransferLiteralToInfeed(se::StreamExecutor* executor,
const Literal& literal) override;
Status TransferLiteralFromOutfeed(
perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
Literal* literal) override;
Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
const Shape& literal_shape,
Literal* literal) override;
Status ResetDevices(
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
executors) override;
tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;
int64 GetByteSizeRequirement(const Shape& shape) const override;
protected:
Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
int64 size, const void* source) override;
Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
const void* source) override;
Status WriteSingleTupleIndexTable(
perftools::gputools::StreamExecutor* executor,
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
elements,
const Shape& shape,
perftools::gputools::DeviceMemoryBase* region) override;
se::StreamExecutor* executor,
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
const Shape& shape, se::DeviceMemoryBase* region) override;
private:
// The platform this transfer manager targets.
const perftools::gputools::Platform::Id platform_id_;
const se::Platform::Id platform_id_;
// The size in bytes of pointers on this platform.
const size_t pointer_size_;

View File

@ -28,8 +28,6 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace gpu {

View File

@ -41,7 +41,7 @@ class BufferAllocations {
// user-specified result buffers) to the given buffer index. The builder
// will skip allocating buffers for registered buffer indices.
void RegisterBuffer(BufferAllocation::Index index,
perftools::gputools::DeviceMemoryBase address);
se::DeviceMemoryBase address);
// Builds a BufferAllocations object from the given buffer assignment.
// `memory_allocator` is what this function uses to allocate device memory.
@ -52,8 +52,7 @@ class BufferAllocations {
DeviceMemoryAllocator* memory_allocator);
private:
std::map<BufferAllocation::Index, perftools::gputools::DeviceMemoryBase>
registered_buffers_;
std::map<BufferAllocation::Index, se::DeviceMemoryBase> registered_buffers_;
};
BufferAllocations(const BufferAllocations&) = delete;
@ -65,22 +64,20 @@ class BufferAllocations {
// Returns the device address of buffer `buffer_index`. `buffer_index` must be
// a valid index, i.e., in [0, buffer_count). This function returns null if
// `buffer_index` is not assigned to a buffer address.
perftools::gputools::DeviceMemoryBase GetDeviceAddress(
se::DeviceMemoryBase GetDeviceAddress(
BufferAllocation::Index buffer_index) const;
// Same as above, but also adjusts the returned address for the offset and
// size contained in the given slice.
perftools::gputools::DeviceMemoryBase GetDeviceAddress(
se::DeviceMemoryBase GetDeviceAddress(
const BufferAllocation::Slice& buffer_slice) const;
perftools::gputools::DeviceMemoryBase GetTempBufferBase() const {
return temp_buffer_base_;
}
se::DeviceMemoryBase GetTempBufferBase() const { return temp_buffer_base_; }
// Tears down all buffers allocated by this object that are not in
// `live_addresses`.
tensorflow::Status TearDown(
const std::set<perftools::gputools::DeviceMemoryBase>& live_addresses,
const std::set<se::DeviceMemoryBase>& live_addresses,
const BufferAssignment& buffer_assignment);
private:
@ -92,15 +89,15 @@ class BufferAllocations {
// Sets the device address of buffer `buffer_index`.
void SetBuffer(BufferAllocation::Index buffer_index,
perftools::gputools::DeviceMemoryBase buffer);
se::DeviceMemoryBase buffer);
// An array of device pointers that stores the address of each buffer
// indexed by Index. Each element can point to a temporary buffer, an
// input buffer, or nullptr if no buffer is needed for that Index.
std::vector<perftools::gputools::DeviceMemoryBase> buffers_;
std::vector<se::DeviceMemoryBase> buffers_;
// The base address of the memory block that contains all temporary buffers.
perftools::gputools::DeviceMemoryBase temp_buffer_base_;
se::DeviceMemoryBase temp_buffer_base_;
int device_ordinal_;

View File

@ -42,11 +42,10 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable) {
}
Status ConditionalThunk::ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) {
const BufferAllocations& buffer_allocations, se::Stream* stream) {
// Copy the predicate value from device.
bool predicate;
perftools::gputools::DeviceMemoryBase predicate_address =
se::DeviceMemoryBase predicate_address =
buffer_allocations.GetDeviceAddress(predicate_buffer_index_);
stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool));

View File

@ -49,7 +49,7 @@ class ConditionalThunk : public Thunk {
Status Initialize(const GpuExecutable& executable) override;
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
se::Stream* stream) override;
private:
BufferAllocation::Slice predicate_buffer_index_;

View File

@ -25,8 +25,6 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace gpu {

View File

@ -66,23 +66,21 @@ class ConvolutionThunk : public Thunk {
// Does the convolution for the thunk on "stream".
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
se::Stream* stream) override;
private:
class ScratchAllocator;
Status Convolve(
const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
perftools::gputools::DeviceMemory<float> input_data,
const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
perftools::gputools::DeviceMemory<float> filter_data,
const perftools::gputools::dnn::BatchDescriptor& output_descriptor,
perftools::gputools::DeviceMemory<float> output_data,
const perftools::gputools::dnn::ConvolutionDescriptor&
convolution_descriptor,
const perftools::gputools::dnn::AlgorithmConfig& algorithm_config,
perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator,
perftools::gputools::dnn::ProfileResult* profile_result);
Status Convolve(const se::dnn::BatchDescriptor& input_descriptor,
se::DeviceMemory<float> input_data,
const se::dnn::FilterDescriptor& filter_descriptor,
se::DeviceMemory<float> filter_data,
const se::dnn::BatchDescriptor& output_descriptor,
se::DeviceMemory<float> output_data,
const se::dnn::ConvolutionDescriptor& convolution_descriptor,
const se::dnn::AlgorithmConfig& algorithm_config,
se::Stream* stream, ScratchAllocator* scratch_allocator,
se::dnn::ProfileResult* profile_result);
const CudnnConvKind convolution_kind_;

View File

@ -30,9 +30,8 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk(
mem_size_(mem_size) {}
tensorflow::Status HostToDeviceCopyThunk::ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) {
perftools::gputools::DeviceMemoryBase destination_data =
const BufferAllocations& buffer_allocations, se::Stream* stream) {
se::DeviceMemoryBase destination_data =
buffer_allocations.GetDeviceAddress(destination_buffer_);
stream->ThenMemcpy(&destination_data, source_address_, mem_size_);
return tensorflow::Status::OK();
@ -48,11 +47,10 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
mem_size_(mem_size) {}
tensorflow::Status DeviceToDeviceCopyThunk::ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) {
perftools::gputools::DeviceMemoryBase destination_data =
const BufferAllocations& buffer_allocations, se::Stream* stream) {
se::DeviceMemoryBase destination_data =
buffer_allocations.GetDeviceAddress(destination_buffer_);
perftools::gputools::DeviceMemoryBase source_data =
se::DeviceMemoryBase source_data =
buffer_allocations.GetDeviceAddress(source_buffer_);
stream->ThenMemcpy(&destination_data, source_data, mem_size_);
return tensorflow::Status::OK();

View File

@ -40,8 +40,7 @@ class HostToDeviceCopyThunk : public Thunk {
HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete;
tensorflow::Status ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
private:
const void* source_address_;
@ -64,8 +63,7 @@ class DeviceToDeviceCopyThunk : public Thunk {
DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;
tensorflow::Status ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
private:
const BufferAllocation::Slice source_buffer_;

View File

@ -28,7 +28,6 @@ limitations under the License.
namespace xla {
namespace gpu {
namespace se = ::perftools::gputools;
namespace dnn = se::dnn;
static std::pair<dnn::BatchDescriptor /*input_desc*/,

View File

@ -60,7 +60,7 @@ class CudnnBatchNormForwardInferenceThunk : public Thunk {
const CudnnBatchNormForwardInferenceThunk&) = delete;
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
se::Stream* stream) override;
private:
BufferAllocation::Slice operand_;
@ -90,7 +90,7 @@ class CudnnBatchNormForwardTrainingThunk : public Thunk {
const CudnnBatchNormForwardTrainingThunk&) = delete;
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
se::Stream* stream) override;
private:
BufferAllocation::Slice operand_;
@ -123,7 +123,7 @@ class CudnnBatchNormBackwardThunk : public Thunk {
delete;
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
se::Stream* stream) override;
private:
BufferAllocation::Slice operand_;

View File

@ -24,8 +24,6 @@ namespace xla {
namespace gpu {
namespace {
namespace se = perftools::gputools;
using se::DeviceMemoryBase;
using se::dnn::AlgorithmConfig;
using se::dnn::AlgorithmDesc;

View File

@ -33,9 +33,8 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
// If the `allocator` parameter is not null, we will use it to allocate temp
// memory while timing the various convolution algorithms. If it's null,
// we'll use the default allocator on the StreamExecutor.
CudnnConvolutionAlgorithmPicker(
perftools::gputools::StreamExecutor* stream_exec,
DeviceMemoryAllocator* allocator)
CudnnConvolutionAlgorithmPicker(se::StreamExecutor* stream_exec,
DeviceMemoryAllocator* allocator)
: stream_exec_(stream_exec), allocator_(allocator) {}
tensorflow::StringPiece name() const override {
@ -52,7 +51,7 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
const Shape& output_shape, const Window& window,
const ConvolutionDimensionNumbers& dnums, HloInstruction* instr);
perftools::gputools::StreamExecutor* stream_exec_; // never null
se::StreamExecutor* stream_exec_; // never null
DeviceMemoryAllocator* allocator_; // may be null
};

View File

@ -22,8 +22,6 @@ namespace xla {
namespace gpu {
namespace {
namespace se = ::perftools::gputools;
using se::DeviceMemory;
using se::DeviceMemoryBase;
using se::Stream;
@ -215,14 +213,12 @@ string CudnnConvKindToString(CudnnConvKind kind) {
Status RunCudnnConvolution(
CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
perftools::gputools::DeviceMemoryBase filter_buf,
perftools::gputools::DeviceMemoryBase output_buf,
perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
const Shape& output_shape, se::DeviceMemoryBase input_buf,
se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
se::DeviceMemoryBase scratch_buf, const Window& window,
const ConvolutionDimensionNumbers& dnums,
perftools::gputools::dnn::AlgorithmConfig algorithm,
perftools::gputools::Stream* stream,
perftools::gputools::dnn::ProfileResult* profile_result) {
se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
se::dnn::ProfileResult* profile_result) {
ScratchBufAllocator scratch_allocator(scratch_buf);
return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
input_buf, filter_buf, output_buf,
@ -232,14 +228,12 @@ Status RunCudnnConvolution(
Status RunCudnnConvolution(
CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
perftools::gputools::DeviceMemoryBase filter_buf,
perftools::gputools::DeviceMemoryBase output_buf,
perftools::gputools::ScratchAllocator* scratch_allocator,
const Window& window, const ConvolutionDimensionNumbers& dnums,
perftools::gputools::dnn::AlgorithmConfig algorithm,
perftools::gputools::Stream* stream,
perftools::gputools::dnn::ProfileResult* profile_result) {
const Shape& output_shape, se::DeviceMemoryBase input_buf,
se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
se::ScratchAllocator* scratch_allocator, const Window& window,
const ConvolutionDimensionNumbers& dnums,
se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
se::dnn::ProfileResult* profile_result) {
PrimitiveType output_primitive_type = output_shape.element_type();
CHECK(output_primitive_type == F32 || output_primitive_type == F16)
<< ShapeUtil::HumanString(output_shape);

View File

@ -72,25 +72,21 @@ string CudnnConvKindToString(CudnnConvKind kind);
// that size, if you like.
Status RunCudnnConvolution(
CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
perftools::gputools::DeviceMemoryBase filter_buf,
perftools::gputools::DeviceMemoryBase output_buf,
perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
const Shape& output_shape, se::DeviceMemoryBase input_buf,
se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
se::DeviceMemoryBase scratch_buf, const Window& window,
const ConvolutionDimensionNumbers& dnums,
perftools::gputools::dnn::AlgorithmConfig algorithm,
perftools::gputools::Stream* stream,
perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
se::dnn::ProfileResult* profile_result = nullptr);
Status RunCudnnConvolution(
CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
const Shape& output_shape, perftools::gputools::DeviceMemoryBase input_buf,
perftools::gputools::DeviceMemoryBase filter_buf,
perftools::gputools::DeviceMemoryBase output_buf,
perftools::gputools::ScratchAllocator* scratch_allocator,
const Window& window, const ConvolutionDimensionNumbers& dnums,
perftools::gputools::dnn::AlgorithmConfig algorithm,
perftools::gputools::Stream* stream,
perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
const Shape& output_shape, se::DeviceMemoryBase input_buf,
se::DeviceMemoryBase filter_buf, se::DeviceMemoryBase output_buf,
se::ScratchAllocator* scratch_allocator, const Window& window,
const ConvolutionDimensionNumbers& dnums,
se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
se::dnn::ProfileResult* profile_result = nullptr);
} // namespace gpu
} // namespace xla

View File

@ -24,8 +24,6 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace gpu {

View File

@ -34,24 +34,24 @@ namespace gpu {
// released on destruction.
//
// Not thread-safe in that AllocateBytes, destructor are not locked.
class FftScratchAllocator : public perftools::gputools::ScratchAllocator {
class FftScratchAllocator : public se::ScratchAllocator {
public:
FftScratchAllocator(int device_ordinal,
DeviceMemoryAllocator* memory_allocator);
~FftScratchAllocator() override;
int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override;
int64 GetMemoryLimitInBytes(se::Stream* stream) override;
int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override;
se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
se::Stream* stream, int64 byte_size) override;
private:
const int device_ordinal_;
DeviceMemoryAllocator* memory_allocator_;
std::vector<perftools::gputools::DeviceMemoryBase> allocated_buffers_;
std::vector<se::DeviceMemoryBase> allocated_buffers_;
int64 total_allocated_bytes_ = 0;
};
@ -74,16 +74,15 @@ class FftThunk : public Thunk {
// Does the FFT for the thunk on "stream".
tensorflow::Status ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
private:
const perftools::gputools::fft::Type fft_type_;
const se::fft::Type fft_type_;
const std::vector<int64> fft_length_;
float scale_factor_;
std::unique_ptr<perftools::gputools::fft::Plan> fft_plan_;
std::unique_ptr<se::fft::Plan> fft_plan_;
const BufferAllocation::Slice input_buffer_;
const BufferAllocation::Slice output_buffer_;

View File

@ -36,8 +36,7 @@ tensorflow::Status ForThunk::Initialize(const GpuExecutable& executable) {
}
tensorflow::Status ForThunk::ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) {
const BufferAllocations& buffer_allocations, se::Stream* stream) {
for (int64 i = 0; i < loop_limit_; ++i) {
// Invoke loop body thunk sequence.
TF_RETURN_IF_ERROR(

View File

@ -38,8 +38,7 @@ class ForThunk : public Thunk {
tensorflow::Status Initialize(const GpuExecutable& executable) override;
tensorflow::Status ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
private:
const int64 loop_limit_;

View File

@ -22,8 +22,6 @@ limitations under the License.
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
#include "tensorflow/core/platform/types.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace gpu {

View File

@ -50,14 +50,12 @@ class GemmThunk : public Thunk {
// Does the gemm operation for the thunk on "stream", which must be non-null.
tensorflow::Status ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
// Returns true if we'll perform autotuning if run on the given stream. If
// so, we want the GPU to be quiescent during autotuning, so as not to
// introduce noise in our results.
bool ShouldHaltAllActivityBeforeRunning(
perftools::gputools::Stream* stream) override {
bool ShouldHaltAllActivityBeforeRunning(se::Stream* stream) override {
return autotune_results_.count(
stream->parent()->GetDeviceDescription().name()) != 0;
}
@ -79,8 +77,7 @@ class GemmThunk : public Thunk {
// results. The map's value is the best algorithm we've found for this thunk
// on this device, or an error if none of the algorithms worked and we should
// use the regular gemm without an algorithm.
std::unordered_map<string,
StatusOr<::perftools::gputools::blas::AlgorithmType>>
std::unordered_map<string, StatusOr<se::blas::AlgorithmType>>
autotune_results_;
};

View File

@ -91,8 +91,6 @@ limitations under the License.
#include "tensorflow/core/platform/tracing.h"
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace gpu {
@ -779,9 +777,9 @@ se::Platform::Id GpuCompiler::PlatformId() const {
} // namespace xla
static bool InitModule() {
xla::Compiler::RegisterCompilerFactory(se::cuda::kCudaPlatformId, []() {
return xla::MakeUnique<xla::gpu::GpuCompiler>();
});
xla::Compiler::RegisterCompilerFactory(
stream_executor::cuda::kCudaPlatformId,
[]() { return xla::MakeUnique<xla::gpu::GpuCompiler>(); });
return true;
}
static bool module_initialized = InitModule();

View File

@ -45,25 +45,23 @@ class GpuCompiler : public LLVMCompiler {
// Bring in
// StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
// std::vector<std::unique_ptr<HloModule>> modules,
// std::vector<std::vector<perftools::gputools::StreamExecutor*>>
// std::vector<std::vector<se::StreamExecutor*>>
// stream_execs)
using LLVMCompiler::Compile;
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
std::unique_ptr<HloModule> module,
perftools::gputools::StreamExecutor* stream_exec,
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::unique_ptr<Executable>> RunBackend(
std::unique_ptr<HloModule> module,
perftools::gputools::StreamExecutor* stream_exec,
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
AotCompilationOptions const& options) override;
perftools::gputools::Platform::Id PlatformId() const override;
se::Platform::Id PlatformId() const override;
HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
// Capture just the pointer size, not the entire GpuCompiler object.

View File

@ -34,8 +34,6 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace gpu {
namespace {
@ -324,7 +322,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
CHECK(!slice.allocation()->is_entry_computation_parameter());
perftools::gputools::DeviceMemoryBase src_base =
se::DeviceMemoryBase src_base =
buffer_allocations->GetDeviceAddress(slice.index());
CHECK(!src_base.is_null() || src_base.size() == 0);
*device_memory = src_base;

View File

@ -33,8 +33,6 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace se = ::perftools::gputools;
namespace xla {
// TODO(b/30467474) Once GPU infeed implementation settles, consider
@ -153,8 +151,8 @@ static std::unique_ptr<xla::TransferManager> CreateGpuTransferManager() {
}
static bool InitModule() {
xla::TransferManager::RegisterTransferManager(se::cuda::kCudaPlatformId,
&CreateGpuTransferManager);
xla::TransferManager::RegisterTransferManager(
stream_executor::cuda::kCudaPlatformId, &CreateGpuTransferManager);
return true;
}
static bool module_initialized = InitModule();

View File

@ -36,21 +36,20 @@ class GpuTransferManager : public GenericTransferManager {
GpuTransferManager();
~GpuTransferManager() override {}
Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
Status TransferLiteralToInfeed(se::StreamExecutor* executor,
const Literal& literal) override;
Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
int64 size, const void* source) override;
Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
const void* source) override;
private:
// Initiates the infeed data transfers. InfeedBuffer->Done() must be
// called to clean up the memory allocated for InfeedBuffer.
StatusOr<gpu::InfeedBuffer*> TransferBufferToInfeedInternal(
perftools::gputools::StreamExecutor* executor, int64 size,
const void* source);
se::StreamExecutor* executor, int64 size, const void* source);
// Enqueues infeed data buffers with the infeed manager after their
// transfer completes.
Status EnqueueBuffersToInfeed(perftools::gputools::StreamExecutor* executor,
Status EnqueueBuffersToInfeed(se::StreamExecutor* executor,
std::vector<gpu::InfeedBuffer*> buffers);
TF_DISALLOW_COPY_AND_ASSIGN(GpuTransferManager);

View File

@ -19,8 +19,6 @@ limitations under the License.
#include "tensorflow/compiler/xla/ptr_util.h"
#include "tensorflow/core/platform/logging.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace gpu {

View File

@ -46,7 +46,7 @@ namespace gpu {
// the client. The client manages the memory of the buffer.
class InfeedBuffer {
public:
InfeedBuffer(perftools::gputools::StreamExecutor* executor, int64 length)
InfeedBuffer(se::StreamExecutor* executor, int64 length)
: executor_(executor), length_(length) {
device_memory_ = executor_->AllocateArray<uint8>(length);
CHECK(!device_memory_.is_null());
@ -60,14 +60,12 @@ class InfeedBuffer {
// client to manage memory for the infeed buffers.
void Done() { delete this; }
perftools::gputools::DeviceMemoryBase* device_memory() {
return &device_memory_;
}
se::DeviceMemoryBase* device_memory() { return &device_memory_; }
private:
perftools::gputools::StreamExecutor* executor_; // Not owned.
se::StreamExecutor* executor_; // Not owned.
const int64 length_;
perftools::gputools::DeviceMemoryBase device_memory_;
se::DeviceMemoryBase device_memory_;
};
// Client-side class used to enqueue infeed buffers.
@ -100,8 +98,7 @@ class InfeedManager {
// new stream on the first invocation. On subsequent invocations, if
// the cached executor is not the same as the requested executor,
// returns null.
perftools::gputools::Stream* GetStream(
perftools::gputools::StreamExecutor* executor);
se::Stream* GetStream(se::StreamExecutor* executor);
private:
// TODO(b/30467474): Revisit if this mutex becomes a point of
@ -121,10 +118,10 @@ class InfeedManager {
tensorflow::gtl::FlatSet<const InfeedBuffer*> dequeued_buffer_;
// Cached host to device stream for queuing infeed data.
std::unique_ptr<perftools::gputools::Stream> host_to_device_stream_;
std::unique_ptr<se::Stream> host_to_device_stream_;
// Executor that the host_to_device_stream belongs to. Not owned.
perftools::gputools::StreamExecutor* host_to_device_executor_;
se::StreamExecutor* host_to_device_executor_;
};
// Singleton creator-or-accessor: Returns the GPU infeed manager.

View File

@ -31,10 +31,10 @@ InfeedThunk::InfeedThunk(
destination_buffer_(destination_buffer) {}
Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) {
se::Stream* stream) {
VLOG(2) << "Infeeding to GPU ";
perftools::gputools::DeviceMemoryBase destination_address =
se::DeviceMemoryBase destination_address =
buffer_allocations.GetDeviceAddress(destination_buffer_);
InfeedManager* infeed_manager = GetOrCreateInfeedManager();
@ -45,7 +45,7 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
std::vector<void*> tuple_element_addresses;
for (BufferAllocation::Slice tuple_element_buffer :
tuple_element_buffers_) {
perftools::gputools::DeviceMemoryBase tuple_element_address =
se::DeviceMemoryBase tuple_element_address =
buffer_allocations.GetDeviceAddress(tuple_element_buffer);
InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer();

View File

@ -44,7 +44,7 @@ class InfeedThunk : public Thunk {
InfeedThunk& operator=(const InfeedThunk&) = delete;
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
se::Stream* stream) override;
private:
const std::vector<BufferAllocation::Slice> tuple_element_buffers_;

View File

@ -32,7 +32,7 @@ class IrEmitterContext {
public:
IrEmitterContext(const HloModule* hlo_module,
const BufferAssignment* buffer_assignment,
const perftools::gputools::DeviceDescription* device_desc,
const se::DeviceDescription* device_desc,
llvm::Module* llvm_module)
: hlo_module_(hlo_module),
buffer_assignment_(buffer_assignment),
@ -47,7 +47,7 @@ class IrEmitterContext {
const BufferAssignment& buffer_assignment() const {
return *buffer_assignment_;
}
const perftools::gputools::DeviceDescription& device_description() const {
const se::DeviceDescription& device_description() const {
return *device_desc_;
}
llvm::Module* llvm_module() { return llvm_module_; }
@ -56,7 +56,7 @@ class IrEmitterContext {
private:
const HloModule* hlo_module_;
const BufferAssignment* buffer_assignment_;
const perftools::gputools::DeviceDescription* device_desc_;
const se::DeviceDescription* device_desc_;
llvm::Module* llvm_module_;
NameUniquer name_uniquer_;
};

View File

@ -23,8 +23,6 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace gpu {

View File

@ -61,8 +61,7 @@ class KernelThunk : public Thunk {
// Executes the kernel for the thunk on "stream", which must be non-null.
tensorflow::Status ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
private:
// Buffers passed to the kernel as arguments.
@ -82,13 +81,11 @@ class KernelThunk : public Thunk {
// Describes how to load this kernel. ExecuteOnStream reuses this loader
// specification for all executions.
mutable tensorflow::mutex mutex_;
std::unique_ptr<perftools::gputools::MultiKernelLoaderSpec> loader_spec_
GUARDED_BY(mutex_);
std::unique_ptr<se::MultiKernelLoaderSpec> loader_spec_ GUARDED_BY(mutex_);
// Loaded kernels for each `StreamExecutor`
std::unordered_map<perftools::gputools::StreamExecutor*,
perftools::gputools::KernelBase>
kernel_cache_ GUARDED_BY(mutex_);
std::unordered_map<se::StreamExecutor*, se::KernelBase> kernel_cache_
GUARDED_BY(mutex_);
};
} // namespace gpu

View File

@ -19,8 +19,6 @@ limitations under the License.
namespace xla {
namespace gpu {
namespace se = ::perftools::gputools;
Status MemzeroThunk::ExecuteOnStream(
const BufferAllocations& buffer_allocations, se::Stream* stream) {
se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_);

View File

@ -36,7 +36,7 @@ class MemzeroThunk : public Thunk {
: Thunk(Kind::kMemzero, hlo), dest_(dest) {}
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
se::Stream* stream) override;
private:
const BufferAllocation::Slice dest_;
@ -52,7 +52,7 @@ class Memset32BitValueThunk : public Thunk {
: Thunk(Kind::kMemset32BitValue, hlo), value_(value), dest_(dest) {}
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
se::Stream* stream) override;
private:
uint32 value_;

View File

@ -29,8 +29,6 @@ limitations under the License.
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/logging.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace gpu {

View File

@ -57,8 +57,7 @@ std::ostream& operator<<(std::ostream& out,
const LaunchDimensions& launch_dims);
LaunchDimensions CalculateLaunchDimensions(
const Shape& shape,
const perftools::gputools::DeviceDescription& device_desc,
const Shape& shape, const se::DeviceDescription& device_desc,
int unroll_factor = 1);
} // namespace gpu

View File

@ -33,8 +33,7 @@ tensorflow::Status SequentialThunk::Initialize(
}
tensorflow::Status SequentialThunk::ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) {
const BufferAllocations& buffer_allocations, se::Stream* stream) {
for (const auto& thunk : thunks_) {
TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
}

View File

@ -40,8 +40,7 @@ class SequentialThunk : public Thunk {
tensorflow::Status Initialize(const GpuExecutable& executable) override;
tensorflow::Status ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
private:
// The list of sub-thunks.

View File

@ -85,8 +85,7 @@ class Thunk {
// This value is not required to be constant for a given Thunk. For example,
// a Thunk that performs autotuning may return true for its first run and
// false thereafter.
virtual bool ShouldHaltAllActivityBeforeRunning(
perftools::gputools::Stream* /*stream*/) {
virtual bool ShouldHaltAllActivityBeforeRunning(se::Stream* /*stream*/) {
return false;
}
@ -104,8 +103,7 @@ class Thunk {
// called after Initialize and can be called multiple times over Thunk's
// lifetime. Stream argument must be non-null.
virtual tensorflow::Status ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) = 0;
const BufferAllocations& buffer_allocations, se::Stream* stream) = 0;
private:
Kind kind_;

View File

@ -17,8 +17,6 @@ limitations under the License.
#include "tensorflow/compiler/xla/util.h"
namespace se = ::perftools::gputools;
namespace xla {
namespace gpu {

View File

@ -46,8 +46,7 @@ class TupleThunk : public Thunk {
TupleThunk& operator=(const TupleThunk&) = delete;
tensorflow::Status ExecuteOnStream(
const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
const BufferAllocations& buffer_allocations, se::Stream* stream) override;
private:
const std::vector<BufferAllocation::Slice> tuple_element_buffers_;

View File

@ -41,8 +41,8 @@ Status WhileThunk::Initialize(const GpuExecutable& executable) {
}
Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) {
perftools::gputools::DeviceMemoryBase condition_result_data =
se::Stream* stream) {
se::DeviceMemoryBase condition_result_data =
buffer_allocations.GetDeviceAddress(condition_result_buffer_index_);
while (true) {

View File

@ -47,7 +47,7 @@ class WhileThunk : public Thunk {
Status Initialize(const GpuExecutable& executable) override;
Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
perftools::gputools::Stream* stream) override;
se::Stream* stream) override;
private:
const BufferAllocation::Slice condition_result_buffer_index_;

View File

@ -88,7 +88,7 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
// down how much time each HLO took.
class HloExecutionProfile {
public:
using DeviceDescription = perftools::gputools::DeviceDescription;
using DeviceDescription = se::DeviceDescription;
HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data,
const HloProfileIndexMap* hlo_profile_index_map);

View File

@ -30,8 +30,6 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
namespace se = ::perftools::gputools;
namespace xla {
/*static*/ StatusOr<std::unique_ptr<HloModule>>

View File

@ -80,7 +80,7 @@ class HloRunner {
bool run_hlo_passes = false;
};
explicit HloRunner(::perftools::gputools::Platform* platform);
explicit HloRunner(se::Platform* platform);
~HloRunner();
@ -149,8 +149,7 @@ class HloRunner {
// will be used to configure the replication parameters. Replicated executions
// should pass the device_assignment parameter.
ServiceExecutableRunOptions GetServiceRunOptionsForDevice(
int64 device, ::perftools::gputools::Stream* stream,
DeviceAssignment* device_assignment);
int64 device, se::Stream* stream, DeviceAssignment* device_assignment);
std::unique_ptr<Backend> backend_;
};

View File

@ -41,9 +41,6 @@ limitations under the License.
namespace xla {
namespace interpreter {
namespace se = ::perftools::gputools;
namespace sep = ::perftools::gputools::interpreter;
Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
HloPassPipeline pipeline("Interpreter");
@ -96,7 +93,7 @@ InterpreterCompiler::CompileAheadOfTime(
}
se::Platform::Id InterpreterCompiler::PlatformId() const {
return sep::kXlaInterpreterPlatformId;
return se::interpreter::kXlaInterpreterPlatformId;
}
HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
@ -109,11 +106,12 @@ static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
}
static bool InitModule() {
xla::Compiler::RegisterCompilerFactory(sep::kXlaInterpreterPlatformId, []() {
return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
});
xla::Compiler::RegisterCompilerFactory(
se::interpreter::kXlaInterpreterPlatformId, []() {
return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
});
xla::ComputationPlacer::RegisterComputationPlacer(
sep::kXlaInterpreterPlatformId, &CreateComputationPlacer);
se::interpreter::kXlaInterpreterPlatformId, &CreateComputationPlacer);
return true;
}

View File

@ -44,19 +44,16 @@ class InterpreterCompiler : public Compiler {
~InterpreterCompiler() override {}
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
std::unique_ptr<HloModule> hlo_module,
perftools::gputools::StreamExecutor* stream_exec,
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::unique_ptr<Executable>> RunBackend(
std::unique_ptr<HloModule> hlo_module,
perftools::gputools::StreamExecutor* stream_exec,
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
std::vector<std::unique_ptr<HloModule>> hlo_modules,
std::vector<std::vector<perftools::gputools::StreamExecutor*>>
stream_exec,
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
DeviceMemoryAllocator* device_allocator) override;
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
@ -65,7 +62,7 @@ class InterpreterCompiler : public Compiler {
HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
perftools::gputools::Platform::Id PlatformId() const override;
se::Platform::Id PlatformId() const override;
private:
Status RunHloOptimization(HloModule* hlo_module);

View File

@ -38,8 +38,6 @@ limitations under the License.
namespace xla {
namespace interpreter {
namespace se = ::perftools::gputools;
InterpreterExecutable::InterpreterExecutable(
std::unique_ptr<const HloModule> hlo_module)
: Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,

View File

@ -19,8 +19,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/status_macros.h"
namespace perftools {
namespace gputools {
namespace stream_executor {
namespace interpreter {
host::HostStream *AsExecutorStream(Stream *stream) {
@ -119,5 +118,4 @@ DeviceDescription *XlaInterpreterExecutor::PopulateDeviceDescription() const {
}
} // namespace interpreter
} // namespace gputools
} // namespace perftools
} // namespace stream_executor

View File

@ -44,8 +44,7 @@ limitations under the License.
#include "tensorflow/stream_executor/stream_executor_internal.h"
#include "tensorflow/stream_executor/timer.h"
namespace perftools {
namespace gputools {
namespace stream_executor {
namespace interpreter {
using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;
@ -213,7 +212,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
};
} // namespace interpreter
} // namespace gputools
} // namespace perftools
} // namespace stream_executor
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_

View File

@ -21,12 +21,10 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
#include "tensorflow/compiler/xla/service/transfer_manager.h"
namespace sei = ::perftools::gputools::interpreter;
namespace xla {
InterpreterTransferManager::InterpreterTransferManager()
: GenericTransferManager(sei::kXlaInterpreterPlatformId,
: GenericTransferManager(se::interpreter::kXlaInterpreterPlatformId,
/*pointer_size=*/sizeof(void*)) {}
} // namespace xla
@ -38,7 +36,8 @@ CreateInterpreterTransferManager() {
static bool InitModule() {
xla::TransferManager::RegisterTransferManager(
sei::kXlaInterpreterPlatformId, &CreateInterpreterTransferManager);
stream_executor::interpreter::kXlaInterpreterPlatformId,
&CreateInterpreterTransferManager);
return true;
}

View File

@ -28,11 +28,7 @@ limitations under the License.
#include "tensorflow/stream_executor/multi_platform_manager.h"
#include "tensorflow/stream_executor/platform.h"
namespace se = ::perftools::gputools;
namespace sep = ::perftools::gputools::interpreter;
namespace perftools {
namespace gputools {
namespace stream_executor {
namespace interpreter {
XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {}
@ -99,16 +95,16 @@ void XlaInterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
}
static void InitializeXlaInterpreterPlatform() {
std::unique_ptr<se::Platform> platform(new sep::XlaInterpreterPlatform);
SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
std::unique_ptr<Platform> platform(new XlaInterpreterPlatform);
SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
}
} // namespace interpreter
} // namespace gputools
} // namespace perftools
} // namespace stream_executor
REGISTER_MODULE_INITIALIZER(interpreter_platform,
sep::InitializeXlaInterpreterPlatform());
REGISTER_MODULE_INITIALIZER(
interpreter_platform,
stream_executor::interpreter::InitializeXlaInterpreterPlatform());
DECLARE_MODULE_INITIALIZER(multi_platform_manager);

View File

@ -23,8 +23,7 @@ limitations under the License.
#include "tensorflow/stream_executor/stream_executor.h"
#include "tensorflow/stream_executor/trace_listener.h"
namespace perftools {
namespace gputools {
namespace stream_executor {
namespace interpreter {
class XlaInterpreterPlatform : public Platform {
@ -64,7 +63,6 @@ class XlaInterpreterPlatform : public Platform {
};
} // namespace interpreter
} // namespace gputools
} // namespace perftools
} // namespace stream_executor
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_H_

View File

@ -14,12 +14,10 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
namespace perftools {
namespace gputools {
namespace stream_executor {
namespace interpreter {
PLATFORM_DEFINE_ID(kXlaInterpreterPlatformId);
} // namespace interpreter
} // namespace gputools
} // namespace perftools
} // namespace stream_executor

View File

@ -18,14 +18,12 @@ limitations under the License.
#include "tensorflow/stream_executor/platform.h"
namespace perftools {
namespace gputools {
namespace stream_executor {
namespace interpreter {
extern const Platform::Id kXlaInterpreterPlatformId;
} // namespace interpreter
} // namespace gputools
} // namespace perftools
} // namespace stream_executor
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_ID_H_

View File

@ -23,7 +23,7 @@ limitations under the License.
namespace xla {
StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
std::vector<std::unique_ptr<HloModule>> modules,
std::vector<std::vector<perftools::gputools::StreamExecutor*>> stream_execs,
std::vector<std::vector<se::StreamExecutor*>> stream_execs,
DeviceMemoryAllocator* device_allocator) {
// Tensorflow tries to enable the following behaviors in all its threads:
//

View File

@ -60,19 +60,18 @@ class LLVMCompiler : public Compiler {
// Bring in
// StatusOr<std::unique_ptr<Executable>> RunBackend(
// std::unique_ptr<HloModule> module,
// perftools::gputools::StreamExecutor* stream_exec,
// se::StreamExecutor* stream_exec,
// DeviceMemoryAllocator* device_allocator)
// StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
// std::unique_ptr<HloModule> module,
// perftools::gputools::StreamExecutor* stream_exec,
// se::StreamExecutor* stream_exec,
// DeviceMemoryAllocator* device_allocator)
using Compiler::RunBackend;
using Compiler::RunHloPasses;
StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
std::vector<std::unique_ptr<HloModule>> modules,
std::vector<std::vector<perftools::gputools::StreamExecutor*>>
stream_execs,
std::vector<std::vector<se::StreamExecutor*>> stream_execs,
DeviceMemoryAllocator* device_allocator) override;
protected:

View File

@ -43,13 +43,11 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace se = ::perftools::gputools;
namespace xla {
/* static */ StatusOr<std::unique_ptr<LocalService>> LocalService::NewService(
const ServiceOptions& options) {
perftools::gputools::Platform* platform = options.platform();
se::Platform* platform = options.platform();
if (platform == nullptr) {
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
}

View File

@ -29,8 +29,6 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace se = ::perftools::gputools;
namespace xla {
using tensorflow::str_util::Lowercase;

View File

@ -34,29 +34,27 @@ class PlatformUtil {
//
// Note that, even if a platform is present with zero devices, if we *do* have
// compilation support for it, it will be returned in this sequence.
static StatusOr<std::vector<perftools::gputools::Platform*>>
GetSupportedPlatforms();
static StatusOr<std::vector<se::Platform*>> GetSupportedPlatforms();
// Convenience function which returns the default supported platform for
// tests. If exactly one supported platform is present, then this platform is
// the default platform. If exactly two platforms are present and one of them
// is the interpreter platform, then the other platform is the default
// platform. Otherwise returns an error.
static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
static StatusOr<se::Platform*> GetDefaultPlatform();
// Convenience function which returns the sole supported platform. If
// exactly one supported platform is present, then this platform is the
// default platform. Otherwise returns an error.
static StatusOr<perftools::gputools::Platform*> GetSolePlatform();
static StatusOr<se::Platform*> GetSolePlatform();
// Returns the platform according to the given name. Returns error if there is
// no such platform.
static StatusOr<perftools::gputools::Platform*> GetPlatform(
const string& platform_name);
static StatusOr<se::Platform*> GetPlatform(const string& platform_name);
// Returns exactly one platform that does not have given name. Returns error
// if there is no such platform, or there are multiple such platforms.
static StatusOr<perftools::gputools::Platform*> GetPlatformExceptFor(
static StatusOr<se::Platform*> GetPlatformExceptFor(
const string& platform_name);
// Returns a vector of StreamExecutors for the given platform. The vector is
@ -64,8 +62,8 @@ class PlatformUtil {
// element is nullptr, then the device is present by not supported by XLA.
//
// If the platform has no visible devices, a not-found error is returned.
static StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
GetStreamExecutors(perftools::gputools::Platform* platform);
static StatusOr<std::vector<se::StreamExecutor*>> GetStreamExecutors(
se::Platform* platform);
private:
TF_DISALLOW_COPY_AND_ASSIGN(PlatformUtil);

View File

@ -54,8 +54,6 @@ limitations under the License.
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
#include "tensorflow/core/platform/types.h"
namespace se = ::perftools::gputools;
using ::tensorflow::strings::Printf;
using ::tensorflow::strings::StrCat;
using ::xla::source_map_util::InvalidParameterArgument;
@ -95,15 +93,12 @@ tensorflow::Status RecordResult(const ShapedBuffer& result,
} // namespace
ServiceOptions& ServiceOptions::set_platform(
perftools::gputools::Platform* platform) {
ServiceOptions& ServiceOptions::set_platform(se::Platform* platform) {
platform_ = platform;
return *this;
}
perftools::gputools::Platform* ServiceOptions::platform() const {
return platform_;
}
se::Platform* ServiceOptions::platform() const { return platform_; }
ServiceOptions& ServiceOptions::set_number_of_replicas(int number_of_replicas) {
number_of_replicas_ = number_of_replicas;
@ -123,7 +118,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
}
/* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
perftools::gputools::Platform* platform) {
se::Platform* platform) {
ServiceOptions default_options;
default_options.set_platform(platform);
return NewService(default_options);
@ -131,7 +126,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
/* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
const ServiceOptions& options) {
perftools::gputools::Platform* platform = options.platform();
se::Platform* platform = options.platform();
std::unique_ptr<Backend> execute_backend;
if (platform == nullptr) {
TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
@ -235,8 +230,7 @@ tensorflow::Status Service::ValidateResultShapeWithLayout(
StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
Service::ResolveAndValidateArguments(
tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
stream_executors) {
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors) {
CHECK_EQ(options_.number_of_replicas(), stream_executors.size());
std::vector<std::vector<const ShapedBuffer*>> replicated_arguments;
replicated_arguments.resize(options_.number_of_replicas());
@ -349,8 +343,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
std::vector<VersionedComputationHandle> versioned_handles,
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
Backend* backend,
std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
DeviceMemoryAllocator* device_allocator) {
VLOG(1) << Printf("BuildExecutable on service %p", this);
@ -412,8 +405,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
const std::vector<const HloModuleProto*>& module_protos,
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
Backend* backend,
std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
DeviceMemoryAllocator* device_allocator) {
VLOG(1) << Printf("BuildExecutable on service %p", this);
@ -493,7 +485,7 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
const VersionedComputationHandle& versioned_handle,
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
se::StreamExecutor* executor, ExecutionProfile* profile,
DeviceMemoryAllocator* device_allocator) {
std::shared_ptr<Executable> executable =
compilation_cache_.LookUp(versioned_handle, *module_config);
@ -541,7 +533,7 @@ Service::ExecuteParallelAndRegisterResult(
// Streams where the computation are launched, so we can wait on the streams
// to complete.
std::vector<Pool<se::Stream>::SmartPtr> streams;
std::vector<std::unique_ptr<perftools::gputools::Timer>> timers;
std::vector<std::unique_ptr<se::Timer>> timers;
// Global data handles for the computation results, one for each computation.
std::vector<GlobalDataHandle> result_handles;
@ -565,8 +557,7 @@ Service::ExecuteParallelAndRegisterResult(
streams.push_back(std::move(stream));
if (replica == 0 && profile != nullptr) {
timers.emplace_back(
new perftools::gputools::Timer(streams.back()->parent()));
timers.emplace_back(new se::Timer(streams.back()->parent()));
streams.back()
->InitTimer(timers.back().get())
.ThenStartTimer(timers.back().get());
@ -734,9 +725,9 @@ tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
return computation->SetReturnValue(arg->operand());
}
StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
Service::GetExecutors(const ExecutionOptions& execution_options,
int64 requests_size, int64 request_index) const {
StatusOr<std::vector<se::StreamExecutor*>> Service::GetExecutors(
const ExecutionOptions& execution_options, int64 requests_size,
int64 request_index) const {
if (execution_options.device_handles().empty()) {
return FailedPrecondition(
"device handles must be given to execute parallel computations");
@ -748,7 +739,7 @@ Service::GetExecutors(const ExecutionOptions& execution_options,
"handles.",
requests_size, request_index, execution_options.device_handles_size());
}
std::vector<perftools::gputools::StreamExecutor*> executors;
std::vector<se::StreamExecutor*> executors;
for (const auto& device_handle : execution_options.device_handles()) {
TF_ASSIGN_OR_RETURN(auto replicas,
Replicas(*execute_backend_, device_handle));
@ -780,7 +771,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
std::vector<std::vector<se::StreamExecutor*>> all_executors;
std::vector<VersionedComputationHandle> versioned_handles;
std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
std::vector<string> computation_names;
@ -891,7 +882,7 @@ tensorflow::Status Service::ExecuteGraphParallel(
VLOG(1) << "running execute-graph-parallel request";
std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
std::vector<std::vector<se::StreamExecutor*>> all_executors;
std::vector<const HloModuleProto*> module_protos;
std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
std::vector<string> computation_names;
@ -1953,9 +1944,9 @@ DeviceHandle Service::SingleComputationDeviceHandle() const {
return device_handle;
}
StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Service::Replicas(
StatusOr<std::vector<se::StreamExecutor*>> Service::Replicas(
const Backend& backend, const DeviceHandle& device_handle) const {
std::vector<perftools::gputools::StreamExecutor*> replicas;
std::vector<se::StreamExecutor*> replicas;
for (int replica = 0; replica < options_.number_of_replicas(); ++replica) {
// From the computation placer, find out the device ids of the replicas for
// the given device handle.

View File

@ -53,8 +53,8 @@ namespace xla {
class ServiceOptions {
public:
// Set the platform backing the service, or nullptr for the default platform.
ServiceOptions& set_platform(perftools::gputools::Platform* platform);
perftools::gputools::Platform* platform() const;
ServiceOptions& set_platform(se::Platform* platform);
se::Platform* platform() const;
// Set the number of replicas to use when compiling replicated
// programs.
@ -66,7 +66,7 @@ class ServiceOptions {
int intra_op_parallelism_threads() const;
private:
perftools::gputools::Platform* platform_ = nullptr;
se::Platform* platform_ = nullptr;
int number_of_replicas_ = 1;
int intra_op_parallelism_threads_ = -1;
};
@ -79,7 +79,7 @@ class Service : public ServiceInterface {
public:
// Factory method for creating a new Service.
static StatusOr<std::unique_ptr<Service>> NewService(
perftools::gputools::Platform* platform = nullptr);
se::Platform* platform = nullptr);
static StatusOr<std::unique_ptr<Service>> NewService(
const ServiceOptions& options);
@ -286,7 +286,7 @@ class Service : public ServiceInterface {
ExecuteResponse* result);
// Prepare the executors for executing parallel.
StatusOr<std::vector<perftools::gputools::StreamExecutor*>> GetExecutors(
StatusOr<std::vector<se::StreamExecutor*>> GetExecutors(
const ExecutionOptions& execution_options, int64 requests_size,
int64 request_index) const;
@ -310,8 +310,7 @@ class Service : public ServiceInterface {
StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
ResolveAndValidateArguments(
tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
stream_executors);
tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
// Create a Hlo module config for the given program shape and arguments.
// execution_options is optional; if not given a default is used.
@ -329,7 +328,7 @@ class Service : public ServiceInterface {
StatusOr<std::unique_ptr<Executable>> BuildExecutable(
const VersionedComputationHandle& versioned_handle,
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
perftools::gputools::StreamExecutor* executor,
se::StreamExecutor* executor,
DeviceMemoryAllocator* device_allocator = nullptr);
// Builds an Executable for the given HLO module proto.
@ -338,7 +337,7 @@ class Service : public ServiceInterface {
StatusOr<std::unique_ptr<Executable>> BuildExecutable(
const HloModuleProto& module_proto,
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
perftools::gputools::StreamExecutor* executor,
se::StreamExecutor* executor,
DeviceMemoryAllocator* device_allocator = nullptr);
// Same as BuildExecutable() above, but builds a list of Executables for the
@ -346,14 +345,12 @@ class Service : public ServiceInterface {
StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
std::vector<VersionedComputationHandle> versioned_handles,
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
Backend* backend,
std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
DeviceMemoryAllocator* device_allocator);
StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
const std::vector<const HloModuleProto*>& module_protos,
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
Backend* backend,
std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
DeviceMemoryAllocator* device_allocator);
// Similar to BuildExecutable, but look in the compilation cache for the
@ -362,7 +359,7 @@ class Service : public ServiceInterface {
StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
const VersionedComputationHandle& versioned_handle,
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
se::StreamExecutor* executor, ExecutionProfile* profile,
DeviceMemoryAllocator* device_allocator = nullptr);
// Runs the given executable with the given arguments and register the result
@ -411,7 +408,7 @@ class Service : public ServiceInterface {
// Returns the stream executors assigned to the replicas represented by the
// given device handle. Each device_handle is a virtual replicated device that
// represents a set of physical devices for the replicas.
StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Replicas(
StatusOr<std::vector<se::StreamExecutor*>> Replicas(
const Backend& backend, const DeviceHandle& device_handle) const;
Status MaybeDumpHloModule(const HloModule& module) const;

View File

@ -28,7 +28,7 @@ namespace xla {
class ServiceExecutableRunOptions {
public:
using StreamBorrower =
std::function<StatusOr<Pool<perftools::gputools::Stream>::SmartPtr>(int)>;
std::function<StatusOr<Pool<se::Stream>::SmartPtr>(int)>;
ServiceExecutableRunOptions()
: ServiceExecutableRunOptions(ExecutableRunOptions()) {}
@ -45,14 +45,13 @@ class ServiceExecutableRunOptions {
ExecutableRunOptions* mutable_run_options() { return &run_options_; }
// Delegate to `ExecutableRunOptions` member.
perftools::gputools::Stream* stream() const { return run_options_.stream(); }
se::Stream* stream() const { return run_options_.stream(); }
DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); }
int device_ordinal() const { return run_options_.device_ordinal(); }
// Borrows a stream and returns a smart pointer which returns the stream on
// destruction.
StatusOr<Pool<perftools::gputools::Stream>::SmartPtr> BorrowStream(
int device_ordinal) const {
StatusOr<Pool<se::Stream>::SmartPtr> BorrowStream(int device_ordinal) const {
return borrow_stream_
? borrow_stream_(device_ordinal)
: Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");

View File

@ -28,8 +28,6 @@ limitations under the License.
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/logging.h"
namespace se = ::perftools::gputools;
namespace xla {
using ::tensorflow::strings::Appendf;
@ -146,7 +144,7 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
std::unique_ptr<ShapedBuffer> ScopedShapedBuffer::release() {
auto shaped_buffer = MakeUnique<ShapedBuffer>(std::move(*this));
buffers_ = ShapeTree<perftools::gputools::DeviceMemoryBase>();
buffers_ = ShapeTree<se::DeviceMemoryBase>();
return shaped_buffer;
}

View File

@ -41,8 +41,7 @@ class ShapedBuffer {
// determines the number of device allocations (DeviceMemoryBase) held by the
// ShapedBuffer.
ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
const perftools::gputools::Platform* platform,
int device_ordinal);
const se::Platform* platform, int device_ordinal);
// Returns the shape of the on-host representation of the data held by this
// ShapedBuffer.
@ -52,35 +51,29 @@ class ShapedBuffer {
// ShapedBuffer.
const Shape& on_device_shape() const { return on_device_shape_; }
const perftools::gputools::Platform* platform() const { return platform_; }
const se::Platform* platform() const { return platform_; }
int device_ordinal() const { return device_ordinal_; }
// Return the root buffer of the shape (shape index {}).
const perftools::gputools::DeviceMemoryBase& root_buffer() const {
const se::DeviceMemoryBase& root_buffer() const {
return buffer(/*index=*/{});
}
// Returns the buffer at the given shape index where index is defined as in
// ShapeUtil::GetSubshape.
const perftools::gputools::DeviceMemoryBase& buffer(
const ShapeIndex& index) const {
const se::DeviceMemoryBase& buffer(const ShapeIndex& index) const {
return buffers_.element(index);
}
// Sets the device memory buffer at the given index.
void set_buffer(const perftools::gputools::DeviceMemoryBase& buffer,
const ShapeIndex& index) {
void set_buffer(const se::DeviceMemoryBase& buffer, const ShapeIndex& index) {
*buffers_.mutable_element(index) = buffer;
}
// Returns the underlying ShapeTree containing all the device addresses in the
// ShapedBuffer.
const ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() const {
return buffers_;
}
ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() {
return buffers_;
}
const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
ShapeTree<se::DeviceMemoryBase>& buffers() { return buffers_; }
// Set all device memory pointers in the object to null.
void clear();
@ -101,13 +94,13 @@ class ShapedBuffer {
Shape on_device_shape_;
// The platform the memory is allocated on.
const perftools::gputools::Platform* platform_;
const se::Platform* platform_;
// The device the memory is allocated on.
int device_ordinal_;
// The tree of device buffers. Its shape is on_device_shape().
ShapeTree<perftools::gputools::DeviceMemoryBase> buffers_;
ShapeTree<se::DeviceMemoryBase> buffers_;
};
std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);

View File

@ -25,24 +25,20 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
namespace se = ::perftools::gputools;
namespace xla {
/* static */ tensorflow::mutex
TransferManager::platform_transfer_manager_mutex_(
tensorflow::LINKER_INITIALIZED);
/* static */ std::map<perftools::gputools::Platform::Id,
TransferManager::State>*
/* static */ std::map<se::Platform::Id, TransferManager::State>*
TransferManager::GetPlatformTransferManagers() {
static auto* r =
new std::map<perftools::gputools::Platform::Id, TransferManager::State>;
static auto* r = new std::map<se::Platform::Id, TransferManager::State>;
return r;
}
Status TransferManager::TransferArrayToDevice(
perftools::gputools::StreamExecutor* executor, const Literal& literal,
const perftools::gputools::DeviceMemoryBase& dest) {
se::StreamExecutor* executor, const Literal& literal,
const se::DeviceMemoryBase& dest) {
const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
<< "On-device representation of "
@ -61,8 +57,8 @@ Status TransferManager::TransferArrayToDevice(
}
StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
perftools::gputools::StreamExecutor* executor, const Shape& shape,
const perftools::gputools::DeviceMemoryBase& source) {
se::StreamExecutor* executor, const Shape& shape,
const se::DeviceMemoryBase& source) {
TF_RET_CHECK(ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape))
<< "Shape " << ShapeUtil::HumanString(shape)
<< " has a differently shaped representation on-device: "
@ -112,8 +108,7 @@ StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
}
Status TransferManager::WriteTupleIndexTables(
perftools::gputools::StreamExecutor* executor,
const ShapedBuffer& device_buffer) {
se::StreamExecutor* executor, const ShapedBuffer& device_buffer) {
VLOG(2) << "Writing tuple index tables for " << device_buffer;
TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());

Some files were not shown because too many files have changed in this diff Show More