[XLA/GPU] Remove uses of BufferAssignment in GpuExecutable.
PiperOrigin-RevId: 348539166 Change-Id: I97aa232cd81837ffcfd10e24fd8f8b3dfb7953fe
This commit is contained in:
parent
722d7601e9
commit
63b8cdcb82
@ -358,6 +358,14 @@ class BufferAssignment {
|
||||
return allocations_;
|
||||
}
|
||||
|
||||
// This is similar to copying Allocations(), but since it's moved out, it
|
||||
// preserves the addresses. Since BufferAllocation::Slice keeps a
|
||||
// BufferAllocation*, and some backends keep BufferAllocation::Slice in
|
||||
// xla::Executables, migrating off the use of addresses can be hard.
|
||||
std::vector<BufferAllocation> ReleaseAllocations() {
|
||||
return std::move(allocations_);
|
||||
}
|
||||
|
||||
// Returns the total size allocation holding all temporary buffers.
|
||||
int64 temp_allocation_total_size() const {
|
||||
return temp_allocation_total_size_;
|
||||
|
@ -672,8 +672,8 @@ cc_library(
|
||||
"gpu_debug_info_manager.h",
|
||||
],
|
||||
deps = [
|
||||
"//tensorflow/compiler/xla/service:buffer_assignment",
|
||||
"//tensorflow/compiler/xla/service:hlo",
|
||||
"//tensorflow/compiler/xla/service:hlo_proto_cc",
|
||||
"//tensorflow/compiler/xla/service:hlo_proto_util",
|
||||
"//tensorflow/core:lib",
|
||||
"@com_google_absl//absl/container:flat_hash_map",
|
||||
@ -685,15 +685,10 @@ tf_cc_test(
|
||||
srcs = ["gpu_debug_info_manager_test.cc"],
|
||||
tags = tf_cuda_tests_tags(),
|
||||
deps = [
|
||||
":gpu_constants",
|
||||
":gpu_debug_info_manager",
|
||||
":gpu_hlo_schedule",
|
||||
":stream_assignment",
|
||||
"//tensorflow/compiler/xla/service:buffer_assignment",
|
||||
"//tensorflow/compiler/xla/service:hlo_proto_cc",
|
||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||
"//tensorflow/compiler/xla/tests:test_utils",
|
||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||
"//tensorflow/core:test",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -34,13 +34,13 @@ namespace gpu {
|
||||
|
||||
Status BufferAllocations::TearDown(
|
||||
const std::set<se::DeviceMemoryBase>& live_addresses,
|
||||
const BufferAssignment* buffer_assignment) {
|
||||
absl::Span<const BufferAllocation> allocations) {
|
||||
// Deallocate temporary buffers, taking care to try to deallocate all of them
|
||||
// even if one of the deallocations fails.
|
||||
Status status;
|
||||
const int64 num_buffers = buffer_assignment->Allocations().size();
|
||||
const int64 num_buffers = allocations.size();
|
||||
for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
|
||||
const BufferAllocation& allocation = buffer_assignment->GetAllocation(i);
|
||||
const BufferAllocation& allocation = allocations[i];
|
||||
se::DeviceMemoryBase buffer_address = GetDeviceAddress(allocation.index());
|
||||
// Deallocate buffers marked "maybe_live_out" but aren't actually live out,
|
||||
// and temp buffers.
|
||||
|
@ -70,7 +70,7 @@ class BufferAllocations {
|
||||
// Tears down all buffers allocated by this object that are not in
|
||||
// `live_addresses`.
|
||||
Status TearDown(const std::set<se::DeviceMemoryBase>& live_addresses,
|
||||
const BufferAssignment* buffer_assignment);
|
||||
absl::Span<const BufferAllocation> allocations);
|
||||
|
||||
std::string ToString() {
|
||||
std::string out;
|
||||
|
@ -869,13 +869,18 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
|
||||
absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
|
||||
TF_ASSIGN_OR_RETURN(OutputInfoMap output_info,
|
||||
GetOutputInfo(*module, *buffer_assignment));
|
||||
auto buffer_assignment_proto =
|
||||
std::make_unique<BufferAssignmentProto>(buffer_assignment->ToProto());
|
||||
std::vector<BufferAllocation> allocations =
|
||||
buffer_assignment->ReleaseAllocations();
|
||||
|
||||
GpuVersion gpu_version = GetGpuVersion(stream_exec);
|
||||
auto* gpu_executable = new GpuExecutable(
|
||||
{std::move(backend_result.first), std::move(backend_result.second),
|
||||
gpu_version, std::move(thunk_schedule), std::move(constants),
|
||||
std::move(output_info), std::move(module), std::move(buffer_assignment),
|
||||
std::move(profile_printer), std::move(profile_index_map)});
|
||||
std::move(output_info), std::move(module), std::move(allocations),
|
||||
std::move(buffer_assignment_proto), std::move(profile_printer),
|
||||
std::move(profile_index_map)});
|
||||
if (embed_ir_in_executable) {
|
||||
DCHECK_NE("", ir_module_string_before_opt);
|
||||
gpu_executable->set_ir_module_string(ir_module_string_before_opt);
|
||||
|
@ -22,7 +22,7 @@ namespace gpu {
|
||||
|
||||
void GpuDebugInfoManager::RegisterModule(
|
||||
const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
|
||||
std::shared_ptr<const BufferAssignment> buffer_assignment) {
|
||||
std::shared_ptr<const BufferAssignmentProto> buffer_assignment) {
|
||||
tensorflow::mutex_lock lock(mutex_);
|
||||
if (active_modules_.find(module_id) != active_modules_.end()) {
|
||||
active_modules_[module_id].instances.emplace_back(hlo_module,
|
||||
@ -40,7 +40,7 @@ void GpuDebugInfoManager::RegisterModule(
|
||||
// However during tracing, we will defer the cleanup after serialization.
|
||||
void GpuDebugInfoManager::UnregisterModule(
|
||||
const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
|
||||
std::shared_ptr<const BufferAssignment> buffer_assignment) {
|
||||
std::shared_ptr<const BufferAssignmentProto> buffer_assignment) {
|
||||
tensorflow::mutex_lock lock(mutex_);
|
||||
CHECK(active_modules_.find(module_id) != active_modules_.end());
|
||||
GpuModuleEntry& active_module = active_modules_[module_id];
|
||||
@ -146,8 +146,10 @@ void GpuDebugInfoManager::StopTracing(
|
||||
// non-nullptr. Due to the inconvenience of creation of buffer_assignment
|
||||
// object in test, we set it to nullptr and guard this for it.
|
||||
if (m.instances[0].hlo_module && m.instances[0].buffer_assignment) {
|
||||
info.hlo_proto = absl::make_unique<HloProto>(MakeHloProto(
|
||||
*m.instances[0].hlo_module, *m.instances[0].buffer_assignment));
|
||||
info.hlo_proto = absl::make_unique<HloProto>(
|
||||
MakeHloProto(*m.instances[0].hlo_module));
|
||||
*info.hlo_proto->mutable_buffer_assignment() =
|
||||
*m.instances[0].buffer_assignment;
|
||||
}
|
||||
module_debug_info->emplace_back(std::move(info));
|
||||
}
|
||||
|
@ -17,7 +17,7 @@ limitations under the License.
|
||||
#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEBUG_INFO_MANAGER_H_
|
||||
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo.pb.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo_module.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
|
||||
@ -56,14 +56,14 @@ class GpuDebugInfoManager {
|
||||
// Modules with same module id can be registered and tracked separately.
|
||||
void RegisterModule(
|
||||
const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
|
||||
std::shared_ptr<const BufferAssignment> buffer_assignment);
|
||||
std::shared_ptr<const BufferAssignmentProto> buffer_assignment);
|
||||
|
||||
// Unregister an active module. When the last active module of the same
|
||||
// module id is out of scope, we remove it from our database.
|
||||
// However during tracing, we will defer the cleanup after serialization.
|
||||
void UnregisterModule(
|
||||
const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
|
||||
std::shared_ptr<const BufferAssignment> buffer_assignment);
|
||||
std::shared_ptr<const BufferAssignmentProto> buffer_assignment);
|
||||
|
||||
// Register when the module start execution on certain device.
|
||||
// TODO(jiesun): Do we need to track which device this is?
|
||||
@ -110,10 +110,10 @@ class GpuDebugInfoManager {
|
||||
// tracking, they need to be tracked separately.
|
||||
struct GpuModuleInstance {
|
||||
GpuModuleInstance(std::shared_ptr<HloModule> m,
|
||||
std::shared_ptr<const BufferAssignment> b)
|
||||
std::shared_ptr<const BufferAssignmentProto> b)
|
||||
: hlo_module(std::move(m)), buffer_assignment(std::move(b)) {}
|
||||
std::shared_ptr<HloModule> hlo_module;
|
||||
std::shared_ptr<const BufferAssignment> buffer_assignment;
|
||||
std::shared_ptr<const BufferAssignmentProto> buffer_assignment;
|
||||
bool active = true;
|
||||
};
|
||||
|
||||
|
@ -14,7 +14,7 @@ limitations under the License.
|
||||
==============================================================================*/
|
||||
#include "tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h"
|
||||
|
||||
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo.pb.h"
|
||||
#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
|
||||
|
||||
namespace xla {
|
||||
@ -30,7 +30,7 @@ class GpuDebugInfoManagerTest : public HloTestBase {
|
||||
int unique_id;
|
||||
string id;
|
||||
std::shared_ptr<HloModule> module;
|
||||
std::shared_ptr<BufferAssignment> buffer_assignment;
|
||||
std::shared_ptr<BufferAssignmentProto> buffer_assignment;
|
||||
};
|
||||
|
||||
// Return unique id of this module.
|
||||
|
@ -62,18 +62,19 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
|
||||
binary_(std::move(params.binary)),
|
||||
gpu_version_(params.gpu_version),
|
||||
thunk_schedule_(std::move(params.thunk_schedule)),
|
||||
assignment_(std::move(params.assignment)),
|
||||
allocations_(std::move(params.allocations)),
|
||||
debug_buffer_assignment_(std::move(params.debug_buffer_assignment)),
|
||||
constants_(std::move(params.constants)),
|
||||
output_info_(std::move(params.output_info)) {
|
||||
CHECK(has_module() && assignment_);
|
||||
CHECK(has_module());
|
||||
GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(),
|
||||
assignment_);
|
||||
debug_buffer_assignment_);
|
||||
}
|
||||
|
||||
GpuExecutable::~GpuExecutable() {
|
||||
CHECK(has_module() && assignment_);
|
||||
CHECK(has_module());
|
||||
GpuDebugInfoManager::Get()->UnregisterModule(module().name(), shared_module(),
|
||||
assignment_);
|
||||
debug_buffer_assignment_);
|
||||
|
||||
{
|
||||
// We could have issued host->device mem copies in ResolveConstantGlobals.
|
||||
@ -376,11 +377,11 @@ StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
|
||||
[&] { return std::string("Build buffer allocations"); },
|
||||
tensorflow::profiler::TraceMeLevel::kInfo);
|
||||
|
||||
const int64 num_buffers = assignment_->Allocations().size();
|
||||
const int64 num_buffers = allocations_.size();
|
||||
std::vector<se::DeviceMemoryBase> buffers;
|
||||
buffers.reserve(num_buffers);
|
||||
for (int64 i = 0; i < num_buffers; ++i) {
|
||||
const BufferAllocation& allocation = assignment_->GetAllocation(i);
|
||||
const BufferAllocation& allocation = allocations_[i];
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
se::DeviceMemoryBase buffer,
|
||||
BufferForAllocation(arguments, globals, allocation, memory_allocator,
|
||||
@ -537,7 +538,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
|
||||
|
||||
// Free all temporary allocations.
|
||||
TF_RETURN_IF_ERROR(
|
||||
buffer_allocations.TearDown(buffers_in_result, assignment_.get()));
|
||||
buffer_allocations.TearDown(buffers_in_result, allocations_));
|
||||
|
||||
// Free allocations for arguments.
|
||||
MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
|
||||
@ -551,9 +552,8 @@ int64 GpuExecutable::SizeOfGeneratedCodeInBytes() const {
|
||||
return -1;
|
||||
}
|
||||
int64 size = binary().size();
|
||||
for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
|
||||
++i) {
|
||||
const BufferAllocation& allocation = assignment_->GetAllocation(i);
|
||||
for (BufferAllocation::Index i = 0; i < allocations_.size(); ++i) {
|
||||
const BufferAllocation& allocation = allocations_[i];
|
||||
if (allocation.is_constant()) {
|
||||
size += allocation.size();
|
||||
}
|
||||
|
@ -71,7 +71,8 @@ class GpuExecutable : public Executable {
|
||||
std::vector<ConstantInfo> constants;
|
||||
absl::flat_hash_map<ShapeIndex, OutputInfo> output_info;
|
||||
std::unique_ptr<HloModule> hlo_module;
|
||||
std::unique_ptr<const BufferAssignment> assignment;
|
||||
std::vector<BufferAllocation> allocations;
|
||||
std::unique_ptr<BufferAssignmentProto> debug_buffer_assignment;
|
||||
std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data = nullptr;
|
||||
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map = nullptr;
|
||||
};
|
||||
@ -108,8 +109,8 @@ class GpuExecutable : public Executable {
|
||||
std::vector<ExecutionInput> arguments,
|
||||
HloExecutionProfile* hlo_execution_profile) override;
|
||||
|
||||
std::shared_ptr<const BufferAssignment> GetBufferAssignment() const {
|
||||
return assignment_;
|
||||
absl::Span<const BufferAllocation> GetAllocations() const {
|
||||
return allocations_;
|
||||
}
|
||||
|
||||
private:
|
||||
@ -176,7 +177,9 @@ class GpuExecutable : public Executable {
|
||||
|
||||
// Owns the buffer data at runtime. It provides information to allocate
|
||||
// memory for every output/temp buffers.
|
||||
const std::shared_ptr<const BufferAssignment> assignment_;
|
||||
const std::vector<BufferAllocation> allocations_;
|
||||
|
||||
std::shared_ptr<BufferAssignmentProto> debug_buffer_assignment_;
|
||||
|
||||
// Cache of module handles and constant buffer allocation maps used by
|
||||
// `ResolveConstantGlobals`.
|
||||
|
@ -46,12 +46,9 @@ class GemmRewriteTest : public GpuCodegenTest {
|
||||
backend().default_stream_executor()->GetAllocator()));
|
||||
GpuExecutable* gpu_executable =
|
||||
static_cast<GpuExecutable*>(executable.get());
|
||||
std::shared_ptr<const BufferAssignment> buffer_assignment =
|
||||
gpu_executable->GetBufferAssignment();
|
||||
CHECK_EQ(buffer_assignment->Allocations().size(),
|
||||
expected_number_of_allocations)
|
||||
<< "Unexpected buffer assignment. Was:\n"
|
||||
<< buffer_assignment->ToString();
|
||||
absl::Span<const BufferAllocation> allocations =
|
||||
gpu_executable->GetAllocations();
|
||||
CHECK_EQ(allocations.size(), expected_number_of_allocations);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -589,13 +589,14 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
|
||||
|
||||
TF_ASSIGN_OR_RETURN(auto output_info,
|
||||
xla::gpu::GetOutputInfo(*module, *buffer_assignment));
|
||||
std::vector<BufferAllocation> allocations =
|
||||
buffer_assignment->ReleaseAllocations();
|
||||
|
||||
// TODO(b/137624192): Add profiling support.
|
||||
return {absl::make_unique<GpuExecutable>(GpuExecutable::Params{
|
||||
std::move(ptx), std::move(cubin), GetGpuVersion(stream_exec),
|
||||
std::move(thunk_schedule), std::vector<GpuExecutable::ConstantInfo>(),
|
||||
std::move(output_info), std::move(module),
|
||||
std::move(buffer_assignment)})};
|
||||
std::move(output_info), std::move(module), std::move(allocations)})};
|
||||
}
|
||||
|
||||
StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(
|
||||
|
Loading…
Reference in New Issue
Block a user