[XLA/GPU] Remove uses of BufferAssignment in GpuExecutable.

PiperOrigin-RevId: 348539166
Change-Id: I97aa232cd81837ffcfd10e24fd8f8b3dfb7953fe
This commit is contained in:
Tim Shen 2020-12-21 15:37:48 -08:00 committed by TensorFlower Gardener
parent 722d7601e9
commit 63b8cdcb82
12 changed files with 58 additions and 47 deletions

View File

@ -358,6 +358,14 @@ class BufferAssignment {
return allocations_;
}
// This is similar to copying Allocations(), but since it's moved out, it
// preserves the addresses. Since BufferAllocation::Slice keeps a
// BufferAllocation*, and some backends keep BufferAllocation::Slice in
// xla::Executables, migrating off the use of addresses can be hard.
std::vector<BufferAllocation> ReleaseAllocations() {
return std::move(allocations_);
}
// Returns the total size allocation holding all temporary buffers.
int64 temp_allocation_total_size() const {
return temp_allocation_total_size_;

View File

@ -672,8 +672,8 @@ cc_library(
"gpu_debug_info_manager.h",
],
deps = [
"//tensorflow/compiler/xla/service:buffer_assignment",
"//tensorflow/compiler/xla/service:hlo",
"//tensorflow/compiler/xla/service:hlo_proto_cc",
"//tensorflow/compiler/xla/service:hlo_proto_util",
"//tensorflow/core:lib",
"@com_google_absl//absl/container:flat_hash_map",
@ -685,15 +685,10 @@ tf_cc_test(
srcs = ["gpu_debug_info_manager_test.cc"],
tags = tf_cuda_tests_tags(),
deps = [
":gpu_constants",
":gpu_debug_info_manager",
":gpu_hlo_schedule",
":stream_assignment",
"//tensorflow/compiler/xla/service:buffer_assignment",
"//tensorflow/compiler/xla/service:hlo_proto_cc",
"//tensorflow/compiler/xla/tests:hlo_test_base",
"//tensorflow/compiler/xla/tests:test_utils",
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
"//tensorflow/core:test",
],
)

View File

@ -34,13 +34,13 @@ namespace gpu {
Status BufferAllocations::TearDown(
const std::set<se::DeviceMemoryBase>& live_addresses,
const BufferAssignment* buffer_assignment) {
absl::Span<const BufferAllocation> allocations) {
// Deallocate temporary buffers, taking care to try to deallocate all of them
// even if one of the deallocations fails.
Status status;
const int64 num_buffers = buffer_assignment->Allocations().size();
const int64 num_buffers = allocations.size();
for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
const BufferAllocation& allocation = buffer_assignment->GetAllocation(i);
const BufferAllocation& allocation = allocations[i];
se::DeviceMemoryBase buffer_address = GetDeviceAddress(allocation.index());
// Deallocate buffers marked "maybe_live_out" but aren't actually live out,
// and temp buffers.

View File

@ -70,7 +70,7 @@ class BufferAllocations {
// Tears down all buffers allocated by this object that are not in
// `live_addresses`.
Status TearDown(const std::set<se::DeviceMemoryBase>& live_addresses,
const BufferAssignment* buffer_assignment);
absl::Span<const BufferAllocation> allocations);
std::string ToString() {
std::string out;

View File

@ -869,13 +869,18 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
TF_ASSIGN_OR_RETURN(OutputInfoMap output_info,
GetOutputInfo(*module, *buffer_assignment));
auto buffer_assignment_proto =
std::make_unique<BufferAssignmentProto>(buffer_assignment->ToProto());
std::vector<BufferAllocation> allocations =
buffer_assignment->ReleaseAllocations();
GpuVersion gpu_version = GetGpuVersion(stream_exec);
auto* gpu_executable = new GpuExecutable(
{std::move(backend_result.first), std::move(backend_result.second),
gpu_version, std::move(thunk_schedule), std::move(constants),
std::move(output_info), std::move(module), std::move(buffer_assignment),
std::move(profile_printer), std::move(profile_index_map)});
std::move(output_info), std::move(module), std::move(allocations),
std::move(buffer_assignment_proto), std::move(profile_printer),
std::move(profile_index_map)});
if (embed_ir_in_executable) {
DCHECK_NE("", ir_module_string_before_opt);
gpu_executable->set_ir_module_string(ir_module_string_before_opt);

View File

@ -22,7 +22,7 @@ namespace gpu {
void GpuDebugInfoManager::RegisterModule(
const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
std::shared_ptr<const BufferAssignment> buffer_assignment) {
std::shared_ptr<const BufferAssignmentProto> buffer_assignment) {
tensorflow::mutex_lock lock(mutex_);
if (active_modules_.find(module_id) != active_modules_.end()) {
active_modules_[module_id].instances.emplace_back(hlo_module,
@ -40,7 +40,7 @@ void GpuDebugInfoManager::RegisterModule(
// However during tracing, we will defer the cleanup after serialization.
void GpuDebugInfoManager::UnregisterModule(
const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
std::shared_ptr<const BufferAssignment> buffer_assignment) {
std::shared_ptr<const BufferAssignmentProto> buffer_assignment) {
tensorflow::mutex_lock lock(mutex_);
CHECK(active_modules_.find(module_id) != active_modules_.end());
GpuModuleEntry& active_module = active_modules_[module_id];
@ -146,8 +146,10 @@ void GpuDebugInfoManager::StopTracing(
// non-nullptr. Due to the inconvenience of creation of buffer_assignment
// object in test, we set it to nullptr and guard this for it.
if (m.instances[0].hlo_module && m.instances[0].buffer_assignment) {
info.hlo_proto = absl::make_unique<HloProto>(MakeHloProto(
*m.instances[0].hlo_module, *m.instances[0].buffer_assignment));
info.hlo_proto = absl::make_unique<HloProto>(
MakeHloProto(*m.instances[0].hlo_module));
*info.hlo_proto->mutable_buffer_assignment() =
*m.instances[0].buffer_assignment;
}
module_debug_info->emplace_back(std::move(info));
}

View File

@ -17,7 +17,7 @@ limitations under the License.
#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEBUG_INFO_MANAGER_H_
#include "absl/container/flat_hash_map.h"
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
#include "tensorflow/compiler/xla/service/hlo.pb.h"
#include "tensorflow/compiler/xla/service/hlo_module.h"
#include "tensorflow/core/lib/core/status.h"
@ -56,14 +56,14 @@ class GpuDebugInfoManager {
// Modules with same module id can be registered and tracked separately.
void RegisterModule(
const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
std::shared_ptr<const BufferAssignment> buffer_assignment);
std::shared_ptr<const BufferAssignmentProto> buffer_assignment);
// Unregister an active module. When the last active module of the same
// module id is out of scope, we remove it from our database.
// However during tracing, we will defer the cleanup after serialization.
void UnregisterModule(
const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
std::shared_ptr<const BufferAssignment> buffer_assignment);
std::shared_ptr<const BufferAssignmentProto> buffer_assignment);
// Register when the module start execution on certain device.
// TODO(jiesun): Do we need to track which device this is?
@ -110,10 +110,10 @@ class GpuDebugInfoManager {
// tracking, they need to be tracked separately.
struct GpuModuleInstance {
GpuModuleInstance(std::shared_ptr<HloModule> m,
std::shared_ptr<const BufferAssignment> b)
std::shared_ptr<const BufferAssignmentProto> b)
: hlo_module(std::move(m)), buffer_assignment(std::move(b)) {}
std::shared_ptr<HloModule> hlo_module;
std::shared_ptr<const BufferAssignment> buffer_assignment;
std::shared_ptr<const BufferAssignmentProto> buffer_assignment;
bool active = true;
};

View File

@ -14,7 +14,7 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h"
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
#include "tensorflow/compiler/xla/service/hlo.pb.h"
#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
namespace xla {
@ -30,7 +30,7 @@ class GpuDebugInfoManagerTest : public HloTestBase {
int unique_id;
string id;
std::shared_ptr<HloModule> module;
std::shared_ptr<BufferAssignment> buffer_assignment;
std::shared_ptr<BufferAssignmentProto> buffer_assignment;
};
// Return unique id of this module.

View File

@ -62,18 +62,19 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
binary_(std::move(params.binary)),
gpu_version_(params.gpu_version),
thunk_schedule_(std::move(params.thunk_schedule)),
assignment_(std::move(params.assignment)),
allocations_(std::move(params.allocations)),
debug_buffer_assignment_(std::move(params.debug_buffer_assignment)),
constants_(std::move(params.constants)),
output_info_(std::move(params.output_info)) {
CHECK(has_module() && assignment_);
CHECK(has_module());
GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(),
assignment_);
debug_buffer_assignment_);
}
GpuExecutable::~GpuExecutable() {
CHECK(has_module() && assignment_);
CHECK(has_module());
GpuDebugInfoManager::Get()->UnregisterModule(module().name(), shared_module(),
assignment_);
debug_buffer_assignment_);
{
// We could have issued host->device mem copies in ResolveConstantGlobals.
@ -376,11 +377,11 @@ StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
[&] { return std::string("Build buffer allocations"); },
tensorflow::profiler::TraceMeLevel::kInfo);
const int64 num_buffers = assignment_->Allocations().size();
const int64 num_buffers = allocations_.size();
std::vector<se::DeviceMemoryBase> buffers;
buffers.reserve(num_buffers);
for (int64 i = 0; i < num_buffers; ++i) {
const BufferAllocation& allocation = assignment_->GetAllocation(i);
const BufferAllocation& allocation = allocations_[i];
TF_ASSIGN_OR_RETURN(
se::DeviceMemoryBase buffer,
BufferForAllocation(arguments, globals, allocation, memory_allocator,
@ -537,7 +538,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
// Free all temporary allocations.
TF_RETURN_IF_ERROR(
buffer_allocations.TearDown(buffers_in_result, assignment_.get()));
buffer_allocations.TearDown(buffers_in_result, allocations_));
// Free allocations for arguments.
MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
@ -551,9 +552,8 @@ int64 GpuExecutable::SizeOfGeneratedCodeInBytes() const {
return -1;
}
int64 size = binary().size();
for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
++i) {
const BufferAllocation& allocation = assignment_->GetAllocation(i);
for (BufferAllocation::Index i = 0; i < allocations_.size(); ++i) {
const BufferAllocation& allocation = allocations_[i];
if (allocation.is_constant()) {
size += allocation.size();
}

View File

@ -71,7 +71,8 @@ class GpuExecutable : public Executable {
std::vector<ConstantInfo> constants;
absl::flat_hash_map<ShapeIndex, OutputInfo> output_info;
std::unique_ptr<HloModule> hlo_module;
std::unique_ptr<const BufferAssignment> assignment;
std::vector<BufferAllocation> allocations;
std::unique_ptr<BufferAssignmentProto> debug_buffer_assignment;
std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data = nullptr;
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map = nullptr;
};
@ -108,8 +109,8 @@ class GpuExecutable : public Executable {
std::vector<ExecutionInput> arguments,
HloExecutionProfile* hlo_execution_profile) override;
std::shared_ptr<const BufferAssignment> GetBufferAssignment() const {
return assignment_;
absl::Span<const BufferAllocation> GetAllocations() const {
return allocations_;
}
private:
@ -176,7 +177,9 @@ class GpuExecutable : public Executable {
// Owns the buffer data at runtime. It provides information to allocate
// memory for every output/temp buffers.
const std::shared_ptr<const BufferAssignment> assignment_;
const std::vector<BufferAllocation> allocations_;
std::shared_ptr<BufferAssignmentProto> debug_buffer_assignment_;
// Cache of module handles and constant buffer allocation maps used by
// `ResolveConstantGlobals`.

View File

@ -46,12 +46,9 @@ class GemmRewriteTest : public GpuCodegenTest {
backend().default_stream_executor()->GetAllocator()));
GpuExecutable* gpu_executable =
static_cast<GpuExecutable*>(executable.get());
std::shared_ptr<const BufferAssignment> buffer_assignment =
gpu_executable->GetBufferAssignment();
CHECK_EQ(buffer_assignment->Allocations().size(),
expected_number_of_allocations)
<< "Unexpected buffer assignment. Was:\n"
<< buffer_assignment->ToString();
absl::Span<const BufferAllocation> allocations =
gpu_executable->GetAllocations();
CHECK_EQ(allocations.size(), expected_number_of_allocations);
}
};

View File

@ -589,13 +589,14 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
TF_ASSIGN_OR_RETURN(auto output_info,
xla::gpu::GetOutputInfo(*module, *buffer_assignment));
std::vector<BufferAllocation> allocations =
buffer_assignment->ReleaseAllocations();
// TODO(b/137624192): Add profiling support.
return {absl::make_unique<GpuExecutable>(GpuExecutable::Params{
std::move(ptx), std::move(cubin), GetGpuVersion(stream_exec),
std::move(thunk_schedule), std::vector<GpuExecutable::ConstantInfo>(),
std::move(output_info), std::move(module),
std::move(buffer_assignment)})};
std::move(output_info), std::move(module), std::move(allocations)})};
}
StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(