From dd619e62dcbe36aada5e3279942459e140fa721a Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Wed, 3 Jun 2020 21:08:13 -0700 Subject: [PATCH] [XLA:CPU] Implement SizeOfGeneratedCodeInBytes() on CPU. [XLA:GPU] Account for global constants in implementation of SizeOfGeneratedCodeInBytes() on GPU. [XLA] Make SizeOfGeneratedCodeInBytes() const. PiperOrigin-RevId: 314660261 Change-Id: I0b76adcbb2d75dfb432599b38065d510ef093b87 --- .../xla/service/cpu/cpu_executable.cc | 4 ++++ .../compiler/xla/service/cpu/cpu_executable.h | 2 ++ .../xla/service/cpu/simple_orc_jit.cc | 1 + .../compiler/xla/service/cpu/simple_orc_jit.h | 5 ++++ tensorflow/compiler/xla/service/executable.cc | 2 +- tensorflow/compiler/xla/service/executable.h | 2 +- .../xla/service/gpu/gpu_executable.cc | 12 ++++++++-- .../compiler/xla/service/gpu/gpu_executable.h | 2 +- .../xla/tests/local_client_execute_test.cc | 23 +++++++++++++++++++ 9 files changed, 48 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index f031daecb1f..d9a328a326e 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -376,5 +376,9 @@ const InstructionValueSet& CpuExecutable::GetRootValueSet() const { module().entry_computation()->root_instruction()); } +int64 CpuExecutable::SizeOfGeneratedCodeInBytes() const { + return jit_->SizeOfGeneratedCodeInBytes(); +} + } // namespace cpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index 4ec688c1016..310e30e41f5 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -81,6 +81,8 @@ class CpuExecutable : public Executable { const BufferAssignment& buffer_assignment() const { return *assignment_; } + int64 SizeOfGeneratedCodeInBytes() const override; + private: // Creates an array suitable for passing as the "buffer_table" argument to the // JIT compiled function pointer. diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc index 4cc9e373b3e..c38ee486af3 100644 --- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc +++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc @@ -163,6 +163,7 @@ void SimpleOrcJIT::NotifyObjectFinalized( uint64_t key = static_cast( reinterpret_cast(object.getData().data())); gdb_jit_event_listener_->notifyObjectLoaded(key, object, object_info); + size_of_generated_code_in_bytes_ += object.getData().size(); } void SimpleOrcJIT::NotifyObjectFreed(const llvm::object::ObjectFile& object) { diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h index 66333fb65c0..9c470edbac2 100644 --- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h +++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h @@ -88,6 +88,10 @@ class SimpleOrcJIT { const llvm::TargetOptions& target_options, llvm::CodeGenOpt::Level opt_level); + int64 SizeOfGeneratedCodeInBytes() const { + return size_of_generated_code_in_bytes_; + } + private: llvm::JITSymbol ResolveRuntimeSymbol(const std::string& name); @@ -103,6 +107,7 @@ class SimpleOrcJIT { std::shared_ptr symbol_resolver_; ObjLayerT object_layer_; CompileLayerT compile_layer_; + int64 size_of_generated_code_in_bytes_ = 0; // Non owning pointer to a JIT event listener that registers the JIT events // with an attached GDB. diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index f41c4b77cd1..0ab4a223916 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -245,6 +245,6 @@ StatusOr Executable::ExecuteAsyncOnStreamWrapper( return return_value; } -int64 Executable::SizeOfGeneratedCodeInBytes() { return -1; } +int64 Executable::SizeOfGeneratedCodeInBytes() const { return -1; } } // namespace xla diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 5d7bd26b01e..e6b26b4fdae 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -318,7 +318,7 @@ class Executable { // not supported by the executable. // // Does not include the size of used libraries (e.g. cuDNN, Eigen, etc.). - virtual int64 SizeOfGeneratedCodeInBytes(); + virtual int64 SizeOfGeneratedCodeInBytes() const; // Dumping helpers. void set_hlo_proto(std::unique_ptr hlo_proto) { diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index a240a9dc65e..bf65df20544 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -510,13 +510,21 @@ const InstructionValueSet& GpuExecutable::GetRootValueSet() const { module().entry_computation()->root_instruction()); } -int64 GpuExecutable::SizeOfGeneratedCodeInBytes() { +int64 GpuExecutable::SizeOfGeneratedCodeInBytes() const { // Non-empty PTX but empty cubin: compilation must have failed, return // "unknown". if (binary().empty() && !text_.empty()) { return -1; } - return binary().size(); + int64 size = binary().size(); + for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); + ++i) { + const BufferAllocation& allocation = assignment_->GetAllocation(i); + if (allocation.is_constant()) { + size += allocation.size(); + } + } + return size; } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 66a4e605821..0da446c9739 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -61,7 +61,7 @@ class GpuExecutable : public Executable { std::unique_ptr hlo_profile_index_map); ~GpuExecutable() override; - int64 SizeOfGeneratedCodeInBytes() override; + int64 SizeOfGeneratedCodeInBytes() const override; // This should be called after set_ir_module_string. const string& ir_module_string() const { return ir_module_string_; } diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc index 7d804e18ed3..fab1a53611f 100644 --- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc @@ -806,6 +806,29 @@ XLA_TEST_F(LocalClientExecuteTest, CompilePartitionedExecutable) { EXPECT_EQ(2, executables.size()); } +XLA_TEST_F(LocalClientExecuteTest, + DISABLED_ON_INTERPRETER(SizeOfGeneratedCodeInBytes)) { + XlaBuilder builder(TestName()); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x"); + constexpr int size = 100000; + TF_ASSERT_OK_AND_ASSIGN(auto literal, + LiteralUtil::CreateRandomLiteral( + ShapeUtil::MakeShape(F32, {size}), 0.0, 1.0)); + auto y = ConstantLiteral(&builder, literal); + Add(x, y); + + Shape argument_layout = + ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{}, {}); + TF_ASSERT_OK_AND_ASSIGN( + auto executables, + local_client_->Compile(builder.Build().ValueOrDie(), {&argument_layout}, + ExecutableBuildOptions())); + EXPECT_EQ(1, executables.size()); + // The executable should be at least as large as the constant it contains. + EXPECT_GT(executables.front()->executable()->SizeOfGeneratedCodeInBytes(), + int64{sizeof(float) * size}); +} + XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) { // Test copying Literals to the device as ShapedBuffers, then copying them // back again to Literals.