From dd619e62dcbe36aada5e3279942459e140fa721a Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 3 Jun 2020 21:08:13 -0700
Subject: [PATCH] [XLA:CPU] Implement SizeOfGeneratedCodeInBytes() on CPU.
 [XLA:GPU] Account for global constants in implementation of
 SizeOfGeneratedCodeInBytes() on GPU. [XLA] Make SizeOfGeneratedCodeInBytes()
 const.

PiperOrigin-RevId: 314660261
Change-Id: I0b76adcbb2d75dfb432599b38065d510ef093b87
---
 .../xla/service/cpu/cpu_executable.cc         |  4 ++++
 .../compiler/xla/service/cpu/cpu_executable.h |  2 ++
 .../xla/service/cpu/simple_orc_jit.cc         |  1 +
 .../compiler/xla/service/cpu/simple_orc_jit.h |  5 ++++
 tensorflow/compiler/xla/service/executable.cc |  2 +-
 tensorflow/compiler/xla/service/executable.h  |  2 +-
 .../xla/service/gpu/gpu_executable.cc         | 12 ++++++++--
 .../compiler/xla/service/gpu/gpu_executable.h |  2 +-
 .../xla/tests/local_client_execute_test.cc    | 23 +++++++++++++++++++
 9 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index f031daecb1f..d9a328a326e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -376,5 +376,9 @@ const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
       module().entry_computation()->root_instruction());
 }
 
+int64 CpuExecutable::SizeOfGeneratedCodeInBytes() const {
+  return jit_->SizeOfGeneratedCodeInBytes();
+}
+
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 4ec688c1016..310e30e41f5 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -81,6 +81,8 @@ class CpuExecutable : public Executable {
 
   const BufferAssignment& buffer_assignment() const { return *assignment_; }
 
+  int64 SizeOfGeneratedCodeInBytes() const override;
+
  private:
   // Creates an array suitable for passing as the "buffer_table" argument to the
   // JIT compiled function pointer.
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 4cc9e373b3e..c38ee486af3 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -163,6 +163,7 @@ void SimpleOrcJIT::NotifyObjectFinalized(
   uint64_t key = static_cast<uint64_t>(
       reinterpret_cast<uintptr_t>(object.getData().data()));
   gdb_jit_event_listener_->notifyObjectLoaded(key, object, object_info);
+  size_of_generated_code_in_bytes_ += object.getData().size();
 }
 
 void SimpleOrcJIT::NotifyObjectFreed(const llvm::object::ObjectFile& object) {
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index 66333fb65c0..9c470edbac2 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -88,6 +88,10 @@ class SimpleOrcJIT {
       const llvm::TargetOptions& target_options,
       llvm::CodeGenOpt::Level opt_level);
 
+  int64 SizeOfGeneratedCodeInBytes() const {
+    return size_of_generated_code_in_bytes_;
+  }
+
  private:
   llvm::JITSymbol ResolveRuntimeSymbol(const std::string& name);
 
@@ -103,6 +107,7 @@ class SimpleOrcJIT {
   std::shared_ptr<llvm::orc::SymbolResolver> symbol_resolver_;
   ObjLayerT object_layer_;
   CompileLayerT compile_layer_;
+  int64 size_of_generated_code_in_bytes_ = 0;
 
   // Non owning pointer to a JIT event listener that registers the JIT events
   // with an attached GDB.
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index f41c4b77cd1..0ab4a223916 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -245,6 +245,6 @@ StatusOr<ExecutionOutput> Executable::ExecuteAsyncOnStreamWrapper(
   return return_value;
 }
 
-int64 Executable::SizeOfGeneratedCodeInBytes() { return -1; }
+int64 Executable::SizeOfGeneratedCodeInBytes() const { return -1; }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 5d7bd26b01e..e6b26b4fdae 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -318,7 +318,7 @@ class Executable {
   // not supported by the executable.
   //
   // Does not include the size of used libraries (e.g. cuDNN, Eigen, etc.).
-  virtual int64 SizeOfGeneratedCodeInBytes();
+  virtual int64 SizeOfGeneratedCodeInBytes() const;
 
   // Dumping helpers.
   void set_hlo_proto(std::unique_ptr<xla::HloProto> hlo_proto) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index a240a9dc65e..bf65df20544 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -510,13 +510,21 @@ const InstructionValueSet& GpuExecutable::GetRootValueSet() const {
       module().entry_computation()->root_instruction());
 }
 
-int64 GpuExecutable::SizeOfGeneratedCodeInBytes() {
+int64 GpuExecutable::SizeOfGeneratedCodeInBytes() const {
   // Non-empty PTX but empty cubin: compilation must have failed, return
   // "unknown".
   if (binary().empty() && !text_.empty()) {
     return -1;
   }
-  return binary().size();
+  int64 size = binary().size();
+  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
+       ++i) {
+    const BufferAllocation& allocation = assignment_->GetAllocation(i);
+    if (allocation.is_constant()) {
+      size += allocation.size();
+    }
+  }
+  return size;
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 66a4e605821..0da446c9739 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -61,7 +61,7 @@ class GpuExecutable : public Executable {
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~GpuExecutable() override;
 
-  int64 SizeOfGeneratedCodeInBytes() override;
+  int64 SizeOfGeneratedCodeInBytes() const override;
 
   // This should be called after set_ir_module_string.
   const string& ir_module_string() const { return ir_module_string_; }
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 7d804e18ed3..fab1a53611f 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -806,6 +806,29 @@ XLA_TEST_F(LocalClientExecuteTest, CompilePartitionedExecutable) {
   EXPECT_EQ(2, executables.size());
 }
 
+XLA_TEST_F(LocalClientExecuteTest,
+           DISABLED_ON_INTERPRETER(SizeOfGeneratedCodeInBytes)) {
+  XlaBuilder builder(TestName());
+  auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x");
+  constexpr int size = 100000;
+  TF_ASSERT_OK_AND_ASSIGN(auto literal,
+                          LiteralUtil::CreateRandomLiteral<F32>(
+                              ShapeUtil::MakeShape(F32, {size}), 0.0, 1.0));
+  auto y = ConstantLiteral(&builder, literal);
+  Add(x, y);
+
+  Shape argument_layout =
+      ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{}, {});
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executables,
+      local_client_->Compile(builder.Build().ValueOrDie(), {&argument_layout},
+                             ExecutableBuildOptions()));
+  EXPECT_EQ(1, executables.size());
+  // The executable should be at least as large as the constant it contains.
+  EXPECT_GT(executables.front()->executable()->SizeOfGeneratedCodeInBytes(),
+            int64{sizeof(float) * size});
+}
+
 XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) {
   // Test copying Literals to the device as ShapedBuffers, then copying them
   // back again to Literals.