From 6a7391ca0231d4ee5426595cec8ecf181f2fc6eb Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 7 Oct 2020 18:43:53 -0700
Subject: [PATCH] [TPU] Don't pass host_shapes across the TPU API boundary.
 They can be computed from device shapes.

PiperOrigin-RevId: 335996516
Change-Id: I702ff81ce311042399ef87bddb5e0d68b7464331
---
 tensorflow/compiler/xla/service/executable.h               | 3 +++
 tensorflow/core/tpu/tpu_on_demand_compiler.cc              | 2 --
 tensorflow/stream_executor/tpu/c_api_conversions.cc        | 5 +----
 tensorflow/stream_executor/tpu/c_api_decl.h                | 2 --
 tensorflow/stream_executor/tpu/tpu_executable_interface.cc | 3 +--
 5 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 9216e5de85d..55d11a79011 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -153,6 +153,9 @@ class ExecutionOutput {
                   std::vector<se::OwningDeviceMemory> to_be_released)
       : result_(std::move(result)),
         to_be_released_(std::move(to_be_released)) {}
+  ExecutionOutput(Shape on_device_shape, se::DeviceMemoryAllocator* allocator,
+                  int device_ordinal)
+      : result_(std::move(on_device_shape), allocator, device_ordinal) {}
   ExecutionOutput(Shape on_host_shape, Shape on_device_shape,
                   se::DeviceMemoryAllocator* allocator, int device_ordinal)
       : result_(std::move(on_host_shape), std::move(on_device_shape), allocator,
diff --git a/tensorflow/core/tpu/tpu_on_demand_compiler.cc b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
index 08153f62063..01ea9f5848a 100644
--- a/tensorflow/core/tpu/tpu_on_demand_compiler.cc
+++ b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
@@ -119,7 +119,6 @@ class TpuExecutable : public TpuExecutableInterface {
       }
 
       ApiConverter::ToC(arg.shape(), &se_args[i]->dynamic_shape);
-      ApiConverter::ToC(arg.host_shape(), &se_args[i]->host_shape);
       const auto& unowned_indices = arg.unowned_indices();
       se_args[i]->unowned_indices_size = unowned_indices.size();
       se_args[i]->unowned_indices = new XLA_ShapeIndex[unowned_indices.size()];
@@ -142,7 +141,6 @@ class TpuExecutable : public TpuExecutableInterface {
     for (int i = 0; i < arguments.size(); ++i) {
       ApiConverter::Free(&se_args[i]->shape_tree.shape);
       ApiConverter::Free(&se_args[i]->dynamic_shape);
-      ApiConverter::Free(&se_args[i]->host_shape);
       delete[] se_args[i]->unowned_indices;
       delete[] se_args[i]->shape_tree.buffers;
       delete se_args[i];
diff --git a/tensorflow/stream_executor/tpu/c_api_conversions.cc b/tensorflow/stream_executor/tpu/c_api_conversions.cc
index 674a1fdfb68..0a7801f45fc 100644
--- a/tensorflow/stream_executor/tpu/c_api_conversions.cc
+++ b/tensorflow/stream_executor/tpu/c_api_conversions.cc
@@ -23,7 +23,6 @@ limitations under the License.
 namespace ApiConverter {
 
 xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer) {
-  xla::Shape xla_on_host_shape = ApiConverter::FromC(&c_buffer->on_host_shape);
   xla::Shape xla_on_device_shape =
       ApiConverter::FromC(&c_buffer->on_device_shape);
 
@@ -36,7 +35,7 @@ xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer) {
   }
 
   xla::ShapedBuffer xla_shaped_buffer(
-      xla_on_host_shape, xla_on_device_shape,
+      xla_on_device_shape,
       tensorflow::tpu::TpuPlatformInterface::GetRegisteredPlatform(),
       c_buffer->device_ordinal);
   xla_shaped_buffer.set_buffers(xla_shape_tree);
@@ -199,7 +198,6 @@ xla::MutableBorrowingLiteral FromC(XLA_Literal* c_literal) {
 }
 
 void ToC(const xla::ShapedBuffer& buffer, XLA_ShapedBuffer* c_device_buffer) {
-  ApiConverter::ToC(buffer.on_host_shape(), &c_device_buffer->on_host_shape);
   ApiConverter::ToC(buffer.on_device_shape(),
                     &c_device_buffer->on_device_shape);
   c_device_buffer->device_ordinal = buffer.device_ordinal();
@@ -226,7 +224,6 @@ void Free(XLA_Literal* c_literal) {
 
 void Free(XLA_ShapedBuffer* c_buffer) {
   ApiConverter::Free(&c_buffer->on_device_shape);
-  ApiConverter::Free(&c_buffer->on_host_shape);
   delete[] c_buffer->bases;
 }
 
diff --git a/tensorflow/stream_executor/tpu/c_api_decl.h b/tensorflow/stream_executor/tpu/c_api_decl.h
index 7953670dec7..dcb53823e0c 100644
--- a/tensorflow/stream_executor/tpu/c_api_decl.h
+++ b/tensorflow/stream_executor/tpu/c_api_decl.h
@@ -177,7 +177,6 @@ typedef struct XLA_Shape {
 
 // Represents a leaf node for a XLA shaped buffer.
 typedef struct XLA_ShapedBuffer {
-  XLA_Shape on_host_shape;
   XLA_Shape on_device_shape;
   int device_ordinal;
 
@@ -208,7 +207,6 @@ typedef struct SE_ExecutionInput {
   XLA_ShapeIndex* unowned_indices;
   int unowned_indices_size;
   XLA_Shape dynamic_shape;
-  XLA_Shape host_shape;
 } SE_ExecutionInput;
 
 typedef struct SE_ExecutionOutput {
diff --git a/tensorflow/stream_executor/tpu/tpu_executable_interface.cc b/tensorflow/stream_executor/tpu/tpu_executable_interface.cc
index af29f2e2b06..84ce8444420 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable_interface.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executable_interface.cc
@@ -90,8 +90,7 @@ TpuExecutableInterface::AllocateOutputMemoryWithInputReuse(
     }
   }
 
-  ExecutionOutput result(host_shape, std::move(device_shape), allocator,
-                         device_ordinal);
+  ExecutionOutput result(std::move(device_shape), allocator, device_ordinal);
   // Iterate through and allocate a buffer for each shape index, checking for
   // possible input buffer reuse.
   int64 reused_buffer_bytes = 0;