Added support of some GPUOperations in ComputeTask.

ComputeTask can be created from GPUOperation now. Test of Add created from GPUOperation in Metal backend. PiperOrigin-RevId: 351860267 Change-Id: I0e622f6efdee5f40043f293034b4d84298387e9d
2021-01-14 13:02:23 -08:00 · 2021-01-14 13:02:23 -08:00 · a9aee8f713
commit a9aee8f713
parent c1336b952d
8 changed files with 255 additions and 8 deletions
--- a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
@ -38,6 +38,9 @@ namespace gpu {
 namespace cl {
 class ClOperation;
 }
+namespace metal {
+class ComputeTask;
+}

 // kCustom: default value
 //   GPUOperation::GetGridSize must be overloaded
@ -145,6 +148,7 @@ class GPUOperation {

 protected:
  friend class cl::ClOperation;
+  friend class metal::ComputeTask;
  friend flatbuffers::Offset<tflite::gpu::data::GPUOperation> Encode(
      const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder);
  friend absl::Status Decode(const tflite::gpu::data::GPUOperation* fb_op,
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@ -121,6 +121,7 @@ objc_library(
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
        "@com_google_absl//absl/strings",
    ],
 )
--- a/tensorflow/lite/delegates/gpu/metal/compute_task.cc
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task.cc
@ -33,11 +33,43 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace metal {
+namespace {
+int3 GetWorkGroupsCount(int grid_dimension, const int3& grid_size,
+                        const int3& work_group_size,
+                        const int3& work_group_launch_order) {
+  int3 work_groups_count;
+  if (grid_dimension == 1) {
+    work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    work_groups_count.y = 1;
+    work_groups_count.z = 1;
+  } else if (grid_dimension == 2) {
+    int3 wgs;
+    wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+    work_groups_count.x = wgs[work_group_launch_order[0]];
+    work_groups_count.y = wgs[work_group_launch_order[1]];
+    work_groups_count.z = 1;
+  } else {  // grid_dimension == 3
+    int3 wgs;
+    wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+    wgs.z = DivideRoundUp(grid_size.z, work_group_size.z);
+    work_groups_count.x = wgs[work_group_launch_order[0]];
+    work_groups_count.y = wgs[work_group_launch_order[1]];
+    work_groups_count.z = wgs[work_group_launch_order[2]];
+  }
+  return work_groups_count;
+}
+}  // namespace

 void ComputeTask::Init(std::unique_ptr<ComputeTaskDescriptor>&& task_desc) {
  task_desc_ = std::move(task_desc);
 }

+void ComputeTask::Init(std::unique_ptr<GPUOperation>&& operation) {
+  operation_ = std::move(operation);
+}
+
 absl::Status ComputeTask::Compile(CalculationsPrecision precision,
                                  MetalDevice* device) {
  task_desc_->AssembleCode();
@ -111,6 +143,71 @@ absl::Status ComputeTask::Compile(CalculationsPrecision precision,
  return absl::OkStatus();
 }

+absl::Status ComputeTask::CompileOp(MetalDevice* device) {
+  operation_->AssembleCode(device->GetInfo());
+  const std::map<std::string, std::string> linkables = {
+      {operation_->dst_tensors_names_[0], operation_->elementwise_code_}};
+  RETURN_IF_ERROR(metal_args_.Init(linkables, device, &operation_->args_,
+                                   &operation_->code_));
+
+  operation_->args_.ReleaseCPURepresentation();
+  NSString* storageType;
+  NSString* accumulatorType;
+  NSString* toAccumulatorType = @"";
+  NSString* toAccumulatorType2 = @"";
+  NSString* toAccumulatorType3 = @"";
+  NSString* toAccumulatorType4 = @"";
+  if (operation_->definition_.precision == CalculationsPrecision::F32) {
+    storageType = @"float";
+    accumulatorType = @"float";
+  } else {
+    // FP16
+    storageType = @"half";
+    if (operation_->definition_.precision == CalculationsPrecision::F32_F16) {
+      accumulatorType = @"float";
+      toAccumulatorType = @"float";
+      toAccumulatorType2 = @"float2";
+      toAccumulatorType3 = @"float3";
+      toAccumulatorType4 = @"float4";
+    } else {
+      accumulatorType = @"half";
+    }
+  }
+  NSDictionary<NSString*, NSString*>* macros = @{
+    @"FLT" : storageType,
+    @"FLT2" : [NSString stringWithFormat:@"%@2", storageType],
+    @"FLT3" : [NSString stringWithFormat:@"%@3", storageType],
+    @"FLT4" : [NSString stringWithFormat:@"%@4", storageType],
+    @"ACCUM_FLT" : accumulatorType,
+    @"ACCUM_FLT2" : [NSString stringWithFormat:@"%@2", accumulatorType],
+    @"ACCUM_FLT3" : [NSString stringWithFormat:@"%@3", accumulatorType],
+    @"ACCUM_FLT4" : [NSString stringWithFormat:@"%@4", accumulatorType],
+    @"TO_ACCUM_TYPE" : toAccumulatorType,
+    @"TO_ACCUM2_TYPE" : toAccumulatorType2,
+    @"TO_ACCUM3_TYPE" : toAccumulatorType3,
+    @"TO_ACCUM4_TYPE" : toAccumulatorType4,
+    @"MAIN_FUNCTION" : @"\"kernel void ComputeFunction\"",
+    @"GLOBAL_ID_0" : @"static_cast<int>(reserved_gid.x)",
+    @"GLOBAL_ID_1" : @"static_cast<int>(reserved_gid.y)",
+    @"GLOBAL_ID_2" : @"static_cast<int>(reserved_gid.z)",
+    @"INIT_FLT(value)" : [NSString stringWithFormat:@"%@(value)", storageType],
+    @"INIT_FLT4(value)" :
+        [NSString stringWithFormat:@"%@4(value)", storageType],
+  };
+
+  NSString* code =
+      [NSString stringWithCString:operation_->code_.c_str()
+                         encoding:[NSString defaultCStringEncoding]];
+  id<MTLComputePipelineState> program;
+  RETURN_IF_ERROR(CreateComputeProgram(device->device(), code,
+                                       @"ComputeFunction", macros, &program));
+  if (!program) {
+    return absl::InternalError("Unknown shader compilation error");
+  }
+  program_ = program;
+  return absl::OkStatus();
+}
+
 absl::Status ComputeTask::UpdateParams(const GpuInfo& gpu_info,
                                       const std::vector<BHWC>& src_shapes,
                                       const std::vector<BHWC>& dst_shapes) {
@ -137,6 +234,33 @@ absl::Status ComputeTask::UpdateParams(const GpuInfo& gpu_info,
  return absl::OkStatus();
 }

+absl::Status ComputeTask::UpdateOpParams() {
+  for (int i = 0; i < operation_->src_tensors_names_.size(); ++i) {
+    const auto* metal_spatial_tensor =
+        dynamic_cast<const MetalSpatialTensor*>(operation_->src_[i]);
+    if (!metal_spatial_tensor) {
+      return absl::InvalidArgumentError("Expected MetalSpatialTensor.");
+    }
+    RETURN_IF_ERROR(metal_args_.SetObjectRef(operation_->src_tensors_names_[i],
+                                             *metal_spatial_tensor));
+  }
+  for (int i = 0; i < operation_->dst_tensors_names_.size(); ++i) {
+    const auto* metal_spatial_tensor =
+        dynamic_cast<const MetalSpatialTensor*>(operation_->dst_[i]);
+    if (!metal_spatial_tensor) {
+      return absl::InvalidArgumentError("Expected MetalSpatialTensor.");
+    }
+    RETURN_IF_ERROR(metal_args_.SetObjectRef(operation_->dst_tensors_names_[i],
+                                             *metal_spatial_tensor));
+  }
+  RETURN_IF_ERROR(operation_->BindArguments(&metal_args_));
+  operation_->grid_size_ = operation_->GetGridSize();
+  operation_->work_groups_count_ = GetWorkGroupsCount(
+      operation_->grid_dimension_, operation_->grid_size_,
+      operation_->work_group_size_, operation_->work_group_launch_order_);
+  return absl::OkStatus();
+}
+
 void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
  // The dispatch call is intended to be skipped.
  if (groups_count_.x * groups_count_.y * groups_count_.z == 0) {
@ -145,8 +269,7 @@ void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {

  [encoder setComputePipelineState:program_];

-  int bindIndex = 0;
-  metal_args_.Encode(encoder, bindIndex);
+  metal_args_.Encode(encoder, 0);

  MTLSize groupsCount =
      MTLSizeMake(groups_count_.x, groups_count_.y, groups_count_.z);
@ -155,6 +278,19 @@ void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
  [encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize];
 }

+void ComputeTask::EncodeOpWithEncoder(id<MTLComputeCommandEncoder> encoder) {
+  [encoder setComputePipelineState:program_];
+  metal_args_.Encode(encoder, 0);
+  MTLSize groupsCount, groupsSize;
+  groupsCount.width = operation_->work_groups_count_.x;
+  groupsCount.height = operation_->work_groups_count_.y;
+  groupsCount.depth = operation_->work_groups_count_.z;
+  groupsSize.width = operation_->work_group_size_.x;
+  groupsSize.height = operation_->work_group_size_.y;
+  groupsSize.depth = operation_->work_group_size_.z;
+  [encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize];
+}
+
 void ComputeTask::SetSrcTensor(const MetalSpatialTensor& tensor, int index) {
  auto status =
      metal_args_.SetObjectRef(task_desc_->src_tensors_names[index], tensor);
--- a/tensorflow/lite/delegates/gpu/metal/compute_task.h
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task.h
@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/precision.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/metal/common.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
 #include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h"
@ -49,25 +50,35 @@ class ComputeTask {

  void Init(std::unique_ptr<ComputeTaskDescriptor>&& task_desc);

+  void Init(std::unique_ptr<GPUOperation>&& operation);
+
  ComputeTaskDescriptor& GetTaskDesc() { return *task_desc_; }
  const ComputeTaskDescriptor& GetTaskDesc() const { return *task_desc_; }

  /// Returns empty string or error if shader can't be compiled.
  absl::Status Compile(CalculationsPrecision precision, MetalDevice* device);

+  absl::Status CompileOp(MetalDevice* device);
+
  /// Updates parameters for inputs/outputs/intermediate tensors
  absl::Status UpdateParams(const GpuInfo& gpu_info,
                            const std::vector<BHWC>& src_shapes,
                            const std::vector<BHWC>& dst_shapes);

+  // should be called after changes of inputs/outputs.
+  absl::Status UpdateOpParams();
+
  void EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder);

+  void EncodeOpWithEncoder(id<MTLComputeCommandEncoder> encoder);
+
  void SetSrcTensor(const MetalSpatialTensor& tensor, int index);

  void SetDstTensor(const MetalSpatialTensor& tensor, int index);

 private:
  std::unique_ptr<ComputeTaskDescriptor> task_desc_;
+  std::unique_ptr<GPUOperation> operation_ = nullptr;
  id<MTLComputePipelineState> program_;
  MetalArguments metal_args_;
  uint3 groups_size_;
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@ -64,6 +64,7 @@ objc_library(
    deps = [
        ":add",
        ":test_util",
+        "//tensorflow/lite/delegates/gpu/common/tasks:add_test_util",
    ],
 )

@ -812,6 +813,8 @@ objc_library(
        "//tensorflow/lite/delegates/gpu/common:tensor",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common/task:testing_util",
        "//tensorflow/lite/delegates/gpu/metal:common",
        "//tensorflow/lite/delegates/gpu/metal:inference_context",
        "//tensorflow/lite/delegates/gpu/metal:metal_device",
@ -897,6 +900,7 @@ objc_library(
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/common/tasks:add_test_util",
        "//tensorflow/lite/delegates/gpu/metal:common",
        "//tensorflow/lite/delegates/gpu/metal:inference_context",
        "//tensorflow/lite/kernels/internal:quantization_util",
--- a/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm
@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
+#include "tensorflow/lite/delegates/gpu/common/tasks/add_test_util.h"

 using ::tflite::gpu::ElementwiseAttributes;
 using ::tflite::gpu::BHWC;
@ -123,4 +124,22 @@ using ::tflite::gpu::metal::SingleOpModel;
  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }

+- (void)testAddTwoEqualTensors {
+  tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
+  auto status = AddTwoEqualTensorsTest(&exec_env_);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
+- (void)testAddFirstTensorHasMoreChannelsThanSecond {
+  tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
+  auto status = AddFirstTensorHasMoreChannelsThanSecondTest(&exec_env_);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
+- (void)testAddFirstTensorHasLessChannelsThanSecond {
+  tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
+  auto status = AddFirstTensorHasLessChannelsThanSecond(&exec_env_);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
@end
--- a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.cc
@ -184,6 +184,72 @@ MetalExecutionEnvironment::GetSupportedStoragesWithHWZeroClampSupport() const {
  return {};
 }

+absl::Status MetalExecutionEnvironment::ExecuteGPUOperation(
+    const std::vector<TensorFloat32>& src_cpu,
+    std::unique_ptr<GPUOperation>&& operation,
+    const std::vector<BHWC>& dst_sizes,
+    const std::vector<TensorFloat32*>& dst_cpu) {
+  const OperationDef op_def = operation->GetDefinition();
+  std::vector<MetalSpatialTensor> src(src_cpu.size());
+  for (int i = 0; i < src_cpu.size(); ++i) {
+    auto src_shape = src_cpu[i].shape;
+    if (src_shape.b != 1 && !op_def.IsBatchSupported()) {
+      return absl::InvalidArgumentError(
+          "Layout doesn't have Batch dimension, but shape.b != 1");
+    }
+    RETURN_IF_ERROR(CreateTensor(device_.device(), src_shape,
+                                 op_def.src_tensors[i], &src[i]));
+    RETURN_IF_ERROR(src[i].WriteData(src_cpu[i]));
+    operation->SetSrc(&src[i], i);
+  }
+
+  std::vector<MetalSpatialTensor> dst(dst_cpu.size());
+  for (int i = 0; i < dst_cpu.size(); ++i) {
+    auto dst_shape = dst_sizes[i];
+    if (dst_shape.b != 1 && !op_def.IsBatchSupported()) {
+      return absl::InvalidArgumentError(
+          "Layout doesn't have Batch dimension, but shape.b != 1");
+    }
+    RETURN_IF_ERROR(CreateTensor(device_.device(), dst_shape,
+                                 op_def.dst_tensors[i], &dst[i]));
+    operation->SetDst(&dst[i], i);
+  }
+
+  std::vector<BHWC> src_shapes;
+  std::vector<BHWC> dst_shapes;
+  std::vector<ValueId> src_ids;
+  std::vector<ValueId> dst_ids;
+  for (int i = 0; i < src_cpu.size(); ++i) {
+    src_ids.push_back(i);
+    src_shapes.push_back(src_cpu[i].shape);
+  }
+  for (int i = 0; i < dst_cpu.size(); ++i) {
+    dst_ids.push_back(src_cpu.size() + i);
+    dst_shapes.push_back(dst_sizes[i]);
+  }
+
+  ComputeTask gpu_task;
+  gpu_task.Init(std::move(operation));
+  RETURN_IF_ERROR(gpu_task.CompileOp(&device_));
+  RETURN_IF_ERROR(gpu_task.UpdateOpParams());
+
+  id<MTLCommandQueue> command_queue = [device_.device() newCommandQueue];
+  id<MTLCommandBuffer> command_buffer = [command_queue commandBuffer];
+  id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
+  gpu_task.EncodeOpWithEncoder(encoder);
+  [encoder endEncoding];
+  [command_buffer commit];
+  [command_buffer waitUntilCompleted];
+
+  for (int i = 0; i < dst_cpu.size(); ++i) {
+    dst_cpu[i]->shape = dst_sizes[i];
+    dst_cpu[i]->data = std::vector<float>(dst_sizes[i].DimensionsProduct(), 0);
+    RETURN_IF_ERROR(dst[i].ReadData(dst_cpu[i]));
+  }
+
+  return absl::OkStatus();
+}
+
 absl::Status MetalExecutionEnvironment::ExecuteGPUOperation(
    const std::vector<TensorFloat32>& src_cpu,
    std::unique_ptr<ComputeTaskDescriptor>&& operation,
--- a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h
@ -24,6 +24,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
 #include "tensorflow/lite/delegates/gpu/metal/inference_context.h"
@ -61,20 +63,24 @@ class SingleOpModel {
 absl::Status CompareVectors(const std::vector<float>& reference,
                            const std::vector<float>& output, float max_error);

-class MetalExecutionEnvironment {
+class MetalExecutionEnvironment : public TestExecutionEnvironment {
 public:
  MetalExecutionEnvironment() = default;
  ~MetalExecutionEnvironment() = default;

-  std::vector<CalculationsPrecision> GetSupportedPrecisions() const;
-  std::vector<TensorStorageType> GetSupportedStorages() const;
-  // returns storage types that support zero clamping when reading OOB in HW
-  // (Height/Width) dimensions.
+  std::vector<CalculationsPrecision> GetSupportedPrecisions() const override;
+  std::vector<TensorStorageType> GetSupportedStorages() const override;
  std::vector<TensorStorageType> GetSupportedStoragesWithHWZeroClampSupport()
-      const;
+      const override;

  const GpuInfo& GetGpuInfo() const { return device_.GetInfo(); }

+  absl::Status ExecuteGPUOperation(
+      const std::vector<TensorFloat32>& src_cpu,
+      std::unique_ptr<GPUOperation>&& operation,
+      const std::vector<BHWC>& dst_sizes,
+      const std::vector<TensorFloat32*>& dst_cpu) override;
+
  absl::Status ExecuteGPUOperation(
      const std::vector<TensorFloat32>& src_cpu,
      std::unique_ptr<ComputeTaskDescriptor>&& operation,