From a2d2b5c3d5e0e2fef7cecc312127c71a3631c4b5 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 10 Dec 2020 18:30:58 -0800
Subject: [PATCH] TFLComputeTask converted to c++ style ComputeTask.

PiperOrigin-RevId: 346908996
Change-Id: I8df1ee7b777a0f8b40580922b2fe91cb0d2983cd
---
 .../lite/delegates/gpu/metal/compute_task.h   |  77 +++++++--
 .../lite/delegates/gpu/metal/compute_task.mm  | 156 +++++++-----------
 .../delegates/gpu/metal/inference_context.mm  |  47 +++---
 3 files changed, 141 insertions(+), 139 deletions(-)
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.h b/tensorflow/lite/delegates/gpu/metal/compute_task.h
index ccbb95b0271..aeb07324464 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task.h
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task.h
@@ -27,34 +27,77 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/precision.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/metal/common.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h"
 #include "tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h"
 
-@interface TFLComputeTask : NSObject
+namespace tflite {
+namespace gpu {
+namespace metal {
 
-/// Returns empty string or error if shader can't be compiled.
-- (absl::Status)compileWithDevice:(id<MTLDevice>)device
-                   taskDescriptor:(const tflite::gpu::metal::NodeDescriptor&)desc
-                        precision:(tflite::gpu::CalculationsPrecision)precision;
+class ComputeTask {
+ public:
+  ComputeTask() = default;
 
-/// Updates parameters for inputs/outputs/intermediate tensors
-- (absl::Status)updateParamsWithDevice:(id<MTLDevice>)device
-                          tensorShapes:(const std::map<tflite::gpu::ValueId, tflite::gpu::BHWC>&)
-                                           tensorShapes;
+  // Move only
+  ComputeTask(ComputeTask&& args) = default;
+  ComputeTask& operator=(ComputeTask&& args) = default;
+  ComputeTask(const ComputeTask&) = delete;
+  ComputeTask& operator=(const ComputeTask&) = delete;
 
-- (bool)hasInOutIds:(const std::set<::tflite::gpu::ValueId>&)ids;
+  /// Returns empty string or error if shader can't be compiled.
+  absl::Status CompileWithDevice(id<MTLDevice> device,
+                                 const NodeDescriptor& desc,
+                                 CalculationsPrecision precision);
 
-- (void)encodeWithEncoder:(id<MTLComputeCommandEncoder>)encoder;
+  /// Updates parameters for inputs/outputs/intermediate tensors
+  absl::Status UpdateParamsWithDevice(
+      id<MTLDevice> device, const std::map<ValueId, BHWC>& tensor_shapes);
 
-- (std::vector<tflite::gpu::ValueId>)getOutputIds;
-- (std::vector<tflite::gpu::ValueId>)getInputIds;
+  bool HasInOutIds(const std::set<ValueId>& ids) const;
 
-- (void)setSrcTensor:(const tflite::gpu::metal::MetalSpatialTensor&)tensor withIndex:(int)index;
+  void EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder);
 
-- (void)setDstTensor:(const tflite::gpu::metal::MetalSpatialTensor&)tensor withIndex:(int)index;
+  std::vector<ValueId> GetOutputIds() const;
+  std::vector<ValueId> GetInputIds() const;
 
-- (void)setDescription:(const std::string&)description;
+  void SetSrcTensor(const MetalSpatialTensor& tensor, int index);
 
-@end
+  void SetDstTensor(const MetalSpatialTensor& tensor, int index);
+
+  void SetDescription(const std::string& description);
+
+ private:
+  struct InputBuffer {
+    ValueId uid;
+    id<MTLBuffer> metal_handle;
+  };
+
+  struct OutputBuffer {
+    ValueId uid;
+    id<MTLBuffer> metal_handle;
+  };
+
+  struct UniformBuffer {
+    std::vector<uint8_t> data;
+    UniformsFunction data_function;
+  };
+
+  id<MTLComputePipelineState> program_;
+  std::vector<InputBuffer> input_buffers_;
+  std::vector<OutputBuffer> output_buffers_;
+  std::vector<id<MTLBuffer>> immutable_buffers_;
+  std::vector<UniformBuffer> uniform_buffers_;
+  uint3 groups_size_;
+  uint3 groups_count_;
+  DispatchParamsFunction resize_function_;
+  std::string description_;
+  MetalArguments metal_args_;
+};
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_COMPUTE_TASK_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.mm b/tensorflow/lite/delegates/gpu/metal/compute_task.mm
index 7782c53ed3e..95b30832b58 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task.mm
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task.mm
@@ -27,55 +27,16 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/common.h"
 
-using ::tflite::gpu::AlignByN;
-using ::tflite::gpu::BHWC;
-using ::tflite::gpu::HalfBits;
-using ::tflite::gpu::metal::ComputeTaskDescriptorPtr;
-using ::tflite::gpu::metal::CreateComputeProgram;
-using ::tflite::gpu::metal::DispatchParamsFunction;
-using ::tflite::gpu::CalculationsPrecision;
-using ::tflite::gpu::metal::UniformsFunction;
-using ::tflite::gpu::uint3;
-using ::tflite::gpu::ValueId;
+namespace tflite {
+namespace gpu {
+namespace metal {
 
-namespace {
-
-struct InputBuffer {
-  ValueId uid;
-  id<MTLBuffer> metalHandle;
-};
-
-struct OutputBuffer {
-  ValueId uid;
-  id<MTLBuffer> metalHandle;
-};
-
-struct UniformBuffer {
-  std::vector<uint8_t> data;
-  UniformsFunction dataFunction;
-};
-
-}  // namespace
-
-@implementation TFLComputeTask {
-  id<MTLComputePipelineState> _program;
-  std::vector<InputBuffer> _inputBuffers;
-  std::vector<OutputBuffer> _outputBuffers;
-  std::vector<id<MTLBuffer>> _immutableBuffers;
-  std::vector<UniformBuffer> _uniformBuffers;
-  uint3 _groupsSize;
-  uint3 _groupsCount;
-  DispatchParamsFunction _resizeFunction;
-  std::string _description;
-  tflite::gpu::metal::MetalArguments _metal_args;
-}
-
-- (absl::Status)compileWithDevice:(id<MTLDevice>)device
-                   taskDescriptor:(const tflite::gpu::metal::NodeDescriptor&)desc
-                        precision:(CalculationsPrecision)precision; {
+absl::Status ComputeTask::CompileWithDevice(id<MTLDevice> device,
+                                            const NodeDescriptor& desc,
+                                            CalculationsPrecision precision) {
   size_t offset = desc.task->src_tensors_names.size() + desc.task->uniform_buffers.size()
                   + desc.task->immutable_buffers.size() + 1;
-  RETURN_IF_ERROR(_metal_args.Init(device, offset, &desc.task->args, &desc.task->shader_source));
+  RETURN_IF_ERROR(metal_args_.Init(device, offset, &desc.task->args, &desc.task->shader_source));
   NSString* barrier;
   // simdgroup_barrier is supported on macOS 10.13+ and Metal shading language version 2.0
   if (@available(macOS 10.13, iOS 10.0, tvOS 10.0, *)) {
@@ -129,12 +90,12 @@ struct UniformBuffer {
     return absl::InternalError("Unknown shader compilation error");
   }
   for (auto& id : desc.src_tensors_ids) {
-    _inputBuffers.emplace_back(InputBuffer{id, nil});
+    input_buffers_.emplace_back(InputBuffer{id, nil});
   }
   for (auto& uniform : desc.task->uniform_buffers) {
-    _uniformBuffers.emplace_back(UniformBuffer{{}, uniform.data_function});
+    uniform_buffers_.emplace_back(UniformBuffer{{}, uniform.data_function});
   }
-  _outputBuffers.emplace_back(OutputBuffer{desc.dst_tensors_ids[0], nil});
+  output_buffers_.emplace_back(OutputBuffer{desc.dst_tensors_ids[0], nil});
   const bool f32_storage = precision == CalculationsPrecision::F32;
   for (auto& immutable : desc.task->immutable_buffers) {
     int padding = 4 * (f32_storage ? sizeof(float) : sizeof(HalfBits));
@@ -143,61 +104,60 @@ struct UniformBuffer {
     id<MTLBuffer> metalBuffer = [device newBufferWithBytes:immutable.data.data()
                                                     length:immutable.data.size()
                                                    options:MTLResourceStorageModeShared];
-    _immutableBuffers.emplace_back(metalBuffer);
+    immutable_buffers_.emplace_back(metalBuffer);
   }
-  _resizeFunction = desc.task->resize_function;
-  _program = program;
+  resize_function_ = desc.task->resize_function;
+  program_ = program;
   return absl::OkStatus();
 }
 
-- (absl::Status)
-    updateParamsWithDevice:(id<MTLDevice>)device
-              tensorShapes:(const std::map<tflite::gpu::ValueId, tflite::gpu::BHWC>&)tensorShapes {
+absl::Status ComputeTask::UpdateParamsWithDevice(
+      id<MTLDevice> device, const std::map<ValueId, BHWC>& tensor_shapes) {
   std::vector<BHWC> src_shapes;
   std::vector<BHWC> dst_shapes;
-  for (const auto& in_buf : _inputBuffers) {
-    auto it = tensorShapes.find(in_buf.uid);
-    if (it == tensorShapes.end()) {
+  for (const auto& in_buf : input_buffers_) {
+    auto it = tensor_shapes.find(in_buf.uid);
+    if (it == tensor_shapes.end()) {
       return absl::InvalidArgumentError("Missing tensor shape");
     }
     src_shapes.push_back(it->second);
   }
-  for (const auto& out_buf : _outputBuffers) {
-    auto it = tensorShapes.find(out_buf.uid);
-    if (it == tensorShapes.end()) {
+  for (const auto& out_buf : output_buffers_) {
+    auto it = tensor_shapes.find(out_buf.uid);
+    if (it == tensor_shapes.end()) {
       return absl::InvalidArgumentError("Missing tensor shape");
     }
     dst_shapes.push_back(it->second);
   }
-  for (auto& uniform : _uniformBuffers) {
-    uniform.data = uniform.dataFunction(src_shapes, dst_shapes);
+  for (auto& uniform : uniform_buffers_) {
+    uniform.data = uniform.data_function(src_shapes, dst_shapes);
   }
 
   // Dispatch parameters re-calculation
-  auto workGroups = _resizeFunction(src_shapes, dst_shapes);
-  _groupsSize = workGroups.first;
+  auto workGroups = resize_function_(src_shapes, dst_shapes);
+  groups_size_ = workGroups.first;
   MTLSize threadsPerGroup = [device maxThreadsPerThreadgroup];
-  if (_groupsSize.x > threadsPerGroup.width || _groupsSize.y > threadsPerGroup.height ||
-      _groupsSize.z > threadsPerGroup.depth) {
+  if (groups_size_.x > threadsPerGroup.width || groups_size_.y > threadsPerGroup.height ||
+      groups_size_.z > threadsPerGroup.depth) {
     std::string error("Threads per working group: ");
-    error += std::to_string(_groupsSize.x) + ", " + std::to_string(_groupsSize.y) + ", " +
-             std::to_string(_groupsSize.z);
+    error += std::to_string(groups_size_.x) + ", " + std::to_string(groups_size_.y) + ", " +
+             std::to_string(groups_size_.z);
     error += "is larger than the MTLDevice can support: ";
     error += std::to_string(threadsPerGroup.width) + ", " + std::to_string(threadsPerGroup.height) +
              ", " + std::to_string(threadsPerGroup.depth);
     return absl::InvalidArgumentError(error);
   }
-  _groupsCount = workGroups.second;
+  groups_count_ = workGroups.second;
   return absl::OkStatus();
 }
 
-- (bool)hasInOutIds:(const std::set<::tflite::gpu::ValueId>&)ids {
-  for (auto& buffer : _inputBuffers) {
+bool ComputeTask::HasInOutIds(const std::set<ValueId>& ids) const {
+  for (auto& buffer : input_buffers_) {
     if (ids.count(buffer.uid)) {
       return true;
     }
   }
-  for (auto& buffer : _outputBuffers) {
+  for (auto& buffer : output_buffers_) {
     if (ids.count(buffer.uid)) {
       return true;
     }
@@ -205,66 +165,66 @@ struct UniformBuffer {
   return false;
 }
 
-- (void)encodeWithEncoder:(id<MTLComputeCommandEncoder>)encoder {
+void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
   // The dispatch call is intended to be skipped.
-  if (_groupsCount.x * _groupsCount.y * _groupsCount.z == 0) {
+  if (groups_count_.x * groups_count_.y * groups_count_.z == 0) {
     return;
   }
 
-  [encoder setComputePipelineState:_program];
+  [encoder setComputePipelineState:program_];
 
   int bindIndex = 0;
-  for (const auto& buffer : _outputBuffers) {
-    [encoder setBuffer:buffer.metalHandle offset:0 atIndex:bindIndex];
+  for (const auto& buffer : output_buffers_) {
+    [encoder setBuffer:buffer.metal_handle offset:0 atIndex:bindIndex];
     bindIndex++;
   }
-  for (const auto& buffer : _inputBuffers) {
-    [encoder setBuffer:buffer.metalHandle offset:0 atIndex:bindIndex];
+  for (const auto& buffer : input_buffers_) {
+    [encoder setBuffer:buffer.metal_handle offset:0 atIndex:bindIndex];
     bindIndex++;
   }
-  for (auto& immutable : _immutableBuffers) {
+  for (auto& immutable : immutable_buffers_) {
     [encoder setBuffer:immutable offset:0 atIndex:bindIndex];
     bindIndex++;
   }
-  for (auto& uniform : _uniformBuffers) {
+  for (auto& uniform : uniform_buffers_) {
     [encoder setBytes:uniform.data.data() length:uniform.data.size() atIndex:bindIndex];
     bindIndex++;
   }
-  _metal_args.Encode(encoder, bindIndex);
+  metal_args_.Encode(encoder, bindIndex);
 
-  MTLSize groupsCount = MTLSizeMake(_groupsCount.x, _groupsCount.y, _groupsCount.z);
-  MTLSize groupsSize = MTLSizeMake(_groupsSize.x, _groupsSize.y, _groupsSize.z);
+  MTLSize groupsCount = MTLSizeMake(groups_count_.x, groups_count_.y, groups_count_.z);
+  MTLSize groupsSize = MTLSizeMake(groups_size_.x, groups_size_.y, groups_size_.z);
   [encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize];
 }
 
-- (std::vector<tflite::gpu::ValueId>)getOutputIds {
+std::vector<ValueId> ComputeTask::GetOutputIds() const {
   std::vector<tflite::gpu::ValueId> result;
-  for (auto& buffer : _outputBuffers) {
+  for (auto& buffer : output_buffers_) {
     result.push_back(buffer.uid);
   }
   return result;
 }
 
-- (std::vector<tflite::gpu::ValueId>)getInputIds {
+std::vector<ValueId> ComputeTask::GetInputIds() const {
   std::vector<tflite::gpu::ValueId> result;
-  for (auto& buffer : _inputBuffers) {
+  for (auto& buffer : input_buffers_) {
     result.push_back(buffer.uid);
   }
   return result;
 }
 
-- (void)setSrcTensor:(const tflite::gpu::metal::MetalSpatialTensor&)tensor
-           withIndex:(int)index; {
-  _inputBuffers[index].metalHandle = tensor.GetBufferHandle();
+void ComputeTask::SetSrcTensor(const MetalSpatialTensor& tensor, int index) {
+  input_buffers_[index].metal_handle = tensor.GetBufferHandle();
 }
 
-- (void)setDstTensor:(const tflite::gpu::metal::MetalSpatialTensor&)tensor
-           withIndex:(int)index; {
-  _outputBuffers[index].metalHandle = tensor.GetBufferHandle();
+void ComputeTask::SetDstTensor(const MetalSpatialTensor& tensor, int index) {
+  output_buffers_[index].metal_handle = tensor.GetBufferHandle();
 }
 
-- (void)setDescription:(const std::string&)description {
-  _description = description;
+void ComputeTask::SetDescription(const std::string& description) {
+  description_ = description;
 }
 
-@end
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/inference_context.mm b/tensorflow/lite/delegates/gpu/metal/inference_context.mm
index a228813d0ca..dd9ef73c0d6 100644
--- a/tensorflow/lite/delegates/gpu/metal/inference_context.mm
+++ b/tensorflow/lite/delegates/gpu/metal/inference_context.mm
@@ -37,6 +37,7 @@ using ::tflite::gpu::DataType;
 using ::tflite::gpu::HalfBits;
 using ::tflite::gpu::int2;
 using ::tflite::gpu::MemoryStrategy;
+using ::tflite::gpu::metal::ComputeTask;
 using ::tflite::gpu::metal::ComputeTaskDescriptorPtr;
 using ::tflite::gpu::metal::MetalSpatialTensor;
 using ::tflite::gpu::TensorDescriptor;
@@ -60,7 +61,7 @@ void AddUsage(ValueId id, int task_index,
 }  // namespace
 
 @implementation TFLInferenceContext {
-  std::vector<TFLComputeTask*> _computeTasks;
+  std::vector<ComputeTask> _computeTasks;
   // contains indexes of _computeTasks
   std::vector<int> _taskIdsWithPreallocatedTensors;
   std::vector<ValueId> _inputIds;
@@ -85,17 +86,15 @@ void AddUsage(ValueId id, int task_index,
   _precision = precision;
   // Metal resources are created here.
   for (const auto& node : compiledModel.nodes) {
-    TFLComputeTask* task = [[TFLComputeTask alloc] init];
-    RETURN_IF_ERROR([task compileWithDevice:device
-                             taskDescriptor:node
-                                  precision:_precision]);
-    [task setDescription:node.description];
-    _computeTasks.emplace_back(task);
+    ComputeTask task;
+    RETURN_IF_ERROR(task.CompileWithDevice(device, node, _precision));
+    task.SetDescription(node.description);
+    _computeTasks.emplace_back(std::move(task));
   }
   _tensorShapes = compiledModel.tensor_shapes;
   for (auto& task : _computeTasks) {
     // The same device must be used here as well as on shader compilation stage.
-    RETURN_IF_ERROR([task updateParamsWithDevice:device tensorShapes:_tensorShapes]);
+    RETURN_IF_ERROR(task.UpdateParamsWithDevice(device, _tensorShapes));
   }
   RETURN_IF_ERROR([self allocateTensors:device]);
   return absl::OkStatus();
@@ -111,7 +110,7 @@ void AddUsage(ValueId id, int task_index,
   }
   for (int i = 0; i < _computeTasks.size(); ++i) {
     auto& task = _computeTasks[i];
-    if ([task hasInOutIds:preallocatedIds]) {
+    if (task.HasInOutIds(preallocatedIds)) {
       _taskIdsWithPreallocatedTensors.push_back(i);
     }
   }
@@ -144,15 +143,15 @@ void AddUsage(ValueId id, int task_index,
 
 - (void)bindTensorsToOperations {
   for (auto& task : _computeTasks) {
-    const auto& src_ids = [task getInputIds];
+    const auto& src_ids = task.GetInputIds();
     for (int i = 0; i < src_ids.size(); ++i) {
       MetalSpatialTensor* tensor = [self getTensor:src_ids[i]];
-      [task setSrcTensor:*tensor withIndex:i];
+      task.SetSrcTensor(*tensor, i);
     }
-    const auto& dst_ids = [task getOutputIds];
+    const auto& dst_ids = task.GetOutputIds();
     for (int i = 0; i < dst_ids.size(); ++i) {
       MetalSpatialTensor* tensor = [self getTensor:dst_ids[i]];
-      [task setDstTensor:*tensor withIndex:i];
+      task.SetDstTensor(*tensor, i);
     }
   }
 }
@@ -164,12 +163,12 @@ void AddUsage(ValueId id, int task_index,
     }
   }
   for (int op_index = 0; op_index < _computeTasks.size(); ++op_index) {
-    for (auto& tensor_id : [_computeTasks[op_index] getInputIds]) {
+    for (auto& tensor_id : _computeTasks[op_index].GetInputIds()) {
       if (_preallocatedTensors.find(tensor_id) == _preallocatedTensors.end()) {
         AddUsage(tensor_id, op_index, usages);
       }
     }
-    for (auto& tensor_id : [_computeTasks[op_index] getOutputIds]) {
+    for (auto& tensor_id : _computeTasks[op_index].GetOutputIds()) {
       if (_preallocatedTensors.find(tensor_id) == _preallocatedTensors.end()) {
         AddUsage(tensor_id, op_index, usages);
       }
@@ -239,8 +238,8 @@ void AddUsage(ValueId id, int task_index,
   descriptor.data_type = f32_storage ? DataType::FLOAT32 : DataType::FLOAT16;
   descriptor.layout = tflite::gpu::Layout::HWC;
   for (auto& task : _computeTasks) {
-    const std::vector<ValueId> input_ids = [task getInputIds];
-    const std::vector<ValueId> output_ids = [task getOutputIds];
+    const std::vector<ValueId> input_ids = task.GetInputIds();
+    const std::vector<ValueId> output_ids = task.GetOutputIds();
     std::vector<ValueId> all_ids = input_ids;
     all_ids.insert(all_ids.end(), output_ids.begin(), output_ids.end());
     for (auto& tensor_id : all_ids) {
@@ -263,7 +262,7 @@ void AddUsage(ValueId id, int task_index,
   [self updatePreallocatedTensors:inputOutputBuffers];
   for (int i = 0; i < _computeTasks.size(); ++i) {
     auto& task = _computeTasks[i];
-    [task encodeWithEncoder:commandEncoder];
+    task.EncodeWithEncoder(commandEncoder);
   }
 }
 
@@ -274,7 +273,7 @@ void AddUsage(ValueId id, int task_index,
   for (int i = 0; i < _computeTasks.size(); ++i) {
     id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
     auto& task = _computeTasks[i];
-    [task encodeWithEncoder:encoder];
+    task.EncodeWithEncoder(encoder);
     [encoder endEncoding];
   }
 }
@@ -288,7 +287,7 @@ void AddUsage(ValueId id, int task_index,
   for (int i = 0; i < _computeTasks.size(); ++i) {
     id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
     auto& task = _computeTasks[i];
-    [task encodeWithEncoder:encoder];
+    task.EncodeWithEncoder(encoder);
     [encoder endEncoding];
     if (i % flushPeriod == (flushPeriod - 1)) {
       [commandBuffer commit];
@@ -304,18 +303,18 @@ void AddUsage(ValueId id, int task_index,
   }
   for (auto& task_index : _taskIdsWithPreallocatedTensors) {
     auto& task = _computeTasks[task_index];
-    const auto& src_ids = [task getInputIds];
+    const auto& src_ids = task.GetInputIds();
     for (int i = 0; i < src_ids.size(); ++i) {
       const auto& it = _preallocatedTensors.find(src_ids[i]);
       if (it != _preallocatedTensors.end()) {
-        [task setSrcTensor:it->second withIndex:i];
+        task.SetSrcTensor(it->second, i);
       }
     }
-    const auto& dst_ids = [task getOutputIds];
+    const auto& dst_ids = task.GetOutputIds();
     for (int i = 0; i < dst_ids.size(); ++i) {
       const auto& it = _preallocatedTensors.find(dst_ids[i]);
       if (it != _preallocatedTensors.end()) {
-        [task setDstTensor:it->second withIndex:i];
+        task.SetDstTensor(it->second, i);
       }
     }
   }