From a2d2b5c3d5e0e2fef7cecc312127c71a3631c4b5 Mon Sep 17 00:00:00 2001 From: Raman Sarokin Date: Thu, 10 Dec 2020 18:30:58 -0800 Subject: [PATCH] TFLComputeTask converted to c++ style ComputeTask. PiperOrigin-RevId: 346908996 Change-Id: I8df1ee7b777a0f8b40580922b2fe91cb0d2983cd --- .../lite/delegates/gpu/metal/compute_task.h | 77 +++++++-- .../lite/delegates/gpu/metal/compute_task.mm | 156 +++++++----------- .../delegates/gpu/metal/inference_context.mm | 47 +++--- 3 files changed, 141 insertions(+), 139 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.h b/tensorflow/lite/delegates/gpu/metal/compute_task.h index ccbb95b0271..aeb07324464 100644 --- a/tensorflow/lite/delegates/gpu/metal/compute_task.h +++ b/tensorflow/lite/delegates/gpu/metal/compute_task.h @@ -27,34 +27,77 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/common/precision.h" #include "tensorflow/lite/delegates/gpu/common/shape.h" #include "tensorflow/lite/delegates/gpu/common/status.h" +#include "tensorflow/lite/delegates/gpu/metal/common.h" #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" +#include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h" #include "tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h" -@interface TFLComputeTask : NSObject +namespace tflite { +namespace gpu { +namespace metal { -/// Returns empty string or error if shader can't be compiled. -- (absl::Status)compileWithDevice:(id)device - taskDescriptor:(const tflite::gpu::metal::NodeDescriptor&)desc - precision:(tflite::gpu::CalculationsPrecision)precision; +class ComputeTask { + public: + ComputeTask() = default; -/// Updates parameters for inputs/outputs/intermediate tensors -- (absl::Status)updateParamsWithDevice:(id)device - tensorShapes:(const std::map&) - tensorShapes; + // Move only + ComputeTask(ComputeTask&& args) = default; + ComputeTask& operator=(ComputeTask&& args) = default; + ComputeTask(const ComputeTask&) = delete; + ComputeTask& operator=(const ComputeTask&) = delete; -- (bool)hasInOutIds:(const std::set<::tflite::gpu::ValueId>&)ids; + /// Returns empty string or error if shader can't be compiled. + absl::Status CompileWithDevice(id device, + const NodeDescriptor& desc, + CalculationsPrecision precision); -- (void)encodeWithEncoder:(id)encoder; + /// Updates parameters for inputs/outputs/intermediate tensors + absl::Status UpdateParamsWithDevice( + id device, const std::map& tensor_shapes); -- (std::vector)getOutputIds; -- (std::vector)getInputIds; + bool HasInOutIds(const std::set& ids) const; -- (void)setSrcTensor:(const tflite::gpu::metal::MetalSpatialTensor&)tensor withIndex:(int)index; + void EncodeWithEncoder(id encoder); -- (void)setDstTensor:(const tflite::gpu::metal::MetalSpatialTensor&)tensor withIndex:(int)index; + std::vector GetOutputIds() const; + std::vector GetInputIds() const; -- (void)setDescription:(const std::string&)description; + void SetSrcTensor(const MetalSpatialTensor& tensor, int index); -@end + void SetDstTensor(const MetalSpatialTensor& tensor, int index); + + void SetDescription(const std::string& description); + + private: + struct InputBuffer { + ValueId uid; + id metal_handle; + }; + + struct OutputBuffer { + ValueId uid; + id metal_handle; + }; + + struct UniformBuffer { + std::vector data; + UniformsFunction data_function; + }; + + id program_; + std::vector input_buffers_; + std::vector output_buffers_; + std::vector> immutable_buffers_; + std::vector uniform_buffers_; + uint3 groups_size_; + uint3 groups_count_; + DispatchParamsFunction resize_function_; + std::string description_; + MetalArguments metal_args_; +}; + +} // namespace metal +} // namespace gpu +} // namespace tflite #endif // TENSORFLOW_LITE_DELEGATES_GPU_METAL_COMPUTE_TASK_H_ diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.mm b/tensorflow/lite/delegates/gpu/metal/compute_task.mm index 7782c53ed3e..95b30832b58 100644 --- a/tensorflow/lite/delegates/gpu/metal/compute_task.mm +++ b/tensorflow/lite/delegates/gpu/metal/compute_task.mm @@ -27,55 +27,16 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/common/util.h" #include "tensorflow/lite/delegates/gpu/metal/common.h" -using ::tflite::gpu::AlignByN; -using ::tflite::gpu::BHWC; -using ::tflite::gpu::HalfBits; -using ::tflite::gpu::metal::ComputeTaskDescriptorPtr; -using ::tflite::gpu::metal::CreateComputeProgram; -using ::tflite::gpu::metal::DispatchParamsFunction; -using ::tflite::gpu::CalculationsPrecision; -using ::tflite::gpu::metal::UniformsFunction; -using ::tflite::gpu::uint3; -using ::tflite::gpu::ValueId; +namespace tflite { +namespace gpu { +namespace metal { -namespace { - -struct InputBuffer { - ValueId uid; - id metalHandle; -}; - -struct OutputBuffer { - ValueId uid; - id metalHandle; -}; - -struct UniformBuffer { - std::vector data; - UniformsFunction dataFunction; -}; - -} // namespace - -@implementation TFLComputeTask { - id _program; - std::vector _inputBuffers; - std::vector _outputBuffers; - std::vector> _immutableBuffers; - std::vector _uniformBuffers; - uint3 _groupsSize; - uint3 _groupsCount; - DispatchParamsFunction _resizeFunction; - std::string _description; - tflite::gpu::metal::MetalArguments _metal_args; -} - -- (absl::Status)compileWithDevice:(id)device - taskDescriptor:(const tflite::gpu::metal::NodeDescriptor&)desc - precision:(CalculationsPrecision)precision; { +absl::Status ComputeTask::CompileWithDevice(id device, + const NodeDescriptor& desc, + CalculationsPrecision precision) { size_t offset = desc.task->src_tensors_names.size() + desc.task->uniform_buffers.size() + desc.task->immutable_buffers.size() + 1; - RETURN_IF_ERROR(_metal_args.Init(device, offset, &desc.task->args, &desc.task->shader_source)); + RETURN_IF_ERROR(metal_args_.Init(device, offset, &desc.task->args, &desc.task->shader_source)); NSString* barrier; // simdgroup_barrier is supported on macOS 10.13+ and Metal shading language version 2.0 if (@available(macOS 10.13, iOS 10.0, tvOS 10.0, *)) { @@ -129,12 +90,12 @@ struct UniformBuffer { return absl::InternalError("Unknown shader compilation error"); } for (auto& id : desc.src_tensors_ids) { - _inputBuffers.emplace_back(InputBuffer{id, nil}); + input_buffers_.emplace_back(InputBuffer{id, nil}); } for (auto& uniform : desc.task->uniform_buffers) { - _uniformBuffers.emplace_back(UniformBuffer{{}, uniform.data_function}); + uniform_buffers_.emplace_back(UniformBuffer{{}, uniform.data_function}); } - _outputBuffers.emplace_back(OutputBuffer{desc.dst_tensors_ids[0], nil}); + output_buffers_.emplace_back(OutputBuffer{desc.dst_tensors_ids[0], nil}); const bool f32_storage = precision == CalculationsPrecision::F32; for (auto& immutable : desc.task->immutable_buffers) { int padding = 4 * (f32_storage ? sizeof(float) : sizeof(HalfBits)); @@ -143,61 +104,60 @@ struct UniformBuffer { id metalBuffer = [device newBufferWithBytes:immutable.data.data() length:immutable.data.size() options:MTLResourceStorageModeShared]; - _immutableBuffers.emplace_back(metalBuffer); + immutable_buffers_.emplace_back(metalBuffer); } - _resizeFunction = desc.task->resize_function; - _program = program; + resize_function_ = desc.task->resize_function; + program_ = program; return absl::OkStatus(); } -- (absl::Status) - updateParamsWithDevice:(id)device - tensorShapes:(const std::map&)tensorShapes { +absl::Status ComputeTask::UpdateParamsWithDevice( + id device, const std::map& tensor_shapes) { std::vector src_shapes; std::vector dst_shapes; - for (const auto& in_buf : _inputBuffers) { - auto it = tensorShapes.find(in_buf.uid); - if (it == tensorShapes.end()) { + for (const auto& in_buf : input_buffers_) { + auto it = tensor_shapes.find(in_buf.uid); + if (it == tensor_shapes.end()) { return absl::InvalidArgumentError("Missing tensor shape"); } src_shapes.push_back(it->second); } - for (const auto& out_buf : _outputBuffers) { - auto it = tensorShapes.find(out_buf.uid); - if (it == tensorShapes.end()) { + for (const auto& out_buf : output_buffers_) { + auto it = tensor_shapes.find(out_buf.uid); + if (it == tensor_shapes.end()) { return absl::InvalidArgumentError("Missing tensor shape"); } dst_shapes.push_back(it->second); } - for (auto& uniform : _uniformBuffers) { - uniform.data = uniform.dataFunction(src_shapes, dst_shapes); + for (auto& uniform : uniform_buffers_) { + uniform.data = uniform.data_function(src_shapes, dst_shapes); } // Dispatch parameters re-calculation - auto workGroups = _resizeFunction(src_shapes, dst_shapes); - _groupsSize = workGroups.first; + auto workGroups = resize_function_(src_shapes, dst_shapes); + groups_size_ = workGroups.first; MTLSize threadsPerGroup = [device maxThreadsPerThreadgroup]; - if (_groupsSize.x > threadsPerGroup.width || _groupsSize.y > threadsPerGroup.height || - _groupsSize.z > threadsPerGroup.depth) { + if (groups_size_.x > threadsPerGroup.width || groups_size_.y > threadsPerGroup.height || + groups_size_.z > threadsPerGroup.depth) { std::string error("Threads per working group: "); - error += std::to_string(_groupsSize.x) + ", " + std::to_string(_groupsSize.y) + ", " + - std::to_string(_groupsSize.z); + error += std::to_string(groups_size_.x) + ", " + std::to_string(groups_size_.y) + ", " + + std::to_string(groups_size_.z); error += "is larger than the MTLDevice can support: "; error += std::to_string(threadsPerGroup.width) + ", " + std::to_string(threadsPerGroup.height) + ", " + std::to_string(threadsPerGroup.depth); return absl::InvalidArgumentError(error); } - _groupsCount = workGroups.second; + groups_count_ = workGroups.second; return absl::OkStatus(); } -- (bool)hasInOutIds:(const std::set<::tflite::gpu::ValueId>&)ids { - for (auto& buffer : _inputBuffers) { +bool ComputeTask::HasInOutIds(const std::set& ids) const { + for (auto& buffer : input_buffers_) { if (ids.count(buffer.uid)) { return true; } } - for (auto& buffer : _outputBuffers) { + for (auto& buffer : output_buffers_) { if (ids.count(buffer.uid)) { return true; } @@ -205,66 +165,66 @@ struct UniformBuffer { return false; } -- (void)encodeWithEncoder:(id)encoder { +void ComputeTask::EncodeWithEncoder(id encoder) { // The dispatch call is intended to be skipped. - if (_groupsCount.x * _groupsCount.y * _groupsCount.z == 0) { + if (groups_count_.x * groups_count_.y * groups_count_.z == 0) { return; } - [encoder setComputePipelineState:_program]; + [encoder setComputePipelineState:program_]; int bindIndex = 0; - for (const auto& buffer : _outputBuffers) { - [encoder setBuffer:buffer.metalHandle offset:0 atIndex:bindIndex]; + for (const auto& buffer : output_buffers_) { + [encoder setBuffer:buffer.metal_handle offset:0 atIndex:bindIndex]; bindIndex++; } - for (const auto& buffer : _inputBuffers) { - [encoder setBuffer:buffer.metalHandle offset:0 atIndex:bindIndex]; + for (const auto& buffer : input_buffers_) { + [encoder setBuffer:buffer.metal_handle offset:0 atIndex:bindIndex]; bindIndex++; } - for (auto& immutable : _immutableBuffers) { + for (auto& immutable : immutable_buffers_) { [encoder setBuffer:immutable offset:0 atIndex:bindIndex]; bindIndex++; } - for (auto& uniform : _uniformBuffers) { + for (auto& uniform : uniform_buffers_) { [encoder setBytes:uniform.data.data() length:uniform.data.size() atIndex:bindIndex]; bindIndex++; } - _metal_args.Encode(encoder, bindIndex); + metal_args_.Encode(encoder, bindIndex); - MTLSize groupsCount = MTLSizeMake(_groupsCount.x, _groupsCount.y, _groupsCount.z); - MTLSize groupsSize = MTLSizeMake(_groupsSize.x, _groupsSize.y, _groupsSize.z); + MTLSize groupsCount = MTLSizeMake(groups_count_.x, groups_count_.y, groups_count_.z); + MTLSize groupsSize = MTLSizeMake(groups_size_.x, groups_size_.y, groups_size_.z); [encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize]; } -- (std::vector)getOutputIds { +std::vector ComputeTask::GetOutputIds() const { std::vector result; - for (auto& buffer : _outputBuffers) { + for (auto& buffer : output_buffers_) { result.push_back(buffer.uid); } return result; } -- (std::vector)getInputIds { +std::vector ComputeTask::GetInputIds() const { std::vector result; - for (auto& buffer : _inputBuffers) { + for (auto& buffer : input_buffers_) { result.push_back(buffer.uid); } return result; } -- (void)setSrcTensor:(const tflite::gpu::metal::MetalSpatialTensor&)tensor - withIndex:(int)index; { - _inputBuffers[index].metalHandle = tensor.GetBufferHandle(); +void ComputeTask::SetSrcTensor(const MetalSpatialTensor& tensor, int index) { + input_buffers_[index].metal_handle = tensor.GetBufferHandle(); } -- (void)setDstTensor:(const tflite::gpu::metal::MetalSpatialTensor&)tensor - withIndex:(int)index; { - _outputBuffers[index].metalHandle = tensor.GetBufferHandle(); +void ComputeTask::SetDstTensor(const MetalSpatialTensor& tensor, int index) { + output_buffers_[index].metal_handle = tensor.GetBufferHandle(); } -- (void)setDescription:(const std::string&)description { - _description = description; +void ComputeTask::SetDescription(const std::string& description) { + description_ = description; } -@end +} // namespace metal +} // namespace gpu +} // namespace tflite diff --git a/tensorflow/lite/delegates/gpu/metal/inference_context.mm b/tensorflow/lite/delegates/gpu/metal/inference_context.mm index a228813d0ca..dd9ef73c0d6 100644 --- a/tensorflow/lite/delegates/gpu/metal/inference_context.mm +++ b/tensorflow/lite/delegates/gpu/metal/inference_context.mm @@ -37,6 +37,7 @@ using ::tflite::gpu::DataType; using ::tflite::gpu::HalfBits; using ::tflite::gpu::int2; using ::tflite::gpu::MemoryStrategy; +using ::tflite::gpu::metal::ComputeTask; using ::tflite::gpu::metal::ComputeTaskDescriptorPtr; using ::tflite::gpu::metal::MetalSpatialTensor; using ::tflite::gpu::TensorDescriptor; @@ -60,7 +61,7 @@ void AddUsage(ValueId id, int task_index, } // namespace @implementation TFLInferenceContext { - std::vector _computeTasks; + std::vector _computeTasks; // contains indexes of _computeTasks std::vector _taskIdsWithPreallocatedTensors; std::vector _inputIds; @@ -85,17 +86,15 @@ void AddUsage(ValueId id, int task_index, _precision = precision; // Metal resources are created here. for (const auto& node : compiledModel.nodes) { - TFLComputeTask* task = [[TFLComputeTask alloc] init]; - RETURN_IF_ERROR([task compileWithDevice:device - taskDescriptor:node - precision:_precision]); - [task setDescription:node.description]; - _computeTasks.emplace_back(task); + ComputeTask task; + RETURN_IF_ERROR(task.CompileWithDevice(device, node, _precision)); + task.SetDescription(node.description); + _computeTasks.emplace_back(std::move(task)); } _tensorShapes = compiledModel.tensor_shapes; for (auto& task : _computeTasks) { // The same device must be used here as well as on shader compilation stage. - RETURN_IF_ERROR([task updateParamsWithDevice:device tensorShapes:_tensorShapes]); + RETURN_IF_ERROR(task.UpdateParamsWithDevice(device, _tensorShapes)); } RETURN_IF_ERROR([self allocateTensors:device]); return absl::OkStatus(); @@ -111,7 +110,7 @@ void AddUsage(ValueId id, int task_index, } for (int i = 0; i < _computeTasks.size(); ++i) { auto& task = _computeTasks[i]; - if ([task hasInOutIds:preallocatedIds]) { + if (task.HasInOutIds(preallocatedIds)) { _taskIdsWithPreallocatedTensors.push_back(i); } } @@ -144,15 +143,15 @@ void AddUsage(ValueId id, int task_index, - (void)bindTensorsToOperations { for (auto& task : _computeTasks) { - const auto& src_ids = [task getInputIds]; + const auto& src_ids = task.GetInputIds(); for (int i = 0; i < src_ids.size(); ++i) { MetalSpatialTensor* tensor = [self getTensor:src_ids[i]]; - [task setSrcTensor:*tensor withIndex:i]; + task.SetSrcTensor(*tensor, i); } - const auto& dst_ids = [task getOutputIds]; + const auto& dst_ids = task.GetOutputIds(); for (int i = 0; i < dst_ids.size(); ++i) { MetalSpatialTensor* tensor = [self getTensor:dst_ids[i]]; - [task setDstTensor:*tensor withIndex:i]; + task.SetDstTensor(*tensor, i); } } } @@ -164,12 +163,12 @@ void AddUsage(ValueId id, int task_index, } } for (int op_index = 0; op_index < _computeTasks.size(); ++op_index) { - for (auto& tensor_id : [_computeTasks[op_index] getInputIds]) { + for (auto& tensor_id : _computeTasks[op_index].GetInputIds()) { if (_preallocatedTensors.find(tensor_id) == _preallocatedTensors.end()) { AddUsage(tensor_id, op_index, usages); } } - for (auto& tensor_id : [_computeTasks[op_index] getOutputIds]) { + for (auto& tensor_id : _computeTasks[op_index].GetOutputIds()) { if (_preallocatedTensors.find(tensor_id) == _preallocatedTensors.end()) { AddUsage(tensor_id, op_index, usages); } @@ -239,8 +238,8 @@ void AddUsage(ValueId id, int task_index, descriptor.data_type = f32_storage ? DataType::FLOAT32 : DataType::FLOAT16; descriptor.layout = tflite::gpu::Layout::HWC; for (auto& task : _computeTasks) { - const std::vector input_ids = [task getInputIds]; - const std::vector output_ids = [task getOutputIds]; + const std::vector input_ids = task.GetInputIds(); + const std::vector output_ids = task.GetOutputIds(); std::vector all_ids = input_ids; all_ids.insert(all_ids.end(), output_ids.begin(), output_ids.end()); for (auto& tensor_id : all_ids) { @@ -263,7 +262,7 @@ void AddUsage(ValueId id, int task_index, [self updatePreallocatedTensors:inputOutputBuffers]; for (int i = 0; i < _computeTasks.size(); ++i) { auto& task = _computeTasks[i]; - [task encodeWithEncoder:commandEncoder]; + task.EncodeWithEncoder(commandEncoder); } } @@ -274,7 +273,7 @@ void AddUsage(ValueId id, int task_index, for (int i = 0; i < _computeTasks.size(); ++i) { id encoder = [commandBuffer computeCommandEncoder]; auto& task = _computeTasks[i]; - [task encodeWithEncoder:encoder]; + task.EncodeWithEncoder(encoder); [encoder endEncoding]; } } @@ -288,7 +287,7 @@ void AddUsage(ValueId id, int task_index, for (int i = 0; i < _computeTasks.size(); ++i) { id encoder = [commandBuffer computeCommandEncoder]; auto& task = _computeTasks[i]; - [task encodeWithEncoder:encoder]; + task.EncodeWithEncoder(encoder); [encoder endEncoding]; if (i % flushPeriod == (flushPeriod - 1)) { [commandBuffer commit]; @@ -304,18 +303,18 @@ void AddUsage(ValueId id, int task_index, } for (auto& task_index : _taskIdsWithPreallocatedTensors) { auto& task = _computeTasks[task_index]; - const auto& src_ids = [task getInputIds]; + const auto& src_ids = task.GetInputIds(); for (int i = 0; i < src_ids.size(); ++i) { const auto& it = _preallocatedTensors.find(src_ids[i]); if (it != _preallocatedTensors.end()) { - [task setSrcTensor:it->second withIndex:i]; + task.SetSrcTensor(it->second, i); } } - const auto& dst_ids = [task getOutputIds]; + const auto& dst_ids = task.GetOutputIds(); for (int i = 0; i < dst_ids.size(); ++i) { const auto& it = _preallocatedTensors.find(dst_ids[i]); if (it != _preallocatedTensors.end()) { - [task setDstTensor:it->second withIndex:i]; + task.SetDstTensor(it->second, i); } } }