diff --git a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h index f35682dad3a..cb02a173dd9 100644 --- a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h +++ b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h @@ -38,6 +38,9 @@ namespace gpu { namespace cl { class ClOperation; } +namespace metal { +class ComputeTask; +} // kCustom: default value // GPUOperation::GetGridSize must be overloaded @@ -145,6 +148,7 @@ class GPUOperation { protected: friend class cl::ClOperation; + friend class metal::ComputeTask; friend flatbuffers::Offset Encode( const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder); friend absl::Status Decode(const tflite::gpu::data::GPUOperation* fb_op, diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD index f46834398ec..eb179af712f 100644 --- a/tensorflow/lite/delegates/gpu/metal/BUILD +++ b/tensorflow/lite/delegates/gpu/metal/BUILD @@ -121,6 +121,7 @@ objc_library( "//tensorflow/lite/delegates/gpu/common:status", "//tensorflow/lite/delegates/gpu/common:types", "//tensorflow/lite/delegates/gpu/common:util", + "//tensorflow/lite/delegates/gpu/common/task:gpu_operation", "@com_google_absl//absl/strings", ], ) diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.cc b/tensorflow/lite/delegates/gpu/metal/compute_task.cc index b76989f2094..9c0642bd36f 100644 --- a/tensorflow/lite/delegates/gpu/metal/compute_task.cc +++ b/tensorflow/lite/delegates/gpu/metal/compute_task.cc @@ -33,11 +33,43 @@ limitations under the License. namespace tflite { namespace gpu { namespace metal { +namespace { +int3 GetWorkGroupsCount(int grid_dimension, const int3& grid_size, + const int3& work_group_size, + const int3& work_group_launch_order) { + int3 work_groups_count; + if (grid_dimension == 1) { + work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x); + work_groups_count.y = 1; + work_groups_count.z = 1; + } else if (grid_dimension == 2) { + int3 wgs; + wgs.x = DivideRoundUp(grid_size.x, work_group_size.x); + wgs.y = DivideRoundUp(grid_size.y, work_group_size.y); + work_groups_count.x = wgs[work_group_launch_order[0]]; + work_groups_count.y = wgs[work_group_launch_order[1]]; + work_groups_count.z = 1; + } else { // grid_dimension == 3 + int3 wgs; + wgs.x = DivideRoundUp(grid_size.x, work_group_size.x); + wgs.y = DivideRoundUp(grid_size.y, work_group_size.y); + wgs.z = DivideRoundUp(grid_size.z, work_group_size.z); + work_groups_count.x = wgs[work_group_launch_order[0]]; + work_groups_count.y = wgs[work_group_launch_order[1]]; + work_groups_count.z = wgs[work_group_launch_order[2]]; + } + return work_groups_count; +} +} // namespace void ComputeTask::Init(std::unique_ptr&& task_desc) { task_desc_ = std::move(task_desc); } +void ComputeTask::Init(std::unique_ptr&& operation) { + operation_ = std::move(operation); +} + absl::Status ComputeTask::Compile(CalculationsPrecision precision, MetalDevice* device) { task_desc_->AssembleCode(); @@ -111,6 +143,71 @@ absl::Status ComputeTask::Compile(CalculationsPrecision precision, return absl::OkStatus(); } +absl::Status ComputeTask::CompileOp(MetalDevice* device) { + operation_->AssembleCode(device->GetInfo()); + const std::map linkables = { + {operation_->dst_tensors_names_[0], operation_->elementwise_code_}}; + RETURN_IF_ERROR(metal_args_.Init(linkables, device, &operation_->args_, + &operation_->code_)); + + operation_->args_.ReleaseCPURepresentation(); + NSString* storageType; + NSString* accumulatorType; + NSString* toAccumulatorType = @""; + NSString* toAccumulatorType2 = @""; + NSString* toAccumulatorType3 = @""; + NSString* toAccumulatorType4 = @""; + if (operation_->definition_.precision == CalculationsPrecision::F32) { + storageType = @"float"; + accumulatorType = @"float"; + } else { + // FP16 + storageType = @"half"; + if (operation_->definition_.precision == CalculationsPrecision::F32_F16) { + accumulatorType = @"float"; + toAccumulatorType = @"float"; + toAccumulatorType2 = @"float2"; + toAccumulatorType3 = @"float3"; + toAccumulatorType4 = @"float4"; + } else { + accumulatorType = @"half"; + } + } + NSDictionary* macros = @{ + @"FLT" : storageType, + @"FLT2" : [NSString stringWithFormat:@"%@2", storageType], + @"FLT3" : [NSString stringWithFormat:@"%@3", storageType], + @"FLT4" : [NSString stringWithFormat:@"%@4", storageType], + @"ACCUM_FLT" : accumulatorType, + @"ACCUM_FLT2" : [NSString stringWithFormat:@"%@2", accumulatorType], + @"ACCUM_FLT3" : [NSString stringWithFormat:@"%@3", accumulatorType], + @"ACCUM_FLT4" : [NSString stringWithFormat:@"%@4", accumulatorType], + @"TO_ACCUM_TYPE" : toAccumulatorType, + @"TO_ACCUM2_TYPE" : toAccumulatorType2, + @"TO_ACCUM3_TYPE" : toAccumulatorType3, + @"TO_ACCUM4_TYPE" : toAccumulatorType4, + @"MAIN_FUNCTION" : @"\"kernel void ComputeFunction\"", + @"GLOBAL_ID_0" : @"static_cast(reserved_gid.x)", + @"GLOBAL_ID_1" : @"static_cast(reserved_gid.y)", + @"GLOBAL_ID_2" : @"static_cast(reserved_gid.z)", + @"INIT_FLT(value)" : [NSString stringWithFormat:@"%@(value)", storageType], + @"INIT_FLT4(value)" : + [NSString stringWithFormat:@"%@4(value)", storageType], + }; + + NSString* code = + [NSString stringWithCString:operation_->code_.c_str() + encoding:[NSString defaultCStringEncoding]]; + id program; + RETURN_IF_ERROR(CreateComputeProgram(device->device(), code, + @"ComputeFunction", macros, &program)); + if (!program) { + return absl::InternalError("Unknown shader compilation error"); + } + program_ = program; + return absl::OkStatus(); +} + absl::Status ComputeTask::UpdateParams(const GpuInfo& gpu_info, const std::vector& src_shapes, const std::vector& dst_shapes) { @@ -137,6 +234,33 @@ absl::Status ComputeTask::UpdateParams(const GpuInfo& gpu_info, return absl::OkStatus(); } +absl::Status ComputeTask::UpdateOpParams() { + for (int i = 0; i < operation_->src_tensors_names_.size(); ++i) { + const auto* metal_spatial_tensor = + dynamic_cast(operation_->src_[i]); + if (!metal_spatial_tensor) { + return absl::InvalidArgumentError("Expected MetalSpatialTensor."); + } + RETURN_IF_ERROR(metal_args_.SetObjectRef(operation_->src_tensors_names_[i], + *metal_spatial_tensor)); + } + for (int i = 0; i < operation_->dst_tensors_names_.size(); ++i) { + const auto* metal_spatial_tensor = + dynamic_cast(operation_->dst_[i]); + if (!metal_spatial_tensor) { + return absl::InvalidArgumentError("Expected MetalSpatialTensor."); + } + RETURN_IF_ERROR(metal_args_.SetObjectRef(operation_->dst_tensors_names_[i], + *metal_spatial_tensor)); + } + RETURN_IF_ERROR(operation_->BindArguments(&metal_args_)); + operation_->grid_size_ = operation_->GetGridSize(); + operation_->work_groups_count_ = GetWorkGroupsCount( + operation_->grid_dimension_, operation_->grid_size_, + operation_->work_group_size_, operation_->work_group_launch_order_); + return absl::OkStatus(); +} + void ComputeTask::EncodeWithEncoder(id encoder) { // The dispatch call is intended to be skipped. if (groups_count_.x * groups_count_.y * groups_count_.z == 0) { @@ -145,8 +269,7 @@ void ComputeTask::EncodeWithEncoder(id encoder) { [encoder setComputePipelineState:program_]; - int bindIndex = 0; - metal_args_.Encode(encoder, bindIndex); + metal_args_.Encode(encoder, 0); MTLSize groupsCount = MTLSizeMake(groups_count_.x, groups_count_.y, groups_count_.z); @@ -155,6 +278,19 @@ void ComputeTask::EncodeWithEncoder(id encoder) { [encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize]; } +void ComputeTask::EncodeOpWithEncoder(id encoder) { + [encoder setComputePipelineState:program_]; + metal_args_.Encode(encoder, 0); + MTLSize groupsCount, groupsSize; + groupsCount.width = operation_->work_groups_count_.x; + groupsCount.height = operation_->work_groups_count_.y; + groupsCount.depth = operation_->work_groups_count_.z; + groupsSize.width = operation_->work_group_size_.x; + groupsSize.height = operation_->work_group_size_.y; + groupsSize.depth = operation_->work_group_size_.z; + [encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize]; +} + void ComputeTask::SetSrcTensor(const MetalSpatialTensor& tensor, int index) { auto status = metal_args_.SetObjectRef(task_desc_->src_tensors_names[index], tensor); diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.h b/tensorflow/lite/delegates/gpu/metal/compute_task.h index cc31edea26f..c8912892662 100644 --- a/tensorflow/lite/delegates/gpu/metal/compute_task.h +++ b/tensorflow/lite/delegates/gpu/metal/compute_task.h @@ -27,6 +27,7 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/common/precision.h" #include "tensorflow/lite/delegates/gpu/common/shape.h" #include "tensorflow/lite/delegates/gpu/common/status.h" +#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h" #include "tensorflow/lite/delegates/gpu/metal/common.h" #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" #include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h" @@ -49,25 +50,35 @@ class ComputeTask { void Init(std::unique_ptr&& task_desc); + void Init(std::unique_ptr&& operation); + ComputeTaskDescriptor& GetTaskDesc() { return *task_desc_; } const ComputeTaskDescriptor& GetTaskDesc() const { return *task_desc_; } /// Returns empty string or error if shader can't be compiled. absl::Status Compile(CalculationsPrecision precision, MetalDevice* device); + absl::Status CompileOp(MetalDevice* device); + /// Updates parameters for inputs/outputs/intermediate tensors absl::Status UpdateParams(const GpuInfo& gpu_info, const std::vector& src_shapes, const std::vector& dst_shapes); + // should be called after changes of inputs/outputs. + absl::Status UpdateOpParams(); + void EncodeWithEncoder(id encoder); + void EncodeOpWithEncoder(id encoder); + void SetSrcTensor(const MetalSpatialTensor& tensor, int index); void SetDstTensor(const MetalSpatialTensor& tensor, int index); private: std::unique_ptr task_desc_; + std::unique_ptr operation_ = nullptr; id program_; MetalArguments metal_args_; uint3 groups_size_; diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD index 2c81fdfbf6a..0ac07d2b202 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD +++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD @@ -64,6 +64,7 @@ objc_library( deps = [ ":add", ":test_util", + "//tensorflow/lite/delegates/gpu/common/tasks:add_test_util", ], ) @@ -812,6 +813,8 @@ objc_library( "//tensorflow/lite/delegates/gpu/common:tensor", "//tensorflow/lite/delegates/gpu/common:types", "//tensorflow/lite/delegates/gpu/common:util", + "//tensorflow/lite/delegates/gpu/common/task:gpu_operation", + "//tensorflow/lite/delegates/gpu/common/task:testing_util", "//tensorflow/lite/delegates/gpu/metal:common", "//tensorflow/lite/delegates/gpu/metal:inference_context", "//tensorflow/lite/delegates/gpu/metal:metal_device", @@ -897,6 +900,7 @@ objc_library( "//tensorflow/lite/delegates/gpu/common:shape", "//tensorflow/lite/delegates/gpu/common:types", "//tensorflow/lite/delegates/gpu/common:util", + "//tensorflow/lite/delegates/gpu/common/tasks:add_test_util", "//tensorflow/lite/delegates/gpu/metal:common", "//tensorflow/lite/delegates/gpu/metal:inference_context", "//tensorflow/lite/kernels/internal:quantization_util", diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm index e935a10eb56..c7ac3bd94e9 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm +++ b/tensorflow/lite/delegates/gpu/metal/kernels/add_test.mm @@ -27,6 +27,7 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/common/util.h" #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" #include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h" +#include "tensorflow/lite/delegates/gpu/common/tasks/add_test_util.h" using ::tflite::gpu::ElementwiseAttributes; using ::tflite::gpu::BHWC; @@ -123,4 +124,22 @@ using ::tflite::gpu::metal::SingleOpModel; XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); } +- (void)testAddTwoEqualTensors { + tflite::gpu::metal::MetalExecutionEnvironment exec_env_; + auto status = AddTwoEqualTensorsTest(&exec_env_); + XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); +} + +- (void)testAddFirstTensorHasMoreChannelsThanSecond { + tflite::gpu::metal::MetalExecutionEnvironment exec_env_; + auto status = AddFirstTensorHasMoreChannelsThanSecondTest(&exec_env_); + XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); +} + +- (void)testAddFirstTensorHasLessChannelsThanSecond { + tflite::gpu::metal::MetalExecutionEnvironment exec_env_; + auto status = AddFirstTensorHasLessChannelsThanSecond(&exec_env_); + XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); +} + @end diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.cc b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.cc index 5fe993b55c0..d629ff40038 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.cc +++ b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.cc @@ -184,6 +184,72 @@ MetalExecutionEnvironment::GetSupportedStoragesWithHWZeroClampSupport() const { return {}; } +absl::Status MetalExecutionEnvironment::ExecuteGPUOperation( + const std::vector& src_cpu, + std::unique_ptr&& operation, + const std::vector& dst_sizes, + const std::vector& dst_cpu) { + const OperationDef op_def = operation->GetDefinition(); + std::vector src(src_cpu.size()); + for (int i = 0; i < src_cpu.size(); ++i) { + auto src_shape = src_cpu[i].shape; + if (src_shape.b != 1 && !op_def.IsBatchSupported()) { + return absl::InvalidArgumentError( + "Layout doesn't have Batch dimension, but shape.b != 1"); + } + RETURN_IF_ERROR(CreateTensor(device_.device(), src_shape, + op_def.src_tensors[i], &src[i])); + RETURN_IF_ERROR(src[i].WriteData(src_cpu[i])); + operation->SetSrc(&src[i], i); + } + + std::vector dst(dst_cpu.size()); + for (int i = 0; i < dst_cpu.size(); ++i) { + auto dst_shape = dst_sizes[i]; + if (dst_shape.b != 1 && !op_def.IsBatchSupported()) { + return absl::InvalidArgumentError( + "Layout doesn't have Batch dimension, but shape.b != 1"); + } + RETURN_IF_ERROR(CreateTensor(device_.device(), dst_shape, + op_def.dst_tensors[i], &dst[i])); + operation->SetDst(&dst[i], i); + } + + std::vector src_shapes; + std::vector dst_shapes; + std::vector src_ids; + std::vector dst_ids; + for (int i = 0; i < src_cpu.size(); ++i) { + src_ids.push_back(i); + src_shapes.push_back(src_cpu[i].shape); + } + for (int i = 0; i < dst_cpu.size(); ++i) { + dst_ids.push_back(src_cpu.size() + i); + dst_shapes.push_back(dst_sizes[i]); + } + + ComputeTask gpu_task; + gpu_task.Init(std::move(operation)); + RETURN_IF_ERROR(gpu_task.CompileOp(&device_)); + RETURN_IF_ERROR(gpu_task.UpdateOpParams()); + + id command_queue = [device_.device() newCommandQueue]; + id command_buffer = [command_queue commandBuffer]; + id encoder = [command_buffer computeCommandEncoder]; + gpu_task.EncodeOpWithEncoder(encoder); + [encoder endEncoding]; + [command_buffer commit]; + [command_buffer waitUntilCompleted]; + + for (int i = 0; i < dst_cpu.size(); ++i) { + dst_cpu[i]->shape = dst_sizes[i]; + dst_cpu[i]->data = std::vector(dst_sizes[i].DimensionsProduct(), 0); + RETURN_IF_ERROR(dst[i].ReadData(dst_cpu[i])); + } + + return absl::OkStatus(); +} + absl::Status MetalExecutionEnvironment::ExecuteGPUOperation( const std::vector& src_cpu, std::unique_ptr&& operation, diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h index d9f46a4762e..e8ce371f178 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h +++ b/tensorflow/lite/delegates/gpu/metal/kernels/test_util.h @@ -24,6 +24,8 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/common/model.h" #include "tensorflow/lite/delegates/gpu/common/shape.h" #include "tensorflow/lite/delegates/gpu/common/status.h" +#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h" +#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h" #include "tensorflow/lite/delegates/gpu/common/tensor.h" #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" #include "tensorflow/lite/delegates/gpu/metal/inference_context.h" @@ -61,20 +63,24 @@ class SingleOpModel { absl::Status CompareVectors(const std::vector& reference, const std::vector& output, float max_error); -class MetalExecutionEnvironment { +class MetalExecutionEnvironment : public TestExecutionEnvironment { public: MetalExecutionEnvironment() = default; ~MetalExecutionEnvironment() = default; - std::vector GetSupportedPrecisions() const; - std::vector GetSupportedStorages() const; - // returns storage types that support zero clamping when reading OOB in HW - // (Height/Width) dimensions. + std::vector GetSupportedPrecisions() const override; + std::vector GetSupportedStorages() const override; std::vector GetSupportedStoragesWithHWZeroClampSupport() - const; + const override; const GpuInfo& GetGpuInfo() const { return device_.GetInfo(); } + absl::Status ExecuteGPUOperation( + const std::vector& src_cpu, + std::unique_ptr&& operation, + const std::vector& dst_sizes, + const std::vector& dst_cpu) override; + absl::Status ExecuteGPUOperation( const std::vector& src_cpu, std::unique_ptr&& operation,