Added support of some GPUOperations in ComputeTask.

ComputeTask can be created from GPUOperation now.
Test of Add created from GPUOperation in Metal backend.

PiperOrigin-RevId: 351860267
Change-Id: I0e622f6efdee5f40043f293034b4d84298387e9d
This commit is contained in:
Raman Sarokin 2021-01-14 13:02:23 -08:00 committed by TensorFlower Gardener
parent c1336b952d
commit a9aee8f713
8 changed files with 255 additions and 8 deletions

View File

@ -38,6 +38,9 @@ namespace gpu {
namespace cl {
class ClOperation;
}
namespace metal {
class ComputeTask;
}
// kCustom: default value
// GPUOperation::GetGridSize must be overloaded
@ -145,6 +148,7 @@ class GPUOperation {
protected:
friend class cl::ClOperation;
friend class metal::ComputeTask;
friend flatbuffers::Offset<tflite::gpu::data::GPUOperation> Encode(
const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder);
friend absl::Status Decode(const tflite::gpu::data::GPUOperation* fb_op,

View File

@ -121,6 +121,7 @@ objc_library(
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:types",
"//tensorflow/lite/delegates/gpu/common:util",
"//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
"@com_google_absl//absl/strings",
],
)

View File

@ -33,11 +33,43 @@ limitations under the License.
namespace tflite {
namespace gpu {
namespace metal {
namespace {
int3 GetWorkGroupsCount(int grid_dimension, const int3& grid_size,
const int3& work_group_size,
const int3& work_group_launch_order) {
int3 work_groups_count;
if (grid_dimension == 1) {
work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
work_groups_count.y = 1;
work_groups_count.z = 1;
} else if (grid_dimension == 2) {
int3 wgs;
wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
work_groups_count.x = wgs[work_group_launch_order[0]];
work_groups_count.y = wgs[work_group_launch_order[1]];
work_groups_count.z = 1;
} else { // grid_dimension == 3
int3 wgs;
wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
wgs.z = DivideRoundUp(grid_size.z, work_group_size.z);
work_groups_count.x = wgs[work_group_launch_order[0]];
work_groups_count.y = wgs[work_group_launch_order[1]];
work_groups_count.z = wgs[work_group_launch_order[2]];
}
return work_groups_count;
}
} // namespace
void ComputeTask::Init(std::unique_ptr<ComputeTaskDescriptor>&& task_desc) {
task_desc_ = std::move(task_desc);
}
void ComputeTask::Init(std::unique_ptr<GPUOperation>&& operation) {
operation_ = std::move(operation);
}
absl::Status ComputeTask::Compile(CalculationsPrecision precision,
MetalDevice* device) {
task_desc_->AssembleCode();
@ -111,6 +143,71 @@ absl::Status ComputeTask::Compile(CalculationsPrecision precision,
return absl::OkStatus();
}
absl::Status ComputeTask::CompileOp(MetalDevice* device) {
operation_->AssembleCode(device->GetInfo());
const std::map<std::string, std::string> linkables = {
{operation_->dst_tensors_names_[0], operation_->elementwise_code_}};
RETURN_IF_ERROR(metal_args_.Init(linkables, device, &operation_->args_,
&operation_->code_));
operation_->args_.ReleaseCPURepresentation();
NSString* storageType;
NSString* accumulatorType;
NSString* toAccumulatorType = @"";
NSString* toAccumulatorType2 = @"";
NSString* toAccumulatorType3 = @"";
NSString* toAccumulatorType4 = @"";
if (operation_->definition_.precision == CalculationsPrecision::F32) {
storageType = @"float";
accumulatorType = @"float";
} else {
// FP16
storageType = @"half";
if (operation_->definition_.precision == CalculationsPrecision::F32_F16) {
accumulatorType = @"float";
toAccumulatorType = @"float";
toAccumulatorType2 = @"float2";
toAccumulatorType3 = @"float3";
toAccumulatorType4 = @"float4";
} else {
accumulatorType = @"half";
}
}
NSDictionary<NSString*, NSString*>* macros = @{
@"FLT" : storageType,
@"FLT2" : [NSString stringWithFormat:@"%@2", storageType],
@"FLT3" : [NSString stringWithFormat:@"%@3", storageType],
@"FLT4" : [NSString stringWithFormat:@"%@4", storageType],
@"ACCUM_FLT" : accumulatorType,
@"ACCUM_FLT2" : [NSString stringWithFormat:@"%@2", accumulatorType],
@"ACCUM_FLT3" : [NSString stringWithFormat:@"%@3", accumulatorType],
@"ACCUM_FLT4" : [NSString stringWithFormat:@"%@4", accumulatorType],
@"TO_ACCUM_TYPE" : toAccumulatorType,
@"TO_ACCUM2_TYPE" : toAccumulatorType2,
@"TO_ACCUM3_TYPE" : toAccumulatorType3,
@"TO_ACCUM4_TYPE" : toAccumulatorType4,
@"MAIN_FUNCTION" : @"\"kernel void ComputeFunction\"",
@"GLOBAL_ID_0" : @"static_cast<int>(reserved_gid.x)",
@"GLOBAL_ID_1" : @"static_cast<int>(reserved_gid.y)",
@"GLOBAL_ID_2" : @"static_cast<int>(reserved_gid.z)",
@"INIT_FLT(value)" : [NSString stringWithFormat:@"%@(value)", storageType],
@"INIT_FLT4(value)" :
[NSString stringWithFormat:@"%@4(value)", storageType],
};
NSString* code =
[NSString stringWithCString:operation_->code_.c_str()
encoding:[NSString defaultCStringEncoding]];
id<MTLComputePipelineState> program;
RETURN_IF_ERROR(CreateComputeProgram(device->device(), code,
@"ComputeFunction", macros, &program));
if (!program) {
return absl::InternalError("Unknown shader compilation error");
}
program_ = program;
return absl::OkStatus();
}
absl::Status ComputeTask::UpdateParams(const GpuInfo& gpu_info,
const std::vector<BHWC>& src_shapes,
const std::vector<BHWC>& dst_shapes) {
@ -137,6 +234,33 @@ absl::Status ComputeTask::UpdateParams(const GpuInfo& gpu_info,
return absl::OkStatus();
}
absl::Status ComputeTask::UpdateOpParams() {
for (int i = 0; i < operation_->src_tensors_names_.size(); ++i) {
const auto* metal_spatial_tensor =
dynamic_cast<const MetalSpatialTensor*>(operation_->src_[i]);
if (!metal_spatial_tensor) {
return absl::InvalidArgumentError("Expected MetalSpatialTensor.");
}
RETURN_IF_ERROR(metal_args_.SetObjectRef(operation_->src_tensors_names_[i],
*metal_spatial_tensor));
}
for (int i = 0; i < operation_->dst_tensors_names_.size(); ++i) {
const auto* metal_spatial_tensor =
dynamic_cast<const MetalSpatialTensor*>(operation_->dst_[i]);
if (!metal_spatial_tensor) {
return absl::InvalidArgumentError("Expected MetalSpatialTensor.");
}
RETURN_IF_ERROR(metal_args_.SetObjectRef(operation_->dst_tensors_names_[i],
*metal_spatial_tensor));
}
RETURN_IF_ERROR(operation_->BindArguments(&metal_args_));
operation_->grid_size_ = operation_->GetGridSize();
operation_->work_groups_count_ = GetWorkGroupsCount(
operation_->grid_dimension_, operation_->grid_size_,
operation_->work_group_size_, operation_->work_group_launch_order_);
return absl::OkStatus();
}
void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
// The dispatch call is intended to be skipped.
if (groups_count_.x * groups_count_.y * groups_count_.z == 0) {
@ -145,8 +269,7 @@ void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
[encoder setComputePipelineState:program_];
int bindIndex = 0;
metal_args_.Encode(encoder, bindIndex);
metal_args_.Encode(encoder, 0);
MTLSize groupsCount =
MTLSizeMake(groups_count_.x, groups_count_.y, groups_count_.z);
@ -155,6 +278,19 @@ void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
[encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize];
}
void ComputeTask::EncodeOpWithEncoder(id<MTLComputeCommandEncoder> encoder) {
[encoder setComputePipelineState:program_];
metal_args_.Encode(encoder, 0);
MTLSize groupsCount, groupsSize;
groupsCount.width = operation_->work_groups_count_.x;
groupsCount.height = operation_->work_groups_count_.y;
groupsCount.depth = operation_->work_groups_count_.z;
groupsSize.width = operation_->work_group_size_.x;
groupsSize.height = operation_->work_group_size_.y;
groupsSize.depth = operation_->work_group_size_.z;
[encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize];
}
void ComputeTask::SetSrcTensor(const MetalSpatialTensor& tensor, int index) {
auto status =
metal_args_.SetObjectRef(task_desc_->src_tensors_names[index], tensor);

View File

@ -27,6 +27,7 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/common/precision.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/metal/common.h"
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
#include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h"
@ -49,25 +50,35 @@ class ComputeTask {
void Init(std::unique_ptr<ComputeTaskDescriptor>&& task_desc);
void Init(std::unique_ptr<GPUOperation>&& operation);
ComputeTaskDescriptor& GetTaskDesc() { return *task_desc_; }
const ComputeTaskDescriptor& GetTaskDesc() const { return *task_desc_; }
/// Returns empty string or error if shader can't be compiled.
absl::Status Compile(CalculationsPrecision precision, MetalDevice* device);
absl::Status CompileOp(MetalDevice* device);
/// Updates parameters for inputs/outputs/intermediate tensors
absl::Status UpdateParams(const GpuInfo& gpu_info,
const std::vector<BHWC>& src_shapes,
const std::vector<BHWC>& dst_shapes);
// should be called after changes of inputs/outputs.
absl::Status UpdateOpParams();
void EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder);
void EncodeOpWithEncoder(id<MTLComputeCommandEncoder> encoder);
void SetSrcTensor(const MetalSpatialTensor& tensor, int index);
void SetDstTensor(const MetalSpatialTensor& tensor, int index);
private:
std::unique_ptr<ComputeTaskDescriptor> task_desc_;
std::unique_ptr<GPUOperation> operation_ = nullptr;
id<MTLComputePipelineState> program_;
MetalArguments metal_args_;
uint3 groups_size_;

View File

@ -64,6 +64,7 @@ objc_library(
deps = [
":add",
":test_util",
"//tensorflow/lite/delegates/gpu/common/tasks:add_test_util",
],
)
@ -812,6 +813,8 @@ objc_library(
"//tensorflow/lite/delegates/gpu/common:tensor",
"//tensorflow/lite/delegates/gpu/common:types",
"//tensorflow/lite/delegates/gpu/common:util",
"//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
"//tensorflow/lite/delegates/gpu/common/task:testing_util",
"//tensorflow/lite/delegates/gpu/metal:common",
"//tensorflow/lite/delegates/gpu/metal:inference_context",
"//tensorflow/lite/delegates/gpu/metal:metal_device",
@ -897,6 +900,7 @@ objc_library(
"//tensorflow/lite/delegates/gpu/common:shape",
"//tensorflow/lite/delegates/gpu/common:types",
"//tensorflow/lite/delegates/gpu/common:util",
"//tensorflow/lite/delegates/gpu/common/tasks:add_test_util",
"//tensorflow/lite/delegates/gpu/metal:common",
"//tensorflow/lite/delegates/gpu/metal:inference_context",
"//tensorflow/lite/kernels/internal:quantization_util",

View File

@ -27,6 +27,7 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/common/util.h"
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
#include "tensorflow/lite/delegates/gpu/common/tasks/add_test_util.h"
using ::tflite::gpu::ElementwiseAttributes;
using ::tflite::gpu::BHWC;
@ -123,4 +124,22 @@ using ::tflite::gpu::metal::SingleOpModel;
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
}
- (void)testAddTwoEqualTensors {
tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
auto status = AddTwoEqualTensorsTest(&exec_env_);
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
}
- (void)testAddFirstTensorHasMoreChannelsThanSecond {
tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
auto status = AddFirstTensorHasMoreChannelsThanSecondTest(&exec_env_);
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
}
- (void)testAddFirstTensorHasLessChannelsThanSecond {
tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
auto status = AddFirstTensorHasLessChannelsThanSecond(&exec_env_);
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
}
@end

View File

@ -184,6 +184,72 @@ MetalExecutionEnvironment::GetSupportedStoragesWithHWZeroClampSupport() const {
return {};
}
absl::Status MetalExecutionEnvironment::ExecuteGPUOperation(
const std::vector<TensorFloat32>& src_cpu,
std::unique_ptr<GPUOperation>&& operation,
const std::vector<BHWC>& dst_sizes,
const std::vector<TensorFloat32*>& dst_cpu) {
const OperationDef op_def = operation->GetDefinition();
std::vector<MetalSpatialTensor> src(src_cpu.size());
for (int i = 0; i < src_cpu.size(); ++i) {
auto src_shape = src_cpu[i].shape;
if (src_shape.b != 1 && !op_def.IsBatchSupported()) {
return absl::InvalidArgumentError(
"Layout doesn't have Batch dimension, but shape.b != 1");
}
RETURN_IF_ERROR(CreateTensor(device_.device(), src_shape,
op_def.src_tensors[i], &src[i]));
RETURN_IF_ERROR(src[i].WriteData(src_cpu[i]));
operation->SetSrc(&src[i], i);
}
std::vector<MetalSpatialTensor> dst(dst_cpu.size());
for (int i = 0; i < dst_cpu.size(); ++i) {
auto dst_shape = dst_sizes[i];
if (dst_shape.b != 1 && !op_def.IsBatchSupported()) {
return absl::InvalidArgumentError(
"Layout doesn't have Batch dimension, but shape.b != 1");
}
RETURN_IF_ERROR(CreateTensor(device_.device(), dst_shape,
op_def.dst_tensors[i], &dst[i]));
operation->SetDst(&dst[i], i);
}
std::vector<BHWC> src_shapes;
std::vector<BHWC> dst_shapes;
std::vector<ValueId> src_ids;
std::vector<ValueId> dst_ids;
for (int i = 0; i < src_cpu.size(); ++i) {
src_ids.push_back(i);
src_shapes.push_back(src_cpu[i].shape);
}
for (int i = 0; i < dst_cpu.size(); ++i) {
dst_ids.push_back(src_cpu.size() + i);
dst_shapes.push_back(dst_sizes[i]);
}
ComputeTask gpu_task;
gpu_task.Init(std::move(operation));
RETURN_IF_ERROR(gpu_task.CompileOp(&device_));
RETURN_IF_ERROR(gpu_task.UpdateOpParams());
id<MTLCommandQueue> command_queue = [device_.device() newCommandQueue];
id<MTLCommandBuffer> command_buffer = [command_queue commandBuffer];
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
gpu_task.EncodeOpWithEncoder(encoder);
[encoder endEncoding];
[command_buffer commit];
[command_buffer waitUntilCompleted];
for (int i = 0; i < dst_cpu.size(); ++i) {
dst_cpu[i]->shape = dst_sizes[i];
dst_cpu[i]->data = std::vector<float>(dst_sizes[i].DimensionsProduct(), 0);
RETURN_IF_ERROR(dst[i].ReadData(dst_cpu[i]));
}
return absl::OkStatus();
}
absl::Status MetalExecutionEnvironment::ExecuteGPUOperation(
const std::vector<TensorFloat32>& src_cpu,
std::unique_ptr<ComputeTaskDescriptor>&& operation,

View File

@ -24,6 +24,8 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/common/model.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
#include "tensorflow/lite/delegates/gpu/metal/inference_context.h"
@ -61,20 +63,24 @@ class SingleOpModel {
absl::Status CompareVectors(const std::vector<float>& reference,
const std::vector<float>& output, float max_error);
class MetalExecutionEnvironment {
class MetalExecutionEnvironment : public TestExecutionEnvironment {
public:
MetalExecutionEnvironment() = default;
~MetalExecutionEnvironment() = default;
std::vector<CalculationsPrecision> GetSupportedPrecisions() const;
std::vector<TensorStorageType> GetSupportedStorages() const;
// returns storage types that support zero clamping when reading OOB in HW
// (Height/Width) dimensions.
std::vector<CalculationsPrecision> GetSupportedPrecisions() const override;
std::vector<TensorStorageType> GetSupportedStorages() const override;
std::vector<TensorStorageType> GetSupportedStoragesWithHWZeroClampSupport()
const;
const override;
const GpuInfo& GetGpuInfo() const { return device_.GetInfo(); }
absl::Status ExecuteGPUOperation(
const std::vector<TensorFloat32>& src_cpu,
std::unique_ptr<GPUOperation>&& operation,
const std::vector<BHWC>& dst_sizes,
const std::vector<TensorFloat32*>& dst_cpu) override;
absl::Status ExecuteGPUOperation(
const std::vector<TensorFloat32>& src_cpu,
std::unique_ptr<ComputeTaskDescriptor>&& operation,