Added support of some GPUOperations in ComputeTask.
ComputeTask can be created from GPUOperation now. Test of Add created from GPUOperation in Metal backend. PiperOrigin-RevId: 351860267 Change-Id: I0e622f6efdee5f40043f293034b4d84298387e9d
This commit is contained in:
parent
c1336b952d
commit
a9aee8f713
@ -38,6 +38,9 @@ namespace gpu {
|
||||
namespace cl {
|
||||
class ClOperation;
|
||||
}
|
||||
namespace metal {
|
||||
class ComputeTask;
|
||||
}
|
||||
|
||||
// kCustom: default value
|
||||
// GPUOperation::GetGridSize must be overloaded
|
||||
@ -145,6 +148,7 @@ class GPUOperation {
|
||||
|
||||
protected:
|
||||
friend class cl::ClOperation;
|
||||
friend class metal::ComputeTask;
|
||||
friend flatbuffers::Offset<tflite::gpu::data::GPUOperation> Encode(
|
||||
const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder);
|
||||
friend absl::Status Decode(const tflite::gpu::data::GPUOperation* fb_op,
|
||||
|
@ -121,6 +121,7 @@ objc_library(
|
||||
"//tensorflow/lite/delegates/gpu/common:status",
|
||||
"//tensorflow/lite/delegates/gpu/common:types",
|
||||
"//tensorflow/lite/delegates/gpu/common:util",
|
||||
"//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
|
||||
"@com_google_absl//absl/strings",
|
||||
],
|
||||
)
|
||||
|
@ -33,11 +33,43 @@ limitations under the License.
|
||||
namespace tflite {
|
||||
namespace gpu {
|
||||
namespace metal {
|
||||
namespace {
|
||||
int3 GetWorkGroupsCount(int grid_dimension, const int3& grid_size,
|
||||
const int3& work_group_size,
|
||||
const int3& work_group_launch_order) {
|
||||
int3 work_groups_count;
|
||||
if (grid_dimension == 1) {
|
||||
work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
|
||||
work_groups_count.y = 1;
|
||||
work_groups_count.z = 1;
|
||||
} else if (grid_dimension == 2) {
|
||||
int3 wgs;
|
||||
wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
|
||||
wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
|
||||
work_groups_count.x = wgs[work_group_launch_order[0]];
|
||||
work_groups_count.y = wgs[work_group_launch_order[1]];
|
||||
work_groups_count.z = 1;
|
||||
} else { // grid_dimension == 3
|
||||
int3 wgs;
|
||||
wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
|
||||
wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
|
||||
wgs.z = DivideRoundUp(grid_size.z, work_group_size.z);
|
||||
work_groups_count.x = wgs[work_group_launch_order[0]];
|
||||
work_groups_count.y = wgs[work_group_launch_order[1]];
|
||||
work_groups_count.z = wgs[work_group_launch_order[2]];
|
||||
}
|
||||
return work_groups_count;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void ComputeTask::Init(std::unique_ptr<ComputeTaskDescriptor>&& task_desc) {
|
||||
task_desc_ = std::move(task_desc);
|
||||
}
|
||||
|
||||
void ComputeTask::Init(std::unique_ptr<GPUOperation>&& operation) {
|
||||
operation_ = std::move(operation);
|
||||
}
|
||||
|
||||
absl::Status ComputeTask::Compile(CalculationsPrecision precision,
|
||||
MetalDevice* device) {
|
||||
task_desc_->AssembleCode();
|
||||
@ -111,6 +143,71 @@ absl::Status ComputeTask::Compile(CalculationsPrecision precision,
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status ComputeTask::CompileOp(MetalDevice* device) {
|
||||
operation_->AssembleCode(device->GetInfo());
|
||||
const std::map<std::string, std::string> linkables = {
|
||||
{operation_->dst_tensors_names_[0], operation_->elementwise_code_}};
|
||||
RETURN_IF_ERROR(metal_args_.Init(linkables, device, &operation_->args_,
|
||||
&operation_->code_));
|
||||
|
||||
operation_->args_.ReleaseCPURepresentation();
|
||||
NSString* storageType;
|
||||
NSString* accumulatorType;
|
||||
NSString* toAccumulatorType = @"";
|
||||
NSString* toAccumulatorType2 = @"";
|
||||
NSString* toAccumulatorType3 = @"";
|
||||
NSString* toAccumulatorType4 = @"";
|
||||
if (operation_->definition_.precision == CalculationsPrecision::F32) {
|
||||
storageType = @"float";
|
||||
accumulatorType = @"float";
|
||||
} else {
|
||||
// FP16
|
||||
storageType = @"half";
|
||||
if (operation_->definition_.precision == CalculationsPrecision::F32_F16) {
|
||||
accumulatorType = @"float";
|
||||
toAccumulatorType = @"float";
|
||||
toAccumulatorType2 = @"float2";
|
||||
toAccumulatorType3 = @"float3";
|
||||
toAccumulatorType4 = @"float4";
|
||||
} else {
|
||||
accumulatorType = @"half";
|
||||
}
|
||||
}
|
||||
NSDictionary<NSString*, NSString*>* macros = @{
|
||||
@"FLT" : storageType,
|
||||
@"FLT2" : [NSString stringWithFormat:@"%@2", storageType],
|
||||
@"FLT3" : [NSString stringWithFormat:@"%@3", storageType],
|
||||
@"FLT4" : [NSString stringWithFormat:@"%@4", storageType],
|
||||
@"ACCUM_FLT" : accumulatorType,
|
||||
@"ACCUM_FLT2" : [NSString stringWithFormat:@"%@2", accumulatorType],
|
||||
@"ACCUM_FLT3" : [NSString stringWithFormat:@"%@3", accumulatorType],
|
||||
@"ACCUM_FLT4" : [NSString stringWithFormat:@"%@4", accumulatorType],
|
||||
@"TO_ACCUM_TYPE" : toAccumulatorType,
|
||||
@"TO_ACCUM2_TYPE" : toAccumulatorType2,
|
||||
@"TO_ACCUM3_TYPE" : toAccumulatorType3,
|
||||
@"TO_ACCUM4_TYPE" : toAccumulatorType4,
|
||||
@"MAIN_FUNCTION" : @"\"kernel void ComputeFunction\"",
|
||||
@"GLOBAL_ID_0" : @"static_cast<int>(reserved_gid.x)",
|
||||
@"GLOBAL_ID_1" : @"static_cast<int>(reserved_gid.y)",
|
||||
@"GLOBAL_ID_2" : @"static_cast<int>(reserved_gid.z)",
|
||||
@"INIT_FLT(value)" : [NSString stringWithFormat:@"%@(value)", storageType],
|
||||
@"INIT_FLT4(value)" :
|
||||
[NSString stringWithFormat:@"%@4(value)", storageType],
|
||||
};
|
||||
|
||||
NSString* code =
|
||||
[NSString stringWithCString:operation_->code_.c_str()
|
||||
encoding:[NSString defaultCStringEncoding]];
|
||||
id<MTLComputePipelineState> program;
|
||||
RETURN_IF_ERROR(CreateComputeProgram(device->device(), code,
|
||||
@"ComputeFunction", macros, &program));
|
||||
if (!program) {
|
||||
return absl::InternalError("Unknown shader compilation error");
|
||||
}
|
||||
program_ = program;
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status ComputeTask::UpdateParams(const GpuInfo& gpu_info,
|
||||
const std::vector<BHWC>& src_shapes,
|
||||
const std::vector<BHWC>& dst_shapes) {
|
||||
@ -137,6 +234,33 @@ absl::Status ComputeTask::UpdateParams(const GpuInfo& gpu_info,
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status ComputeTask::UpdateOpParams() {
|
||||
for (int i = 0; i < operation_->src_tensors_names_.size(); ++i) {
|
||||
const auto* metal_spatial_tensor =
|
||||
dynamic_cast<const MetalSpatialTensor*>(operation_->src_[i]);
|
||||
if (!metal_spatial_tensor) {
|
||||
return absl::InvalidArgumentError("Expected MetalSpatialTensor.");
|
||||
}
|
||||
RETURN_IF_ERROR(metal_args_.SetObjectRef(operation_->src_tensors_names_[i],
|
||||
*metal_spatial_tensor));
|
||||
}
|
||||
for (int i = 0; i < operation_->dst_tensors_names_.size(); ++i) {
|
||||
const auto* metal_spatial_tensor =
|
||||
dynamic_cast<const MetalSpatialTensor*>(operation_->dst_[i]);
|
||||
if (!metal_spatial_tensor) {
|
||||
return absl::InvalidArgumentError("Expected MetalSpatialTensor.");
|
||||
}
|
||||
RETURN_IF_ERROR(metal_args_.SetObjectRef(operation_->dst_tensors_names_[i],
|
||||
*metal_spatial_tensor));
|
||||
}
|
||||
RETURN_IF_ERROR(operation_->BindArguments(&metal_args_));
|
||||
operation_->grid_size_ = operation_->GetGridSize();
|
||||
operation_->work_groups_count_ = GetWorkGroupsCount(
|
||||
operation_->grid_dimension_, operation_->grid_size_,
|
||||
operation_->work_group_size_, operation_->work_group_launch_order_);
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
|
||||
// The dispatch call is intended to be skipped.
|
||||
if (groups_count_.x * groups_count_.y * groups_count_.z == 0) {
|
||||
@ -145,8 +269,7 @@ void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
|
||||
|
||||
[encoder setComputePipelineState:program_];
|
||||
|
||||
int bindIndex = 0;
|
||||
metal_args_.Encode(encoder, bindIndex);
|
||||
metal_args_.Encode(encoder, 0);
|
||||
|
||||
MTLSize groupsCount =
|
||||
MTLSizeMake(groups_count_.x, groups_count_.y, groups_count_.z);
|
||||
@ -155,6 +278,19 @@ void ComputeTask::EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder) {
|
||||
[encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize];
|
||||
}
|
||||
|
||||
void ComputeTask::EncodeOpWithEncoder(id<MTLComputeCommandEncoder> encoder) {
|
||||
[encoder setComputePipelineState:program_];
|
||||
metal_args_.Encode(encoder, 0);
|
||||
MTLSize groupsCount, groupsSize;
|
||||
groupsCount.width = operation_->work_groups_count_.x;
|
||||
groupsCount.height = operation_->work_groups_count_.y;
|
||||
groupsCount.depth = operation_->work_groups_count_.z;
|
||||
groupsSize.width = operation_->work_group_size_.x;
|
||||
groupsSize.height = operation_->work_group_size_.y;
|
||||
groupsSize.depth = operation_->work_group_size_.z;
|
||||
[encoder dispatchThreadgroups:groupsCount threadsPerThreadgroup:groupsSize];
|
||||
}
|
||||
|
||||
void ComputeTask::SetSrcTensor(const MetalSpatialTensor& tensor, int index) {
|
||||
auto status =
|
||||
metal_args_.SetObjectRef(task_desc_->src_tensors_names[index], tensor);
|
||||
|
@ -27,6 +27,7 @@ limitations under the License.
|
||||
#include "tensorflow/lite/delegates/gpu/common/precision.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/shape.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/status.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/common.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h"
|
||||
@ -49,25 +50,35 @@ class ComputeTask {
|
||||
|
||||
void Init(std::unique_ptr<ComputeTaskDescriptor>&& task_desc);
|
||||
|
||||
void Init(std::unique_ptr<GPUOperation>&& operation);
|
||||
|
||||
ComputeTaskDescriptor& GetTaskDesc() { return *task_desc_; }
|
||||
const ComputeTaskDescriptor& GetTaskDesc() const { return *task_desc_; }
|
||||
|
||||
/// Returns empty string or error if shader can't be compiled.
|
||||
absl::Status Compile(CalculationsPrecision precision, MetalDevice* device);
|
||||
|
||||
absl::Status CompileOp(MetalDevice* device);
|
||||
|
||||
/// Updates parameters for inputs/outputs/intermediate tensors
|
||||
absl::Status UpdateParams(const GpuInfo& gpu_info,
|
||||
const std::vector<BHWC>& src_shapes,
|
||||
const std::vector<BHWC>& dst_shapes);
|
||||
|
||||
// should be called after changes of inputs/outputs.
|
||||
absl::Status UpdateOpParams();
|
||||
|
||||
void EncodeWithEncoder(id<MTLComputeCommandEncoder> encoder);
|
||||
|
||||
void EncodeOpWithEncoder(id<MTLComputeCommandEncoder> encoder);
|
||||
|
||||
void SetSrcTensor(const MetalSpatialTensor& tensor, int index);
|
||||
|
||||
void SetDstTensor(const MetalSpatialTensor& tensor, int index);
|
||||
|
||||
private:
|
||||
std::unique_ptr<ComputeTaskDescriptor> task_desc_;
|
||||
std::unique_ptr<GPUOperation> operation_ = nullptr;
|
||||
id<MTLComputePipelineState> program_;
|
||||
MetalArguments metal_args_;
|
||||
uint3 groups_size_;
|
||||
|
@ -64,6 +64,7 @@ objc_library(
|
||||
deps = [
|
||||
":add",
|
||||
":test_util",
|
||||
"//tensorflow/lite/delegates/gpu/common/tasks:add_test_util",
|
||||
],
|
||||
)
|
||||
|
||||
@ -812,6 +813,8 @@ objc_library(
|
||||
"//tensorflow/lite/delegates/gpu/common:tensor",
|
||||
"//tensorflow/lite/delegates/gpu/common:types",
|
||||
"//tensorflow/lite/delegates/gpu/common:util",
|
||||
"//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
|
||||
"//tensorflow/lite/delegates/gpu/common/task:testing_util",
|
||||
"//tensorflow/lite/delegates/gpu/metal:common",
|
||||
"//tensorflow/lite/delegates/gpu/metal:inference_context",
|
||||
"//tensorflow/lite/delegates/gpu/metal:metal_device",
|
||||
@ -897,6 +900,7 @@ objc_library(
|
||||
"//tensorflow/lite/delegates/gpu/common:shape",
|
||||
"//tensorflow/lite/delegates/gpu/common:types",
|
||||
"//tensorflow/lite/delegates/gpu/common:util",
|
||||
"//tensorflow/lite/delegates/gpu/common/tasks:add_test_util",
|
||||
"//tensorflow/lite/delegates/gpu/metal:common",
|
||||
"//tensorflow/lite/delegates/gpu/metal:inference_context",
|
||||
"//tensorflow/lite/kernels/internal:quantization_util",
|
||||
|
@ -27,6 +27,7 @@ limitations under the License.
|
||||
#include "tensorflow/lite/delegates/gpu/common/util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/tasks/add_test_util.h"
|
||||
|
||||
using ::tflite::gpu::ElementwiseAttributes;
|
||||
using ::tflite::gpu::BHWC;
|
||||
@ -123,4 +124,22 @@ using ::tflite::gpu::metal::SingleOpModel;
|
||||
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
|
||||
}
|
||||
|
||||
- (void)testAddTwoEqualTensors {
|
||||
tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
|
||||
auto status = AddTwoEqualTensorsTest(&exec_env_);
|
||||
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
|
||||
}
|
||||
|
||||
- (void)testAddFirstTensorHasMoreChannelsThanSecond {
|
||||
tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
|
||||
auto status = AddFirstTensorHasMoreChannelsThanSecondTest(&exec_env_);
|
||||
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
|
||||
}
|
||||
|
||||
- (void)testAddFirstTensorHasLessChannelsThanSecond {
|
||||
tflite::gpu::metal::MetalExecutionEnvironment exec_env_;
|
||||
auto status = AddFirstTensorHasLessChannelsThanSecond(&exec_env_);
|
||||
XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
|
||||
}
|
||||
|
||||
@end
|
||||
|
@ -184,6 +184,72 @@ MetalExecutionEnvironment::GetSupportedStoragesWithHWZeroClampSupport() const {
|
||||
return {};
|
||||
}
|
||||
|
||||
absl::Status MetalExecutionEnvironment::ExecuteGPUOperation(
|
||||
const std::vector<TensorFloat32>& src_cpu,
|
||||
std::unique_ptr<GPUOperation>&& operation,
|
||||
const std::vector<BHWC>& dst_sizes,
|
||||
const std::vector<TensorFloat32*>& dst_cpu) {
|
||||
const OperationDef op_def = operation->GetDefinition();
|
||||
std::vector<MetalSpatialTensor> src(src_cpu.size());
|
||||
for (int i = 0; i < src_cpu.size(); ++i) {
|
||||
auto src_shape = src_cpu[i].shape;
|
||||
if (src_shape.b != 1 && !op_def.IsBatchSupported()) {
|
||||
return absl::InvalidArgumentError(
|
||||
"Layout doesn't have Batch dimension, but shape.b != 1");
|
||||
}
|
||||
RETURN_IF_ERROR(CreateTensor(device_.device(), src_shape,
|
||||
op_def.src_tensors[i], &src[i]));
|
||||
RETURN_IF_ERROR(src[i].WriteData(src_cpu[i]));
|
||||
operation->SetSrc(&src[i], i);
|
||||
}
|
||||
|
||||
std::vector<MetalSpatialTensor> dst(dst_cpu.size());
|
||||
for (int i = 0; i < dst_cpu.size(); ++i) {
|
||||
auto dst_shape = dst_sizes[i];
|
||||
if (dst_shape.b != 1 && !op_def.IsBatchSupported()) {
|
||||
return absl::InvalidArgumentError(
|
||||
"Layout doesn't have Batch dimension, but shape.b != 1");
|
||||
}
|
||||
RETURN_IF_ERROR(CreateTensor(device_.device(), dst_shape,
|
||||
op_def.dst_tensors[i], &dst[i]));
|
||||
operation->SetDst(&dst[i], i);
|
||||
}
|
||||
|
||||
std::vector<BHWC> src_shapes;
|
||||
std::vector<BHWC> dst_shapes;
|
||||
std::vector<ValueId> src_ids;
|
||||
std::vector<ValueId> dst_ids;
|
||||
for (int i = 0; i < src_cpu.size(); ++i) {
|
||||
src_ids.push_back(i);
|
||||
src_shapes.push_back(src_cpu[i].shape);
|
||||
}
|
||||
for (int i = 0; i < dst_cpu.size(); ++i) {
|
||||
dst_ids.push_back(src_cpu.size() + i);
|
||||
dst_shapes.push_back(dst_sizes[i]);
|
||||
}
|
||||
|
||||
ComputeTask gpu_task;
|
||||
gpu_task.Init(std::move(operation));
|
||||
RETURN_IF_ERROR(gpu_task.CompileOp(&device_));
|
||||
RETURN_IF_ERROR(gpu_task.UpdateOpParams());
|
||||
|
||||
id<MTLCommandQueue> command_queue = [device_.device() newCommandQueue];
|
||||
id<MTLCommandBuffer> command_buffer = [command_queue commandBuffer];
|
||||
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
|
||||
gpu_task.EncodeOpWithEncoder(encoder);
|
||||
[encoder endEncoding];
|
||||
[command_buffer commit];
|
||||
[command_buffer waitUntilCompleted];
|
||||
|
||||
for (int i = 0; i < dst_cpu.size(); ++i) {
|
||||
dst_cpu[i]->shape = dst_sizes[i];
|
||||
dst_cpu[i]->data = std::vector<float>(dst_sizes[i].DimensionsProduct(), 0);
|
||||
RETURN_IF_ERROR(dst[i].ReadData(dst_cpu[i]));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status MetalExecutionEnvironment::ExecuteGPUOperation(
|
||||
const std::vector<TensorFloat32>& src_cpu,
|
||||
std::unique_ptr<ComputeTaskDescriptor>&& operation,
|
||||
|
@ -24,6 +24,8 @@ limitations under the License.
|
||||
#include "tensorflow/lite/delegates/gpu/common/model.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/shape.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/status.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/inference_context.h"
|
||||
@ -61,20 +63,24 @@ class SingleOpModel {
|
||||
absl::Status CompareVectors(const std::vector<float>& reference,
|
||||
const std::vector<float>& output, float max_error);
|
||||
|
||||
class MetalExecutionEnvironment {
|
||||
class MetalExecutionEnvironment : public TestExecutionEnvironment {
|
||||
public:
|
||||
MetalExecutionEnvironment() = default;
|
||||
~MetalExecutionEnvironment() = default;
|
||||
|
||||
std::vector<CalculationsPrecision> GetSupportedPrecisions() const;
|
||||
std::vector<TensorStorageType> GetSupportedStorages() const;
|
||||
// returns storage types that support zero clamping when reading OOB in HW
|
||||
// (Height/Width) dimensions.
|
||||
std::vector<CalculationsPrecision> GetSupportedPrecisions() const override;
|
||||
std::vector<TensorStorageType> GetSupportedStorages() const override;
|
||||
std::vector<TensorStorageType> GetSupportedStoragesWithHWZeroClampSupport()
|
||||
const;
|
||||
const override;
|
||||
|
||||
const GpuInfo& GetGpuInfo() const { return device_.GetInfo(); }
|
||||
|
||||
absl::Status ExecuteGPUOperation(
|
||||
const std::vector<TensorFloat32>& src_cpu,
|
||||
std::unique_ptr<GPUOperation>&& operation,
|
||||
const std::vector<BHWC>& dst_sizes,
|
||||
const std::vector<TensorFloat32*>& dst_cpu) override;
|
||||
|
||||
absl::Status ExecuteGPUOperation(
|
||||
const std::vector<TensorFloat32>& src_cpu,
|
||||
std::unique_ptr<ComputeTaskDescriptor>&& operation,
|
||||
|
Loading…
Reference in New Issue
Block a user