From 5a251d2975e18d2d548bded1bb2c1871cc17d51f Mon Sep 17 00:00:00 2001 From: Raman Sarokin Date: Mon, 25 Jan 2021 11:08:29 -0800 Subject: [PATCH] Metal Pooling kernels replaced with pooling tasks from common/tasks. PiperOrigin-RevId: 353686250 Change-Id: Ib65111414fcef072bd71383c069e222183c24e5b --- .../lite/delegates/gpu/metal/kernels/BUILD | 17 -- .../delegates/gpu/metal/kernels/pooling.cc | 182 ------------------ .../delegates/gpu/metal/kernels/pooling.h | 37 ---- .../gpu/metal/kernels/pooling_test.mm | 96 --------- .../lite/delegates/gpu/metal/selectors/BUILD | 1 + .../gpu/metal/selectors/operation_selector.cc | 28 +-- 6 files changed, 9 insertions(+), 352 deletions(-) delete mode 100644 tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc delete mode 100644 tensorflow/lite/delegates/gpu/metal/kernels/pooling.h diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD index b0eeddb60b8..ce45ca29d18 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD +++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD @@ -22,7 +22,6 @@ cc_library( ":conv", ":depthwise_conv", ":fully_connected", - ":pooling", ":transpose_conv", ":winograd", ], @@ -296,28 +295,12 @@ ios_unit_test( deps = [":padding_test_lib"], ) -cc_library( - name = "pooling", - srcs = ["pooling.cc"], - hdrs = ["pooling.h"], - deps = [ - ":util", - "//tensorflow/lite/delegates/gpu/common:model", - "//tensorflow/lite/delegates/gpu/common:operations", - "//tensorflow/lite/delegates/gpu/common:shape", - "//tensorflow/lite/delegates/gpu/common:util", - "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor", - "@com_google_absl//absl/strings", - ], -) - objc_library( name = "pooling_test_lib", testonly = 1, srcs = ["pooling_test.mm"], sdk_frameworks = ["XCTest"], deps = [ - ":pooling", ":test_util", "//tensorflow/lite/delegates/gpu/common/tasks:pooling_test_util", ], diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc deleted file mode 100644 index d33630ecb83..00000000000 --- a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc +++ /dev/null @@ -1,182 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/lite/delegates/gpu/metal/kernels/pooling.h" - -#include -#include -#include -#include -#include - -#include "absl/strings/substitute.h" -#include "tensorflow/lite/delegates/gpu/common/model.h" -#include "tensorflow/lite/delegates/gpu/common/operations.h" -#include "tensorflow/lite/delegates/gpu/common/shape.h" -#include "tensorflow/lite/delegates/gpu/common/util.h" -#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" -#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h" - -namespace tflite { -namespace gpu { -namespace metal { -namespace { - -std::string GetMaxPoolingCode() { - std::string shader_source = R"( -kernel void ComputeFunction($0 - uint3 gid[[thread_position_in_grid]]) { - if (static_cast(gid.x) >= args.dst_tensor.Width() || - static_cast(gid.y) >= args.dst_tensor.Height() || - static_cast(gid.z) >= args.dst_tensor.Slices()) { - return; - } - - FLT4 maximum = FLT4(-10000.0); - for (int ky = 0; ky < args.kernel_size_y; ++ky) { - for (int kx = 0; kx < args.kernel_size_x; ++kx) { - int c_x = int(gid.x) * args.stride_x - args.offset_x + kx; - int c_y = int(gid.y) * args.stride_y - args.offset_y + ky; - bool outside = c_x < 0 || c_y < 0 || c_x >= args.src_tensor.Width() || - c_y >= args.src_tensor.Height(); - FLT4 src_color = outside ? FLT4(-10000.0) : args.src_tensor.Read(c_x, c_y, gid.z); - maximum = max(maximum, src_color); - } - } - args.dst_tensor.Write(maximum, gid.x, gid.y, gid.z); -} - )"; - return shader_source; -} - -std::string GetMaxPoolingIndicesCode() { - std::string shader_source = R"( -kernel void ComputeFunction($0 - uint3 gid[[thread_position_in_grid]]) { - if (static_cast(gid.x) >= args.dst_tensor.Width() || - static_cast(gid.y) >= args.dst_tensor.Height() || - static_cast(gid.z) >= args.dst_tensor.Slices()) { - return; - } - - FLT4 maximum = FLT4(-10000.0); - ushort4 indexes = ushort4(0); - ushort index_counter = 0; - for (int ky = 0; ky < args.kernel_size_y; ++ky) { - for (int kx = 0; kx < args.kernel_size_x; ++kx) { - int c_x = int(gid.x) * args.stride_x - args.offset_x + kx; - int c_y = int(gid.y) * args.stride_y - args.offset_y + ky; - bool outside = c_x < 0 || c_y < 0 || c_x >= args.src_tensor.Width() || - c_y >= args.src_tensor.Height(); - FLT4 src_color = outside ? FLT4(-10000.0) : args.src_tensor.Read(c_x, c_y, gid.z); - if (src_color.x > maximum.x) { - indexes.x = index_counter; - maximum.x = src_color.x; - } - if (src_color.y > maximum.y) { - indexes.y = index_counter; - maximum.y = src_color.y; - } - if (src_color.z > maximum.z) { - indexes.z = index_counter; - maximum.z = src_color.z; - } - if (src_color.w > maximum.w) { - indexes.w = index_counter; - maximum.w = src_color.w; - } - index_counter++; - } - } - FLT4 value = static_cast(indexes); - args.dst_tensor.Write(value, gid.x, gid.y, gid.z); -} - )"; - return shader_source; -} - -std::string GetAveragePoolingCode() { - std::string shader_source = R"( -kernel void ComputeFunction($0 - uint tid[[thread_index_in_threadgroup]], - uint3 gid[[thread_position_in_grid]]) { - if (static_cast(gid.x) >= args.dst_tensor.Width() || - static_cast(gid.y) >= args.dst_tensor.Height() || - static_cast(gid.z) >= args.dst_tensor.Slices()) { - return; - } - - float4 sum = float4(0.0f); - float window_size = 0.0f; - for (int ky = 0; ky < args.kernel_size_y; ++ky) { - for (int kx = 0; kx < args.kernel_size_x; ++kx) { - int c_x = int(gid.x) * args.stride_x - args.offset_x + kx; - int c_y = int(gid.y) * args.stride_y - args.offset_y + ky; - bool outside = c_x < 0 || c_y < 0 || c_x >= args.src_tensor.Width() || - c_y >= args.src_tensor.Height(); - float4 src_color = outside ? float4(0.0f) : float4(args.src_tensor.Read(c_x, c_y, gid.z)); - window_size += outside ? 0.0f : 1.0f; - sum += src_color; - } - } - // If window_size==0, window covered nothing. This situation is a sign of - // incorrectly constructed operation. NaNs are expected as output. - FLT4 value = FLT4(sum / window_size); - args.dst_tensor.Write(value, gid.x, gid.y, gid.z); -} -)"; - return shader_source; -} - -} // namespace - -ComputeTaskDescriptor Pooling(const OperationDef& definition, - const Pooling2DAttributes& attr, - bool generate_indices) { - ComputeTaskDescriptor desc(definition); - if (attr.type == PoolingType::MAX) { - desc.shader_source = - generate_indices ? GetMaxPoolingIndicesCode() : GetMaxPoolingCode(); - } else if (attr.type == PoolingType::AVERAGE) { - desc.shader_source = GetAveragePoolingCode(); - } - - desc.AddSrcTensor("src_tensor", definition.src_tensors[0]); - desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]); - - desc.args.AddInt("kernel_size_x", attr.kernel.w); - desc.args.AddInt("kernel_size_y", attr.kernel.h); - desc.args.AddInt("stride_x", attr.strides.w); - desc.args.AddInt("stride_y", attr.strides.h); - desc.args.AddInt("offset_x", attr.padding.prepended.w); - desc.args.AddInt("offset_y", attr.padding.prepended.h); - - desc.resize_function = [](const std::vector& src_shapes, - const std::vector& dst_shapes) { - const uint3 grid = uint3(dst_shapes[0].w, dst_shapes[0].h, - DivideRoundUp(dst_shapes[0].c, 4)); - const uint3 groups_size = GetWorkGroupSizeForGrid(grid); - int groups_x = DivideRoundUp(grid.x, groups_size.x); - int groups_y = DivideRoundUp(grid.y, groups_size.y); - int groups_z = DivideRoundUp(grid.z, groups_size.z); - return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z}); - }; - - return desc; -} - -} // namespace metal -} // namespace gpu -} // namespace tflite diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.h b/tensorflow/lite/delegates/gpu/metal/kernels/pooling.h deleted file mode 100644 index ee1050a8b2c..00000000000 --- a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_POOLING_H_ -#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_POOLING_H_ - -#include - -#include "tensorflow/lite/delegates/gpu/common/model.h" -#include "tensorflow/lite/delegates/gpu/common/operations.h" -#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" - -namespace tflite { -namespace gpu { -namespace metal { - -ComputeTaskDescriptor Pooling(const OperationDef& definition, - const Pooling2DAttributes& attr, - bool generate_indices); - -} // namespace metal -} // namespace gpu -} // namespace tflite - -#endif // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_POOLING_H_ diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/pooling_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/pooling_test.mm index 9fe450053c3..a2f35f8cdcd 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/pooling_test.mm +++ b/tensorflow/lite/delegates/gpu/metal/kernels/pooling_test.mm @@ -27,16 +27,6 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" #include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h" -using ::tflite::gpu::BHWC; -using ::tflite::gpu::DataType; -using ::tflite::gpu::HW; -using ::tflite::gpu::OperationType; -using ::tflite::gpu::Pooling2DAttributes; -using ::tflite::gpu::PoolingType; -using ::tflite::gpu::TensorRef; -using ::tflite::gpu::metal::CompareVectors; -using ::tflite::gpu::metal::SingleOpModel; - @interface PoolingTest : XCTestCase @end @@ -44,92 +34,6 @@ using ::tflite::gpu::metal::SingleOpModel; tflite::gpu::metal::MetalExecutionEnvironment exec_env_; } -- (void)testPoolingMaxKernel2x2Stride2x2WithIndices { - TensorRef input; - input.type = DataType::FLOAT32; - input.ref = 0; - input.shape = BHWC(1, 4, 4, 1); - - TensorRef output; - output.type = DataType::FLOAT32; - output.ref = 1; - output.shape = BHWC(1, 2, 2, 1); - - TensorRef indices; - indices.type = DataType::INT32; - indices.ref = 2; - indices.shape = BHWC(1, 2, 2, 1); - - Pooling2DAttributes attr; - attr.kernel = HW(2, 2); - attr.padding.prepended = HW(0, 0); - attr.padding.appended = HW(0, 0); - attr.strides = HW(2, 2); - attr.type = PoolingType::MAX; - attr.output_indices = true; - - SingleOpModel model({ToString(OperationType::POOLING_2D), attr}, {input}, {output, indices}); - XCTAssertTrue(model.PopulateTensor(0, {1, 2, 1, 2, 3, 4, 3, 4, 7, 8, 7, 8, 5, 6, 5, 6})); - auto status = model.Invoke(); - XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); - status = CompareVectors({4, 4, 8, 8}, model.GetOutput(0), 1e-6f); - XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); - status = CompareVectors({3, 3, 1, 1}, model.GetOutput(1), 1e-6f); - XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); -} - -- (void)testPoolingMaxKernel2x2Stride2x2WithoutIndices { - TensorRef input; - input.type = DataType::FLOAT32; - input.ref = 0; - input.shape = BHWC(1, 4, 4, 1); - - TensorRef output; - output.type = DataType::FLOAT32; - output.ref = 1; - output.shape = BHWC(1, 2, 2, 1); - - Pooling2DAttributes attr; - attr.kernel = HW(2, 2); - attr.padding.prepended = HW(0, 0); - attr.padding.appended = HW(0, 0); - attr.strides = HW(2, 2); - attr.type = PoolingType::MAX; - - SingleOpModel model({ToString(OperationType::POOLING_2D), attr}, {input}, {output}); - XCTAssertTrue(model.PopulateTensor(0, {1, 2, 1, 2, 3, 4, 3, 4, 7, 8, 7, 8, 5, 6, 5, 6})); - auto status = model.Invoke(); - XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); - status = CompareVectors({4, 4, 8, 8}, model.GetOutput(0), 1e-6f); - XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); -} - -- (void)testPoolingAverageKernel2x2Stride2x2 { - TensorRef input; - input.type = DataType::FLOAT32; - input.ref = 0; - input.shape = BHWC(1, 4, 4, 1); - - TensorRef output; - output.type = DataType::FLOAT32; - output.ref = 1; - output.shape = BHWC(1, 2, 2, 1); - - Pooling2DAttributes attr; - attr.kernel = HW(2, 2); - attr.padding.prepended = HW(0, 0); - attr.padding.appended = HW(0, 0); - attr.strides = HW(2, 2); - attr.type = PoolingType::AVERAGE; - - SingleOpModel model({ToString(OperationType::POOLING_2D), attr}, {input}, {output}); - XCTAssertTrue(model.PopulateTensor(0, {1, 1, 2, 2, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 4, 4})); - auto status = model.Invoke(); - XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); - status = CompareVectors({1, 2, 3, 4}, model.GetOutput(0), 1e-6f); - XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); -} - - (void)testAveragePooling { auto status = AveragePoolingTest(&exec_env_); XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str()); diff --git a/tensorflow/lite/delegates/gpu/metal/selectors/BUILD b/tensorflow/lite/delegates/gpu/metal/selectors/BUILD index 29a772f5384..264d1b5c867 100644 --- a/tensorflow/lite/delegates/gpu/metal/selectors/BUILD +++ b/tensorflow/lite/delegates/gpu/metal/selectors/BUILD @@ -43,6 +43,7 @@ cc_library( "//tensorflow/lite/delegates/gpu/common/tasks:lstm", "//tensorflow/lite/delegates/gpu/common/tasks:max_unpooling", "//tensorflow/lite/delegates/gpu/common/tasks:padding", + "//tensorflow/lite/delegates/gpu/common/tasks:pooling", "//tensorflow/lite/delegates/gpu/common/tasks:prelu", "//tensorflow/lite/delegates/gpu/common/tasks:quantize_and_dequantize", "//tensorflow/lite/delegates/gpu/common/tasks:reduce", diff --git a/tensorflow/lite/delegates/gpu/metal/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/metal/selectors/operation_selector.cc index e23f3056598..09f7c1bb5df 100644 --- a/tensorflow/lite/delegates/gpu/metal/selectors/operation_selector.cc +++ b/tensorflow/lite/delegates/gpu/metal/selectors/operation_selector.cc @@ -31,6 +31,7 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/common/tasks/lstm.h" #include "tensorflow/lite/delegates/gpu/common/tasks/max_unpooling.h" #include "tensorflow/lite/delegates/gpu/common/tasks/padding.h" +#include "tensorflow/lite/delegates/gpu/common/tasks/pooling.h" #include "tensorflow/lite/delegates/gpu/common/tasks/prelu.h" #include "tensorflow/lite/delegates/gpu/common/tasks/quantize_and_dequantize.h" #include "tensorflow/lite/delegates/gpu/common/tasks/reduce.h" @@ -49,7 +50,6 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/metal/kernels/conv.h" #include "tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.h" #include "tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.h" -#include "tensorflow/lite/delegates/gpu/metal/kernels/pooling.h" #include "tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h" #include "tensorflow/lite/delegates/gpu/metal/kernels/winograd.h" #include "tensorflow/lite/delegates/gpu/metal/selectors/default_selector.h" @@ -125,6 +125,11 @@ void SelectPadding(const PadAttributes& attr, const OperationDef& op_def, *ptr = absl::make_unique(std::move(operation)); } +std::unique_ptr SelectPooling(const Pooling2DAttributes& attr, + const OperationDef& op_def) { + return absl::make_unique(CreatePooling(op_def, attr)); +} + std::unique_ptr SelectReduce(const std::set& axis_to_reduce, const BHWC& src_shape, OperationType op_type, @@ -419,25 +424,8 @@ absl::Status GPUOperationFromNode(const GpuInfo& gpu_info, case OperationType::POOLING_2D: { auto attr = absl::any_cast(node.operation.attributes); - auto pooling_op_def = op_def; - pooling_op_def.dst_tensors = {op_def.dst_tensors[0]}; - auto gpu_op = Pooling(op_def, attr, false); - gpu_subgraph->operations[0].task_desc = - absl::make_unique(std::move(gpu_op)); - gpu_subgraph->operations[0].input_ids = {static_cast(inputs[0]->id)}; - gpu_subgraph->operations[0].output_ids = { - static_cast(outputs[0]->id)}; - if (attr.type == PoolingType::MAX && attr.output_indices) { - gpu_subgraph->operations.push_back({}); - auto gpu_ind_op = Pooling(op_def, attr, true); - gpu_subgraph->operations[1].task_desc = - absl::make_unique(std::move(gpu_ind_op)); - gpu_subgraph->operations[1].input_ids = { - static_cast(inputs[0]->id)}; - gpu_subgraph->operations[1].output_ids = { - static_cast(outputs[1]->id)}; - } - break; + gpu_operation->operation = SelectPooling(attr, op_def); + return absl::OkStatus(); } case OperationType::PRELU: { auto attr = absl::any_cast(node.operation.attributes);