ConvTexture replaced with ConvPowerVR.
PiperOrigin-RevId: 336742538 Change-Id: I78fdaaa2d98d59ededfec90cffe1cb53747137f4
This commit is contained in:
parent
1782a45957
commit
b082980a84
@ -239,50 +239,6 @@ cc_test(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "conv_texture",
|
||||
srcs = ["conv_texture.cc"],
|
||||
hdrs = ["conv_texture.h"],
|
||||
deps = [
|
||||
":gpu_operation",
|
||||
":util",
|
||||
":work_group_picking",
|
||||
"//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
|
||||
"//tensorflow/lite/delegates/gpu/cl:cl_context",
|
||||
"//tensorflow/lite/delegates/gpu/cl:linear_storage",
|
||||
"//tensorflow/lite/delegates/gpu/cl:precision",
|
||||
"//tensorflow/lite/delegates/gpu/cl:tensor",
|
||||
"//tensorflow/lite/delegates/gpu/cl:tensor_type",
|
||||
"//tensorflow/lite/delegates/gpu/cl:texture2d",
|
||||
"//tensorflow/lite/delegates/gpu/cl:util",
|
||||
"//tensorflow/lite/delegates/gpu/common:data_type",
|
||||
"//tensorflow/lite/delegates/gpu/common:operations",
|
||||
"//tensorflow/lite/delegates/gpu/common:shape",
|
||||
"//tensorflow/lite/delegates/gpu/common:status",
|
||||
"//tensorflow/lite/delegates/gpu/common:tensor",
|
||||
"//tensorflow/lite/delegates/gpu/common:types",
|
||||
"//tensorflow/lite/delegates/gpu/common:winograd_util",
|
||||
"@com_google_absl//absl/strings",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "conv_texture_test",
|
||||
srcs = ["conv_texture_test.cc"],
|
||||
linkstatic = True,
|
||||
tags = tf_gpu_tests_tags() + [
|
||||
"linux",
|
||||
"local",
|
||||
],
|
||||
deps = [
|
||||
":cl_test",
|
||||
":conv_texture",
|
||||
"//tensorflow/lite/delegates/gpu/common:operations",
|
||||
"//tensorflow/lite/delegates/gpu/common:status",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "conv_weights_converter",
|
||||
srcs = ["conv_weights_converter.cc"],
|
||||
@ -1397,7 +1353,6 @@ test_suite(
|
||||
"conv_buffer_1x1_test",
|
||||
"conv_constants_test",
|
||||
"conv_powervr_test",
|
||||
"conv_texture_test",
|
||||
"convolution_transposed_3x3_thin_test",
|
||||
"convolution_transposed_4x4_test",
|
||||
"convolution_transposed_test",
|
||||
|
@ -233,6 +233,16 @@ void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
|
||||
if (conv_params_.IsPrivateMemBroadcast() && device_info.IsCL20OrHigher()) {
|
||||
compiler_options_.push_back(CompilerOptions::CL_2_0);
|
||||
}
|
||||
bool kernel_is_trivial =
|
||||
conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
|
||||
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
|
||||
kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
|
||||
}
|
||||
if (device_info.IsAdreno3xx() &&
|
||||
definition_.precision == CalculationsPrecision::F16 &&
|
||||
kernel_is_trivial) {
|
||||
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status ConvPowerVR::BindArguments() {
|
||||
@ -284,23 +294,34 @@ int3 ConvPowerVR::GetGridSize() const {
|
||||
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
|
||||
grid_x *= task_size_z;
|
||||
}
|
||||
wg.x = DivideRoundUp(grid_x, work_group_size_.x);
|
||||
wg.y = DivideRoundUp(task_size_s, work_group_size_.y);
|
||||
return int3(
|
||||
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
|
||||
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
|
||||
if (conv_params_.work_group_launch_order[0] == 0 &&
|
||||
conv_params_.work_group_launch_order[1] == 1) {
|
||||
return int3(grid_x, task_size_s, 1);
|
||||
} else {
|
||||
wg.x = DivideRoundUp(grid_x, work_group_size_.x);
|
||||
wg.y = DivideRoundUp(task_size_s, work_group_size_.y);
|
||||
return int3(
|
||||
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
|
||||
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
|
||||
}
|
||||
} else {
|
||||
int grid_y = task_size_y;
|
||||
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
|
||||
grid_y *= task_size_z;
|
||||
}
|
||||
wg.x = DivideRoundUp(task_size_x, work_group_size_.x);
|
||||
wg.y = DivideRoundUp(grid_y, work_group_size_.y);
|
||||
wg.z = DivideRoundUp(task_size_s, work_group_size_.z);
|
||||
return int3(
|
||||
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
|
||||
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
|
||||
wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
|
||||
if (conv_params_.work_group_launch_order[0] == 0 &&
|
||||
conv_params_.work_group_launch_order[1] == 1 &&
|
||||
conv_params_.work_group_launch_order[2] == 2) {
|
||||
return int3(task_size_x, grid_y, task_size_s);
|
||||
} else {
|
||||
wg.x = DivideRoundUp(task_size_x, work_group_size_.x);
|
||||
wg.y = DivideRoundUp(grid_y, work_group_size_.y);
|
||||
wg.z = DivideRoundUp(task_size_s, work_group_size_.z);
|
||||
return int3(
|
||||
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
|
||||
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
|
||||
wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1190,7 +1211,16 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
conv_params.fixed_work_group_size = false;
|
||||
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
|
||||
} else if (device_info.IsAdreno()) {
|
||||
conv_params.block_size = int4(2, 2, 1, 1);
|
||||
conv_params.block_size = int4(2, 2, 1, 2);
|
||||
if (device_info.IsAdreno3xx()) {
|
||||
if (definition.precision == CalculationsPrecision::F16) {
|
||||
conv_params.block_size = int4(2, 2, 1, 2);
|
||||
} else if (definition.precision == CalculationsPrecision::F32_F16) {
|
||||
conv_params.block_size = int4(2, 1, 1, 2);
|
||||
} else { // F32
|
||||
conv_params.block_size = int4(2, 2, 1, 1);
|
||||
}
|
||||
}
|
||||
work_group_size_ = int3(8, 2, 1);
|
||||
conv_params.work_group_launch_order = int3(0, 1, 2);
|
||||
conv_params.fixed_work_group_size = false;
|
||||
|
@ -1,461 +0,0 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/substitute.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
bool UseFP16SIMD(const DeviceInfo& device_info, CalculationsPrecision precision,
|
||||
bool kernel1x1) {
|
||||
if (!device_info.IsAdreno()) {
|
||||
return false;
|
||||
}
|
||||
switch (precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
case CalculationsPrecision::F32_F16:
|
||||
return false;
|
||||
case CalculationsPrecision::F16:
|
||||
return device_info.IsAdreno3xx() && kernel1x1;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
ConvTexture::ConvTexture(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr)
|
||||
: GPUOperation(definition),
|
||||
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
|
||||
stride_(attr.strides.w, attr.strides.h),
|
||||
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
|
||||
dilation_(attr.dilations.w, attr.dilations.h),
|
||||
different_weights_for_height_(false),
|
||||
block_size_(2, 2, 2) {
|
||||
work_group_size_ = int3(4, 4, 2);
|
||||
}
|
||||
|
||||
ConvTexture::ConvTexture(const OperationDef& definition)
|
||||
: GPUOperation(definition),
|
||||
kernel_size_(1, 1),
|
||||
stride_(1, 1),
|
||||
padding_(0, 0),
|
||||
dilation_(1, 1),
|
||||
different_weights_for_height_(false),
|
||||
block_size_(4, 1, 2) {
|
||||
work_group_size_ = int3(16, 1, 2);
|
||||
}
|
||||
|
||||
ConvTexture::ConvTexture(ConvTexture&& operation)
|
||||
: GPUOperation(std::move(operation)),
|
||||
kernel_size_(operation.kernel_size_),
|
||||
stride_(operation.stride_),
|
||||
padding_(operation.padding_),
|
||||
dilation_(operation.dilation_),
|
||||
different_weights_for_height_(operation.different_weights_for_height_),
|
||||
block_size_(operation.block_size_) {}
|
||||
|
||||
ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
|
||||
if (this != &operation) {
|
||||
std::swap(kernel_size_, operation.kernel_size_);
|
||||
std::swap(stride_, operation.stride_);
|
||||
std::swap(padding_, operation.padding_);
|
||||
std::swap(dilation_, operation.dilation_);
|
||||
std::swap(different_weights_for_height_,
|
||||
operation.different_weights_for_height_);
|
||||
std::swap(block_size_, operation.block_size_);
|
||||
GPUOperation::operator=(std::move(operation));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
std::string ConvTexture::GenerateConvCode(const OperationDef& op_def,
|
||||
const int3& block_size, bool is1x1,
|
||||
bool adreno4xx_optimization,
|
||||
bool stride_correction,
|
||||
bool different_weights_for_height) {
|
||||
auto src_desc = op_def.src_tensors[0];
|
||||
src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
|
||||
if (op_def.IsBatchSupported()) {
|
||||
src_desc.SetStateVar("BatchedWidth", "true");
|
||||
}
|
||||
AddSrcTensor("src_tensor", src_desc);
|
||||
|
||||
auto dst_desc = op_def.dst_tensors[0];
|
||||
if (op_def.IsBatchSupported()) {
|
||||
dst_desc.SetStateVar("BatchedWidth", "true");
|
||||
}
|
||||
AddDstTensor("dst_tensor", dst_desc);
|
||||
|
||||
if (!is1x1) {
|
||||
args_.AddInt("kernel_size_x");
|
||||
args_.AddInt("kernel_size_y");
|
||||
args_.AddInt("dilation_x");
|
||||
args_.AddInt("dilation_y");
|
||||
}
|
||||
args_.AddInt("stride_x");
|
||||
args_.AddInt("stride_y");
|
||||
args_.AddInt("padding_x");
|
||||
args_.AddInt("padding_y");
|
||||
|
||||
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
|
||||
const bool is_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER ||
|
||||
src_tensor_type == TensorStorageType::BUFFER;
|
||||
|
||||
std::vector<std::string> xs(block_size.x);
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
xs[x] = std::to_string(x);
|
||||
}
|
||||
|
||||
std::vector<std::string> ys(block_size.y);
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
ys[y] = std::to_string(y);
|
||||
}
|
||||
|
||||
std::vector<std::string> zs(block_size.z);
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
zs[z] = std::to_string(z);
|
||||
}
|
||||
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
const std::string f0 = std::to_string(z * 4 + 0);
|
||||
const std::string f1 = std::to_string(z * 4 + 1);
|
||||
const std::string f2 = std::to_string(z * 4 + 2);
|
||||
const std::string f3 = std::to_string(z * 4 + 3);
|
||||
switch (op_def.precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
case CalculationsPrecision::F16:
|
||||
c += "#define CONV" + zs[z] + "(R, S) \\\n";
|
||||
c += "R += S.x * f" + f0 + "; \\\n";
|
||||
c += "R += S.y * f" + f1 + "; \\\n";
|
||||
c += "R += S.z * f" + f2 + "; \\\n";
|
||||
c += "R += S.w * f" + f3 + "; \n";
|
||||
break;
|
||||
case CalculationsPrecision::F32_F16:
|
||||
c += "#define CONV" + zs[z] + "(R, S) \\\n";
|
||||
c += "R += convert_float4(S.x * f" + f0 + " + S.y * f" + f1 +
|
||||
" + S.z * f" + f2 + " + S.w * f" + f3 + ");\n";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
c += "__kernel void main_function(\n";
|
||||
c += "$0) {\n";
|
||||
c += " int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
|
||||
c += " int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
|
||||
c += " int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
|
||||
c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
|
||||
"|| Z >= args.dst_tensor.Slices()) return;\n";
|
||||
std::vector<std::string> s_x(block_size.x);
|
||||
std::vector<std::string> s_y(block_size.y);
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
if (stride_correction) {
|
||||
c += " int xc" + xs[x] + " = " +
|
||||
GetXStrideCorrected("X + " + xs[x], "args.src_tensor.Batch()",
|
||||
"args.stride_x", "args.padding_x") +
|
||||
";\n";
|
||||
} else {
|
||||
c += " int xc" + xs[x] + " = (X +" + xs[x] +
|
||||
") * args.stride_x + args.padding_x;\n";
|
||||
}
|
||||
s_x[x] = is1x1 ? "xc" + xs[x] : "cx" + xs[x];
|
||||
}
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
c += " int yc" + ys[y] + " = (Y +" + ys[y] +
|
||||
") * args.stride_y + args.padding_y;\n";
|
||||
s_y[y] = is1x1 ? "yc" + ys[y] : "cy" + ys[y];
|
||||
}
|
||||
for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
|
||||
c += " ACCUM_FLT4 r" + std::to_string(i) +
|
||||
" = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
|
||||
}
|
||||
std::string f_y = is1x1 ? "s" : "filter_offset";
|
||||
if (different_weights_for_height) {
|
||||
f_y = "Y * args.src_tensor.Slices() + s";
|
||||
}
|
||||
if (!is1x1) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
c += " int cx" + xs[x] + ";\n";
|
||||
}
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
c += " int cy" + ys[y] + ";\n";
|
||||
}
|
||||
c += " int filter_offset = 0;\n";
|
||||
c += " for (int y = 0; y < args.kernel_size_y; ++y) {\n";
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
c += " cy" + ys[y] + " = y * args.dilation_y + yc" + ys[y] + ";\n";
|
||||
}
|
||||
if (is_buffer) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
c += " bool in_y" + ys[y] + " = cy" + ys[y] + " >= 0 && cy" + ys[y] +
|
||||
" < args.src_tensor.Height();\n";
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " cy" + ys[y] + " = clamp(cy" + ys[y] +
|
||||
", 0, args.src_tensor.Height() - 1);\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
c += " for (int x = 0; x < args.kernel_size_x; ++x) {\n";
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
c += " cx" + xs[x] + " = x * args.dilation_x + xc" + xs[x] + ";\n";
|
||||
}
|
||||
if (is_buffer) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
c += " bool in_x" + xs[x] + " = cx" + xs[x] + " >= 0 && cx" + xs[x] +
|
||||
" < args.src_tensor.Width();\n";
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " cx" + xs[x] + " = clamp(cx" + xs[x] +
|
||||
", 0, args.src_tensor.Width() - 1);\n";
|
||||
}
|
||||
}
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
const std::string id = std::to_string(y * block_size.x + x);
|
||||
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
c += absl::Substitute(
|
||||
" int addr_$0 = select(-1, cy$2 * args.src_tensor.Width() + "
|
||||
"cx$1, (in_x$1 "
|
||||
"&& "
|
||||
"in_y$2));\n",
|
||||
y * block_size.x + x, x, y);
|
||||
c += absl::Substitute(
|
||||
" int dz_$0 = select(0, args.src_tensor.Width() * "
|
||||
"args.src_tensor.Height(), (in_x$1 && "
|
||||
"in_y$2));\n",
|
||||
y * block_size.x + x, x, y);
|
||||
} else {
|
||||
c += absl::Substitute(
|
||||
" int addr_$0 = cy$2 * args.src_tensor.Width() + cx$1;\n",
|
||||
y * block_size.x + x, x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
|
||||
}
|
||||
}
|
||||
} else if (is_buffer) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
c += " bool in_y" + ys[y] + " = yc" + ys[y] + " >= 0 && yc" + ys[y] +
|
||||
" < args.src_tensor.Height();\n";
|
||||
}
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
c += " bool in_x" + xs[x] + " = xc" + xs[x] + " >= 0 && xc" + xs[x] +
|
||||
" < args.src_tensor.Width();\n";
|
||||
}
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
const std::string id = std::to_string(y * block_size.x + x);
|
||||
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
c += absl::Substitute(
|
||||
" int addr_$0 = select(-1, yc$2 * args.src_tensor.Width() + "
|
||||
"xc$1, (in_x$1 && "
|
||||
"in_y$2));\n",
|
||||
y * block_size.x + x, x, y);
|
||||
c += absl::Substitute(
|
||||
" int dz_$0 = select(0, args.src_tensor.Width() * "
|
||||
"args.src_tensor.Height(), (in_x$1 && "
|
||||
"in_y$2));\n",
|
||||
y * block_size.x + x, x, y);
|
||||
} else {
|
||||
c += absl::Substitute(
|
||||
" int addr_$0 = yc$2 * args.src_tensor.Width() + xc$1;\n",
|
||||
y * block_size.x + x, x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
|
||||
}
|
||||
}
|
||||
c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
|
||||
if (is_buffer) {
|
||||
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
for (int index = 0; index < block_size.x * block_size.y; ++index) {
|
||||
const std::string id = std::to_string(index);
|
||||
c +=
|
||||
" FLT4 src" + id + " = args.src_tensor.Read(addr_" + id + ");\n";
|
||||
}
|
||||
} else {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
const std::string id = std::to_string(y * block_size.x + x);
|
||||
c += " FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
|
||||
") * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
|
||||
" += dz;\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
c += absl::Substitute(R"( FLT4 f$2 = args.weights0.Read($0, $1);
|
||||
FLT4 f$3 = args.weights1.Read($0, $1);
|
||||
FLT4 f$4 = args.weights2.Read($0, $1);
|
||||
FLT4 f$5 = args.weights3.Read($0, $1);
|
||||
)",
|
||||
"Z + " + zs[z], f_y, z * 4 + 0, z * 4 + 1, z * 4 + 2,
|
||||
z * 4 + 3);
|
||||
}
|
||||
if (!is_buffer) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
const std::string id = std::to_string(y * block_size.x + x);
|
||||
c += " FLT4 src" + id + " = args.src_tensor.Read(" + s_x[x] + ", " +
|
||||
s_y[y] + ", s);\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
for (int i = 0; i < block_size.x * block_size.y; ++i) {
|
||||
c += " CONV" + zs[z] + "(r" +
|
||||
std::to_string(i + z * block_size.x * block_size.y) + ", src" +
|
||||
std::to_string(i) + ");\n";
|
||||
}
|
||||
}
|
||||
if (!is1x1) {
|
||||
c += " filter_offset++;\n";
|
||||
}
|
||||
if (is_buffer) {
|
||||
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
for (int index = 0; index < block_size.x * block_size.y; ++index) {
|
||||
const std::string id = std::to_string(index);
|
||||
c += " addr_" + id + " += dz_" + id + ";\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
c += " }\n"; // args.src_tensor.Slices()
|
||||
if (!is1x1) {
|
||||
c += " }\n"; // kernel_size_x
|
||||
c += " }\n"; // kernel_size_y
|
||||
}
|
||||
// when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
|
||||
std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
|
||||
std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
c += " if (Z < args.dst_tensor.Slices()) {\n";
|
||||
c += " FLT4 bias_val = args.biases.Read(Z);\n";
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
const std::string id =
|
||||
std::to_string((z * block_size.y + y) * block_size.x + x);
|
||||
c += " {\n";
|
||||
c += " int xc = " + dst_x + " + " + xs[x] + ";\n";
|
||||
c += " int yc = " + dst_y + " + " + ys[y] + ";\n";
|
||||
c += " if (xc < args.dst_tensor.Width() && yc < "
|
||||
"args.dst_tensor.Height()) {\n";
|
||||
c += " FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
|
||||
c += " args.dst_tensor.Write(res, xc, yc, Z);\n";
|
||||
c += " }\n";
|
||||
c += " }\n";
|
||||
}
|
||||
}
|
||||
c += " }\n";
|
||||
c += " Z++;\n";
|
||||
}
|
||||
c += "}\n";
|
||||
return c;
|
||||
}
|
||||
|
||||
void ConvTexture::GenerateCode(const DeviceInfo& device_info) {
|
||||
auto storage_type = definition_.GetPrimaryStorageType();
|
||||
bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
|
||||
bool adreno4xx_optimization =
|
||||
stride_.x == 1 && stride_.y == 1 && padding_.x == 0 && padding_.y == 0 &&
|
||||
device_info.IsAdreno4xx() &&
|
||||
storage_type == TensorStorageType::TEXTURE_ARRAY &&
|
||||
definition_.precision == CalculationsPrecision::F16;
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_.x != 1;
|
||||
code_ =
|
||||
GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
|
||||
stride_correction, different_weights_for_height_);
|
||||
|
||||
if (UseFP16SIMD(device_info, definition_.precision, is1x1)) {
|
||||
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status ConvTexture::BindArguments() {
|
||||
if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
|
||||
RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
|
||||
RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
|
||||
RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
|
||||
RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
|
||||
}
|
||||
RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
|
||||
RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
|
||||
RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
|
||||
RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
int3 ConvTexture::GetGridSize() const {
|
||||
const int grid_x =
|
||||
DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
|
||||
const int grid_y = DivideRoundUp(dst_[0]->Height(), block_size_.y);
|
||||
const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
void ConvTexture::GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
|
||||
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
}
|
||||
|
||||
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr) {
|
||||
ConvTexture result(definition, attr);
|
||||
result.GenerateCode(device_info);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr) {
|
||||
ConvTexture result(definition);
|
||||
result.GenerateCode(device_info);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr) {
|
||||
ConvTexture result(definition);
|
||||
result.different_weights_for_height_ = true;
|
||||
result.block_size_ = {4, 1, 2};
|
||||
result.GenerateCode(device_info);
|
||||
result.UploadDataForWinograd4x4To6x6(attr.weights);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
} // namespace tflite
|
@ -1,193 +0,0 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
|
||||
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/operations.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/shape.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/status.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/types.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
// This convolution process BLOCK_SIZE(XxYxZ) of FLT4 values per thread.
|
||||
class ConvTexture : public GPUOperation {
|
||||
public:
|
||||
ConvTexture() = default;
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override;
|
||||
absl::Status BindArguments() override;
|
||||
int3 GetGridSize() const override;
|
||||
|
||||
// Move only
|
||||
ConvTexture(ConvTexture&& operation);
|
||||
ConvTexture& operator=(ConvTexture&& operation);
|
||||
ConvTexture(const ConvTexture&) = delete;
|
||||
ConvTexture& operator=(const ConvTexture&) = delete;
|
||||
|
||||
private:
|
||||
friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr);
|
||||
|
||||
friend ConvTexture CreateConvTextureWino4x4To6x6(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
ConvTexture(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
explicit ConvTexture(const OperationDef& definition);
|
||||
template <DataType T>
|
||||
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases);
|
||||
|
||||
template <DataType T>
|
||||
void UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights);
|
||||
|
||||
template <DataType T>
|
||||
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
|
||||
|
||||
void GenerateCode(const DeviceInfo& device_info);
|
||||
|
||||
std::string GenerateConvCode(const OperationDef& op_def,
|
||||
const int3& block_size, bool is1x1,
|
||||
bool adreno4xx_optimization,
|
||||
bool stride_correction,
|
||||
bool different_weights_for_height);
|
||||
|
||||
int2 kernel_size_;
|
||||
int2 stride_;
|
||||
int2 padding_;
|
||||
int2 dilation_;
|
||||
|
||||
// By default in 2d convolution we have the same weights for WH dims, but in
|
||||
// some cases we need separate weights for H dimension and convolution kernel
|
||||
// requires very small modifications to support it.
|
||||
bool different_weights_for_height_;
|
||||
|
||||
int3 block_size_ = int3(2, 2, 2);
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
void ConvTexture::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases) {
|
||||
UploadWeights(weights);
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
desc.element_type = definition_.GetDataType();
|
||||
desc.UploadLinearData(biases);
|
||||
args_.AddObject("biases",
|
||||
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
void ConvTexture::UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights) {
|
||||
tflite::gpu::Tensor<OHWI, T> wino_weights;
|
||||
RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
|
||||
UploadWeights(wino_weights);
|
||||
|
||||
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
|
||||
bias.shape = Linear(1);
|
||||
bias.data = {0.0f};
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
desc.element_type = definition_.GetDataType();
|
||||
desc.UploadLinearData(bias);
|
||||
args_.AddObject("biases",
|
||||
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
void ConvTexture::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
|
||||
int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
dst_depth = AlignByN(dst_depth, block_size_.z);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
const int kernel_y = weights.shape.h;
|
||||
|
||||
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
|
||||
const int elements_count = dst_depth * src_depth * kernel_x * kernel_y * 4;
|
||||
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
|
||||
|
||||
std::vector<uint8_t> data(float4_size * elements_count);
|
||||
|
||||
if (f32_weights) {
|
||||
float4* ptr = reinterpret_cast<float4*>(data.data());
|
||||
RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.z,
|
||||
absl::MakeSpan(ptr, elements_count));
|
||||
} else {
|
||||
half4* ptr = reinterpret_cast<half4*>(data.data());
|
||||
RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.z,
|
||||
absl::MakeSpan(ptr, elements_count));
|
||||
}
|
||||
|
||||
const int texture_width = dst_depth;
|
||||
const int texture_height = src_depth * kernel_x * kernel_y;
|
||||
const int sub_size = float4_size * texture_width * texture_height;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
Texture2DDescriptor desc;
|
||||
desc.element_type = data_type;
|
||||
desc.size = int2(texture_width, texture_height);
|
||||
desc.data.resize(sub_size);
|
||||
memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
|
||||
const std::string name = "weights" + std::to_string(i);
|
||||
args_.AddObject(name,
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc)));
|
||||
}
|
||||
}
|
||||
|
||||
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr);
|
||||
|
||||
ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
|
@ -1,107 +0,0 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <gmock/gmock.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/operations.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/status.h"
|
||||
|
||||
using ::testing::FloatNear;
|
||||
using ::testing::Pointwise;
|
||||
|
||||
namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
|
||||
TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
|
||||
TensorFloat32 src_tensor;
|
||||
src_tensor.shape = BHWC(1, 2, 2, 2);
|
||||
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
|
||||
|
||||
Convolution2DAttributes attr;
|
||||
attr.padding.prepended = HW(0, 0);
|
||||
attr.padding.appended = HW(1, 1);
|
||||
attr.strides = HW(1, 1);
|
||||
attr.dilations = HW(1, 1);
|
||||
attr.weights.shape = OHWI(1, 2, 2, 2);
|
||||
attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
|
||||
attr.bias.shape = Linear(1);
|
||||
attr.bias.data = {0.0f};
|
||||
|
||||
for (auto storage : env_.GetSupportedStorages()) {
|
||||
for (auto precision : env_.GetSupportedPrecisions()) {
|
||||
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
|
||||
OperationDef op_def;
|
||||
op_def.precision = precision;
|
||||
auto data_type = DeduceDataTypeFromPrecision(precision);
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvTexture operation =
|
||||
CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(OpenCLOperationTest, ConvTexture) {
|
||||
TensorFloat32 src_tensor;
|
||||
src_tensor.shape = BHWC(1, 2, 2, 2);
|
||||
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
|
||||
|
||||
Convolution2DAttributes attr;
|
||||
attr.padding.prepended = HW(0, 0);
|
||||
attr.padding.appended = HW(1, 1);
|
||||
attr.strides = HW(1, 1);
|
||||
attr.dilations = HW(1, 1);
|
||||
attr.weights.shape = OHWI(2, 2, 2, 2);
|
||||
attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
|
||||
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
|
||||
attr.bias.shape = Linear(2);
|
||||
attr.bias.data = {0.5f, -0.5f};
|
||||
|
||||
for (auto storage : env_.GetSupportedStorages()) {
|
||||
for (auto precision : env_.GetSupportedPrecisions()) {
|
||||
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
|
||||
OperationDef op_def;
|
||||
op_def.precision = precision;
|
||||
auto data_type = DeduceDataTypeFromPrecision(precision);
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvTexture operation =
|
||||
CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f,
|
||||
60.5f, 235.5f, 20.5f, 123.5f}));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
} // namespace tflite
|
@ -14,7 +14,6 @@ cc_library(
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_common",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_constants",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_weights_converter",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:work_group_picking",
|
||||
@ -82,7 +81,6 @@ cc_library(
|
||||
deps = [
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_buffer_1x1",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:fully_connected",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
|
||||
"//tensorflow/lite/delegates/gpu/common:operations",
|
||||
|
@ -19,7 +19,6 @@ limitations under the License.
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
|
||||
@ -38,8 +37,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionAdreno(
|
||||
GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
|
||||
return absl::make_unique<GPUOperation>(std::move(conv));
|
||||
} else {
|
||||
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
|
||||
return absl::make_unique<ConvTexture>(std::move(conv));
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
}
|
||||
|
||||
@ -47,8 +46,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionWinogradAdreno(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info, const OperationDef& op_def,
|
||||
ModelHints hints) {
|
||||
ConvTexture conv = CreateConvTextureWino4x4To6x6(device_info, op_def, attr);
|
||||
return absl::make_unique<ConvTexture>(std::move(conv));
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsAdreno(
|
||||
|
@ -18,7 +18,6 @@ limitations under the License.
|
||||
#include "absl/memory/memory.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/operations.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/status.h"
|
||||
@ -31,8 +30,9 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedGeneric(
|
||||
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, int batch_size) {
|
||||
if (op_def.IsBatchSupported()) {
|
||||
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
|
||||
return absl::make_unique<ConvTexture>(std::move(conv));
|
||||
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
} else {
|
||||
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
|
||||
return absl::make_unique<FullyConnected>(std::move(fc));
|
||||
@ -43,8 +43,9 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedAdreno(
|
||||
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, int batch_size) {
|
||||
if (op_def.IsBatchSupported()) {
|
||||
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
|
||||
return absl::make_unique<ConvTexture>(std::move(conv));
|
||||
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
} else {
|
||||
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
|
||||
return absl::make_unique<FullyConnected>(std::move(fc));
|
||||
@ -71,8 +72,10 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedMali(
|
||||
ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr);
|
||||
return absl::make_unique<ConvBuffer1x1>(std::move(conv));
|
||||
} else {
|
||||
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
|
||||
return absl::make_unique<ConvTexture>(std::move(conv));
|
||||
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
|
||||
ConvPowerVR conv =
|
||||
CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
} else {
|
||||
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
|
||||
|
Loading…
x
Reference in New Issue
Block a user