Removed useless Status and CreationContext from convolution kernels.

PiperOrigin-RevId: 327363985
Change-Id: I216229b4cfb4f11416fc2832c6bd00a1793f9a8a
This commit is contained in:
Raman Sarokin 2020-08-18 20:56:46 -07:00 committed by TensorFlower Gardener
parent 4c222cfdf9
commit b14150088d
32 changed files with 584 additions and 687 deletions

View File

@ -244,6 +244,26 @@ DeviceInfo DeviceInfoFromDeviceID(cl_device_id id) {
info.max_work_group_size_x = max_work_group_sizes.x;
info.max_work_group_size_y = max_work_group_sizes.y;
info.max_work_group_size_z = max_work_group_sizes.z;
if (info.IsIntel()) {
if (info.SupportsExtension("cl_intel_required_subgroup_size")) {
size_t sub_groups_count;
cl_int status =
clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
nullptr, &sub_groups_count);
if (status == CL_SUCCESS) {
std::vector<size_t> sub_group_sizes(sub_groups_count);
status = clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
sizeof(size_t) * sub_groups_count,
sub_group_sizes.data(), nullptr);
if (status == CL_SUCCESS) {
for (int i = 0; i < sub_groups_count; ++i) {
info.supported_subgroup_sizes.push_back(sub_group_sizes[i]);
}
}
}
}
}
return info;
}
@ -305,37 +325,10 @@ std::string CLDevice::GetPlatformVersion() const {
return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
}
bool CLDevice::IsCL20OrHigher() const {
return info_.cl_version != OpenCLVersion::CL_1_0 &&
info_.cl_version != OpenCLVersion::CL_1_1 &&
info_.cl_version != OpenCLVersion::CL_1_2;
}
bool CLDevice::IsCL20OrHigher() const { return info_.IsCL20OrHigher(); }
bool CLDevice::SupportsSubGroupWithSize(int sub_group_size) const {
if (IsIntel()) {
if (SupportsExtension("cl_intel_required_subgroup_size")) {
size_t sub_groups_count;
cl_int error =
clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
nullptr, &sub_groups_count);
if (error != CL_SUCCESS) {
return false;
}
std::vector<size_t> sub_group_sizes(sub_groups_count);
error = clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
sizeof(size_t) * sub_groups_count,
sub_group_sizes.data(), nullptr);
if (error != CL_SUCCESS) {
return false;
}
for (int i = 0; i < sub_groups_count; ++i) {
if (sub_group_sizes[i] == sub_group_size) {
return true;
}
}
}
}
return false;
return info_.SupportsSubGroupWithSize(sub_group_size);
}
bool CLDevice::IsAdreno() const { return info_.IsAdreno(); }

View File

@ -262,6 +262,21 @@ bool DeviceInfo::SupportsExtension(const std::string& extension) const {
return false;
}
bool DeviceInfo::IsCL20OrHigher() const {
return cl_version != OpenCLVersion::CL_1_0 &&
cl_version != OpenCLVersion::CL_1_1 &&
cl_version != OpenCLVersion::CL_1_2;
}
bool DeviceInfo::SupportsSubGroupWithSize(int sub_group_size) const {
for (auto subgroup_size : supported_subgroup_sizes) {
if (sub_group_size == subgroup_size) {
return true;
}
}
return false;
}
bool DeviceInfo::IsAdreno() const { return vendor == Vendor::kQualcomm; }
bool DeviceInfo::IsAdreno3xx() const {

View File

@ -139,6 +139,8 @@ struct DeviceInfo {
bool SupportsOneLayerTextureArray() const;
bool SupportsExtension(const std::string& extension) const;
bool IsCL20OrHigher() const;
bool SupportsSubGroupWithSize(int sub_group_size) const;
std::vector<std::string> extensions;
bool supports_fp16;
@ -157,6 +159,7 @@ struct DeviceInfo {
int max_work_group_size_x;
int max_work_group_size_y;
int max_work_group_size_z;
std::vector<int> supported_subgroup_sizes;
// rtn is ROUND_TO_NEAREST
// with rtn precision is much better then with rtz (ROUND_TO_ZERO)

View File

@ -118,6 +118,7 @@ cc_library(
"//tensorflow/lite/delegates/gpu/cl:precision",
"//tensorflow/lite/delegates/gpu/cl:tensor",
"//tensorflow/lite/delegates/gpu/cl:tensor_type",
"//tensorflow/lite/delegates/gpu/cl:texture2d",
"//tensorflow/lite/delegates/gpu/cl:util",
"//tensorflow/lite/delegates/gpu/common:data_type",
"//tensorflow/lite/delegates/gpu/common:operations",

View File

@ -167,7 +167,8 @@ std::string GenerateConv(CalculationsPrecision precision,
} // namespace
Conv3D::Conv3D(const OperationDef& definition,
const Convolution3DAttributes& attr, const CLDevice& device)
const Convolution3DAttributes& attr,
const DeviceInfo& device_info)
: GPUOperation(definition),
stride_(attr.strides.w, attr.strides.h, attr.strides.d),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
@ -175,12 +176,12 @@ Conv3D::Conv3D(const OperationDef& definition,
kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
attr.weights.shape.d),
dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
conv_params_(GuessBestParams(device, definition, attr)) {
conv_params_(GuessBestParams(device_info, definition, attr)) {
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
code_ = GenerateConv3D(definition_, stride_correction, conv_params_);
if (definition_.precision == CalculationsPrecision::F16 &&
device.IsPowerVR()) {
device_info.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
}
@ -725,7 +726,7 @@ std::string Conv3D::GenerateConv3D(const OperationDef& op_def,
return c;
}
Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
Conv3D::ConvParams Conv3D::GuessBestParams(const DeviceInfo& device_info,
const OperationDef& definition,
int src_slices, int dst_slices,
bool x_kernel_is_1,
@ -735,7 +736,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
conv_params.x_kernel_is_1 = x_kernel_is_1;
conv_params.y_kernel_is_1 = y_kernel_is_1;
conv_params.z_kernel_is_1 = z_kernel_is_1;
if (device.IsNvidia()) {
if (device_info.IsNvidia()) {
conv_params.block_size = int4(1, 1, 1, 4);
work_group_size_ = int3(8, 4, 1);
conv_params.work_group_launch_order = int3(2, 0, 1);
@ -754,7 +755,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) {
conv_params.src_depth_loop_size = 4;
}
} else if (device.IsPowerVR()) {
} else if (device_info.IsPowerVR()) {
conv_params.block_size = int4(1, 1, 1, 4);
work_group_size_ = int3(8, 4, 1);
conv_params.work_group_launch_order = int3(2, 0, 1);
@ -792,13 +793,13 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
conv_params.block_size.x = 2;
work_group_size_ = int3(4, 8, 1);
}
} else if (device.IsAdreno()) {
} else if (device_info.IsAdreno()) {
conv_params.block_size = int4(2, 2, 1, 2);
work_group_size_ = int3(8, 4, 1);
conv_params.work_group_launch_order = int3(0, 1, 2);
conv_params.src_depth_loop_size = 1;
conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM;
} else if (device.IsMali()) {
} else if (device_info.IsMali()) {
conv_params.block_size = int4(1, 1, 1, 4);
work_group_size_ = int3(8, 4, 1);
conv_params.work_group_launch_order = int3(0, 1, 2);
@ -829,7 +830,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
}
Conv3D::ConvParams Conv3D::GuessBestParams(
const CLDevice& device, const OperationDef& definition,
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution3DAttributes& attr) {
const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
@ -845,15 +846,16 @@ Conv3D::ConvParams Conv3D::GuessBestParams(
attr.dilations.d == 1 &&
attr.padding.prepended.d == 0 &&
attr.padding.appended.d == 0;
return GuessBestParams(device, definition, src_slices, dst_slices,
return GuessBestParams(device_info, definition, src_slices, dst_slices,
x_kernel_is_1, y_kernel_is_1, z_kernel_is_1);
}
absl::Status CreateConv3D(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution3DAttributes& attr, Conv3D* result) {
*result = Conv3D(definition, attr, *creation_context.device);
return result->UploadData(attr.weights, attr.bias, creation_context.context);
Conv3D CreateConv3D(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution3DAttributes& attr) {
Conv3D result(definition, attr, device_info);
result.UploadData(attr.weights, attr.bias);
return result;
}
} // namespace cl

View File

@ -24,6 +24,7 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
@ -74,35 +75,32 @@ class Conv3D : public GPUOperation {
};
Conv3D(const OperationDef& definition, const Convolution3DAttributes& attr,
const CLDevice& device);
const DeviceInfo& device_info);
template <DataType T>
absl::Status UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases,
CLContext* context);
void UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases);
template <DataType T>
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
CLContext* context);
void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
template <DataType S, typename T>
void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
absl::Span<T> dst);
friend absl::Status CreateConv3D(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution3DAttributes& attr,
Conv3D* result);
friend Conv3D CreateConv3D(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution3DAttributes& attr);
friend std::string GenerateConv3D(const OperationDef& op_def,
bool stride_correction,
const ConvParams& conv_params,
Arguments* args);
ConvParams GuessBestParams(const CLDevice& device,
ConvParams GuessBestParams(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution3DAttributes& attr);
ConvParams GuessBestParams(const CLDevice& device,
ConvParams GuessBestParams(const DeviceInfo& device_info,
const OperationDef& definition, int src_slices,
int dst_slices, bool x_kernel_is_1,
bool y_kernel_is_1, bool z_kernel_is_1);
@ -118,10 +116,9 @@ class Conv3D : public GPUOperation {
};
template <DataType T>
absl::Status Conv3D::UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases,
CLContext* context) {
RETURN_IF_ERROR(UploadWeights(weights, context));
void Conv3D::UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases) {
UploadWeights(weights);
TensorLinearDescriptor desc;
desc.storage_type = conv_params_.AreWeightsBuffer()
? LinearStorageType::BUFFER
@ -130,12 +127,10 @@ absl::Status Conv3D::UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
desc.UploadLinearData(biases);
args_.AddObject("biases",
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
return absl::OkStatus();
}
template <DataType T>
absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
CLContext* context) {
void Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
const int block_size = conv_params_.block_size.w;
const int dst_slices =
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
@ -204,8 +199,6 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
args_.AddObject("weights3",
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
}
return absl::OkStatus();
}
template <DataType S, typename T>
@ -265,9 +258,9 @@ void Conv3D::RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
}
}
absl::Status CreateConv3D(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution3DAttributes& attr, Conv3D* result);
Conv3D CreateConv3D(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution3DAttributes& attr);
} // namespace cl
} // namespace gpu

View File

@ -81,19 +81,19 @@ std::string GetComputationPart(const int3& block_size, int element_size,
return c;
}
ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
const OperationDef& definition,
const BHWC& shape, int src_depth,
int dst_depth) {
ConvBuffer1x1::ConvParams conv_params;
conv_params.element_size = 4;
conv_params.block_size = int3(1, 1, 1);
if (!device.IsMali()) {
if (!device_info.IsMali()) {
return conv_params;
}
bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 &&
definition.precision != CalculationsPrecision::F32;
bool is_midgard = device.IsMali() && device.info_.mali_info.IsMidgard();
bool is_midgard = device_info.IsMali() && device_info.mali_info.IsMidgard();
if (is_midgard) {
if (can_use_flt8) {
conv_params.element_size = 8;
@ -106,7 +106,7 @@ ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
int task_size = shape.w * shape.b * shape.h * dst_depth;
int block_size = GetRecommendedBlockSizeForConv(
device.info_, definition.precision, task_size);
device_info, definition.precision, task_size);
if (!can_use_flt8 && block_size > 4) {
block_size = 4;
@ -134,14 +134,15 @@ ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
return conv_params;
}
ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
const OperationDef& definition,
int src_depth, int dst_depth) {
ConvBuffer1x1::ConvParams conv_params;
conv_params.element_size = 4;
conv_params.block_size = int3(1, 1, 1);
if (device.IsMali() && definition.precision == CalculationsPrecision::F16 &&
device.info_.compute_units_count <= 4) {
if (device_info.IsMali() &&
definition.precision == CalculationsPrecision::F16 &&
device_info.compute_units_count <= 4) {
conv_params.block_size.x *= 2;
}
return conv_params;
@ -345,85 +346,80 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
}
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvBuffer1x1* result, const BHWC* shape) {
if (!IsConvBuffer1x1Supported(definition, attr)) {
return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported");
}
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* shape) {
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvBuffer1x1::ConvParams conv_params;
if (shape) {
conv_params = GetBestParams(*creation_context.device, definition, *shape,
src_depth, dst_depth);
conv_params =
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
} else {
conv_params = GetBestParams(*creation_context.device, definition, src_depth,
dst_depth);
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
}
*result = ConvBuffer1x1(definition, conv_params);
return result->UploadData(attr.weights, attr.bias, creation_context.context);
ConvBuffer1x1 result(definition, conv_params);
result.UploadData(attr.weights, attr.bias);
return result;
}
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
ConvBuffer1x1* result, const BHWC* shape) {
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* shape) {
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvBuffer1x1::ConvParams conv_params;
if (shape) {
conv_params = GetBestParams(*creation_context.device, definition, *shape,
src_depth, dst_depth);
conv_params =
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
} else {
conv_params = GetBestParams(*creation_context.device, definition, src_depth,
dst_depth);
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
}
conv_params.block_size.x *= conv_params.block_size.y;
conv_params.block_size.y = 1;
*result = ConvBuffer1x1(definition, conv_params);
return result->UploadData(attr.weights, attr.bias, creation_context.context);
ConvBuffer1x1 result(definition, conv_params);
result.UploadData(attr.weights, attr.bias);
return result;
}
absl::Status CreateConvBuffer1x1Wino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
const BHWC* shape) {
ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* shape) {
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvBuffer1x1::ConvParams conv_params;
if (shape) {
conv_params = GetBestParams(*creation_context.device, definition, *shape,
src_depth, dst_depth);
conv_params =
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
} else {
conv_params = GetBestParams(*creation_context.device, definition, src_depth,
dst_depth);
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
}
conv_params.block_size.x *= conv_params.block_size.y;
conv_params.block_size.y = 1;
conv_params.different_weights_for_height = true;
*result = ConvBuffer1x1(definition, conv_params);
return result->UploadDataForWinograd4x4To6x6(
attr.weights, *creation_context.device, creation_context.context);
ConvBuffer1x1 result(definition, conv_params);
result.UploadDataForWinograd4x4To6x6(attr.weights);
return result;
}
absl::Status CreateConvBuffer1x1DynamicWeights(
const CreationContext& creation_context, const OperationDef& definition,
ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
ConvBuffer1x1* result, const BHWC* dst_shape) {
const BHWC* dst_shape) {
const int dst_depth = DivideRoundUp(weights_shape.b, 4);
const int src_depth = DivideRoundUp(weights_shape.c, 4);
ConvBuffer1x1::ConvParams conv_params;
if (dst_shape) {
conv_params = GetBestParams(*creation_context.device, definition,
*dst_shape, src_depth, dst_depth);
} else {
conv_params = GetBestParams(*creation_context.device, definition, src_depth,
conv_params = GetBestParams(device_info, definition, *dst_shape, src_depth,
dst_depth);
} else {
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
}
*result = ConvBuffer1x1(definition, conv_params);
return result->UploadBiases(attr.bias, creation_context.context);
ConvBuffer1x1 result(definition, conv_params);
result.UploadBiases(attr.bias);
return result;
}
} // namespace cl

View File

@ -72,39 +72,34 @@ class ConvBuffer1x1 : public GPUOperation {
private:
ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params);
friend absl::Status CreateConvBuffer1x1(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
const BHWC* shape);
friend absl::Status CreateConvBuffer1x1(
const CreationContext& creation_context, const OperationDef& definition,
const FullyConnectedAttributes& attr, ConvBuffer1x1* result,
const BHWC* shape);
friend absl::Status CreateConvBuffer1x1Wino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
const BHWC* shape);
friend absl::Status CreateConvBuffer1x1DynamicWeights(
const CreationContext& creation_context, const OperationDef& definition,
friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* shape);
friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* shape);
friend ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* shape);
friend ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
ConvBuffer1x1* result, const BHWC* dst_shape);
const BHWC* dst_shape);
template <DataType T>
absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases,
CLContext* context);
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases);
template <DataType T>
absl::Status UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
CLContext* context);
void UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights);
template <DataType T>
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
template <DataType T>
absl::Status UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases,
CLContext* context);
void UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases);
std::string GenerateConvBuffer1x1(
const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
@ -114,32 +109,26 @@ class ConvBuffer1x1 : public GPUOperation {
};
template <DataType T>
absl::Status ConvBuffer1x1::UploadData(
const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
RETURN_IF_ERROR(UploadWeights(weights, context));
RETURN_IF_ERROR(UploadBiases(biases, context));
return absl::OkStatus();
void ConvBuffer1x1::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases) {
UploadWeights(weights);
UploadBiases(biases);
}
template <DataType T>
absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
CLContext* context) {
void ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights) {
tflite::gpu::Tensor<OHWI, T> wino_weights;
RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
RETURN_IF_ERROR(UploadWeights(wino_weights, context));
UploadWeights(wino_weights);
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
bias.shape = Linear(weights.shape.o);
bias.data.resize(weights.shape.o, 0.0f);
RETURN_IF_ERROR(UploadBiases(bias, context));
return absl::OkStatus();
UploadBiases(bias);
}
template <DataType T>
absl::Status ConvBuffer1x1::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
void ConvBuffer1x1::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
@ -169,12 +158,10 @@ absl::Status ConvBuffer1x1::UploadWeights(
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}
template <DataType T>
absl::Status ConvBuffer1x1::UploadBiases(
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
void ConvBuffer1x1::UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases) {
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::BUFFER;
desc.element_type = definition_.GetDataType();
@ -182,7 +169,6 @@ absl::Status ConvBuffer1x1::UploadBiases(
desc.UploadLinearData(biases, depth);
args_.AddObject("biases",
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
return absl::OkStatus();
}
bool IsConvBuffer1x1Supported(const OperationDef& definition,
@ -192,27 +178,24 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
const BHWC& weights_shape,
const Convolution2DAttributes& attr);
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvBuffer1x1* result,
const BHWC* shape = nullptr);
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* shape = nullptr);
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
ConvBuffer1x1* result,
const BHWC* shape = nullptr);
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* shape = nullptr);
absl::Status CreateConvBuffer1x1DynamicWeights(
const CreationContext& creation_context, const OperationDef& definition,
ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
ConvBuffer1x1* result, const BHWC* dst_shape = nullptr);
const BHWC* dst_shape = nullptr);
absl::Status CreateConvBuffer1x1Wino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
const BHWC* shape = nullptr);
ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* shape = nullptr);
} // namespace cl
} // namespace gpu

View File

@ -56,9 +56,8 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1SimpleWeights) {
op_def.dst_tensors.push_back(
{data_type, TensorStorageType::BUFFER, Layout::HWC});
TensorFloat32 dst_tensor;
ConvBuffer1x1 operation;
ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation,
&src_tensor.shape));
ConvBuffer1x1 operation = CreateConvBuffer1x1(
creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -92,9 +91,8 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1) {
op_def.dst_tensors.push_back(
{data_type, TensorStorageType::BUFFER, Layout::HWC});
TensorFloat32 dst_tensor;
ConvBuffer1x1 operation;
ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation,
&src_tensor.shape));
ConvBuffer1x1 operation = CreateConvBuffer1x1(
creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 4), &dst_tensor));
EXPECT_THAT(dst_tensor.data,

View File

@ -255,10 +255,11 @@ int3 ConvConstants::GetGridSize() const {
return int3(grid_x, grid_y, 1);
}
bool IsConvConstantsSupported(const CLDevice& device,
bool IsConvConstantsSupported(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
if (device.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
if (device_info.IsAMD() &&
definition.precision != CalculationsPrecision::F32 &&
definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) {
// BUG, some AMD gpus crashe without it
return false;
@ -271,30 +272,25 @@ bool IsConvConstantsSupported(const CLDevice& device,
? sizeof(float)
: sizeof(half);
const int filters_buffer_size = filters_count * float_size;
const int kConstantMaxSize = GetOptimalMaxConstantSize(device.info_);
const int kConstantMaxSize = GetOptimalMaxConstantSize(device_info);
const int flt4_registers = DivideRoundUp(w_shape.o, 4);
return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
}
absl::Status CreateConvConstants(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvConstants* result) {
if (!IsConvConstantsSupported(*creation_context.device, definition, attr)) {
return absl::InvalidArgumentError("ConvConstants doesn't supported");
}
*result = ConvConstants(definition, attr, creation_context.device->info_);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
ConvConstants CreateConvConstants(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
ConvConstants result(definition, attr, device_info);
result.UploadWeights(attr.weights);
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::BUFFER;
desc.element_type = definition.GetDataType();
desc.memory_type = MemoryType::CONSTANT;
desc.UploadLinearData(attr.bias);
result->args_.AddObject(
result.args_.AddObject(
"biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
return absl::OkStatus();
return result;
}
} // namespace cl

View File

@ -45,16 +45,15 @@ class ConvConstants : public GPUOperation {
ConvConstants& operator=(const ConvConstants&) = delete;
private:
friend absl::Status CreateConvConstants(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvConstants* result);
friend ConvConstants CreateConvConstants(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
ConvConstants(const OperationDef& definition,
const Convolution2DAttributes& attr,
const DeviceInfo& device_info);
template <DataType T>
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
template <DataType S, typename T>
void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@ -75,8 +74,7 @@ class ConvConstants : public GPUOperation {
};
template <DataType T>
absl::Status ConvConstants::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
void ConvConstants::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
@ -102,8 +100,6 @@ absl::Status ConvConstants::UploadWeights(
args_.AddObject("weigths",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}
template <DataType S, typename T>
@ -149,14 +145,13 @@ void ConvConstants::RearrangeWeightsData(
}
}
bool IsConvConstantsSupported(const CLDevice& device,
bool IsConvConstantsSupported(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
absl::Status CreateConvConstants(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvConstants* result);
ConvConstants CreateConvConstants(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
} // namespace cl
} // namespace gpu

View File

@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvConstants operation;
ASSERT_OK(
CreateConvConstants(creation_context_, op_def, attr, &operation));
ConvConstants operation =
CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -91,9 +90,8 @@ TEST_F(OpenCLOperationTest, ConvConstants) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvConstants operation;
ASSERT_OK(
CreateConvConstants(creation_context_, op_def, attr, &operation));
ConvConstants operation =
CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,

View File

@ -130,33 +130,33 @@ std::string GenerateBlockCoords(const int3& block_size,
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
const Convolution2DAttributes& attr,
const CLDevice& device, const BHWC* dst_shape)
const DeviceInfo& device_info, const BHWC* dst_shape)
: GPUOperation(definition),
stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
-attr.padding.prepended.h),
kernel_dilation_(attr.weights.shape.w, attr.weights.shape.h,
attr.dilations.w, attr.dilations.h),
conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}
conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC& weights_shape, const CLDevice& device,
const BHWC* dst_shape)
const BHWC& weights_shape,
const DeviceInfo& device_info, const BHWC* dst_shape)
: GPUOperation(definition),
stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
-attr.padding.prepended.h),
kernel_dilation_(weights_shape.w, weights_shape.h, attr.dilations.w,
attr.dilations.h),
conv_params_(GuessBestParams(device, definition, attr, weights_shape,
conv_params_(GuessBestParams(device_info, definition, attr, weights_shape,
dst_shape)) {}
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
const FullyConnectedAttributes& attr,
const CLDevice& device, const BHWC* dst_shape)
const DeviceInfo& device_info, const BHWC* dst_shape)
: GPUOperation(definition),
stride_padding_(1, 1, 0, 0),
kernel_dilation_(1, 1, 1, 1),
conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}
conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
ConvPowerVR::ConvPowerVR(const OperationDef& definition)
: GPUOperation(definition),
@ -687,8 +687,8 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
}
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const CLDevice& device, const OperationDef& definition, int src_depth,
int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
const DeviceInfo& device_info, const OperationDef& definition,
int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
bool different_weights_for_height, const BHWC* dst_shape) {
ConvParams conv_params;
conv_params.linear_hw = false;
@ -697,7 +697,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
conv_params.x_kernel_is_1 = x_kernel_is_1;
conv_params.y_kernel_is_1 = y_kernel_is_1;
conv_params.different_weights_for_height = different_weights_for_height;
if (device.IsNvidia()) {
if (device_info.IsNvidia()) {
if (different_weights_for_height) {
work_group_size_ = int3(32, 1, 1);
conv_params.work_group_launch_order = int3(2, 0, 1);
@ -721,7 +721,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
if (dst_shape) {
int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
float task_size_per_cu =
static_cast<float>(task_size) / device.info_.compute_units_count;
static_cast<float>(task_size) / device_info.compute_units_count;
int block_size = conv_params.block_size.x * conv_params.block_size.y *
conv_params.block_size.z;
float threads_per_cu = task_size_per_cu / block_size;
@ -742,7 +742,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
conv_params.src_depth_loop_size = 4;
}
} else if (device.IsPowerVR()) {
} else if (device_info.IsPowerVR()) {
if (different_weights_for_height) {
work_group_size_ = int3(32, 1, 1);
conv_params.work_group_launch_order = int3(2, 0, 1);
@ -790,7 +790,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
}
conv_params.block_size.x = 2;
}
} else if (device.IsAMD()) {
} else if (device_info.IsAMD()) {
if (different_weights_for_height) {
work_group_size_ = int3(32, 1, 1);
conv_params.work_group_launch_order = int3(2, 0, 1);
@ -819,12 +819,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
if (src_depth % 2 == 0 && src_depth >= 16) {
conv_params.src_depth_loop_size = 2;
}
} else if (device.IsMali()) {
} else if (device_info.IsMali()) {
int block_size = 2;
if (dst_shape) {
int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
block_size = GetRecommendedBlockSizeForConv(
device.info_, definition.precision, task_size);
device_info, definition.precision, task_size);
}
if (!x_kernel_is_1 || !y_kernel_is_1) {
block_size = std::min(block_size, 4);
@ -847,7 +847,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
conv_params.block_size = int3(1, 1, 1);
}
conv_params.src_depth_loop_size = 1;
MaliInfo mali_info = device.info_.mali_info;
MaliInfo mali_info = device_info.mali_info;
if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
conv_params.src_depth_loop_size = 2;
}
@ -859,14 +859,14 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
conv_params.work_group_launch_order = int3(0, 1, 2);
conv_params.fixed_work_group_size = false;
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
} else if (device.IsAdreno()) {
} else if (device_info.IsAdreno()) {
conv_params.block_size = int3(2, 2, 1);
work_group_size_ = int3(8, 2, 1);
conv_params.work_group_launch_order = int3(0, 1, 2);
conv_params.fixed_work_group_size = false;
conv_params.src_depth_loop_size = 1;
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
} else if (device.IsIntel()) {
} else if (device_info.IsIntel()) {
if (different_weights_for_height) {
work_group_size_ = int3(16, 1, 1);
conv_params.work_group_launch_order = int3(0, 1, 2);
@ -880,9 +880,10 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
conv_params.block_size = int3(1, 1, 4);
conv_params.src_depth_loop_size = 1;
if (definition.precision != CalculationsPrecision::F32_F16 &&
device.SupportsExtension("cl_khr_subgroups") &&
device.SupportsExtension("cl_intel_required_subgroup_size") &&
device.IsCL20OrHigher() && device.SupportsSubGroupWithSize(16)) {
device_info.SupportsExtension("cl_khr_subgroups") &&
device_info.SupportsExtension("cl_intel_required_subgroup_size") &&
device_info.IsCL20OrHigher() &&
device_info.SupportsSubGroupWithSize(16)) {
conv_params.weights_upload_type =
WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST;
} else {
@ -927,7 +928,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
}
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const CLDevice& device, const OperationDef& definition,
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* dst_shape) {
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
@ -939,12 +940,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
attr.dilations.h == 1 &&
attr.padding.prepended.h == 0 &&
attr.padding.appended.h == 0;
return GuessBestParams(device, definition, src_depth, dst_depth,
return GuessBestParams(device_info, definition, src_depth, dst_depth,
x_kernel_is_1, y_kernel_is_1, false, dst_shape);
}
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const CLDevice& device, const OperationDef& definition,
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC* dst_shape) {
const int dst_depth = DivideRoundUp(weights_shape.b, 4);
@ -955,17 +956,18 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const bool y_kernel_is_1 =
weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
return GuessBestParams(device, definition, src_depth, dst_depth,
return GuessBestParams(device_info, definition, src_depth, dst_depth,
x_kernel_is_1, y_kernel_is_1, false, dst_shape);
}
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const CLDevice& device, const OperationDef& definition,
const DeviceInfo& device_info, const OperationDef& definition,
const FullyConnectedAttributes& attr, const BHWC* dst_shape) {
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvPowerVR::ConvParams params = GuessBestParams(
device, definition, src_depth, dst_depth, true, true, false, dst_shape);
ConvPowerVR::ConvParams params =
GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
false, dst_shape);
work_group_size_.x *= work_group_size_.y;
work_group_size_.y = 1;
params.block_size.x *= params.block_size.y;
@ -974,55 +976,59 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
}
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
const CLDevice& device, const OperationDef& definition,
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* dst_shape) {
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvPowerVR::ConvParams params = GuessBestParams(
device, definition, src_depth, dst_depth, true, true, true, dst_shape);
ConvPowerVR::ConvParams params =
GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
true, dst_shape);
params.block_size.x *= params.block_size.y;
params.block_size.y = 1;
return params;
}
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvPowerVR* result, const BHWC* dst_shape) {
*result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
result->GenerateCode(creation_context.device->info_);
return result->UploadData(attr.weights, attr.bias, creation_context.context);
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape) {
ConvPowerVR result(definition, attr, device_info, dst_shape);
result.GenerateCode(device_info);
result.UploadData(attr.weights, attr.bias);
return result;
}
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
ConvPowerVR* result, const BHWC* dst_shape) {
*result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
result->GenerateCode(creation_context.device->info_);
return result->UploadData(attr.weights, attr.bias, creation_context.context);
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* dst_shape) {
ConvPowerVR result(definition, attr, device_info, dst_shape);
result.GenerateCode(device_info);
result.UploadData(attr.weights, attr.bias);
return result;
}
absl::Status CreateConvPowerVRDynamicWeights(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
ConvPowerVR* result, const BHWC* dst_shape) {
*result = ConvPowerVR(definition, attr, weights_shape,
*creation_context.device, dst_shape);
result->GenerateCode(creation_context.device->info_);
return result->UploadBias(attr.bias, creation_context.context);
ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC& weights_shape,
const BHWC* dst_shape) {
ConvPowerVR result(definition, attr, weights_shape, device_info, dst_shape);
result.GenerateCode(device_info);
result.UploadBias(attr.bias);
return result;
}
absl::Status CreateConvPowerVRWino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvPowerVR* result,
const BHWC* dst_shape) {
*result = ConvPowerVR(definition);
result->conv_params_ = result->GuessBestParamsWinograd(
*creation_context.device, definition, attr, dst_shape);
result->GenerateCode(creation_context.device->info_);
return result->UploadDataForWinograd4x4To6x6(
attr.weights, *creation_context.device, creation_context.context);
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape) {
ConvPowerVR result(definition);
result.conv_params_ =
result.GuessBestParamsWinograd(device_info, definition, attr, dst_shape);
result.GenerateCode(device_info);
result.UploadDataForWinograd4x4To6x6(attr.weights);
return result;
}
} // namespace cl

View File

@ -128,75 +128,68 @@ class ConvPowerVR : public GPUOperation {
};
ConvPowerVR(const OperationDef& definition,
const Convolution2DAttributes& attr, const CLDevice& device,
const BHWC* dst_shape = nullptr);
const Convolution2DAttributes& attr,
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
ConvPowerVR(const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const CLDevice& device, const BHWC* dst_shape = nullptr);
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
ConvPowerVR(const OperationDef& definition,
const FullyConnectedAttributes& attr, const CLDevice& device,
const BHWC* dst_shape = nullptr);
const FullyConnectedAttributes& attr,
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
explicit ConvPowerVR(const OperationDef& definition);
void GenerateCode(const DeviceInfo& device_info);
template <DataType T>
absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases,
CLContext* context);
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases);
template <DataType T>
absl::Status UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
CLContext* context);
void UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights);
template <DataType T>
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
template <DataType T>
absl::Status UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
CLContext* context);
void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
friend absl::Status CreateConvPowerVR(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvPowerVR* result,
const BHWC* dst_shape);
friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape);
friend absl::Status CreateConvPowerVR(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
ConvPowerVR* result,
const BHWC* dst_shape);
friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* dst_shape);
friend absl::Status CreateConvPowerVRDynamicWeights(
const CreationContext& creation_context, const OperationDef& definition,
friend ConvPowerVR CreateConvPowerVRDynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
ConvPowerVR* result, const BHWC* dst_shape);
friend absl::Status CreateConvPowerVRWino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvPowerVR* result,
const BHWC* dst_shape);
ConvParams GuessBestParams(const CLDevice& device,
friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* dst_shape);
ConvParams GuessBestParams(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape = nullptr);
ConvParams GuessBestParams(const CLDevice& device,
ConvParams GuessBestParams(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC& weights_shape,
const BHWC* dst_shape = nullptr);
ConvParams GuessBestParams(const CLDevice& device,
ConvParams GuessBestParams(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* dst_shape = nullptr);
ConvParams GuessBestParamsWinograd(const CLDevice& device,
ConvParams GuessBestParamsWinograd(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape = nullptr);
ConvParams GuessBestParams(const CLDevice& device,
ConvParams GuessBestParams(const DeviceInfo& device_info,
const OperationDef& definition, int src_depth,
int dst_depth, bool x_kernel_is_1,
bool y_kernel_is_1,
@ -213,31 +206,26 @@ class ConvPowerVR : public GPUOperation {
};
template <DataType T>
absl::Status ConvPowerVR::UploadData(
const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
RETURN_IF_ERROR(UploadWeights(weights, context));
RETURN_IF_ERROR(UploadBias(biases, context));
return absl::OkStatus();
void ConvPowerVR::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases) {
UploadWeights(weights);
UploadBias(biases);
}
template <DataType T>
absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
CLContext* context) {
void ConvPowerVR::UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights) {
tflite::gpu::Tensor<OHWI, T> wino_weights;
RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
RETURN_IF_ERROR(UploadWeights(wino_weights, context));
UploadWeights(wino_weights);
tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
biases.shape = Linear(weights.shape.o);
biases.data.resize(weights.shape.o, 0.0f);
RETURN_IF_ERROR(UploadBias(biases, context));
return absl::OkStatus();
UploadBias(biases);
}
template <DataType T>
absl::Status ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
CLContext* context) {
void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
BufferDescriptor desc;
desc.element_type = conv_params_.weights_data_type;
desc.element_size = 4;
@ -264,12 +252,10 @@ absl::Status ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
}
args_.AddObject("biases",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}
template <DataType T>
absl::Status ConvPowerVR::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
@ -301,30 +287,28 @@ absl::Status ConvPowerVR::UploadWeights(
}
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvPowerVR* result,
const BHWC* dst_shape = nullptr);
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape = nullptr);
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
ConvPowerVR* result,
const BHWC* dst_shape = nullptr);
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* dst_shape = nullptr);
absl::Status CreateConvPowerVRDynamicWeights(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
ConvPowerVR* result, const BHWC* dst_shape = nullptr);
ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC& weights_shape,
const BHWC* dst_shape = nullptr);
absl::Status CreateConvPowerVRWino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvPowerVR* result,
const BHWC* dst_shape = nullptr);
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape = nullptr);
} // namespace cl
} // namespace gpu

View File

@ -57,8 +57,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1SimpleWeights) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvPowerVR operation;
ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
ConvPowerVR operation =
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -92,8 +92,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvPowerVR operation;
ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
ConvPowerVR operation =
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -127,8 +127,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVRSimpleWeights) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvPowerVR operation;
ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
ConvPowerVR operation =
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -162,8 +162,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvPowerVR operation;
ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
ConvPowerVR operation =
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,

View File

@ -427,33 +427,33 @@ void ConvTexture::GetPossibleKernelWorkGroups(
work_groups);
}
absl::Status CreateConvTexture(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvTexture* result) {
*result = ConvTexture(definition, attr);
result->GenerateCode(creation_context.device->info_);
return result->UploadData(attr.weights, attr.bias, creation_context.context);
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
ConvTexture result(definition, attr);
result.GenerateCode(device_info);
result.UploadData(attr.weights, attr.bias);
return result;
}
absl::Status CreateConvTexture(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
ConvTexture* result) {
*result = ConvTexture(definition);
result->GenerateCode(creation_context.device->info_);
return result->UploadData(attr.weights, attr.bias, creation_context.context);
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr) {
ConvTexture result(definition);
result.GenerateCode(device_info);
result.UploadData(attr.weights, attr.bias);
return result;
}
absl::Status CreateConvTextureWino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvTexture* result) {
*result = ConvTexture(definition);
result->different_weights_for_height_ = true;
result->block_size_ = {4, 1, 2};
result->GenerateCode(creation_context.device->info_);
return result->UploadDataForWinograd4x4To6x6(
attr.weights, *creation_context.device, creation_context.context);
ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
ConvTexture result(definition);
result.different_weights_for_height_ = true;
result.block_size_ = {4, 1, 2};
result.GenerateCode(device_info);
result.UploadDataForWinograd4x4To6x6(attr.weights);
return result;
}
} // namespace cl

View File

@ -56,35 +56,30 @@ class ConvTexture : public GPUOperation {
ConvTexture& operator=(const ConvTexture&) = delete;
private:
friend absl::Status CreateConvTexture(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvTexture* result);
friend absl::Status CreateConvTexture(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
ConvTexture* result);
friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr);
friend absl::Status CreateConvTextureWino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvTexture* result);
friend ConvTexture CreateConvTextureWino4x4To6x6(
const DeviceInfo& device_info, const OperationDef& definition,
const Convolution2DAttributes& attr);
ConvTexture(const OperationDef& definition,
const Convolution2DAttributes& attr);
explicit ConvTexture(const OperationDef& definition);
template <DataType T>
absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases,
CLContext* context);
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases);
template <DataType T>
absl::Status UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
CLContext* context);
void UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights);
template <DataType T>
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
template <DataType S, typename T>
void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@ -113,10 +108,9 @@ class ConvTexture : public GPUOperation {
};
template <DataType T>
absl::Status ConvTexture::UploadData(
const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
RETURN_IF_ERROR(UploadWeights(weights, context));
void ConvTexture::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases) {
UploadWeights(weights);
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::TEXTURE_2D;
@ -124,16 +118,14 @@ absl::Status ConvTexture::UploadData(
desc.UploadLinearData(biases);
args_.AddObject("biases",
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
return absl::OkStatus();
}
template <DataType T>
absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
CLContext* context) {
void ConvTexture::UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights) {
tflite::gpu::Tensor<OHWI, T> wino_weights;
RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
RETURN_IF_ERROR(UploadWeights(wino_weights, context));
UploadWeights(wino_weights);
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
bias.shape = Linear(1);
@ -144,12 +136,10 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
desc.UploadLinearData(bias);
args_.AddObject("biases",
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
return absl::OkStatus();
}
template <DataType T>
absl::Status ConvTexture::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
void ConvTexture::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
int dst_depth = DivideRoundUp(weights.shape.o, 4);
dst_depth = AlignByN(dst_depth, block_size_.z);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
@ -213,7 +203,6 @@ absl::Status ConvTexture::UploadWeights(
absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
args_.AddObject("weights3",
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
return absl::OkStatus();
}
template <DataType S, typename T>
@ -261,19 +250,17 @@ void ConvTexture::RearrangeWeightsData(
}
}
absl::Status CreateConvTexture(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvTexture* result);
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
absl::Status CreateConvTexture(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
ConvTexture* result);
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr);
absl::Status CreateConvTextureWino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvTexture* result);
ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
} // namespace cl
} // namespace gpu

View File

@ -55,8 +55,8 @@ TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvTexture operation;
ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
ConvTexture operation =
CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -90,8 +90,8 @@ TEST_F(OpenCLOperationTest, ConvTexture) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvTexture operation;
ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
ConvTexture operation =
CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,

View File

@ -110,22 +110,20 @@ int3 FullyConnected::GetGridSize() const {
return int3(dst_[0]->Slices(), 1, 1);
}
absl::Status CreateFullyConnected(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
FullyConnected* result) {
*result = FullyConnected(definition, creation_context.device->info_);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr) {
FullyConnected result(definition, device_info);
result.UploadWeights(attr.weights);
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::TEXTURE_2D;
desc.element_type = definition.GetDataType();
desc.UploadLinearData(attr.bias);
result->args_.AddObject(
result.args_.AddObject(
"biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
return absl::OkStatus();
return result;
}
} // namespace cl

View File

@ -105,21 +105,20 @@ class FullyConnected : public GPUOperation {
private:
FullyConnected(const OperationDef& definition, const DeviceInfo& device_info);
friend absl::Status CreateFullyConnected(
const CreationContext& creation_context, const OperationDef& definition,
const FullyConnectedAttributes& attr, FullyConnected* result);
friend FullyConnected CreateFullyConnected(
const DeviceInfo& device_info, const OperationDef& definition,
const FullyConnectedAttributes& attr);
template <DataType T>
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
const int3& work_group_size);
};
template <DataType T>
absl::Status FullyConnected::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
void FullyConnected::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights) {
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
@ -144,13 +143,11 @@ absl::Status FullyConnected::UploadWeights(
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}
absl::Status CreateFullyConnected(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
FullyConnected* result);
FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr);
} // namespace cl
} // namespace gpu

View File

@ -52,9 +52,8 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
FullyConnected operation;
ASSERT_OK(
CreateFullyConnected(creation_context_, op_def, attr, &operation));
FullyConnected operation =
CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 1, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}));

View File

@ -68,6 +68,8 @@ struct CreationContext {
CLContext* context;
CLCommandQueue* queue;
ProgramCache* cache;
const DeviceInfo& GetDeviceInfo() const { return device->info_; }
};
struct OperationDef {

View File

@ -234,7 +234,7 @@ std::string Winograd4x4To36::GetWinograd4x4To36Code(
return c;
}
absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
void Winograd4x4To36::UploadBt() {
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
bt_aligned.shape = Linear(6 * 8);
bt_aligned.data.resize(6 * 8);
@ -253,7 +253,6 @@ absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
desc.UploadLinearData(bt_aligned);
args_.AddObject("bt",
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
return absl::OkStatus();
}
int3 Winograd4x4To36::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
@ -298,13 +297,12 @@ void Winograd4x4To36::GetPossibleKernelWorkGroups(
}
}
absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
const OperationDef& definition,
const Padding2D& padding,
Winograd4x4To36* result) {
*result =
Winograd4x4To36(definition, padding, creation_context.device->info_);
return result->UploadBt(creation_context.context);
Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
const OperationDef& definition,
const Padding2D& padding) {
Winograd4x4To36 result(definition, padding, device_info);
result.UploadBt();
return result;
}
Winograd36To4x4::Winograd36To4x4(const OperationDef& definition,
@ -437,7 +435,7 @@ std::string Winograd36To4x4::GetWinograd36To4x4Code(
return c;
}
absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
void Winograd36To4x4::UploadAt() {
tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
at_aligned.shape = Linear(4 * 8);
at_aligned.data.resize(4 * 8);
@ -456,7 +454,6 @@ absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
desc.UploadLinearData(at_aligned);
args_.AddObject("at",
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
return absl::OkStatus();
}
int3 Winograd36To4x4::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
@ -496,18 +493,18 @@ void Winograd36To4x4::GetPossibleKernelWorkGroups(
}
}
absl::Status CreateWinograd36To4x4(
const CreationContext& creation_context, const OperationDef& definition,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
Winograd36To4x4* result) {
*result = Winograd36To4x4(definition, creation_context.device->info_);
Winograd36To4x4 CreateWinograd36To4x4(
const DeviceInfo& device_info, const OperationDef& definition,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
Winograd36To4x4 result(definition, device_info);
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::TEXTURE_2D;
desc.element_type = definition.GetDataType();
desc.UploadLinearData(biases);
result->args_.AddObject(
result.args_.AddObject(
"biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
return result->UploadAt(creation_context.context);
result.UploadAt();
return result;
}
} // namespace cl

View File

@ -50,11 +50,11 @@ class Winograd4x4To36 : public GPUOperation {
Winograd4x4To36& operator=(const Winograd4x4To36&) = delete;
private:
friend absl::Status CreateWinograd4x4To36(
const CreationContext& creation_context, const OperationDef& definition,
const Padding2D& padding, Winograd4x4To36* result);
friend Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
const OperationDef& definition,
const Padding2D& padding);
absl::Status UploadBt(CLContext* context);
void UploadBt();
std::string GetWinograd4x4To36Code(const OperationDef& op_def);
@ -64,10 +64,9 @@ class Winograd4x4To36 : public GPUOperation {
Padding2D padding_;
};
absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
const OperationDef& definition,
const Padding2D& padding,
Winograd4x4To36* result);
Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
const OperationDef& definition,
const Padding2D& padding);
class Winograd36To4x4 : public GPUOperation {
public:
@ -88,12 +87,11 @@ class Winograd36To4x4 : public GPUOperation {
Winograd36To4x4& operator=(const Winograd36To4x4&) = delete;
private:
friend absl::Status CreateWinograd36To4x4(
const CreationContext& creation_context, const OperationDef& definition,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
Winograd36To4x4* result);
friend Winograd36To4x4 CreateWinograd36To4x4(
const DeviceInfo& device_info, const OperationDef& definition,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
absl::Status UploadAt(CLContext* context);
void UploadAt();
std::string GetWinograd36To4x4Code(const OperationDef& op_def);
@ -101,10 +99,9 @@ class Winograd36To4x4 : public GPUOperation {
int3 SelectBestWorkGroup(const KernelInfo& kernel_info) const;
};
absl::Status CreateWinograd36To4x4(
const CreationContext& creation_context, const OperationDef& definition,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
Winograd36To4x4* result);
Winograd36To4x4 CreateWinograd36To4x4(
const DeviceInfo& device_info, const OperationDef& definition,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
} // namespace cl
} // namespace gpu

View File

@ -93,9 +93,8 @@ TEST_F(OpenCLOperationTest, Winograd4x4To36) {
Padding2D padding;
padding.prepended = HW(1, 1);
padding.appended = HW(1, 1);
Winograd4x4To36 wino_up;
ASSERT_OK(
CreateWinograd4x4To36(creation_context_, op_def, padding, &wino_up));
Winograd4x4To36 wino_up = CreateWinograd4x4To36(
creation_context_.GetDeviceInfo(), op_def, padding);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_up,
BHWC(1, 36, 1, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
@ -162,9 +161,8 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
Winograd36To4x4 wino_down;
ASSERT_OK(
CreateWinograd36To4x4(creation_context_, op_def, biases, &wino_down));
Winograd36To4x4 wino_down = CreateWinograd36To4x4(
creation_context_.GetDeviceInfo(), op_def, biases);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_down,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));

View File

@ -32,17 +32,15 @@ namespace {
absl::Status SelectConvolutionAdreno(const Convolution2DAttributes& attr,
const BHWC& dst_shape,
const CreationContext& creation_context,
const DeviceInfo& device_info,
const OperationDef& op_def,
ModelHints hints,
std::unique_ptr<GPUOperation>* ptr) {
if (IsConvConstantsSupported(*creation_context.device, op_def, attr)) {
ConvConstants conv;
RETURN_IF_ERROR(CreateConvConstants(creation_context, op_def, attr, &conv));
if (IsConvConstantsSupported(device_info, op_def, attr)) {
ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
*ptr = absl::make_unique<ConvConstants>(std::move(conv));
} else {
ConvTexture conv;
RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
*ptr = absl::make_unique<ConvTexture>(std::move(conv));
}
return absl::OkStatus();
@ -50,23 +48,20 @@ absl::Status SelectConvolutionAdreno(const Convolution2DAttributes& attr,
absl::Status SelectConvolutionWinogradAdreno(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const CreationContext& creation_context, const OperationDef& op_def,
ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
ConvTexture conv;
RETURN_IF_ERROR(
CreateConvTextureWino4x4To6x6(creation_context, op_def, attr, &conv));
const DeviceInfo& device_info, const OperationDef& op_def, ModelHints hints,
std::unique_ptr<GPUOperation>* ptr) {
ConvTexture conv = CreateConvTextureWino4x4To6x6(device_info, op_def, attr);
*ptr = absl::make_unique<ConvTexture>(std::move(conv));
return absl::OkStatus();
}
absl::Status SelectConvolutionDynamicWeightsAdreno(
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC& dst_shape, const CreationContext& creation_context,
const BHWC& dst_shape, const DeviceInfo& device_info,
const OperationDef& op_def, ModelHints hints,
std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
ConvPowerVR conv;
RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
device_info, op_def, attr, weights_shape, &dst_shape);
*weights_desc = conv.GetConvWeightsDescription();
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
return absl::OkStatus();
@ -74,86 +69,77 @@ absl::Status SelectConvolutionDynamicWeightsAdreno(
absl::Status SelectConvolutionNVidia(const Convolution2DAttributes& attr,
const BHWC& dst_shape,
const CreationContext& creation_context,
const DeviceInfo& device_info,
const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr) {
if (IsConvConstantsSupported(*creation_context.device, op_def, attr)) {
ConvConstants conv;
RETURN_IF_ERROR(CreateConvConstants(creation_context, op_def, attr, &conv));
if (IsConvConstantsSupported(device_info, op_def, attr)) {
ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
*ptr = absl::make_unique<ConvConstants>(std::move(conv));
} else {
ConvPowerVR conv;
RETURN_IF_ERROR(
CreateConvPowerVR(creation_context, op_def, attr, &conv, &dst_shape));
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
}
return absl::OkStatus();
}
absl::Status SelectConvolutionPowerVR(const Convolution2DAttributes& attr,
const CreationContext& creation_context,
const DeviceInfo& device_info,
const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr) {
ConvPowerVR conv;
RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
return absl::OkStatus();
}
absl::Status SelectConvolutionMali(const Convolution2DAttributes& attr,
const BHWC& dst_shape,
const CreationContext& creation_context,
const DeviceInfo& device_info,
const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr) {
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
IsConvBuffer1x1Supported(op_def, attr)) {
ConvBuffer1x1 conv;
RETURN_IF_ERROR(
CreateConvBuffer1x1(creation_context, op_def, attr, &conv, &dst_shape));
ConvBuffer1x1 conv =
CreateConvBuffer1x1(device_info, op_def, attr, &dst_shape);
*ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
} else {
ConvPowerVR conv;
RETURN_IF_ERROR(
CreateConvPowerVR(creation_context, op_def, attr, &conv, &dst_shape));
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
}
return absl::OkStatus();
}
absl::Status SelectConvolutionWinogradMali(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const CreationContext& creation_context, const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr) {
absl::Status SelectConvolutionWinogradMali(const Convolution2DAttributes& attr,
const BHWC& dst_shape,
const DeviceInfo& device_info,
const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr) {
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
ConvBuffer1x1 conv;
RETURN_IF_ERROR(CreateConvBuffer1x1Wino4x4To6x6(creation_context, op_def,
attr, &conv, &dst_shape));
ConvBuffer1x1 conv =
CreateConvBuffer1x1Wino4x4To6x6(device_info, op_def, attr, &dst_shape);
*ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
} else {
ConvPowerVR conv;
RETURN_IF_ERROR(CreateConvPowerVRWino4x4To6x6(creation_context, op_def,
attr, &conv, &dst_shape));
ConvPowerVR conv =
CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
}
return absl::OkStatus();
}
absl::Status SelectConvolutionDynamicWeightsMali(
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC& dst_shape, const CreationContext& creation_context,
const BHWC& dst_shape, const DeviceInfo& device_info,
const OperationDef& op_def, ModelHints hints,
std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
IsConvBuffer1x1Supported(op_def, weights_shape, attr)) {
ConvBuffer1x1 conv;
RETURN_IF_ERROR(CreateConvBuffer1x1DynamicWeights(
creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
ConvBuffer1x1 conv = CreateConvBuffer1x1DynamicWeights(
device_info, op_def, attr, weights_shape, &dst_shape);
*weights_desc = conv.GetConvWeightsDescription();
*ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
} else {
ConvPowerVR conv;
RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
device_info, op_def, attr, weights_shape, &dst_shape);
*weights_desc = conv.GetConvWeightsDescription();
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
}
@ -164,70 +150,65 @@ absl::Status SelectConvolutionDynamicWeightsMali(
absl::Status SelectConvolution(const Convolution2DAttributes& attr,
const BHWC& dst_shape,
const CreationContext& creation_context,
const DeviceInfo& device_info,
const OperationDef& op_def, ModelHints hints,
std::unique_ptr<GPUOperation>* ptr) {
const auto& device_info = creation_context.device->info_;
if (device_info.IsAdreno()) {
return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
hints, ptr);
return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints,
ptr);
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
device_info.IsIntel()) {
return SelectConvolutionPowerVR(attr, creation_context, op_def, ptr);
return SelectConvolutionPowerVR(attr, device_info, op_def, ptr);
} else if (device_info.IsNvidia()) {
return SelectConvolutionNVidia(attr, dst_shape, creation_context, op_def,
ptr);
return SelectConvolutionNVidia(attr, dst_shape, device_info, op_def, ptr);
} else if (device_info.IsMali()) {
return SelectConvolutionMali(attr, dst_shape, creation_context, op_def,
ptr);
return SelectConvolutionMali(attr, dst_shape, device_info, op_def, ptr);
} else {
return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
hints, ptr);
return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints,
ptr);
}
}
absl::Status SelectConvolutionForWinograd(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const CreationContext& creation_context, const OperationDef& op_def,
ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
const auto& device_info = creation_context.device->info_;
absl::Status SelectConvolutionForWinograd(const Convolution2DAttributes& attr,
const BHWC& dst_shape,
const DeviceInfo& device_info,
const OperationDef& op_def,
ModelHints hints,
std::unique_ptr<GPUOperation>* ptr) {
if (device_info.IsAdreno()) {
return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
op_def, hints, ptr);
return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
hints, ptr);
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
device_info.IsNvidia() || device_info.IsIntel()) {
ConvPowerVR conv;
RETURN_IF_ERROR(CreateConvPowerVRWino4x4To6x6(creation_context, op_def,
attr, &conv, &dst_shape));
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
return absl::OkStatus();
ConvPowerVR conv =
CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
return absl::OkStatus();
} else if (device_info.IsMali()) {
return SelectConvolutionWinogradMali(attr, dst_shape, creation_context,
op_def, ptr);
return SelectConvolutionWinogradMali(attr, dst_shape, device_info, op_def,
ptr);
} else {
return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
op_def, hints, ptr);
return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
hints, ptr);
}
}
absl::Status SelectConvolutionWithDynamicWeights(
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC& dst_shape, const CreationContext& creation_context,
const BHWC& dst_shape, const DeviceInfo& device_info,
const OperationDef& op_def, ModelHints hints,
std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
const auto& device_info = creation_context.device->info_;
if (device_info.IsAdreno()) {
return SelectConvolutionDynamicWeightsAdreno(attr, weights_shape, dst_shape,
creation_context, op_def,
hints, ptr, weights_desc);
device_info, op_def, hints,
ptr, weights_desc);
} else if (device_info.IsMali()) {
return SelectConvolutionDynamicWeightsMali(attr, weights_shape, dst_shape,
creation_context, op_def, hints,
ptr, weights_desc);
device_info, op_def, hints, ptr,
weights_desc);
} else {
ConvPowerVR conv;
RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
device_info, op_def, attr, weights_shape, &dst_shape);
*weights_desc = conv.GetConvWeightsDescription();
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
return absl::OkStatus();
@ -235,8 +216,7 @@ absl::Status SelectConvolutionWithDynamicWeights(
}
absl::Status SelectConverterToConvWeights(
const ConvWeightsDescription& weights_desc,
const CreationContext& creation_context, const OperationDef& op_def,
const ConvWeightsDescription& weights_desc, const OperationDef& op_def,
ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
ConverterToConvWeights converter =
ConverterToConvWeights(op_def, weights_desc);

View File

@ -31,24 +31,25 @@ namespace cl {
absl::Status SelectConvolution(const Convolution2DAttributes& attr,
const BHWC& dst_shape,
const CreationContext& creation_context,
const DeviceInfo& device_info,
const OperationDef& op_def, ModelHints hints,
std::unique_ptr<GPUOperation>* ptr);
absl::Status SelectConvolutionForWinograd(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const CreationContext& creation_context, const OperationDef& op_def,
ModelHints hints, std::unique_ptr<GPUOperation>* ptr);
absl::Status SelectConvolutionForWinograd(const Convolution2DAttributes& attr,
const BHWC& dst_shape,
const DeviceInfo& device_info,
const OperationDef& op_def,
ModelHints hints,
std::unique_ptr<GPUOperation>* ptr);
absl::Status SelectConvolutionWithDynamicWeights(
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC& dst_shape, const CreationContext& creation_context,
const BHWC& dst_shape, const DeviceInfo& device_info,
const OperationDef& op_def, ModelHints hints,
std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc);
absl::Status SelectConverterToConvWeights(
const ConvWeightsDescription& weights_desc,
const CreationContext& creation_context, const OperationDef& op_def,
const ConvWeightsDescription& weights_desc, const OperationDef& op_def,
ModelHints hints, std::unique_ptr<GPUOperation>* ptr);
} // namespace cl

View File

@ -27,97 +27,87 @@ namespace tflite {
namespace gpu {
namespace cl {
absl::Status SelectFullyConnectedGeneric(
const FullyConnectedAttributes& attr,
const CreationContext& creation_context, const OperationDef& op_def,
int batch_size, std::unique_ptr<GPUOperation>* ptr) {
absl::Status SelectFullyConnectedGeneric(const FullyConnectedAttributes& attr,
const DeviceInfo& device_info,
const OperationDef& op_def,
int batch_size,
std::unique_ptr<GPUOperation>* ptr) {
if (op_def.IsBatchSupported()) {
ConvTexture conv;
RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
*ptr = absl::make_unique<ConvTexture>(std::move(conv));
} else {
FullyConnected fc;
RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
*ptr = absl::make_unique<FullyConnected>(std::move(fc));
}
return absl::OkStatus();
}
absl::Status SelectFullyConnectedAdreno(const FullyConnectedAttributes& attr,
const CreationContext& creation_context,
const DeviceInfo& device_info,
const OperationDef& op_def,
int batch_size,
std::unique_ptr<GPUOperation>* ptr) {
if (op_def.IsBatchSupported()) {
ConvTexture conv;
RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
*ptr = absl::make_unique<ConvTexture>(std::move(conv));
} else {
FullyConnected fc;
RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
*ptr = absl::make_unique<FullyConnected>(std::move(fc));
}
return absl::OkStatus();
}
absl::Status SelectFullyConnectedPowerVR(
const FullyConnectedAttributes& attr,
const CreationContext& creation_context, const OperationDef& op_def,
int batch_size, std::unique_ptr<GPUOperation>* ptr) {
absl::Status SelectFullyConnectedPowerVR(const FullyConnectedAttributes& attr,
const DeviceInfo& device_info,
const OperationDef& op_def,
int batch_size,
std::unique_ptr<GPUOperation>* ptr) {
if (op_def.IsBatchSupported()) {
ConvPowerVR conv;
RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
} else {
FullyConnected fc;
RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
*ptr = absl::make_unique<FullyConnected>(std::move(fc));
}
return absl::OkStatus();
}
absl::Status SelectFullyConnectedMali(const FullyConnectedAttributes& attr,
const CreationContext& creation_context,
const DeviceInfo& device_info,
const OperationDef& op_def,
int batch_size,
std::unique_ptr<GPUOperation>* ptr) {
if (op_def.IsBatchSupported()) {
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
ConvBuffer1x1 conv;
RETURN_IF_ERROR(
CreateConvBuffer1x1(creation_context, op_def, attr, &conv));
ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr);
*ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
} else {
ConvTexture conv;
RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
*ptr = absl::make_unique<ConvTexture>(std::move(conv));
}
} else {
FullyConnected fc;
RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
*ptr = absl::make_unique<FullyConnected>(std::move(fc));
}
return absl::OkStatus();
}
absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
const CreationContext& creation_context,
const DeviceInfo& device_info,
const OperationDef& op_def, int batch_size,
std::unique_ptr<GPUOperation>* ptr) {
const auto& device_info = creation_context.device->info_;
if (device_info.IsAdreno()) {
return SelectFullyConnectedAdreno(attr, creation_context, op_def,
batch_size, ptr);
return SelectFullyConnectedAdreno(attr, device_info, op_def, batch_size,
ptr);
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
device_info.IsNvidia() || device_info.IsIntel()) {
return SelectFullyConnectedPowerVR(attr, creation_context, op_def,
batch_size, ptr);
return SelectFullyConnectedPowerVR(attr, device_info, op_def, batch_size,
ptr);
} else if (device_info.IsMali()) {
return SelectFullyConnectedMali(attr, creation_context, op_def, batch_size,
ptr);
return SelectFullyConnectedMali(attr, device_info, op_def, batch_size, ptr);
} else {
return SelectFullyConnectedGeneric(attr, creation_context, op_def,
batch_size, ptr);
return SelectFullyConnectedGeneric(attr, device_info, op_def, batch_size,
ptr);
}
}

View File

@ -27,7 +27,7 @@ namespace gpu {
namespace cl {
absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
const CreationContext& creation_context,
const DeviceInfo& device_info,
const OperationDef& op_def, int batch_size,
std::unique_ptr<GPUOperation>* ptr);

View File

@ -39,7 +39,7 @@ namespace gpu {
namespace cl {
namespace {
bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
const CLDevice& device,
const DeviceInfo& device_info,
const BHWC& dst_shape) {
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
@ -49,23 +49,22 @@ bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
// Mali among other devices has smaller SIMD line size
const int min_depth = device.IsMali() ? 16 : 32;
const int min_hw = device.IsMali() ? 32 : 128;
const int min_depth = device_info.IsMali() ? 16 : 32;
const int min_hw = device_info.IsMali() ? 32 : 128;
const bool recommended_channels =
dst_depth % 4 == 0 && src_depth >= min_depth && dst_depth >= min_depth;
const bool recommended_hw = tiles_x * tiles_y >= min_hw;
return suitable_attributes && recommended_channels && recommended_hw;
}
absl::Status WinogradFromNode(const CreationContext& creation_context,
absl::Status WinogradFromNode(const DeviceInfo& device_info,
const std::vector<Value*>& inputs,
const std::vector<Value*>& outputs,
const OperationDef& op_def, ModelHints hints,
const BHWC& input_shape, const BHWC& output_shape,
const Convolution2DAttributes& attr,
GPUOperationsSubgraph* gpu_subgraph) {
if (!IsSuitableForWinograd4x4To6x6(attr, *creation_context.device,
output_shape)) {
if (!IsSuitableForWinograd4x4To6x6(attr, device_info, output_shape)) {
return absl::UnimplementedError("No implementation for this case.");
}
@ -75,16 +74,14 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
TensorDescriptor td_0;
td_0.storage_type = SelectBestStorageType(
creation_context.device->info_, shape_0,
op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
op_def.src_tensors[0].layout);
device_info, shape_0, op_def.src_tensors[0].storage_type,
op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
td_0.data_type = op_def.src_tensors[0].data_type;
td_0.layout = op_def.src_tensors[0].layout;
TensorDescriptor td_1;
td_1.storage_type = SelectBestStorageType(
creation_context.device->info_, shape_1,
op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
op_def.src_tensors[0].layout);
device_info, shape_1, op_def.src_tensors[0].storage_type,
op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
td_1.data_type = op_def.src_tensors[0].data_type;
td_1.layout = op_def.src_tensors[0].layout;
gpu_subgraph->new_tensors = {{shape_0, td_0}, {shape_1, td_1}};
@ -96,8 +93,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
winograd_up_def.src_tensors.push_back(op_def.src_tensors[0]);
winograd_up_def.dst_tensors.push_back(td_0);
auto& winograd_up = gpu_subgraph->operations[0];
RETURN_IF_ERROR(SelectWinograd4x4To36(
creation_context, attr.padding, winograd_up_def, &winograd_up.operation));
winograd_up.operation =
SelectWinograd4x4To36(device_info, attr.padding, winograd_up_def);
winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
winograd_up.output_ids = {-1};
@ -109,7 +106,7 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
conv.input_ids = {-1};
conv.output_ids = {-2};
RETURN_IF_ERROR(SelectConvolutionForWinograd(
attr, input_shape, creation_context, conv_def, hints, &conv.operation));
attr, input_shape, device_info, conv_def, hints, &conv.operation));
OperationDef winograd_down_def;
winograd_down_def.precision = op_def.precision;
@ -123,8 +120,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
bias_copy.shape = Linear(attr.weights.shape.o);
bias_copy.data.resize(attr.weights.shape.o);
}
RETURN_IF_ERROR(SelectWinograd36To4x4(creation_context, winograd_down_def,
bias_copy, &winograd_down.operation));
winograd_down.operation =
SelectWinograd36To4x4(device_info, winograd_down_def, bias_copy);
return absl::OkStatus();
}
@ -183,13 +180,15 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
auto input_shape = inputs[0]->tensor.shape;
auto output_shape = outputs[0]->tensor.shape;
if (inputs.size() == 1) {
if (WinogradFromNode(creation_context, inputs, outputs, op_def, hints,
input_shape, output_shape, attr, gpu_subgraph)
if (WinogradFromNode(creation_context.GetDeviceInfo(), inputs, outputs,
op_def, hints, input_shape, output_shape, attr,
gpu_subgraph)
.ok()) {
return absl::OkStatus();
} else {
gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
return SelectConvolution(attr, output_shape, creation_context, op_def,
return SelectConvolution(attr, output_shape,
creation_context.GetDeviceInfo(), op_def,
hints, gpu_op);
}
} else {
@ -207,8 +206,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
conv_def.src_tensors[1] = weights_desc;
ConvWeightsDescription conv_weights_desc;
RETURN_IF_ERROR(SelectConvolutionWithDynamicWeights(
attr, weights_shape, output_shape, creation_context, conv_def,
hints, &conv_op.operation, &conv_weights_desc));
attr, weights_shape, output_shape, creation_context.GetDeviceInfo(),
conv_def, hints, &conv_op.operation, &conv_weights_desc));
int aligned_output =
AlignByN(weights_shape.b, conv_weights_desc.output_group_size * 4);
@ -225,9 +224,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
converter_op.input_ids = {static_cast<int>(inputs[1]->id)};
converter_op.output_ids = {-1};
return SelectConverterToConvWeights(conv_weights_desc, creation_context,
converter_def, hints,
&converter_op.operation);
return SelectConverterToConvWeights(conv_weights_desc, converter_def,
hints, &converter_op.operation);
}
}
case OperationType::CONVOLUTION_TRANSPOSED: {
@ -244,8 +242,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
case OperationType::FULLY_CONNECTED: {
auto attr =
absl::any_cast<FullyConnectedAttributes>(node.operation.attributes);
return SelectFullyConnected(attr, creation_context, op_def,
inputs[0]->tensor.shape.b, gpu_op);
return SelectFullyConnected(attr, creation_context.GetDeviceInfo(),
op_def, inputs[0]->tensor.shape.b, gpu_op);
}
case OperationType::LSTM: {
SelectLSTM(op_def, creation_context.device->info_, gpu_op);

View File

@ -179,26 +179,18 @@ void SelectTranspose(const TransposeAttributes& attr,
*ptr = absl::make_unique<GPUOperation>(std::move(operation));
}
absl::Status SelectWinograd4x4To36(const CreationContext& creation_context,
const Padding2D& padding,
const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr) {
Winograd4x4To36 operation;
RETURN_IF_ERROR(
CreateWinograd4x4To36(creation_context, op_def, padding, &operation));
*ptr = absl::make_unique<Winograd4x4To36>(std::move(operation));
return absl::OkStatus();
std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
const DeviceInfo& device_info, const Padding2D& padding,
const OperationDef& op_def) {
return absl::make_unique<Winograd4x4To36>(
CreateWinograd4x4To36(device_info, op_def, padding));
}
absl::Status SelectWinograd36To4x4(
const CreationContext& creation_context, const OperationDef& op_def,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
std::unique_ptr<GPUOperation>* ptr) {
Winograd36To4x4 operation;
RETURN_IF_ERROR(
CreateWinograd36To4x4(creation_context, op_def, biases, &operation));
*ptr = absl::make_unique<Winograd36To4x4>(std::move(operation));
return absl::OkStatus();
std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
const DeviceInfo& device_info, const OperationDef& op_def,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
return absl::make_unique<Winograd36To4x4>(
CreateWinograd36To4x4(device_info, op_def, biases));
}
void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,

View File

@ -85,15 +85,13 @@ void SelectTranspose(const TransposeAttributes& attr,
const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr);
absl::Status SelectWinograd4x4To36(const CreationContext& creation_context,
const Padding2D& padding,
const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr);
std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
const DeviceInfo& device_info, const Padding2D& padding,
const OperationDef& op_def);
absl::Status SelectWinograd36To4x4(
const CreationContext& creation_context, const OperationDef& op_def,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
std::unique_ptr<GPUOperation>* ptr);
std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
const DeviceInfo& device_info, const OperationDef& op_def,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
const CreationContext& creation_context,