Removed useless Status and CreationContext from convolution kernels.
PiperOrigin-RevId: 327363985 Change-Id: I216229b4cfb4f11416fc2832c6bd00a1793f9a8a
This commit is contained in:
parent
4c222cfdf9
commit
b14150088d
@ -244,6 +244,26 @@ DeviceInfo DeviceInfoFromDeviceID(cl_device_id id) {
|
||||
info.max_work_group_size_x = max_work_group_sizes.x;
|
||||
info.max_work_group_size_y = max_work_group_sizes.y;
|
||||
info.max_work_group_size_z = max_work_group_sizes.z;
|
||||
|
||||
if (info.IsIntel()) {
|
||||
if (info.SupportsExtension("cl_intel_required_subgroup_size")) {
|
||||
size_t sub_groups_count;
|
||||
cl_int status =
|
||||
clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
|
||||
nullptr, &sub_groups_count);
|
||||
if (status == CL_SUCCESS) {
|
||||
std::vector<size_t> sub_group_sizes(sub_groups_count);
|
||||
status = clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
|
||||
sizeof(size_t) * sub_groups_count,
|
||||
sub_group_sizes.data(), nullptr);
|
||||
if (status == CL_SUCCESS) {
|
||||
for (int i = 0; i < sub_groups_count; ++i) {
|
||||
info.supported_subgroup_sizes.push_back(sub_group_sizes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return info;
|
||||
}
|
||||
|
||||
@ -305,37 +325,10 @@ std::string CLDevice::GetPlatformVersion() const {
|
||||
return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
|
||||
}
|
||||
|
||||
bool CLDevice::IsCL20OrHigher() const {
|
||||
return info_.cl_version != OpenCLVersion::CL_1_0 &&
|
||||
info_.cl_version != OpenCLVersion::CL_1_1 &&
|
||||
info_.cl_version != OpenCLVersion::CL_1_2;
|
||||
}
|
||||
bool CLDevice::IsCL20OrHigher() const { return info_.IsCL20OrHigher(); }
|
||||
|
||||
bool CLDevice::SupportsSubGroupWithSize(int sub_group_size) const {
|
||||
if (IsIntel()) {
|
||||
if (SupportsExtension("cl_intel_required_subgroup_size")) {
|
||||
size_t sub_groups_count;
|
||||
cl_int error =
|
||||
clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
|
||||
nullptr, &sub_groups_count);
|
||||
if (error != CL_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
std::vector<size_t> sub_group_sizes(sub_groups_count);
|
||||
error = clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
|
||||
sizeof(size_t) * sub_groups_count,
|
||||
sub_group_sizes.data(), nullptr);
|
||||
if (error != CL_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < sub_groups_count; ++i) {
|
||||
if (sub_group_sizes[i] == sub_group_size) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
return info_.SupportsSubGroupWithSize(sub_group_size);
|
||||
}
|
||||
|
||||
bool CLDevice::IsAdreno() const { return info_.IsAdreno(); }
|
||||
|
@ -262,6 +262,21 @@ bool DeviceInfo::SupportsExtension(const std::string& extension) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool DeviceInfo::IsCL20OrHigher() const {
|
||||
return cl_version != OpenCLVersion::CL_1_0 &&
|
||||
cl_version != OpenCLVersion::CL_1_1 &&
|
||||
cl_version != OpenCLVersion::CL_1_2;
|
||||
}
|
||||
|
||||
bool DeviceInfo::SupportsSubGroupWithSize(int sub_group_size) const {
|
||||
for (auto subgroup_size : supported_subgroup_sizes) {
|
||||
if (sub_group_size == subgroup_size) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool DeviceInfo::IsAdreno() const { return vendor == Vendor::kQualcomm; }
|
||||
|
||||
bool DeviceInfo::IsAdreno3xx() const {
|
||||
|
@ -139,6 +139,8 @@ struct DeviceInfo {
|
||||
bool SupportsOneLayerTextureArray() const;
|
||||
|
||||
bool SupportsExtension(const std::string& extension) const;
|
||||
bool IsCL20OrHigher() const;
|
||||
bool SupportsSubGroupWithSize(int sub_group_size) const;
|
||||
|
||||
std::vector<std::string> extensions;
|
||||
bool supports_fp16;
|
||||
@ -157,6 +159,7 @@ struct DeviceInfo {
|
||||
int max_work_group_size_x;
|
||||
int max_work_group_size_y;
|
||||
int max_work_group_size_z;
|
||||
std::vector<int> supported_subgroup_sizes;
|
||||
|
||||
// rtn is ROUND_TO_NEAREST
|
||||
// with rtn precision is much better then with rtz (ROUND_TO_ZERO)
|
||||
|
@ -118,6 +118,7 @@ cc_library(
|
||||
"//tensorflow/lite/delegates/gpu/cl:precision",
|
||||
"//tensorflow/lite/delegates/gpu/cl:tensor",
|
||||
"//tensorflow/lite/delegates/gpu/cl:tensor_type",
|
||||
"//tensorflow/lite/delegates/gpu/cl:texture2d",
|
||||
"//tensorflow/lite/delegates/gpu/cl:util",
|
||||
"//tensorflow/lite/delegates/gpu/common:data_type",
|
||||
"//tensorflow/lite/delegates/gpu/common:operations",
|
||||
|
@ -167,7 +167,8 @@ std::string GenerateConv(CalculationsPrecision precision,
|
||||
} // namespace
|
||||
|
||||
Conv3D::Conv3D(const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr, const CLDevice& device)
|
||||
const Convolution3DAttributes& attr,
|
||||
const DeviceInfo& device_info)
|
||||
: GPUOperation(definition),
|
||||
stride_(attr.strides.w, attr.strides.h, attr.strides.d),
|
||||
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
|
||||
@ -175,12 +176,12 @@ Conv3D::Conv3D(const OperationDef& definition,
|
||||
kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
|
||||
attr.weights.shape.d),
|
||||
dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
|
||||
conv_params_(GuessBestParams(device, definition, attr)) {
|
||||
conv_params_(GuessBestParams(device_info, definition, attr)) {
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_.x != 1;
|
||||
code_ = GenerateConv3D(definition_, stride_correction, conv_params_);
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
device.IsPowerVR()) {
|
||||
device_info.IsPowerVR()) {
|
||||
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
|
||||
}
|
||||
}
|
||||
@ -725,7 +726,7 @@ std::string Conv3D::GenerateConv3D(const OperationDef& op_def,
|
||||
return c;
|
||||
}
|
||||
|
||||
Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
|
||||
Conv3D::ConvParams Conv3D::GuessBestParams(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
int src_slices, int dst_slices,
|
||||
bool x_kernel_is_1,
|
||||
@ -735,7 +736,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
|
||||
conv_params.x_kernel_is_1 = x_kernel_is_1;
|
||||
conv_params.y_kernel_is_1 = y_kernel_is_1;
|
||||
conv_params.z_kernel_is_1 = z_kernel_is_1;
|
||||
if (device.IsNvidia()) {
|
||||
if (device_info.IsNvidia()) {
|
||||
conv_params.block_size = int4(1, 1, 1, 4);
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
conv_params.work_group_launch_order = int3(2, 0, 1);
|
||||
@ -754,7 +755,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
|
||||
if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) {
|
||||
conv_params.src_depth_loop_size = 4;
|
||||
}
|
||||
} else if (device.IsPowerVR()) {
|
||||
} else if (device_info.IsPowerVR()) {
|
||||
conv_params.block_size = int4(1, 1, 1, 4);
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
conv_params.work_group_launch_order = int3(2, 0, 1);
|
||||
@ -792,13 +793,13 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
|
||||
conv_params.block_size.x = 2;
|
||||
work_group_size_ = int3(4, 8, 1);
|
||||
}
|
||||
} else if (device.IsAdreno()) {
|
||||
} else if (device_info.IsAdreno()) {
|
||||
conv_params.block_size = int4(2, 2, 1, 2);
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
conv_params.work_group_launch_order = int3(0, 1, 2);
|
||||
conv_params.src_depth_loop_size = 1;
|
||||
conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM;
|
||||
} else if (device.IsMali()) {
|
||||
} else if (device_info.IsMali()) {
|
||||
conv_params.block_size = int4(1, 1, 1, 4);
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
conv_params.work_group_launch_order = int3(0, 1, 2);
|
||||
@ -829,7 +830,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
|
||||
}
|
||||
|
||||
Conv3D::ConvParams Conv3D::GuessBestParams(
|
||||
const CLDevice& device, const OperationDef& definition,
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr) {
|
||||
const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
@ -845,15 +846,16 @@ Conv3D::ConvParams Conv3D::GuessBestParams(
|
||||
attr.dilations.d == 1 &&
|
||||
attr.padding.prepended.d == 0 &&
|
||||
attr.padding.appended.d == 0;
|
||||
return GuessBestParams(device, definition, src_slices, dst_slices,
|
||||
return GuessBestParams(device_info, definition, src_slices, dst_slices,
|
||||
x_kernel_is_1, y_kernel_is_1, z_kernel_is_1);
|
||||
}
|
||||
|
||||
absl::Status CreateConv3D(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr, Conv3D* result) {
|
||||
*result = Conv3D(definition, attr, *creation_context.device);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
Conv3D CreateConv3D(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr) {
|
||||
Conv3D result(definition, attr, device_info);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -24,6 +24,7 @@ limitations under the License.
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/operations.h"
|
||||
@ -74,35 +75,32 @@ class Conv3D : public GPUOperation {
|
||||
};
|
||||
|
||||
Conv3D(const OperationDef& definition, const Convolution3DAttributes& attr,
|
||||
const CLDevice& device);
|
||||
const DeviceInfo& device_info);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
void UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases);
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
CLContext* context);
|
||||
void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
friend absl::Status CreateConv3D(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr,
|
||||
Conv3D* result);
|
||||
friend Conv3D CreateConv3D(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr);
|
||||
|
||||
friend std::string GenerateConv3D(const OperationDef& op_def,
|
||||
bool stride_correction,
|
||||
const ConvParams& conv_params,
|
||||
Arguments* args);
|
||||
|
||||
ConvParams GuessBestParams(const CLDevice& device,
|
||||
ConvParams GuessBestParams(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr);
|
||||
|
||||
ConvParams GuessBestParams(const CLDevice& device,
|
||||
ConvParams GuessBestParams(const DeviceInfo& device_info,
|
||||
const OperationDef& definition, int src_slices,
|
||||
int dst_slices, bool x_kernel_is_1,
|
||||
bool y_kernel_is_1, bool z_kernel_is_1);
|
||||
@ -118,10 +116,9 @@ class Conv3D : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status Conv3D::UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context) {
|
||||
RETURN_IF_ERROR(UploadWeights(weights, context));
|
||||
void Conv3D::UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases) {
|
||||
UploadWeights(weights);
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = conv_params_.AreWeightsBuffer()
|
||||
? LinearStorageType::BUFFER
|
||||
@ -130,12 +127,10 @@ absl::Status Conv3D::UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
desc.UploadLinearData(biases);
|
||||
args_.AddObject("biases",
|
||||
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
CLContext* context) {
|
||||
void Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
|
||||
const int block_size = conv_params_.block_size.w;
|
||||
const int dst_slices =
|
||||
AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
|
||||
@ -204,8 +199,6 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
args_.AddObject("weights3",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
template <DataType S, typename T>
|
||||
@ -265,9 +258,9 @@ void Conv3D::RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateConv3D(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr, Conv3D* result);
|
||||
Conv3D CreateConv3D(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -81,19 +81,19 @@ std::string GetComputationPart(const int3& block_size, int element_size,
|
||||
return c;
|
||||
}
|
||||
|
||||
ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
|
||||
ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const BHWC& shape, int src_depth,
|
||||
int dst_depth) {
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
conv_params.element_size = 4;
|
||||
conv_params.block_size = int3(1, 1, 1);
|
||||
if (!device.IsMali()) {
|
||||
if (!device_info.IsMali()) {
|
||||
return conv_params;
|
||||
}
|
||||
bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 &&
|
||||
definition.precision != CalculationsPrecision::F32;
|
||||
bool is_midgard = device.IsMali() && device.info_.mali_info.IsMidgard();
|
||||
bool is_midgard = device_info.IsMali() && device_info.mali_info.IsMidgard();
|
||||
if (is_midgard) {
|
||||
if (can_use_flt8) {
|
||||
conv_params.element_size = 8;
|
||||
@ -106,7 +106,7 @@ ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
|
||||
|
||||
int task_size = shape.w * shape.b * shape.h * dst_depth;
|
||||
int block_size = GetRecommendedBlockSizeForConv(
|
||||
device.info_, definition.precision, task_size);
|
||||
device_info, definition.precision, task_size);
|
||||
|
||||
if (!can_use_flt8 && block_size > 4) {
|
||||
block_size = 4;
|
||||
@ -134,14 +134,15 @@ ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
|
||||
return conv_params;
|
||||
}
|
||||
|
||||
ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
|
||||
ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
int src_depth, int dst_depth) {
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
conv_params.element_size = 4;
|
||||
conv_params.block_size = int3(1, 1, 1);
|
||||
if (device.IsMali() && definition.precision == CalculationsPrecision::F16 &&
|
||||
device.info_.compute_units_count <= 4) {
|
||||
if (device_info.IsMali() &&
|
||||
definition.precision == CalculationsPrecision::F16 &&
|
||||
device_info.compute_units_count <= 4) {
|
||||
conv_params.block_size.x *= 2;
|
||||
}
|
||||
return conv_params;
|
||||
@ -345,85 +346,80 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
|
||||
attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
|
||||
}
|
||||
|
||||
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvBuffer1x1* result, const BHWC* shape) {
|
||||
if (!IsConvBuffer1x1Supported(definition, attr)) {
|
||||
return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported");
|
||||
}
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* shape) {
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
if (shape) {
|
||||
conv_params = GetBestParams(*creation_context.device, definition, *shape,
|
||||
src_depth, dst_depth);
|
||||
conv_params =
|
||||
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
|
||||
} else {
|
||||
conv_params = GetBestParams(*creation_context.device, definition, src_depth,
|
||||
dst_depth);
|
||||
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
|
||||
}
|
||||
*result = ConvBuffer1x1(definition, conv_params);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
ConvBuffer1x1 result(definition, conv_params);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvBuffer1x1* result, const BHWC* shape) {
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* shape) {
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
if (shape) {
|
||||
conv_params = GetBestParams(*creation_context.device, definition, *shape,
|
||||
src_depth, dst_depth);
|
||||
conv_params =
|
||||
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
|
||||
} else {
|
||||
conv_params = GetBestParams(*creation_context.device, definition, src_depth,
|
||||
dst_depth);
|
||||
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
|
||||
}
|
||||
conv_params.block_size.x *= conv_params.block_size.y;
|
||||
conv_params.block_size.y = 1;
|
||||
*result = ConvBuffer1x1(definition, conv_params);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
ConvBuffer1x1 result(definition, conv_params);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::Status CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
|
||||
const BHWC* shape) {
|
||||
ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* shape) {
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
if (shape) {
|
||||
conv_params = GetBestParams(*creation_context.device, definition, *shape,
|
||||
src_depth, dst_depth);
|
||||
conv_params =
|
||||
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
|
||||
} else {
|
||||
conv_params = GetBestParams(*creation_context.device, definition, src_depth,
|
||||
dst_depth);
|
||||
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
|
||||
}
|
||||
conv_params.block_size.x *= conv_params.block_size.y;
|
||||
conv_params.block_size.y = 1;
|
||||
conv_params.different_weights_for_height = true;
|
||||
*result = ConvBuffer1x1(definition, conv_params);
|
||||
return result->UploadDataForWinograd4x4To6x6(
|
||||
attr.weights, *creation_context.device, creation_context.context);
|
||||
ConvBuffer1x1 result(definition, conv_params);
|
||||
result.UploadDataForWinograd4x4To6x6(attr.weights);
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::Status CreateConvBuffer1x1DynamicWeights(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
ConvBuffer1x1* result, const BHWC* dst_shape) {
|
||||
const BHWC* dst_shape) {
|
||||
const int dst_depth = DivideRoundUp(weights_shape.b, 4);
|
||||
const int src_depth = DivideRoundUp(weights_shape.c, 4);
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
if (dst_shape) {
|
||||
conv_params = GetBestParams(*creation_context.device, definition,
|
||||
*dst_shape, src_depth, dst_depth);
|
||||
} else {
|
||||
conv_params = GetBestParams(*creation_context.device, definition, src_depth,
|
||||
conv_params = GetBestParams(device_info, definition, *dst_shape, src_depth,
|
||||
dst_depth);
|
||||
} else {
|
||||
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
|
||||
}
|
||||
*result = ConvBuffer1x1(definition, conv_params);
|
||||
return result->UploadBiases(attr.bias, creation_context.context);
|
||||
ConvBuffer1x1 result(definition, conv_params);
|
||||
result.UploadBiases(attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -72,39 +72,34 @@ class ConvBuffer1x1 : public GPUOperation {
|
||||
|
||||
private:
|
||||
ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params);
|
||||
friend absl::Status CreateConvBuffer1x1(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
|
||||
const BHWC* shape);
|
||||
friend absl::Status CreateConvBuffer1x1(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr, ConvBuffer1x1* result,
|
||||
const BHWC* shape);
|
||||
friend absl::Status CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
|
||||
const BHWC* shape);
|
||||
friend absl::Status CreateConvBuffer1x1DynamicWeights(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* shape);
|
||||
friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* shape);
|
||||
friend ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* shape);
|
||||
friend ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
ConvBuffer1x1* result, const BHWC* dst_shape);
|
||||
const BHWC* dst_shape);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases);
|
||||
template <DataType T>
|
||||
absl::Status UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context);
|
||||
void UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
void UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases);
|
||||
|
||||
std::string GenerateConvBuffer1x1(
|
||||
const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
|
||||
@ -114,32 +109,26 @@ class ConvBuffer1x1 : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvBuffer1x1::UploadData(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
|
||||
RETURN_IF_ERROR(UploadWeights(weights, context));
|
||||
RETURN_IF_ERROR(UploadBiases(biases, context));
|
||||
return absl::OkStatus();
|
||||
void ConvBuffer1x1::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases) {
|
||||
UploadWeights(weights);
|
||||
UploadBiases(biases);
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context) {
|
||||
void ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights) {
|
||||
tflite::gpu::Tensor<OHWI, T> wino_weights;
|
||||
RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
|
||||
RETURN_IF_ERROR(UploadWeights(wino_weights, context));
|
||||
UploadWeights(wino_weights);
|
||||
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
|
||||
bias.shape = Linear(weights.shape.o);
|
||||
bias.data.resize(weights.shape.o, 0.0f);
|
||||
RETURN_IF_ERROR(UploadBiases(bias, context));
|
||||
|
||||
return absl::OkStatus();
|
||||
UploadBiases(bias);
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvBuffer1x1::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
void ConvBuffer1x1::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
|
||||
@ -169,12 +158,10 @@ absl::Status ConvBuffer1x1::UploadWeights(
|
||||
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvBuffer1x1::UploadBiases(
|
||||
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
|
||||
void ConvBuffer1x1::UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases) {
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::BUFFER;
|
||||
desc.element_type = definition_.GetDataType();
|
||||
@ -182,7 +169,6 @@ absl::Status ConvBuffer1x1::UploadBiases(
|
||||
desc.UploadLinearData(biases, depth);
|
||||
args_.AddObject("biases",
|
||||
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
bool IsConvBuffer1x1Supported(const OperationDef& definition,
|
||||
@ -192,27 +178,24 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
|
||||
const BHWC& weights_shape,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvBuffer1x1* result,
|
||||
const BHWC* shape = nullptr);
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* shape = nullptr);
|
||||
|
||||
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvBuffer1x1* result,
|
||||
const BHWC* shape = nullptr);
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* shape = nullptr);
|
||||
|
||||
absl::Status CreateConvBuffer1x1DynamicWeights(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
ConvBuffer1x1* result, const BHWC* dst_shape = nullptr);
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
absl::Status CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
|
||||
const BHWC* shape = nullptr);
|
||||
ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* shape = nullptr);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -56,9 +56,8 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1SimpleWeights) {
|
||||
op_def.dst_tensors.push_back(
|
||||
{data_type, TensorStorageType::BUFFER, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvBuffer1x1 operation;
|
||||
ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation,
|
||||
&src_tensor.shape));
|
||||
ConvBuffer1x1 operation = CreateConvBuffer1x1(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -92,9 +91,8 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1) {
|
||||
op_def.dst_tensors.push_back(
|
||||
{data_type, TensorStorageType::BUFFER, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvBuffer1x1 operation;
|
||||
ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation,
|
||||
&src_tensor.shape));
|
||||
ConvBuffer1x1 operation = CreateConvBuffer1x1(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 4), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
|
@ -255,10 +255,11 @@ int3 ConvConstants::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, 1);
|
||||
}
|
||||
|
||||
bool IsConvConstantsSupported(const CLDevice& device,
|
||||
bool IsConvConstantsSupported(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr) {
|
||||
if (device.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
|
||||
if (device_info.IsAMD() &&
|
||||
definition.precision != CalculationsPrecision::F32 &&
|
||||
definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) {
|
||||
// BUG, some AMD gpus crashe without it
|
||||
return false;
|
||||
@ -271,30 +272,25 @@ bool IsConvConstantsSupported(const CLDevice& device,
|
||||
? sizeof(float)
|
||||
: sizeof(half);
|
||||
const int filters_buffer_size = filters_count * float_size;
|
||||
const int kConstantMaxSize = GetOptimalMaxConstantSize(device.info_);
|
||||
const int kConstantMaxSize = GetOptimalMaxConstantSize(device_info);
|
||||
const int flt4_registers = DivideRoundUp(w_shape.o, 4);
|
||||
return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
|
||||
}
|
||||
|
||||
absl::Status CreateConvConstants(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvConstants* result) {
|
||||
if (!IsConvConstantsSupported(*creation_context.device, definition, attr)) {
|
||||
return absl::InvalidArgumentError("ConvConstants doesn't supported");
|
||||
}
|
||||
*result = ConvConstants(definition, attr, creation_context.device->info_);
|
||||
RETURN_IF_ERROR(
|
||||
result->UploadWeights(attr.weights, creation_context.context));
|
||||
ConvConstants CreateConvConstants(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr) {
|
||||
ConvConstants result(definition, attr, device_info);
|
||||
result.UploadWeights(attr.weights);
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::BUFFER;
|
||||
desc.element_type = definition.GetDataType();
|
||||
desc.memory_type = MemoryType::CONSTANT;
|
||||
desc.UploadLinearData(attr.bias);
|
||||
result->args_.AddObject(
|
||||
result.args_.AddObject(
|
||||
"biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -45,16 +45,15 @@ class ConvConstants : public GPUOperation {
|
||||
ConvConstants& operator=(const ConvConstants&) = delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateConvConstants(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvConstants* result);
|
||||
friend ConvConstants CreateConvConstants(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
ConvConstants(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const DeviceInfo& device_info);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
@ -75,8 +74,7 @@ class ConvConstants : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvConstants::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
void ConvConstants::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
const int kernel_y = weights.shape.h;
|
||||
@ -102,8 +100,6 @@ absl::Status ConvConstants::UploadWeights(
|
||||
|
||||
args_.AddObject("weigths",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
template <DataType S, typename T>
|
||||
@ -149,14 +145,13 @@ void ConvConstants::RearrangeWeightsData(
|
||||
}
|
||||
}
|
||||
|
||||
bool IsConvConstantsSupported(const CLDevice& device,
|
||||
bool IsConvConstantsSupported(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
absl::Status CreateConvConstants(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvConstants* result);
|
||||
ConvConstants CreateConvConstants(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvConstants operation;
|
||||
ASSERT_OK(
|
||||
CreateConvConstants(creation_context_, op_def, attr, &operation));
|
||||
ConvConstants operation =
|
||||
CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -91,9 +90,8 @@ TEST_F(OpenCLOperationTest, ConvConstants) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvConstants operation;
|
||||
ASSERT_OK(
|
||||
CreateConvConstants(creation_context_, op_def, attr, &operation));
|
||||
ConvConstants operation =
|
||||
CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
|
@ -130,33 +130,33 @@ std::string GenerateBlockCoords(const int3& block_size,
|
||||
|
||||
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const CLDevice& device, const BHWC* dst_shape)
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape)
|
||||
: GPUOperation(definition),
|
||||
stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
|
||||
-attr.padding.prepended.h),
|
||||
kernel_dilation_(attr.weights.shape.w, attr.weights.shape.h,
|
||||
attr.dilations.w, attr.dilations.h),
|
||||
conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}
|
||||
conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
|
||||
|
||||
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC& weights_shape, const CLDevice& device,
|
||||
const BHWC* dst_shape)
|
||||
const BHWC& weights_shape,
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape)
|
||||
: GPUOperation(definition),
|
||||
stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
|
||||
-attr.padding.prepended.h),
|
||||
kernel_dilation_(weights_shape.w, weights_shape.h, attr.dilations.w,
|
||||
attr.dilations.h),
|
||||
conv_params_(GuessBestParams(device, definition, attr, weights_shape,
|
||||
conv_params_(GuessBestParams(device_info, definition, attr, weights_shape,
|
||||
dst_shape)) {}
|
||||
|
||||
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const CLDevice& device, const BHWC* dst_shape)
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape)
|
||||
: GPUOperation(definition),
|
||||
stride_padding_(1, 1, 0, 0),
|
||||
kernel_dilation_(1, 1, 1, 1),
|
||||
conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}
|
||||
conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
|
||||
|
||||
ConvPowerVR::ConvPowerVR(const OperationDef& definition)
|
||||
: GPUOperation(definition),
|
||||
@ -687,8 +687,8 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const CLDevice& device, const OperationDef& definition, int src_depth,
|
||||
int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
|
||||
bool different_weights_for_height, const BHWC* dst_shape) {
|
||||
ConvParams conv_params;
|
||||
conv_params.linear_hw = false;
|
||||
@ -697,7 +697,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
conv_params.x_kernel_is_1 = x_kernel_is_1;
|
||||
conv_params.y_kernel_is_1 = y_kernel_is_1;
|
||||
conv_params.different_weights_for_height = different_weights_for_height;
|
||||
if (device.IsNvidia()) {
|
||||
if (device_info.IsNvidia()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
conv_params.work_group_launch_order = int3(2, 0, 1);
|
||||
@ -721,7 +721,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
if (dst_shape) {
|
||||
int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
|
||||
float task_size_per_cu =
|
||||
static_cast<float>(task_size) / device.info_.compute_units_count;
|
||||
static_cast<float>(task_size) / device_info.compute_units_count;
|
||||
int block_size = conv_params.block_size.x * conv_params.block_size.y *
|
||||
conv_params.block_size.z;
|
||||
float threads_per_cu = task_size_per_cu / block_size;
|
||||
@ -742,7 +742,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
|
||||
conv_params.src_depth_loop_size = 4;
|
||||
}
|
||||
} else if (device.IsPowerVR()) {
|
||||
} else if (device_info.IsPowerVR()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
conv_params.work_group_launch_order = int3(2, 0, 1);
|
||||
@ -790,7 +790,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
}
|
||||
conv_params.block_size.x = 2;
|
||||
}
|
||||
} else if (device.IsAMD()) {
|
||||
} else if (device_info.IsAMD()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
conv_params.work_group_launch_order = int3(2, 0, 1);
|
||||
@ -819,12 +819,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
if (src_depth % 2 == 0 && src_depth >= 16) {
|
||||
conv_params.src_depth_loop_size = 2;
|
||||
}
|
||||
} else if (device.IsMali()) {
|
||||
} else if (device_info.IsMali()) {
|
||||
int block_size = 2;
|
||||
if (dst_shape) {
|
||||
int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
|
||||
block_size = GetRecommendedBlockSizeForConv(
|
||||
device.info_, definition.precision, task_size);
|
||||
device_info, definition.precision, task_size);
|
||||
}
|
||||
if (!x_kernel_is_1 || !y_kernel_is_1) {
|
||||
block_size = std::min(block_size, 4);
|
||||
@ -847,7 +847,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
conv_params.block_size = int3(1, 1, 1);
|
||||
}
|
||||
conv_params.src_depth_loop_size = 1;
|
||||
MaliInfo mali_info = device.info_.mali_info;
|
||||
MaliInfo mali_info = device_info.mali_info;
|
||||
if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
|
||||
conv_params.src_depth_loop_size = 2;
|
||||
}
|
||||
@ -859,14 +859,14 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
conv_params.work_group_launch_order = int3(0, 1, 2);
|
||||
conv_params.fixed_work_group_size = false;
|
||||
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
|
||||
} else if (device.IsAdreno()) {
|
||||
} else if (device_info.IsAdreno()) {
|
||||
conv_params.block_size = int3(2, 2, 1);
|
||||
work_group_size_ = int3(8, 2, 1);
|
||||
conv_params.work_group_launch_order = int3(0, 1, 2);
|
||||
conv_params.fixed_work_group_size = false;
|
||||
conv_params.src_depth_loop_size = 1;
|
||||
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
|
||||
} else if (device.IsIntel()) {
|
||||
} else if (device_info.IsIntel()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(16, 1, 1);
|
||||
conv_params.work_group_launch_order = int3(0, 1, 2);
|
||||
@ -880,9 +880,10 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
conv_params.block_size = int3(1, 1, 4);
|
||||
conv_params.src_depth_loop_size = 1;
|
||||
if (definition.precision != CalculationsPrecision::F32_F16 &&
|
||||
device.SupportsExtension("cl_khr_subgroups") &&
|
||||
device.SupportsExtension("cl_intel_required_subgroup_size") &&
|
||||
device.IsCL20OrHigher() && device.SupportsSubGroupWithSize(16)) {
|
||||
device_info.SupportsExtension("cl_khr_subgroups") &&
|
||||
device_info.SupportsExtension("cl_intel_required_subgroup_size") &&
|
||||
device_info.IsCL20OrHigher() &&
|
||||
device_info.SupportsSubGroupWithSize(16)) {
|
||||
conv_params.weights_upload_type =
|
||||
WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST;
|
||||
} else {
|
||||
@ -927,7 +928,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const CLDevice& device, const OperationDef& definition,
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* dst_shape) {
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
@ -939,12 +940,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
attr.dilations.h == 1 &&
|
||||
attr.padding.prepended.h == 0 &&
|
||||
attr.padding.appended.h == 0;
|
||||
return GuessBestParams(device, definition, src_depth, dst_depth,
|
||||
return GuessBestParams(device_info, definition, src_depth, dst_depth,
|
||||
x_kernel_is_1, y_kernel_is_1, false, dst_shape);
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const CLDevice& device, const OperationDef& definition,
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC* dst_shape) {
|
||||
const int dst_depth = DivideRoundUp(weights_shape.b, 4);
|
||||
@ -955,17 +956,18 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const bool y_kernel_is_1 =
|
||||
weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
|
||||
attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
|
||||
return GuessBestParams(device, definition, src_depth, dst_depth,
|
||||
return GuessBestParams(device_info, definition, src_depth, dst_depth,
|
||||
x_kernel_is_1, y_kernel_is_1, false, dst_shape);
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const CLDevice& device, const OperationDef& definition,
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr, const BHWC* dst_shape) {
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvPowerVR::ConvParams params = GuessBestParams(
|
||||
device, definition, src_depth, dst_depth, true, true, false, dst_shape);
|
||||
ConvPowerVR::ConvParams params =
|
||||
GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
|
||||
false, dst_shape);
|
||||
work_group_size_.x *= work_group_size_.y;
|
||||
work_group_size_.y = 1;
|
||||
params.block_size.x *= params.block_size.y;
|
||||
@ -974,55 +976,59 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
|
||||
const CLDevice& device, const OperationDef& definition,
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* dst_shape) {
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvPowerVR::ConvParams params = GuessBestParams(
|
||||
device, definition, src_depth, dst_depth, true, true, true, dst_shape);
|
||||
ConvPowerVR::ConvParams params =
|
||||
GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
|
||||
true, dst_shape);
|
||||
params.block_size.x *= params.block_size.y;
|
||||
params.block_size.y = 1;
|
||||
return params;
|
||||
}
|
||||
|
||||
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result, const BHWC* dst_shape) {
|
||||
*result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
|
||||
result->GenerateCode(creation_context.device->info_);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape) {
|
||||
ConvPowerVR result(definition, attr, device_info, dst_shape);
|
||||
result.GenerateCode(device_info);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvPowerVR* result, const BHWC* dst_shape) {
|
||||
*result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
|
||||
result->GenerateCode(creation_context.device->info_);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* dst_shape) {
|
||||
ConvPowerVR result(definition, attr, device_info, dst_shape);
|
||||
result.GenerateCode(device_info);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::Status CreateConvPowerVRDynamicWeights(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
ConvPowerVR* result, const BHWC* dst_shape) {
|
||||
*result = ConvPowerVR(definition, attr, weights_shape,
|
||||
*creation_context.device, dst_shape);
|
||||
result->GenerateCode(creation_context.device->info_);
|
||||
return result->UploadBias(attr.bias, creation_context.context);
|
||||
ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC& weights_shape,
|
||||
const BHWC* dst_shape) {
|
||||
ConvPowerVR result(definition, attr, weights_shape, device_info, dst_shape);
|
||||
result.GenerateCode(device_info);
|
||||
result.UploadBias(attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::Status CreateConvPowerVRWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvPowerVR* result,
|
||||
const BHWC* dst_shape) {
|
||||
*result = ConvPowerVR(definition);
|
||||
result->conv_params_ = result->GuessBestParamsWinograd(
|
||||
*creation_context.device, definition, attr, dst_shape);
|
||||
result->GenerateCode(creation_context.device->info_);
|
||||
return result->UploadDataForWinograd4x4To6x6(
|
||||
attr.weights, *creation_context.device, creation_context.context);
|
||||
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape) {
|
||||
ConvPowerVR result(definition);
|
||||
result.conv_params_ =
|
||||
result.GuessBestParamsWinograd(device_info, definition, attr, dst_shape);
|
||||
result.GenerateCode(device_info);
|
||||
result.UploadDataForWinograd4x4To6x6(attr.weights);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -128,75 +128,68 @@ class ConvPowerVR : public GPUOperation {
|
||||
};
|
||||
|
||||
ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const CLDevice& device,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
const Convolution2DAttributes& attr,
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
|
||||
ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const CLDevice& device, const BHWC* dst_shape = nullptr);
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
|
||||
ConvPowerVR(const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr, const CLDevice& device,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
const FullyConnectedAttributes& attr,
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
|
||||
explicit ConvPowerVR(const OperationDef& definition);
|
||||
|
||||
void GenerateCode(const DeviceInfo& device_info);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases);
|
||||
template <DataType T>
|
||||
absl::Status UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context);
|
||||
void UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
|
||||
CLContext* context);
|
||||
void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
|
||||
|
||||
friend absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result,
|
||||
const BHWC* dst_shape);
|
||||
friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape);
|
||||
|
||||
friend absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvPowerVR* result,
|
||||
const BHWC* dst_shape);
|
||||
friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* dst_shape);
|
||||
|
||||
friend absl::Status CreateConvPowerVRDynamicWeights(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
friend ConvPowerVR CreateConvPowerVRDynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
ConvPowerVR* result, const BHWC* dst_shape);
|
||||
|
||||
friend absl::Status CreateConvPowerVRWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvPowerVR* result,
|
||||
const BHWC* dst_shape);
|
||||
|
||||
ConvParams GuessBestParams(const CLDevice& device,
|
||||
friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* dst_shape);
|
||||
|
||||
ConvParams GuessBestParams(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvParams GuessBestParams(const CLDevice& device,
|
||||
ConvParams GuessBestParams(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC& weights_shape,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvParams GuessBestParams(const CLDevice& device,
|
||||
ConvParams GuessBestParams(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvParams GuessBestParamsWinograd(const CLDevice& device,
|
||||
ConvParams GuessBestParamsWinograd(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvParams GuessBestParams(const CLDevice& device,
|
||||
ConvParams GuessBestParams(const DeviceInfo& device_info,
|
||||
const OperationDef& definition, int src_depth,
|
||||
int dst_depth, bool x_kernel_is_1,
|
||||
bool y_kernel_is_1,
|
||||
@ -213,31 +206,26 @@ class ConvPowerVR : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvPowerVR::UploadData(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
|
||||
RETURN_IF_ERROR(UploadWeights(weights, context));
|
||||
RETURN_IF_ERROR(UploadBias(biases, context));
|
||||
return absl::OkStatus();
|
||||
void ConvPowerVR::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases) {
|
||||
UploadWeights(weights);
|
||||
UploadBias(biases);
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context) {
|
||||
void ConvPowerVR::UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights) {
|
||||
tflite::gpu::Tensor<OHWI, T> wino_weights;
|
||||
RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
|
||||
RETURN_IF_ERROR(UploadWeights(wino_weights, context));
|
||||
UploadWeights(wino_weights);
|
||||
tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
|
||||
biases.shape = Linear(weights.shape.o);
|
||||
biases.data.resize(weights.shape.o, 0.0f);
|
||||
RETURN_IF_ERROR(UploadBias(biases, context));
|
||||
return absl::OkStatus();
|
||||
UploadBias(biases);
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
|
||||
CLContext* context) {
|
||||
void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = conv_params_.weights_data_type;
|
||||
desc.element_size = 4;
|
||||
@ -264,12 +252,10 @@ absl::Status ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
|
||||
}
|
||||
args_.AddObject("biases",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvPowerVR::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
|
||||
@ -301,30 +287,28 @@ absl::Status ConvPowerVR::UploadWeights(
|
||||
}
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvPowerVR* result,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
absl::Status CreateConvPowerVRDynamicWeights(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
ConvPowerVR* result, const BHWC* dst_shape = nullptr);
|
||||
ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC& weights_shape,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
absl::Status CreateConvPowerVRWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvPowerVR* result,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -57,8 +57,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1SimpleWeights) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvPowerVR operation;
|
||||
ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
|
||||
ConvPowerVR operation =
|
||||
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -92,8 +92,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvPowerVR operation;
|
||||
ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
|
||||
ConvPowerVR operation =
|
||||
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -127,8 +127,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVRSimpleWeights) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvPowerVR operation;
|
||||
ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
|
||||
ConvPowerVR operation =
|
||||
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -162,8 +162,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvPowerVR operation;
|
||||
ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
|
||||
ConvPowerVR operation =
|
||||
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
|
@ -427,33 +427,33 @@ void ConvTexture::GetPossibleKernelWorkGroups(
|
||||
work_groups);
|
||||
}
|
||||
|
||||
absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvTexture* result) {
|
||||
*result = ConvTexture(definition, attr);
|
||||
result->GenerateCode(creation_context.device->info_);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr) {
|
||||
ConvTexture result(definition, attr);
|
||||
result.GenerateCode(device_info);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvTexture* result) {
|
||||
*result = ConvTexture(definition);
|
||||
result->GenerateCode(creation_context.device->info_);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr) {
|
||||
ConvTexture result(definition);
|
||||
result.GenerateCode(device_info);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::Status CreateConvTextureWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvTexture* result) {
|
||||
*result = ConvTexture(definition);
|
||||
result->different_weights_for_height_ = true;
|
||||
result->block_size_ = {4, 1, 2};
|
||||
result->GenerateCode(creation_context.device->info_);
|
||||
return result->UploadDataForWinograd4x4To6x6(
|
||||
attr.weights, *creation_context.device, creation_context.context);
|
||||
ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr) {
|
||||
ConvTexture result(definition);
|
||||
result.different_weights_for_height_ = true;
|
||||
result.block_size_ = {4, 1, 2};
|
||||
result.GenerateCode(device_info);
|
||||
result.UploadDataForWinograd4x4To6x6(attr.weights);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -56,35 +56,30 @@ class ConvTexture : public GPUOperation {
|
||||
ConvTexture& operator=(const ConvTexture&) = delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvTexture* result);
|
||||
friend absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvTexture* result);
|
||||
friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr);
|
||||
|
||||
friend absl::Status CreateConvTextureWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvTexture* result);
|
||||
friend ConvTexture CreateConvTextureWino4x4To6x6(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
ConvTexture(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
explicit ConvTexture(const OperationDef& definition);
|
||||
template <DataType T>
|
||||
absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context);
|
||||
void UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
@ -113,10 +108,9 @@ class ConvTexture : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvTexture::UploadData(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
|
||||
RETURN_IF_ERROR(UploadWeights(weights, context));
|
||||
void ConvTexture::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases) {
|
||||
UploadWeights(weights);
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
@ -124,16 +118,14 @@ absl::Status ConvTexture::UploadData(
|
||||
desc.UploadLinearData(biases);
|
||||
args_.AddObject("biases",
|
||||
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context) {
|
||||
void ConvTexture::UploadDataForWinograd4x4To6x6(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights) {
|
||||
tflite::gpu::Tensor<OHWI, T> wino_weights;
|
||||
RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
|
||||
RETURN_IF_ERROR(UploadWeights(wino_weights, context));
|
||||
UploadWeights(wino_weights);
|
||||
|
||||
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
|
||||
bias.shape = Linear(1);
|
||||
@ -144,12 +136,10 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
|
||||
desc.UploadLinearData(bias);
|
||||
args_.AddObject("biases",
|
||||
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvTexture::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
void ConvTexture::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
|
||||
int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
dst_depth = AlignByN(dst_depth, block_size_.z);
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
@ -213,7 +203,6 @@ absl::Status ConvTexture::UploadWeights(
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
|
||||
args_.AddObject("weights3",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
template <DataType S, typename T>
|
||||
@ -261,19 +250,17 @@ void ConvTexture::RearrangeWeightsData(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvTexture* result);
|
||||
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvTexture* result);
|
||||
ConvTexture CreateConvTexture(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr);
|
||||
|
||||
absl::Status CreateConvTextureWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvTexture* result);
|
||||
ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -55,8 +55,8 @@ TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvTexture operation;
|
||||
ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
|
||||
ConvTexture operation =
|
||||
CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -90,8 +90,8 @@ TEST_F(OpenCLOperationTest, ConvTexture) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvTexture operation;
|
||||
ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
|
||||
ConvTexture operation =
|
||||
CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
|
@ -110,22 +110,20 @@ int3 FullyConnected::GetGridSize() const {
|
||||
return int3(dst_[0]->Slices(), 1, 1);
|
||||
}
|
||||
|
||||
absl::Status CreateFullyConnected(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
FullyConnected* result) {
|
||||
*result = FullyConnected(definition, creation_context.device->info_);
|
||||
RETURN_IF_ERROR(
|
||||
result->UploadWeights(attr.weights, creation_context.context));
|
||||
FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr) {
|
||||
FullyConnected result(definition, device_info);
|
||||
result.UploadWeights(attr.weights);
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
desc.element_type = definition.GetDataType();
|
||||
desc.UploadLinearData(attr.bias);
|
||||
result->args_.AddObject(
|
||||
result.args_.AddObject(
|
||||
"biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
|
||||
return absl::OkStatus();
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -105,21 +105,20 @@ class FullyConnected : public GPUOperation {
|
||||
|
||||
private:
|
||||
FullyConnected(const OperationDef& definition, const DeviceInfo& device_info);
|
||||
friend absl::Status CreateFullyConnected(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr, FullyConnected* result);
|
||||
friend FullyConnected CreateFullyConnected(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
|
||||
|
||||
std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
|
||||
const int3& work_group_size);
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status FullyConnected::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
void FullyConnected::UploadWeights(
|
||||
const tflite::gpu::Tensor<OHWI, T>& weights) {
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
|
||||
|
||||
@ -144,13 +143,11 @@ absl::Status FullyConnected::UploadWeights(
|
||||
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateFullyConnected(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
FullyConnected* result);
|
||||
FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -52,9 +52,8 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
FullyConnected operation;
|
||||
ASSERT_OK(
|
||||
CreateFullyConnected(creation_context_, op_def, attr, &operation));
|
||||
FullyConnected operation =
|
||||
CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 1, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}));
|
||||
|
@ -68,6 +68,8 @@ struct CreationContext {
|
||||
CLContext* context;
|
||||
CLCommandQueue* queue;
|
||||
ProgramCache* cache;
|
||||
|
||||
const DeviceInfo& GetDeviceInfo() const { return device->info_; }
|
||||
};
|
||||
|
||||
struct OperationDef {
|
||||
|
@ -234,7 +234,7 @@ std::string Winograd4x4To36::GetWinograd4x4To36Code(
|
||||
return c;
|
||||
}
|
||||
|
||||
absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
|
||||
void Winograd4x4To36::UploadBt() {
|
||||
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
|
||||
bt_aligned.shape = Linear(6 * 8);
|
||||
bt_aligned.data.resize(6 * 8);
|
||||
@ -253,7 +253,6 @@ absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
|
||||
desc.UploadLinearData(bt_aligned);
|
||||
args_.AddObject("bt",
|
||||
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
int3 Winograd4x4To36::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
|
||||
@ -298,13 +297,12 @@ void Winograd4x4To36::GetPossibleKernelWorkGroups(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Padding2D& padding,
|
||||
Winograd4x4To36* result) {
|
||||
*result =
|
||||
Winograd4x4To36(definition, padding, creation_context.device->info_);
|
||||
return result->UploadBt(creation_context.context);
|
||||
Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Padding2D& padding) {
|
||||
Winograd4x4To36 result(definition, padding, device_info);
|
||||
result.UploadBt();
|
||||
return result;
|
||||
}
|
||||
|
||||
Winograd36To4x4::Winograd36To4x4(const OperationDef& definition,
|
||||
@ -437,7 +435,7 @@ std::string Winograd36To4x4::GetWinograd36To4x4Code(
|
||||
return c;
|
||||
}
|
||||
|
||||
absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
|
||||
void Winograd36To4x4::UploadAt() {
|
||||
tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
|
||||
at_aligned.shape = Linear(4 * 8);
|
||||
at_aligned.data.resize(4 * 8);
|
||||
@ -456,7 +454,6 @@ absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
|
||||
desc.UploadLinearData(at_aligned);
|
||||
args_.AddObject("at",
|
||||
absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
int3 Winograd36To4x4::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
|
||||
@ -496,18 +493,18 @@ void Winograd36To4x4::GetPossibleKernelWorkGroups(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateWinograd36To4x4(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
|
||||
Winograd36To4x4* result) {
|
||||
*result = Winograd36To4x4(definition, creation_context.device->info_);
|
||||
Winograd36To4x4 CreateWinograd36To4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
|
||||
Winograd36To4x4 result(definition, device_info);
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
desc.element_type = definition.GetDataType();
|
||||
desc.UploadLinearData(biases);
|
||||
result->args_.AddObject(
|
||||
result.args_.AddObject(
|
||||
"biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
return result->UploadAt(creation_context.context);
|
||||
result.UploadAt();
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -50,11 +50,11 @@ class Winograd4x4To36 : public GPUOperation {
|
||||
Winograd4x4To36& operator=(const Winograd4x4To36&) = delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateWinograd4x4To36(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Padding2D& padding, Winograd4x4To36* result);
|
||||
friend Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Padding2D& padding);
|
||||
|
||||
absl::Status UploadBt(CLContext* context);
|
||||
void UploadBt();
|
||||
|
||||
std::string GetWinograd4x4To36Code(const OperationDef& op_def);
|
||||
|
||||
@ -64,10 +64,9 @@ class Winograd4x4To36 : public GPUOperation {
|
||||
Padding2D padding_;
|
||||
};
|
||||
|
||||
absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Padding2D& padding,
|
||||
Winograd4x4To36* result);
|
||||
Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
const Padding2D& padding);
|
||||
|
||||
class Winograd36To4x4 : public GPUOperation {
|
||||
public:
|
||||
@ -88,12 +87,11 @@ class Winograd36To4x4 : public GPUOperation {
|
||||
Winograd36To4x4& operator=(const Winograd36To4x4&) = delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateWinograd36To4x4(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
|
||||
Winograd36To4x4* result);
|
||||
friend Winograd36To4x4 CreateWinograd36To4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
|
||||
|
||||
absl::Status UploadAt(CLContext* context);
|
||||
void UploadAt();
|
||||
|
||||
std::string GetWinograd36To4x4Code(const OperationDef& op_def);
|
||||
|
||||
@ -101,10 +99,9 @@ class Winograd36To4x4 : public GPUOperation {
|
||||
int3 SelectBestWorkGroup(const KernelInfo& kernel_info) const;
|
||||
};
|
||||
|
||||
absl::Status CreateWinograd36To4x4(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
|
||||
Winograd36To4x4* result);
|
||||
Winograd36To4x4 CreateWinograd36To4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -93,9 +93,8 @@ TEST_F(OpenCLOperationTest, Winograd4x4To36) {
|
||||
Padding2D padding;
|
||||
padding.prepended = HW(1, 1);
|
||||
padding.appended = HW(1, 1);
|
||||
Winograd4x4To36 wino_up;
|
||||
ASSERT_OK(
|
||||
CreateWinograd4x4To36(creation_context_, op_def, padding, &wino_up));
|
||||
Winograd4x4To36 wino_up = CreateWinograd4x4To36(
|
||||
creation_context_.GetDeviceInfo(), op_def, padding);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_up,
|
||||
BHWC(1, 36, 1, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
|
||||
@ -162,9 +161,8 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
Winograd36To4x4 wino_down;
|
||||
ASSERT_OK(
|
||||
CreateWinograd36To4x4(creation_context_, op_def, biases, &wino_down));
|
||||
Winograd36To4x4 wino_down = CreateWinograd36To4x4(
|
||||
creation_context_.GetDeviceInfo(), op_def, biases);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_down,
|
||||
BHWC(1, 4, 4, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
|
||||
|
@ -32,17 +32,15 @@ namespace {
|
||||
|
||||
absl::Status SelectConvolutionAdreno(const Convolution2DAttributes& attr,
|
||||
const BHWC& dst_shape,
|
||||
const CreationContext& creation_context,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def,
|
||||
ModelHints hints,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
if (IsConvConstantsSupported(*creation_context.device, op_def, attr)) {
|
||||
ConvConstants conv;
|
||||
RETURN_IF_ERROR(CreateConvConstants(creation_context, op_def, attr, &conv));
|
||||
if (IsConvConstantsSupported(device_info, op_def, attr)) {
|
||||
ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<ConvConstants>(std::move(conv));
|
||||
} else {
|
||||
ConvTexture conv;
|
||||
RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
|
||||
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<ConvTexture>(std::move(conv));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
@ -50,23 +48,20 @@ absl::Status SelectConvolutionAdreno(const Convolution2DAttributes& attr,
|
||||
|
||||
absl::Status SelectConvolutionWinogradAdreno(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const CreationContext& creation_context, const OperationDef& op_def,
|
||||
ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
|
||||
ConvTexture conv;
|
||||
RETURN_IF_ERROR(
|
||||
CreateConvTextureWino4x4To6x6(creation_context, op_def, attr, &conv));
|
||||
const DeviceInfo& device_info, const OperationDef& op_def, ModelHints hints,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
ConvTexture conv = CreateConvTextureWino4x4To6x6(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<ConvTexture>(std::move(conv));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SelectConvolutionDynamicWeightsAdreno(
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC& dst_shape, const CreationContext& creation_context,
|
||||
const BHWC& dst_shape, const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
|
||||
ConvPowerVR conv;
|
||||
RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
|
||||
creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
|
||||
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
|
||||
device_info, op_def, attr, weights_shape, &dst_shape);
|
||||
*weights_desc = conv.GetConvWeightsDescription();
|
||||
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
return absl::OkStatus();
|
||||
@ -74,86 +69,77 @@ absl::Status SelectConvolutionDynamicWeightsAdreno(
|
||||
|
||||
absl::Status SelectConvolutionNVidia(const Convolution2DAttributes& attr,
|
||||
const BHWC& dst_shape,
|
||||
const CreationContext& creation_context,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
if (IsConvConstantsSupported(*creation_context.device, op_def, attr)) {
|
||||
ConvConstants conv;
|
||||
RETURN_IF_ERROR(CreateConvConstants(creation_context, op_def, attr, &conv));
|
||||
if (IsConvConstantsSupported(device_info, op_def, attr)) {
|
||||
ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<ConvConstants>(std::move(conv));
|
||||
} else {
|
||||
ConvPowerVR conv;
|
||||
RETURN_IF_ERROR(
|
||||
CreateConvPowerVR(creation_context, op_def, attr, &conv, &dst_shape));
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SelectConvolutionPowerVR(const Convolution2DAttributes& attr,
|
||||
const CreationContext& creation_context,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
ConvPowerVR conv;
|
||||
RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SelectConvolutionMali(const Convolution2DAttributes& attr,
|
||||
const BHWC& dst_shape,
|
||||
const CreationContext& creation_context,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
|
||||
IsConvBuffer1x1Supported(op_def, attr)) {
|
||||
ConvBuffer1x1 conv;
|
||||
RETURN_IF_ERROR(
|
||||
CreateConvBuffer1x1(creation_context, op_def, attr, &conv, &dst_shape));
|
||||
ConvBuffer1x1 conv =
|
||||
CreateConvBuffer1x1(device_info, op_def, attr, &dst_shape);
|
||||
*ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
|
||||
} else {
|
||||
ConvPowerVR conv;
|
||||
RETURN_IF_ERROR(
|
||||
CreateConvPowerVR(creation_context, op_def, attr, &conv, &dst_shape));
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SelectConvolutionWinogradMali(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const CreationContext& creation_context, const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
absl::Status SelectConvolutionWinogradMali(const Convolution2DAttributes& attr,
|
||||
const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
|
||||
ConvBuffer1x1 conv;
|
||||
RETURN_IF_ERROR(CreateConvBuffer1x1Wino4x4To6x6(creation_context, op_def,
|
||||
attr, &conv, &dst_shape));
|
||||
ConvBuffer1x1 conv =
|
||||
CreateConvBuffer1x1Wino4x4To6x6(device_info, op_def, attr, &dst_shape);
|
||||
*ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
|
||||
} else {
|
||||
ConvPowerVR conv;
|
||||
RETURN_IF_ERROR(CreateConvPowerVRWino4x4To6x6(creation_context, op_def,
|
||||
attr, &conv, &dst_shape));
|
||||
ConvPowerVR conv =
|
||||
CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
|
||||
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SelectConvolutionDynamicWeightsMali(
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC& dst_shape, const CreationContext& creation_context,
|
||||
const BHWC& dst_shape, const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
|
||||
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
|
||||
IsConvBuffer1x1Supported(op_def, weights_shape, attr)) {
|
||||
ConvBuffer1x1 conv;
|
||||
RETURN_IF_ERROR(CreateConvBuffer1x1DynamicWeights(
|
||||
creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
|
||||
ConvBuffer1x1 conv = CreateConvBuffer1x1DynamicWeights(
|
||||
device_info, op_def, attr, weights_shape, &dst_shape);
|
||||
*weights_desc = conv.GetConvWeightsDescription();
|
||||
*ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
|
||||
} else {
|
||||
ConvPowerVR conv;
|
||||
RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
|
||||
creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
|
||||
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
|
||||
device_info, op_def, attr, weights_shape, &dst_shape);
|
||||
*weights_desc = conv.GetConvWeightsDescription();
|
||||
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
@ -164,70 +150,65 @@ absl::Status SelectConvolutionDynamicWeightsMali(
|
||||
|
||||
absl::Status SelectConvolution(const Convolution2DAttributes& attr,
|
||||
const BHWC& dst_shape,
|
||||
const CreationContext& creation_context,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
const auto& device_info = creation_context.device->info_;
|
||||
if (device_info.IsAdreno()) {
|
||||
return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
|
||||
hints, ptr);
|
||||
return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints,
|
||||
ptr);
|
||||
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
|
||||
device_info.IsIntel()) {
|
||||
return SelectConvolutionPowerVR(attr, creation_context, op_def, ptr);
|
||||
return SelectConvolutionPowerVR(attr, device_info, op_def, ptr);
|
||||
} else if (device_info.IsNvidia()) {
|
||||
return SelectConvolutionNVidia(attr, dst_shape, creation_context, op_def,
|
||||
ptr);
|
||||
return SelectConvolutionNVidia(attr, dst_shape, device_info, op_def, ptr);
|
||||
} else if (device_info.IsMali()) {
|
||||
return SelectConvolutionMali(attr, dst_shape, creation_context, op_def,
|
||||
ptr);
|
||||
return SelectConvolutionMali(attr, dst_shape, device_info, op_def, ptr);
|
||||
} else {
|
||||
return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
|
||||
hints, ptr);
|
||||
return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints,
|
||||
ptr);
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status SelectConvolutionForWinograd(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const CreationContext& creation_context, const OperationDef& op_def,
|
||||
ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
|
||||
const auto& device_info = creation_context.device->info_;
|
||||
absl::Status SelectConvolutionForWinograd(const Convolution2DAttributes& attr,
|
||||
const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def,
|
||||
ModelHints hints,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
if (device_info.IsAdreno()) {
|
||||
return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
|
||||
op_def, hints, ptr);
|
||||
return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
|
||||
hints, ptr);
|
||||
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
|
||||
device_info.IsNvidia() || device_info.IsIntel()) {
|
||||
ConvPowerVR conv;
|
||||
RETURN_IF_ERROR(CreateConvPowerVRWino4x4To6x6(creation_context, op_def,
|
||||
attr, &conv, &dst_shape));
|
||||
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
return absl::OkStatus();
|
||||
ConvPowerVR conv =
|
||||
CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
|
||||
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
return absl::OkStatus();
|
||||
} else if (device_info.IsMali()) {
|
||||
return SelectConvolutionWinogradMali(attr, dst_shape, creation_context,
|
||||
op_def, ptr);
|
||||
return SelectConvolutionWinogradMali(attr, dst_shape, device_info, op_def,
|
||||
ptr);
|
||||
} else {
|
||||
return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
|
||||
op_def, hints, ptr);
|
||||
return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
|
||||
hints, ptr);
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status SelectConvolutionWithDynamicWeights(
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC& dst_shape, const CreationContext& creation_context,
|
||||
const BHWC& dst_shape, const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
|
||||
const auto& device_info = creation_context.device->info_;
|
||||
if (device_info.IsAdreno()) {
|
||||
return SelectConvolutionDynamicWeightsAdreno(attr, weights_shape, dst_shape,
|
||||
creation_context, op_def,
|
||||
hints, ptr, weights_desc);
|
||||
device_info, op_def, hints,
|
||||
ptr, weights_desc);
|
||||
} else if (device_info.IsMali()) {
|
||||
return SelectConvolutionDynamicWeightsMali(attr, weights_shape, dst_shape,
|
||||
creation_context, op_def, hints,
|
||||
ptr, weights_desc);
|
||||
device_info, op_def, hints, ptr,
|
||||
weights_desc);
|
||||
} else {
|
||||
ConvPowerVR conv;
|
||||
RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
|
||||
creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
|
||||
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
|
||||
device_info, op_def, attr, weights_shape, &dst_shape);
|
||||
*weights_desc = conv.GetConvWeightsDescription();
|
||||
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
return absl::OkStatus();
|
||||
@ -235,8 +216,7 @@ absl::Status SelectConvolutionWithDynamicWeights(
|
||||
}
|
||||
|
||||
absl::Status SelectConverterToConvWeights(
|
||||
const ConvWeightsDescription& weights_desc,
|
||||
const CreationContext& creation_context, const OperationDef& op_def,
|
||||
const ConvWeightsDescription& weights_desc, const OperationDef& op_def,
|
||||
ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
|
||||
ConverterToConvWeights converter =
|
||||
ConverterToConvWeights(op_def, weights_desc);
|
||||
|
@ -31,24 +31,25 @@ namespace cl {
|
||||
|
||||
absl::Status SelectConvolution(const Convolution2DAttributes& attr,
|
||||
const BHWC& dst_shape,
|
||||
const CreationContext& creation_context,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
|
||||
absl::Status SelectConvolutionForWinograd(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const CreationContext& creation_context, const OperationDef& op_def,
|
||||
ModelHints hints, std::unique_ptr<GPUOperation>* ptr);
|
||||
absl::Status SelectConvolutionForWinograd(const Convolution2DAttributes& attr,
|
||||
const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def,
|
||||
ModelHints hints,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
|
||||
absl::Status SelectConvolutionWithDynamicWeights(
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC& dst_shape, const CreationContext& creation_context,
|
||||
const BHWC& dst_shape, const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc);
|
||||
|
||||
absl::Status SelectConverterToConvWeights(
|
||||
const ConvWeightsDescription& weights_desc,
|
||||
const CreationContext& creation_context, const OperationDef& op_def,
|
||||
const ConvWeightsDescription& weights_desc, const OperationDef& op_def,
|
||||
ModelHints hints, std::unique_ptr<GPUOperation>* ptr);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -27,97 +27,87 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
absl::Status SelectFullyConnectedGeneric(
|
||||
const FullyConnectedAttributes& attr,
|
||||
const CreationContext& creation_context, const OperationDef& op_def,
|
||||
int batch_size, std::unique_ptr<GPUOperation>* ptr) {
|
||||
absl::Status SelectFullyConnectedGeneric(const FullyConnectedAttributes& attr,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def,
|
||||
int batch_size,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
if (op_def.IsBatchSupported()) {
|
||||
ConvTexture conv;
|
||||
RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
|
||||
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<ConvTexture>(std::move(conv));
|
||||
} else {
|
||||
FullyConnected fc;
|
||||
RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
|
||||
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<FullyConnected>(std::move(fc));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SelectFullyConnectedAdreno(const FullyConnectedAttributes& attr,
|
||||
const CreationContext& creation_context,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def,
|
||||
int batch_size,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
if (op_def.IsBatchSupported()) {
|
||||
ConvTexture conv;
|
||||
RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
|
||||
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<ConvTexture>(std::move(conv));
|
||||
} else {
|
||||
FullyConnected fc;
|
||||
RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
|
||||
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<FullyConnected>(std::move(fc));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SelectFullyConnectedPowerVR(
|
||||
const FullyConnectedAttributes& attr,
|
||||
const CreationContext& creation_context, const OperationDef& op_def,
|
||||
int batch_size, std::unique_ptr<GPUOperation>* ptr) {
|
||||
absl::Status SelectFullyConnectedPowerVR(const FullyConnectedAttributes& attr,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def,
|
||||
int batch_size,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
if (op_def.IsBatchSupported()) {
|
||||
ConvPowerVR conv;
|
||||
RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
} else {
|
||||
FullyConnected fc;
|
||||
RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
|
||||
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<FullyConnected>(std::move(fc));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SelectFullyConnectedMali(const FullyConnectedAttributes& attr,
|
||||
const CreationContext& creation_context,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def,
|
||||
int batch_size,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
if (op_def.IsBatchSupported()) {
|
||||
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
|
||||
ConvBuffer1x1 conv;
|
||||
RETURN_IF_ERROR(
|
||||
CreateConvBuffer1x1(creation_context, op_def, attr, &conv));
|
||||
ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
|
||||
} else {
|
||||
ConvTexture conv;
|
||||
RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
|
||||
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<ConvTexture>(std::move(conv));
|
||||
}
|
||||
} else {
|
||||
FullyConnected fc;
|
||||
RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
|
||||
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
|
||||
*ptr = absl::make_unique<FullyConnected>(std::move(fc));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
|
||||
const CreationContext& creation_context,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, int batch_size,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
const auto& device_info = creation_context.device->info_;
|
||||
if (device_info.IsAdreno()) {
|
||||
return SelectFullyConnectedAdreno(attr, creation_context, op_def,
|
||||
batch_size, ptr);
|
||||
return SelectFullyConnectedAdreno(attr, device_info, op_def, batch_size,
|
||||
ptr);
|
||||
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
|
||||
device_info.IsNvidia() || device_info.IsIntel()) {
|
||||
return SelectFullyConnectedPowerVR(attr, creation_context, op_def,
|
||||
batch_size, ptr);
|
||||
return SelectFullyConnectedPowerVR(attr, device_info, op_def, batch_size,
|
||||
ptr);
|
||||
} else if (device_info.IsMali()) {
|
||||
return SelectFullyConnectedMali(attr, creation_context, op_def, batch_size,
|
||||
ptr);
|
||||
return SelectFullyConnectedMali(attr, device_info, op_def, batch_size, ptr);
|
||||
} else {
|
||||
return SelectFullyConnectedGeneric(attr, creation_context, op_def,
|
||||
batch_size, ptr);
|
||||
return SelectFullyConnectedGeneric(attr, device_info, op_def, batch_size,
|
||||
ptr);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -27,7 +27,7 @@ namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
|
||||
const CreationContext& creation_context,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, int batch_size,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
|
||||
|
@ -39,7 +39,7 @@ namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
|
||||
const CLDevice& device,
|
||||
const DeviceInfo& device_info,
|
||||
const BHWC& dst_shape) {
|
||||
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
|
||||
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
|
||||
@ -49,23 +49,22 @@ bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
|
||||
attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
|
||||
attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
|
||||
// Mali among other devices has smaller SIMD line size
|
||||
const int min_depth = device.IsMali() ? 16 : 32;
|
||||
const int min_hw = device.IsMali() ? 32 : 128;
|
||||
const int min_depth = device_info.IsMali() ? 16 : 32;
|
||||
const int min_hw = device_info.IsMali() ? 32 : 128;
|
||||
const bool recommended_channels =
|
||||
dst_depth % 4 == 0 && src_depth >= min_depth && dst_depth >= min_depth;
|
||||
const bool recommended_hw = tiles_x * tiles_y >= min_hw;
|
||||
return suitable_attributes && recommended_channels && recommended_hw;
|
||||
}
|
||||
|
||||
absl::Status WinogradFromNode(const CreationContext& creation_context,
|
||||
absl::Status WinogradFromNode(const DeviceInfo& device_info,
|
||||
const std::vector<Value*>& inputs,
|
||||
const std::vector<Value*>& outputs,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
const BHWC& input_shape, const BHWC& output_shape,
|
||||
const Convolution2DAttributes& attr,
|
||||
GPUOperationsSubgraph* gpu_subgraph) {
|
||||
if (!IsSuitableForWinograd4x4To6x6(attr, *creation_context.device,
|
||||
output_shape)) {
|
||||
if (!IsSuitableForWinograd4x4To6x6(attr, device_info, output_shape)) {
|
||||
return absl::UnimplementedError("No implementation for this case.");
|
||||
}
|
||||
|
||||
@ -75,16 +74,14 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
|
||||
const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
|
||||
TensorDescriptor td_0;
|
||||
td_0.storage_type = SelectBestStorageType(
|
||||
creation_context.device->info_, shape_0,
|
||||
op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
|
||||
op_def.src_tensors[0].layout);
|
||||
device_info, shape_0, op_def.src_tensors[0].storage_type,
|
||||
op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
|
||||
td_0.data_type = op_def.src_tensors[0].data_type;
|
||||
td_0.layout = op_def.src_tensors[0].layout;
|
||||
TensorDescriptor td_1;
|
||||
td_1.storage_type = SelectBestStorageType(
|
||||
creation_context.device->info_, shape_1,
|
||||
op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
|
||||
op_def.src_tensors[0].layout);
|
||||
device_info, shape_1, op_def.src_tensors[0].storage_type,
|
||||
op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
|
||||
td_1.data_type = op_def.src_tensors[0].data_type;
|
||||
td_1.layout = op_def.src_tensors[0].layout;
|
||||
gpu_subgraph->new_tensors = {{shape_0, td_0}, {shape_1, td_1}};
|
||||
@ -96,8 +93,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
|
||||
winograd_up_def.src_tensors.push_back(op_def.src_tensors[0]);
|
||||
winograd_up_def.dst_tensors.push_back(td_0);
|
||||
auto& winograd_up = gpu_subgraph->operations[0];
|
||||
RETURN_IF_ERROR(SelectWinograd4x4To36(
|
||||
creation_context, attr.padding, winograd_up_def, &winograd_up.operation));
|
||||
winograd_up.operation =
|
||||
SelectWinograd4x4To36(device_info, attr.padding, winograd_up_def);
|
||||
winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
|
||||
winograd_up.output_ids = {-1};
|
||||
|
||||
@ -109,7 +106,7 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
|
||||
conv.input_ids = {-1};
|
||||
conv.output_ids = {-2};
|
||||
RETURN_IF_ERROR(SelectConvolutionForWinograd(
|
||||
attr, input_shape, creation_context, conv_def, hints, &conv.operation));
|
||||
attr, input_shape, device_info, conv_def, hints, &conv.operation));
|
||||
|
||||
OperationDef winograd_down_def;
|
||||
winograd_down_def.precision = op_def.precision;
|
||||
@ -123,8 +120,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
|
||||
bias_copy.shape = Linear(attr.weights.shape.o);
|
||||
bias_copy.data.resize(attr.weights.shape.o);
|
||||
}
|
||||
RETURN_IF_ERROR(SelectWinograd36To4x4(creation_context, winograd_down_def,
|
||||
bias_copy, &winograd_down.operation));
|
||||
winograd_down.operation =
|
||||
SelectWinograd36To4x4(device_info, winograd_down_def, bias_copy);
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
@ -183,13 +180,15 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
|
||||
auto input_shape = inputs[0]->tensor.shape;
|
||||
auto output_shape = outputs[0]->tensor.shape;
|
||||
if (inputs.size() == 1) {
|
||||
if (WinogradFromNode(creation_context, inputs, outputs, op_def, hints,
|
||||
input_shape, output_shape, attr, gpu_subgraph)
|
||||
if (WinogradFromNode(creation_context.GetDeviceInfo(), inputs, outputs,
|
||||
op_def, hints, input_shape, output_shape, attr,
|
||||
gpu_subgraph)
|
||||
.ok()) {
|
||||
return absl::OkStatus();
|
||||
} else {
|
||||
gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
|
||||
return SelectConvolution(attr, output_shape, creation_context, op_def,
|
||||
return SelectConvolution(attr, output_shape,
|
||||
creation_context.GetDeviceInfo(), op_def,
|
||||
hints, gpu_op);
|
||||
}
|
||||
} else {
|
||||
@ -207,8 +206,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
|
||||
conv_def.src_tensors[1] = weights_desc;
|
||||
ConvWeightsDescription conv_weights_desc;
|
||||
RETURN_IF_ERROR(SelectConvolutionWithDynamicWeights(
|
||||
attr, weights_shape, output_shape, creation_context, conv_def,
|
||||
hints, &conv_op.operation, &conv_weights_desc));
|
||||
attr, weights_shape, output_shape, creation_context.GetDeviceInfo(),
|
||||
conv_def, hints, &conv_op.operation, &conv_weights_desc));
|
||||
|
||||
int aligned_output =
|
||||
AlignByN(weights_shape.b, conv_weights_desc.output_group_size * 4);
|
||||
@ -225,9 +224,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
|
||||
|
||||
converter_op.input_ids = {static_cast<int>(inputs[1]->id)};
|
||||
converter_op.output_ids = {-1};
|
||||
return SelectConverterToConvWeights(conv_weights_desc, creation_context,
|
||||
converter_def, hints,
|
||||
&converter_op.operation);
|
||||
return SelectConverterToConvWeights(conv_weights_desc, converter_def,
|
||||
hints, &converter_op.operation);
|
||||
}
|
||||
}
|
||||
case OperationType::CONVOLUTION_TRANSPOSED: {
|
||||
@ -244,8 +242,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
|
||||
case OperationType::FULLY_CONNECTED: {
|
||||
auto attr =
|
||||
absl::any_cast<FullyConnectedAttributes>(node.operation.attributes);
|
||||
return SelectFullyConnected(attr, creation_context, op_def,
|
||||
inputs[0]->tensor.shape.b, gpu_op);
|
||||
return SelectFullyConnected(attr, creation_context.GetDeviceInfo(),
|
||||
op_def, inputs[0]->tensor.shape.b, gpu_op);
|
||||
}
|
||||
case OperationType::LSTM: {
|
||||
SelectLSTM(op_def, creation_context.device->info_, gpu_op);
|
||||
|
@ -179,26 +179,18 @@ void SelectTranspose(const TransposeAttributes& attr,
|
||||
*ptr = absl::make_unique<GPUOperation>(std::move(operation));
|
||||
}
|
||||
|
||||
absl::Status SelectWinograd4x4To36(const CreationContext& creation_context,
|
||||
const Padding2D& padding,
|
||||
const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
Winograd4x4To36 operation;
|
||||
RETURN_IF_ERROR(
|
||||
CreateWinograd4x4To36(creation_context, op_def, padding, &operation));
|
||||
*ptr = absl::make_unique<Winograd4x4To36>(std::move(operation));
|
||||
return absl::OkStatus();
|
||||
std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
|
||||
const DeviceInfo& device_info, const Padding2D& padding,
|
||||
const OperationDef& op_def) {
|
||||
return absl::make_unique<Winograd4x4To36>(
|
||||
CreateWinograd4x4To36(device_info, op_def, padding));
|
||||
}
|
||||
|
||||
absl::Status SelectWinograd36To4x4(
|
||||
const CreationContext& creation_context, const OperationDef& op_def,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
Winograd36To4x4 operation;
|
||||
RETURN_IF_ERROR(
|
||||
CreateWinograd36To4x4(creation_context, op_def, biases, &operation));
|
||||
*ptr = absl::make_unique<Winograd36To4x4>(std::move(operation));
|
||||
return absl::OkStatus();
|
||||
std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& op_def,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
|
||||
return absl::make_unique<Winograd36To4x4>(
|
||||
CreateWinograd36To4x4(device_info, op_def, biases));
|
||||
}
|
||||
|
||||
void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
|
||||
|
@ -85,15 +85,13 @@ void SelectTranspose(const TransposeAttributes& attr,
|
||||
const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
|
||||
absl::Status SelectWinograd4x4To36(const CreationContext& creation_context,
|
||||
const Padding2D& padding,
|
||||
const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
|
||||
const DeviceInfo& device_info, const Padding2D& padding,
|
||||
const OperationDef& op_def);
|
||||
|
||||
absl::Status SelectWinograd36To4x4(
|
||||
const CreationContext& creation_context, const OperationDef& op_def,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& op_def,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
|
||||
|
||||
void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
|
||||
const CreationContext& creation_context,
|
||||
|
Loading…
Reference in New Issue
Block a user