ConvConstants converted to generic GPUOperation.

PiperOrigin-RevId: 328431894
Change-Id: I464d830b55ab27cdd47761c2432f731e009fda15
This commit is contained in:
Raman Sarokin 2020-08-25 16:56:47 -07:00 committed by TensorFlower Gardener
parent 776e040ae0
commit 0634d08af1
4 changed files with 96 additions and 172 deletions

View File

@ -45,84 +45,29 @@ int GetOptimalMaxConstantSize(const DeviceInfo& info) {
return GetAdrenoOptimalMaxConstantSize(info.adreno_info.gpu_version);
}
}
} // namespace
ConvConstants::ConvConstants(const OperationDef& definition,
const Convolution2DAttributes& attr,
const DeviceInfo& device_info)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
stride_(attr.strides.w, attr.strides.h),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
dilation_(attr.dilations.w, attr.dilations.h),
src_channels_(attr.weights.shape.i),
dst_channels_(attr.weights.shape.o) {
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
code_ =
GenerateConvolutionConstantCode(definition_, kernel_size_, src_channels_,
dst_channels_, stride_correction);
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsAdreno3xx()) {
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
if (definition_.precision != CalculationsPrecision::F32 &&
device_info.IsPowerVR()) {
// BUG, some PowerVRs (GE8320) produce incorrect result without it
compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
}
}
ConvConstants::ConvConstants(ConvConstants&& kernel)
: GPUOperation(std::move(kernel)),
kernel_size_(kernel.kernel_size_),
stride_(kernel.stride_),
padding_(kernel.padding_),
dilation_(kernel.dilation_),
src_channels_(kernel.src_channels_),
dst_channels_(kernel.dst_channels_) {}
ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
if (this != &kernel) {
std::swap(kernel_size_, kernel.kernel_size_);
std::swap(stride_, kernel.stride_);
std::swap(padding_, kernel.padding_);
std::swap(dilation_, kernel.dilation_);
std::swap(src_channels_, kernel.src_channels_);
std::swap(dst_channels_, kernel.dst_channels_);
GPUOperation::operator=(std::move(kernel));
}
return *this;
}
std::string ConvConstants::GenerateConvolutionConstantCode(
const OperationDef& op_def, const int2& kernel_size, int src_channels,
int dst_channels, bool stride_correction) {
std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
const OHWI& weights_shape,
bool stride_correction,
GPUOperation* op) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
if (op_def.IsBatchSupported()) {
src_desc.SetStateVar("BatchedWidth", "true");
}
AddSrcTensor("src_tensor", src_desc);
op->AddSrcTensor("src_tensor", src_desc);
auto dst_desc = op_def.dst_tensors[0];
if (op_def.IsBatchSupported()) {
dst_desc.SetStateVar("BatchedWidth", "true");
}
AddDstTensor("dst_tensor", dst_desc);
args_.AddInt("stride_x");
args_.AddInt("stride_y");
args_.AddInt("padding_x");
args_.AddInt("padding_y");
args_.AddInt("dilation_x");
args_.AddInt("dilation_y");
op->AddDstTensor("dst_tensor", dst_desc);
std::string c = GetCommonDefines(op_def.precision);
const int out_z = DivideRoundUp(dst_channels, 4);
const int out_z = DivideRoundUp(weights_shape.o, 4);
const std::string kOutZ = std::to_string(out_z);
const int src_depth = DivideRoundUp(src_channels, 4);
const int src_depth = DivideRoundUp(weights_shape.i, 4);
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@ -176,11 +121,16 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
"return;\n";
if (stride_correction) {
c += " int start_x = " +
GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
"args.padding_x") +
GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
"args.padding_x") +
";\n";
} else {
c += " int start_x = X * args.stride_x + args.padding_x;\n";
if (op_def.IsBatchSupported()) {
c += " int start_x = X * args.stride_x + args.padding_x * "
"args.src_tensor.Batch();\n";
} else {
c += " int start_x = X * args.stride_x + args.padding_x;\n";
}
}
c += " int start_y = Y * args.stride_y + args.padding_y;\n";
c += " ACCUM_FLT4 r[" + kOutZ + "];\n";
@ -189,22 +139,25 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
c += " }\n";
int filters_counter = 0;
for (int s = 0; s < src_depth; ++s) {
const int ch_count = std::min(4, src_channels - s * 4);
const int ch_count = std::min(4, weights_shape.i - s * 4);
const std::string s_conv = "CONV" + std::to_string(ch_count);
const std::string s_count = ch_count == 1 ? "" : std::to_string(ch_count);
const std::string s_type = absl::StrCat("FLT", s_count);
const std::string s_postfix = postfixes[ch_count - 1];
for (int ky = 0; ky < kernel_size.y; ++ky) {
const std::string dilation_x =
op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
: "args.dilation_x";
for (int ky = 0; ky < weights_shape.h; ++ky) {
std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
if (manual_clamp) {
c += " {\n";
c += " bool y_out = " + s_y + " < 0 || " + s_y +
" >= args.src_tensor.Height();\n";
}
for (int kx = 0; kx < kernel_size.x; ++kx) {
for (int kx = 0; kx < weights_shape.w; ++kx) {
c += " {\n";
std::string s_x =
absl::StrCat("(start_x + ", kx, " * args.dilation_x)");
absl::StrCat("(start_x + ", kx, " * " + dilation_x + ")");
if (manual_clamp) {
c += " bool x_out = " + s_x + "< 0 || " + s_x +
">= args.src_tensor.Width();\n";
@ -240,20 +193,7 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
return c;
}
absl::Status ConvConstants::BindArguments() {
RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
return args_.SetInt("dilation_y", dilation_.y);
}
int3 ConvConstants::GetGridSize() const {
const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
const int grid_y = dst_[0]->Height();
return int3(grid_x, grid_y, 1);
}
} // namespace
bool IsConvConstantsSupported(const DeviceInfo& device_info,
const OperationDef& definition,
@ -277,20 +217,41 @@ bool IsConvConstantsSupported(const DeviceInfo& device_info,
return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
}
ConvConstants CreateConvConstants(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
ConvConstants result(definition, attr, device_info);
result.UploadWeights(attr.weights);
GPUOperation CreateConvConstants(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
GPUOperation op(definition);
UploadWeightsForConvConstants(attr.weights, definition.precision, &op);
op.args_.AddInt("stride_x", attr.strides.w);
op.args_.AddInt("stride_y", attr.strides.h);
op.args_.AddInt("padding_x", -attr.padding.prepended.w);
op.args_.AddInt("padding_y", -attr.padding.prepended.h);
op.args_.AddInt("dilation_x", attr.dilations.w);
op.args_.AddInt("dilation_y", attr.dilations.h);
op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
const bool stride_correction =
definition.IsBatchSupported() && attr.strides.w != 1;
op.code_ = GenerateConvolutionConstantCode(definition, attr.weights.shape,
stride_correction, &op);
if (definition.precision == CalculationsPrecision::F16 &&
device_info.IsAdreno3xx()) {
op.compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
if (definition.precision != CalculationsPrecision::F32 &&
device_info.IsPowerVR()) {
// BUG, some PowerVRs (GE8320) produce incorrect result without it
op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
}
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::BUFFER;
desc.element_type = definition.GetDataType();
desc.memory_type = MemoryType::CONSTANT;
desc.UploadLinearData(attr.bias);
result.args_.AddObject(
op.args_.AddObject(
"biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
return result;
return op;
}
} // namespace cl

View File

@ -32,78 +32,8 @@ namespace tflite {
namespace gpu {
namespace cl {
class ConvConstants : public GPUOperation {
public:
ConvConstants() = default;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
// Move only
ConvConstants(ConvConstants&& kernel);
ConvConstants& operator=(ConvConstants&& kernel);
ConvConstants(const ConvConstants&) = delete;
ConvConstants& operator=(const ConvConstants&) = delete;
private:
friend ConvConstants CreateConvConstants(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
ConvConstants(const OperationDef& definition,
const Convolution2DAttributes& attr,
const DeviceInfo& device_info);
template <DataType T>
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
template <DataType S, typename T>
void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
absl::Span<T> dst);
std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
const int2& kernel_size,
int src_channels,
int dst_channels,
bool stride_correction);
int2 kernel_size_;
int2 stride_;
int2 padding_;
int2 dilation_;
int src_channels_;
int dst_channels_;
};
template <DataType T>
void ConvConstants::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int float_size = f32_weights ? 4 : 2;
const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
desc.memory_type = MemoryType::CONSTANT;
desc.size = float_size * float_count;
desc.data.resize(desc.size);
if (f32_weights) {
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
} else {
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
}
args_.AddObject("weigths",
absl::make_unique<BufferDescriptor>(std::move(desc)));
}
template <DataType S, typename T>
void ConvConstants::RearrangeWeightsData(
void RearrangeWeightsForConvConstants(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
@ -115,7 +45,7 @@ void ConvConstants::RearrangeWeightsData(
for (int y = 0; y < kernel_y; ++y) {
for (int x = 0; x < kernel_x; ++x) {
for (int d = 0; d < dst_depth; ++d) {
const int channels_count = std::min(4, src_channels_ - s * 4);
const int channels_count = std::min(4, weights.shape.i - s * 4);
T filters[4];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < channels_count; ++j) {
@ -145,13 +75,46 @@ void ConvConstants::RearrangeWeightsData(
}
}
template <DataType T>
void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
CalculationsPrecision precision,
GPUOperation* op) {
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
const bool f32_weights = precision == CalculationsPrecision::F32;
const int float_size = f32_weights ? 4 : 2;
const int float_count = weights.shape.i * dst_depth * 4 * kernel_x * kernel_y;
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
desc.memory_type = MemoryType::CONSTANT;
desc.size = float_size * float_count;
desc.data.resize(desc.size);
if (f32_weights) {
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
RearrangeWeightsForConvConstants(weights,
absl::MakeSpan(ptr, float_count / 4));
} else {
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
RearrangeWeightsForConvConstants(weights,
absl::MakeSpan(ptr, float_count / 4));
}
op->args_.AddObject("weigths",
absl::make_unique<BufferDescriptor>(std::move(desc)));
}
bool IsConvConstantsSupported(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
ConvConstants CreateConvConstants(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
GPUOperation CreateConvConstants(const DeviceInfo& device_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
} // namespace cl
} // namespace gpu

View File

@ -55,7 +55,7 @@ TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvConstants operation =
GPUOperation operation =
CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
@ -90,7 +90,7 @@ TEST_F(OpenCLOperationTest, ConvConstants) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvConstants operation =
GPUOperation operation =
CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));

View File

@ -35,8 +35,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionAdreno(
const DeviceInfo& device_info, const OperationDef& op_def,
ModelHints hints) {
if (IsConvConstantsSupported(device_info, op_def, attr)) {
ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
return absl::make_unique<ConvConstants>(std::move(conv));
GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
return absl::make_unique<GPUOperation>(std::move(conv));
} else {
ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
return absl::make_unique<ConvTexture>(std::move(conv));
@ -66,8 +66,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionNVidia(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const DeviceInfo& device_info, const OperationDef& op_def) {
if (IsConvConstantsSupported(device_info, op_def, attr)) {
ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
return absl::make_unique<ConvConstants>(std::move(conv));
GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
return absl::make_unique<GPUOperation>(std::move(conv));
} else {
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));