Improved performance of ConvolutionTransposed4x4 for NVidia.

PiperOrigin-RevId: 287952384
Change-Id: I253f0c75e41404325c3a5a15d84634d0b7dcc806
This commit is contained in:
A. Unique TensorFlower 2020-01-03 00:10:19 -08:00 committed by TensorFlower Gardener
parent 651b04bc52
commit 1f0db2b01c
3 changed files with 93 additions and 18 deletions

View File

@ -448,6 +448,7 @@ cc_library(
deps = [
":gpu_operation",
":util",
":work_group_picking",
"//tensorflow/lite/delegates/gpu/cl:buffer",
"//tensorflow/lite/delegates/gpu/cl:linear_storage",
"//tensorflow/lite/delegates/gpu/cl:precision",

View File

@ -20,6 +20,7 @@ limitations under the License.
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
@ -30,7 +31,8 @@ namespace {
std::string GenerateConvolutionTransposedCode(
const OperationDef& op_def,
const std::vector<ElementwiseOperation*>& linked_operations) {
const std::vector<ElementwiseOperation*>& linked_operations,
ConvolutionTransposed4x4::WeightsUploadType weights_upload_type) {
std::string c = GetCommonDefines(op_def.precision);
TensorCodeGenerator src_tensor("src_data",
@ -44,6 +46,12 @@ std::string GenerateConvolutionTransposedCode(
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
const bool need_local_mem =
weights_upload_type ==
ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
weights_upload_type ==
ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC;
switch (op_def.precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
@ -62,7 +70,9 @@ std::string GenerateConvolutionTransposedCode(
}
const std::string pixel_stride = op_def.batch_support ? "dst_size.w" : "1";
c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
if (need_local_mem) { // we use fixed workgroup size when use local mem
c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
}
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
c += " __global FLT4* filters,\n";
@ -81,12 +91,28 @@ std::string GenerateConvolutionTransposedCode(
c += " int X = get_global_id(0);\n";
c += " int Y = get_global_id(1);\n";
c += " int Z = get_global_id(2);\n";
if (!need_local_mem) {
if (op_def.batch_support) {
c += " if (X0 * 2 * dst_size.w > dst_size.x || Y * 2 > dst_size.y || Z "
">= "
"dst_size.z) return;\n";
} else {
c += " if (X * 2 > dst_size.x || Y * 2 > dst_size.y || Z >= dst_size.z) "
"return;\n";
}
}
c += " ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
c += " ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
c += " ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
c += " ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
c += " int f_offset = Z * filter_offset;\n";
c += " __local FLT4 weights_cache[64];\n";
if (need_local_mem) {
c += " __local FLT4 weights_cache[64];\n";
}
if (weights_upload_type ==
ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
c += " int local_id = (int)(get_local_id(1) * 8 + get_local_id(0));\n";
}
if (manual_clamp) {
const std::string prev_x = "X - " + pixel_stride;
c += " bool in_x0 = " + prev_x + " >= 0 && " + prev_x + " < src_size.x;\n";
@ -140,14 +166,30 @@ std::string GenerateConvolutionTransposedCode(
}
};
c += " for (int s = 0; s < src_size.z; ++s) {\n";
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
c += " async_work_group_copy(weights_cache, filters + f_offset, 64, 0);\n";
if (need_local_mem) {
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
}
if (weights_upload_type ==
ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC) {
c += " async_work_group_copy(weights_cache, filters + f_offset, 64, "
"0);\n";
} else if (weights_upload_type ==
ConvolutionTransposed4x4::WeightsUploadType::
LOCAL_MEM_BY_THREADS) {
c += " weights_cache[local_id] = filters[f_offset + local_id];\n";
c += " weights_cache[local_id + 32] = filters[f_offset + local_id + "
"32];\n";
} else { // GLOBAL_MEM
c += " __global FLT4* weights_cache = filters + f_offset;\n";
}
c += " FLT4 src0 = " + read_src(0, 0) + ";\n";
c += " FLT4 src1 = " + read_src(1, 0) + ";\n";
c += " FLT4 src2 = " + read_src(0, 1) + ";\n";
c += " FLT4 src3 = " + read_src(1, 1) + ";\n";
c += " f_offset += 64;\n";
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
if (need_local_mem) {
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
}
c += " CONV(r0, src0, 0);\n";
c += " CONV(r1, src0, 4);\n";
c += " CONV(r2, src0, 8);\n";
@ -166,12 +208,15 @@ std::string GenerateConvolutionTransposedCode(
c += " CONV(r3, src3, 60);\n";
c += " }\n";
c += "\n";
if (op_def.batch_support) {
c += " if (X0 * 2 * dst_size.w > dst_size.x || Y * 2 > dst_size.y || Z >= "
"dst_size.z) return;\n";
} else {
c += " if (X * 2 > dst_size.x || Y * 2 > dst_size.y || Z >= dst_size.z) "
"return;\n";
if (need_local_mem) {
if (op_def.batch_support) {
c += " if (X0 * 2 * dst_size.w > dst_size.x || Y * 2 > dst_size.y || Z "
">= "
"dst_size.z) return;\n";
} else {
c += " if (X * 2 > dst_size.x || Y * 2 > dst_size.y || Z >= dst_size.z) "
"return;\n";
}
}
if (op_def.batch_support) {
c += " X = X0 * 2 * dst_size.w + B - dst_size.w;\n";
@ -214,13 +259,22 @@ std::string GenerateConvolutionTransposedCode(
} // namespace
ConvolutionTransposed4x4::ConvolutionTransposed4x4(
const OperationDef& definition)
: GPUOperation(definition) {}
const OperationDef& definition, const CLDevice& device)
: GPUOperation(definition) {
if (device.IsPowerVR()) {
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
} else if (device.IsNvidia()) {
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
} else {
weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
}
}
ConvolutionTransposed4x4::ConvolutionTransposed4x4(
ConvolutionTransposed4x4&& operation)
: GPUOperation(std::move(operation)),
weights_(std::move(operation.weights_)),
weights_upload_type_(operation.weights_upload_type_),
biases_(std::move(operation.biases_)),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}
@ -229,6 +283,7 @@ ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
ConvolutionTransposed4x4&& operation) {
if (this != &operation) {
weights_ = std::move(operation.weights_);
std::swap(weights_upload_type_, operation.weights_upload_type_);
biases_ = std::move(operation.biases_);
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
@ -239,8 +294,8 @@ ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
Status ConvolutionTransposed4x4::Compile(
const CreationContext& creation_context) {
const auto code =
GenerateConvolutionTransposedCode(definition_, linked_operations_);
const auto code = GenerateConvolutionTransposedCode(
definition_, linked_operations_, weights_upload_type_);
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
@ -277,6 +332,16 @@ int3 ConvolutionTransposed4x4::GetGridSize() const {
return int3(grid_x, grid_y, grid_z);
}
Status ConvolutionTransposed4x4::Tune(const TuningParameters& params) {
if (weights_upload_type_ == WeightsUploadType::LOCAL_MEM_ASYNC ||
weights_upload_type_ == WeightsUploadType::LOCAL_MEM_BY_THREADS) {
return OkStatus();
}
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
&work_group_size_);
}
Status ConvolutionTransposed4x4::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
@ -299,7 +364,7 @@ Status CreateConvolutionTransposed4x4(
return InvalidArgumentError(
"ConvolutionTransposed4x4 doesn't support this attributes");
}
*result = ConvolutionTransposed4x4(definition);
*result = ConvolutionTransposed4x4(definition, *creation_context.device);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;

View File

@ -38,6 +38,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
public:
ConvolutionTransposed4x4() = default;
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
@ -46,8 +47,15 @@ class ConvolutionTransposed4x4 : public GPUOperation {
ConvolutionTransposed4x4(const ConvolutionTransposed4x4&) = delete;
ConvolutionTransposed4x4& operator=(const ConvolutionTransposed4x4&) = delete;
enum class WeightsUploadType {
LOCAL_MEM_ASYNC,
LOCAL_MEM_BY_THREADS,
GLOBAL_MEM,
};
private:
explicit ConvolutionTransposed4x4(const OperationDef& definition);
ConvolutionTransposed4x4(const OperationDef& definition,
const CLDevice& device);
friend Status CreateConvolutionTransposed4x4(
const CreationContext& creation_context, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
@ -64,6 +72,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
int3 GetGridSize() const;
Buffer weights_;
WeightsUploadType weights_upload_type_;
LinearStorage biases_;
CLKernel kernel_;