Improved performance of ConvolutionTransposed4x4 for NVidia.
PiperOrigin-RevId: 287952384 Change-Id: I253f0c75e41404325c3a5a15d84634d0b7dcc806
This commit is contained in:
parent
651b04bc52
commit
1f0db2b01c
@ -448,6 +448,7 @@ cc_library(
|
|||||||
deps = [
|
deps = [
|
||||||
":gpu_operation",
|
":gpu_operation",
|
||||||
":util",
|
":util",
|
||||||
|
":work_group_picking",
|
||||||
"//tensorflow/lite/delegates/gpu/cl:buffer",
|
"//tensorflow/lite/delegates/gpu/cl:buffer",
|
||||||
"//tensorflow/lite/delegates/gpu/cl:linear_storage",
|
"//tensorflow/lite/delegates/gpu/cl:linear_storage",
|
||||||
"//tensorflow/lite/delegates/gpu/cl:precision",
|
"//tensorflow/lite/delegates/gpu/cl:precision",
|
||||||
|
@ -20,6 +20,7 @@ limitations under the License.
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
|
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
|
||||||
|
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
|
||||||
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
|
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
|
||||||
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
|
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
|
||||||
|
|
||||||
@ -30,7 +31,8 @@ namespace {
|
|||||||
|
|
||||||
std::string GenerateConvolutionTransposedCode(
|
std::string GenerateConvolutionTransposedCode(
|
||||||
const OperationDef& op_def,
|
const OperationDef& op_def,
|
||||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
const std::vector<ElementwiseOperation*>& linked_operations,
|
||||||
|
ConvolutionTransposed4x4::WeightsUploadType weights_upload_type) {
|
||||||
std::string c = GetCommonDefines(op_def.precision);
|
std::string c = GetCommonDefines(op_def.precision);
|
||||||
|
|
||||||
TensorCodeGenerator src_tensor("src_data",
|
TensorCodeGenerator src_tensor("src_data",
|
||||||
@ -44,6 +46,12 @@ std::string GenerateConvolutionTransposedCode(
|
|||||||
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
|
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
|
||||||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
|
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
|
||||||
|
|
||||||
|
const bool need_local_mem =
|
||||||
|
weights_upload_type ==
|
||||||
|
ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
|
||||||
|
weights_upload_type ==
|
||||||
|
ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC;
|
||||||
|
|
||||||
switch (op_def.precision) {
|
switch (op_def.precision) {
|
||||||
case CalculationsPrecision::F32:
|
case CalculationsPrecision::F32:
|
||||||
case CalculationsPrecision::F16:
|
case CalculationsPrecision::F16:
|
||||||
@ -62,7 +70,9 @@ std::string GenerateConvolutionTransposedCode(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const std::string pixel_stride = op_def.batch_support ? "dst_size.w" : "1";
|
const std::string pixel_stride = op_def.batch_support ? "dst_size.w" : "1";
|
||||||
c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
|
if (need_local_mem) { // we use fixed workgroup size when use local mem
|
||||||
|
c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
|
||||||
|
}
|
||||||
c += "__kernel void main_function(\n";
|
c += "__kernel void main_function(\n";
|
||||||
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
|
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
|
||||||
c += " __global FLT4* filters,\n";
|
c += " __global FLT4* filters,\n";
|
||||||
@ -81,12 +91,28 @@ std::string GenerateConvolutionTransposedCode(
|
|||||||
c += " int X = get_global_id(0);\n";
|
c += " int X = get_global_id(0);\n";
|
||||||
c += " int Y = get_global_id(1);\n";
|
c += " int Y = get_global_id(1);\n";
|
||||||
c += " int Z = get_global_id(2);\n";
|
c += " int Z = get_global_id(2);\n";
|
||||||
|
if (!need_local_mem) {
|
||||||
|
if (op_def.batch_support) {
|
||||||
|
c += " if (X0 * 2 * dst_size.w > dst_size.x || Y * 2 > dst_size.y || Z "
|
||||||
|
">= "
|
||||||
|
"dst_size.z) return;\n";
|
||||||
|
} else {
|
||||||
|
c += " if (X * 2 > dst_size.x || Y * 2 > dst_size.y || Z >= dst_size.z) "
|
||||||
|
"return;\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
c += " ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
|
c += " ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
|
||||||
c += " ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
|
c += " ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
|
||||||
c += " ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
|
c += " ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
|
||||||
c += " ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
|
c += " ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
|
||||||
c += " int f_offset = Z * filter_offset;\n";
|
c += " int f_offset = Z * filter_offset;\n";
|
||||||
c += " __local FLT4 weights_cache[64];\n";
|
if (need_local_mem) {
|
||||||
|
c += " __local FLT4 weights_cache[64];\n";
|
||||||
|
}
|
||||||
|
if (weights_upload_type ==
|
||||||
|
ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
|
||||||
|
c += " int local_id = (int)(get_local_id(1) * 8 + get_local_id(0));\n";
|
||||||
|
}
|
||||||
if (manual_clamp) {
|
if (manual_clamp) {
|
||||||
const std::string prev_x = "X - " + pixel_stride;
|
const std::string prev_x = "X - " + pixel_stride;
|
||||||
c += " bool in_x0 = " + prev_x + " >= 0 && " + prev_x + " < src_size.x;\n";
|
c += " bool in_x0 = " + prev_x + " >= 0 && " + prev_x + " < src_size.x;\n";
|
||||||
@ -140,14 +166,30 @@ std::string GenerateConvolutionTransposedCode(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
c += " for (int s = 0; s < src_size.z; ++s) {\n";
|
c += " for (int s = 0; s < src_size.z; ++s) {\n";
|
||||||
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
|
if (need_local_mem) {
|
||||||
c += " async_work_group_copy(weights_cache, filters + f_offset, 64, 0);\n";
|
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
|
||||||
|
}
|
||||||
|
if (weights_upload_type ==
|
||||||
|
ConvolutionTransposed4x4::WeightsUploadType::LOCAL_MEM_ASYNC) {
|
||||||
|
c += " async_work_group_copy(weights_cache, filters + f_offset, 64, "
|
||||||
|
"0);\n";
|
||||||
|
} else if (weights_upload_type ==
|
||||||
|
ConvolutionTransposed4x4::WeightsUploadType::
|
||||||
|
LOCAL_MEM_BY_THREADS) {
|
||||||
|
c += " weights_cache[local_id] = filters[f_offset + local_id];\n";
|
||||||
|
c += " weights_cache[local_id + 32] = filters[f_offset + local_id + "
|
||||||
|
"32];\n";
|
||||||
|
} else { // GLOBAL_MEM
|
||||||
|
c += " __global FLT4* weights_cache = filters + f_offset;\n";
|
||||||
|
}
|
||||||
c += " FLT4 src0 = " + read_src(0, 0) + ";\n";
|
c += " FLT4 src0 = " + read_src(0, 0) + ";\n";
|
||||||
c += " FLT4 src1 = " + read_src(1, 0) + ";\n";
|
c += " FLT4 src1 = " + read_src(1, 0) + ";\n";
|
||||||
c += " FLT4 src2 = " + read_src(0, 1) + ";\n";
|
c += " FLT4 src2 = " + read_src(0, 1) + ";\n";
|
||||||
c += " FLT4 src3 = " + read_src(1, 1) + ";\n";
|
c += " FLT4 src3 = " + read_src(1, 1) + ";\n";
|
||||||
c += " f_offset += 64;\n";
|
c += " f_offset += 64;\n";
|
||||||
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
|
if (need_local_mem) {
|
||||||
|
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
|
||||||
|
}
|
||||||
c += " CONV(r0, src0, 0);\n";
|
c += " CONV(r0, src0, 0);\n";
|
||||||
c += " CONV(r1, src0, 4);\n";
|
c += " CONV(r1, src0, 4);\n";
|
||||||
c += " CONV(r2, src0, 8);\n";
|
c += " CONV(r2, src0, 8);\n";
|
||||||
@ -166,12 +208,15 @@ std::string GenerateConvolutionTransposedCode(
|
|||||||
c += " CONV(r3, src3, 60);\n";
|
c += " CONV(r3, src3, 60);\n";
|
||||||
c += " }\n";
|
c += " }\n";
|
||||||
c += "\n";
|
c += "\n";
|
||||||
if (op_def.batch_support) {
|
if (need_local_mem) {
|
||||||
c += " if (X0 * 2 * dst_size.w > dst_size.x || Y * 2 > dst_size.y || Z >= "
|
if (op_def.batch_support) {
|
||||||
"dst_size.z) return;\n";
|
c += " if (X0 * 2 * dst_size.w > dst_size.x || Y * 2 > dst_size.y || Z "
|
||||||
} else {
|
">= "
|
||||||
c += " if (X * 2 > dst_size.x || Y * 2 > dst_size.y || Z >= dst_size.z) "
|
"dst_size.z) return;\n";
|
||||||
"return;\n";
|
} else {
|
||||||
|
c += " if (X * 2 > dst_size.x || Y * 2 > dst_size.y || Z >= dst_size.z) "
|
||||||
|
"return;\n";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (op_def.batch_support) {
|
if (op_def.batch_support) {
|
||||||
c += " X = X0 * 2 * dst_size.w + B - dst_size.w;\n";
|
c += " X = X0 * 2 * dst_size.w + B - dst_size.w;\n";
|
||||||
@ -214,13 +259,22 @@ std::string GenerateConvolutionTransposedCode(
|
|||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
ConvolutionTransposed4x4::ConvolutionTransposed4x4(
|
ConvolutionTransposed4x4::ConvolutionTransposed4x4(
|
||||||
const OperationDef& definition)
|
const OperationDef& definition, const CLDevice& device)
|
||||||
: GPUOperation(definition) {}
|
: GPUOperation(definition) {
|
||||||
|
if (device.IsPowerVR()) {
|
||||||
|
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
|
||||||
|
} else if (device.IsNvidia()) {
|
||||||
|
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
|
||||||
|
} else {
|
||||||
|
weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ConvolutionTransposed4x4::ConvolutionTransposed4x4(
|
ConvolutionTransposed4x4::ConvolutionTransposed4x4(
|
||||||
ConvolutionTransposed4x4&& operation)
|
ConvolutionTransposed4x4&& operation)
|
||||||
: GPUOperation(std::move(operation)),
|
: GPUOperation(std::move(operation)),
|
||||||
weights_(std::move(operation.weights_)),
|
weights_(std::move(operation.weights_)),
|
||||||
|
weights_upload_type_(operation.weights_upload_type_),
|
||||||
biases_(std::move(operation.biases_)),
|
biases_(std::move(operation.biases_)),
|
||||||
kernel_(std::move(operation.kernel_)),
|
kernel_(std::move(operation.kernel_)),
|
||||||
work_group_size_(operation.work_group_size_) {}
|
work_group_size_(operation.work_group_size_) {}
|
||||||
@ -229,6 +283,7 @@ ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
|
|||||||
ConvolutionTransposed4x4&& operation) {
|
ConvolutionTransposed4x4&& operation) {
|
||||||
if (this != &operation) {
|
if (this != &operation) {
|
||||||
weights_ = std::move(operation.weights_);
|
weights_ = std::move(operation.weights_);
|
||||||
|
std::swap(weights_upload_type_, operation.weights_upload_type_);
|
||||||
biases_ = std::move(operation.biases_);
|
biases_ = std::move(operation.biases_);
|
||||||
kernel_ = std::move(operation.kernel_);
|
kernel_ = std::move(operation.kernel_);
|
||||||
std::swap(work_group_size_, operation.work_group_size_);
|
std::swap(work_group_size_, operation.work_group_size_);
|
||||||
@ -239,8 +294,8 @@ ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
|
|||||||
|
|
||||||
Status ConvolutionTransposed4x4::Compile(
|
Status ConvolutionTransposed4x4::Compile(
|
||||||
const CreationContext& creation_context) {
|
const CreationContext& creation_context) {
|
||||||
const auto code =
|
const auto code = GenerateConvolutionTransposedCode(
|
||||||
GenerateConvolutionTransposedCode(definition_, linked_operations_);
|
definition_, linked_operations_, weights_upload_type_);
|
||||||
|
|
||||||
std::vector<CompilerOptions> options;
|
std::vector<CompilerOptions> options;
|
||||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||||
@ -277,6 +332,16 @@ int3 ConvolutionTransposed4x4::GetGridSize() const {
|
|||||||
return int3(grid_x, grid_y, grid_z);
|
return int3(grid_x, grid_y, grid_z);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status ConvolutionTransposed4x4::Tune(const TuningParameters& params) {
|
||||||
|
if (weights_upload_type_ == WeightsUploadType::LOCAL_MEM_ASYNC ||
|
||||||
|
weights_upload_type_ == WeightsUploadType::LOCAL_MEM_BY_THREADS) {
|
||||||
|
return OkStatus();
|
||||||
|
}
|
||||||
|
RETURN_IF_ERROR(BindArguments());
|
||||||
|
return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
|
||||||
|
&work_group_size_);
|
||||||
|
}
|
||||||
|
|
||||||
Status ConvolutionTransposed4x4::AddToQueue(CLCommandQueue* queue) {
|
Status ConvolutionTransposed4x4::AddToQueue(CLCommandQueue* queue) {
|
||||||
RETURN_IF_ERROR(BindArguments());
|
RETURN_IF_ERROR(BindArguments());
|
||||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||||
@ -299,7 +364,7 @@ Status CreateConvolutionTransposed4x4(
|
|||||||
return InvalidArgumentError(
|
return InvalidArgumentError(
|
||||||
"ConvolutionTransposed4x4 doesn't support this attributes");
|
"ConvolutionTransposed4x4 doesn't support this attributes");
|
||||||
}
|
}
|
||||||
*result = ConvolutionTransposed4x4(definition);
|
*result = ConvolutionTransposed4x4(definition, *creation_context.device);
|
||||||
RETURN_IF_ERROR(
|
RETURN_IF_ERROR(
|
||||||
result->UploadWeights(attr.weights, creation_context.context));
|
result->UploadWeights(attr.weights, creation_context.context));
|
||||||
LinearStorageCreateInfo create_info;
|
LinearStorageCreateInfo create_info;
|
||||||
|
@ -38,6 +38,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
|
|||||||
public:
|
public:
|
||||||
ConvolutionTransposed4x4() = default;
|
ConvolutionTransposed4x4() = default;
|
||||||
Status AddToQueue(CLCommandQueue* queue) override;
|
Status AddToQueue(CLCommandQueue* queue) override;
|
||||||
|
Status Tune(const TuningParameters& params) override;
|
||||||
Status Compile(const CreationContext& creation_context) override;
|
Status Compile(const CreationContext& creation_context) override;
|
||||||
|
|
||||||
// Move only
|
// Move only
|
||||||
@ -46,8 +47,15 @@ class ConvolutionTransposed4x4 : public GPUOperation {
|
|||||||
ConvolutionTransposed4x4(const ConvolutionTransposed4x4&) = delete;
|
ConvolutionTransposed4x4(const ConvolutionTransposed4x4&) = delete;
|
||||||
ConvolutionTransposed4x4& operator=(const ConvolutionTransposed4x4&) = delete;
|
ConvolutionTransposed4x4& operator=(const ConvolutionTransposed4x4&) = delete;
|
||||||
|
|
||||||
|
enum class WeightsUploadType {
|
||||||
|
LOCAL_MEM_ASYNC,
|
||||||
|
LOCAL_MEM_BY_THREADS,
|
||||||
|
GLOBAL_MEM,
|
||||||
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
explicit ConvolutionTransposed4x4(const OperationDef& definition);
|
ConvolutionTransposed4x4(const OperationDef& definition,
|
||||||
|
const CLDevice& device);
|
||||||
friend Status CreateConvolutionTransposed4x4(
|
friend Status CreateConvolutionTransposed4x4(
|
||||||
const CreationContext& creation_context, const OperationDef& definition,
|
const CreationContext& creation_context, const OperationDef& definition,
|
||||||
const ConvolutionTransposedAttributes& attr,
|
const ConvolutionTransposedAttributes& attr,
|
||||||
@ -64,6 +72,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
|
|||||||
int3 GetGridSize() const;
|
int3 GetGridSize() const;
|
||||||
|
|
||||||
Buffer weights_;
|
Buffer weights_;
|
||||||
|
WeightsUploadType weights_upload_type_;
|
||||||
LinearStorage biases_;
|
LinearStorage biases_;
|
||||||
|
|
||||||
CLKernel kernel_;
|
CLKernel kernel_;
|
||||||
|
Loading…
Reference in New Issue
Block a user