Common function for kernel code generation.

PiperOrigin-RevId: 272715088
This commit is contained in:
A. Unique TensorFlower 2019-10-03 12:21:35 -07:00 committed by TensorFlower Gardener
parent e9b1a42dfb
commit e6d3b90e28
3 changed files with 41 additions and 34 deletions

View File

@ -30,17 +30,17 @@ namespace tflite {
namespace gpu {
namespace cl {
namespace {
bool NeedStrideCorrection(const OperationDef& op_def, const int2& stride) {
return op_def.batch_support && stride.x != 1;
}
std::string GenerateConvCode(
const OperationDef& op_def, const int3& block_size, bool is1x1,
bool adreno4xx_optimization, const int2& stride, const CLDevice& device,
bool adreno4xx_optimization, bool stride_correction, const CLDevice& device,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::string c = GetCommonDefines(op_def.precision);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
TensorCodeGenerator src_tensor("src_data",
{"src_size.x", "src_size.y", "src_size.z"},
op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data",
{"dst_size.x", "dst_size.y", "dst_size.z"},
op_def.dst_tensors[0]);
const bool is_image_buffer =
op_def.src_tensors[0].storage_type == TensorStorageType::IMAGE_BUFFER;
@ -97,25 +97,21 @@ std::string GenerateConvCode(
c += " int2 kernel_size, \n";
c += " int2 dilation, \n";
}
if (NeedStrideCorrection(op_def, stride)) {
c += " int BATCH_SIZE, \n";
}
c += " int2 stride, \n";
c += " int2 padding \n";
c += ") {\n";
c += " int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
c += " int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
c += " int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
std::vector<std::string> s_x(block_size.x);
std::vector<std::string> s_y(block_size.y);
for (int x = 0; x < block_size.x; ++x) {
if (NeedStrideCorrection(op_def, stride)) {
// TODO(sorokin) check perf and optimize with floor() if needed
c += " int p" + xs[x] + " = (X + " + xs[x] + ") / BATCH_SIZE;\n";
c += " int b" + xs[x] + " = (X + " + xs[x] + ") % BATCH_SIZE;\n";
c += " int xc" + xs[x] + " = p" + xs[x] +
" * BATCH_SIZE * stride.x + b" + xs[x] + " + padding.x;\n";
if (stride_correction) {
c += " int xc" + xs[x] + " = " +
GetXStrideCorrected("X + " + xs[x], "src_size.w", "stride.x",
"padding.x") +
";\n";
} else {
c += " int xc" + xs[x] + " = (X +" + xs[x] +
") * stride.x + padding.x;\n";
@ -196,7 +192,7 @@ std::string GenerateConvCode(
}
}
}
c += " for (int s = 0; s < src_size.w; ++s) {\n";
c += " for (int s = 0; s < src_size.z; ++s) {\n";
if (is_image_buffer) {
for (int index = 0; index < block_size.x * block_size.y; ++index) {
const std::string id = std::to_string(index);
@ -238,7 +234,7 @@ std::string GenerateConvCode(
c += " addr_" + id + " += dz_" + id + ";\n";
}
}
c += " }\n"; // src_size.w
c += " }\n"; // src_size.z
if (!is1x1) {
c += " }\n"; // kernel_size.x
c += " }\n"; // kernel_size.y
@ -247,7 +243,7 @@ std::string GenerateConvCode(
std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
for (int z = 0; z < block_size.z; ++z) {
c += " if (Z < dst_size.w) {\n";
c += " if (Z < dst_size.z) {\n";
c += " FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n";
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
@ -338,9 +334,10 @@ Status ConvTexture::Compile(const CreationContext& creation_context) {
creation_context.device->IsAdreno4xx() &&
storage_type == TensorStorageType::TEXTURE_ARRAY &&
definition_.precision == CalculationsPrecision::F16;
std::string code =
GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
stride_, *creation_context.device, linked_operations_);
const bool stride_correction = definition_.batch_support && stride_.x != 1;
const std::string code = GenerateConvCode(
definition_, block_size_, is1x1, adreno4xx_optimization,
stride_correction, *creation_context.device, linked_operations_);
std::vector<CompilerOptions> options;
if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
@ -360,22 +357,13 @@ Status ConvTexture::BindArguments() {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
const int4 src_size =
int4(src_[0]->Width() * src_[0]->Batch(), src_[0]->Height(),
src_[0]->Channels(), src_[0]->Depth());
const int4 dst_size =
int4(dst_[0]->Width() * dst_[0]->Batch(), dst_[0]->Height(),
dst_[0]->Channels(), dst_[0]->Depth());
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_size));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_size));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDB()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDB()));
if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(
int2(dilation_.x * src_[0]->Batch(), dilation_.y)));
}
if (NeedStrideCorrection(definition_, stride_)) {
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->Batch()));
}
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
RETURN_IF_ERROR(
kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));

View File

@ -348,6 +348,18 @@ std::string TensorCodeGenerator::Write(
}
}
std::string GetXStrideCorrected(const std::string& src_x,
const std::string& batch_size,
const std::string& stride_x,
const std::string& padding_x) {
// TODO(sorokin) check perf and optimize with floor() if needed
// int p0 = src_x / batch_size;\n";
// int b0 = src_x % batch_size;\n";
// return p0 * stride_x * batch_size + b0 + padding_x;\n";
return absl::Substitute("((($0) / $1) * $2 * $1 + (($0) % $1) + $3)", src_x,
batch_size, stride_x, padding_x);
}
TextureAddressMode GetFastestZeroMode(const CLDevice& device) {
return device.IsAdreno3xx() ? TextureAddressMode::DONT_CARE
: TextureAddressMode::ZERO;

View File

@ -131,6 +131,13 @@ class TensorCodeGenerator {
TensorDescriptor descriptor_;
};
// Calculates correct X coordinate when stride != 1 and batch != 1 for
// DHWBC4, HDWBC4, HWBC layouts
std::string GetXStrideCorrected(const std::string& src_x,
const std::string& batch_size,
const std::string& stride_x,
const std::string& padding_x);
template <DataType S, typename T>
void RearrangeWeightsToOHWI4I4O(const ::tflite::gpu::Tensor<OHWI, S>& weights,
absl::Span<T> dst) {