Simplified signatures of CreateKernel... methods.
PiperOrigin-RevId: 269652728
This commit is contained in:
parent
a15068b1a6
commit
e72f5dd506
tensorflow/lite/delegates/gpu/cl/kernels
conv_buffer.ccconv_buffer_1x1.ccconv_constants.ccconv_powervr.ccconv_powervr.hconv_texture.ccconvolution_transposed.ccconvolution_transposed_3x3_thin.ccconvolution_transposed_thin.ccdepth_wise_conv.ccdepth_wise_conv_3x3.ccfully_connected_texture.ccmax_unpooling.ccpadding.ccreshape.ccreshapex4.ccsoftmax.ccsoftmax1x1.ccstrided_slice.ccupsample.cc
@ -29,15 +29,13 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GenerateConvBuffer(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
int x_elements, int y_elements,
|
||||
const OperationDef& op_def, int x_elements, int y_elements,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
std::string c = GetCommonDefines(precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
switch (precision) {
|
||||
switch (op_def.precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
case CalculationsPrecision::F16:
|
||||
c += "#define CONV(R, S) \\\n";
|
||||
@ -53,7 +51,7 @@ std::string GenerateConvBuffer(
|
||||
break;
|
||||
}
|
||||
|
||||
switch (precision) {
|
||||
switch (op_def.precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
c += "#define FLT16 float16\n";
|
||||
break;
|
||||
@ -206,9 +204,8 @@ ConvBuffer& ConvBuffer::operator=(ConvBuffer&& operation) {
|
||||
}
|
||||
|
||||
Status ConvBuffer::Compile(const CreationContext& creation_context) {
|
||||
std::string code = GenerateConvBuffer(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, x_elements_, y_elements_, linked_operations_);
|
||||
std::string code = GenerateConvBuffer(definition_, x_elements_, y_elements_,
|
||||
linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
|
@ -85,15 +85,14 @@ std::string GetShiftFromElementSize(int element_size) {
|
||||
}
|
||||
|
||||
std::string GenerateConvBuffer1x1(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
int x_elements, int y_elements, int element_size,
|
||||
const OperationDef& op_def, int x_elements, int y_elements,
|
||||
int element_size,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
std::string c = GetCommonDefines(precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
switch (precision) {
|
||||
switch (op_def.precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
c += "#define FLT8 float8\n";
|
||||
c += "#define FLT16 float16\n";
|
||||
@ -154,7 +153,8 @@ std::string GenerateConvBuffer1x1(
|
||||
}
|
||||
}
|
||||
c += " FLT16 f0 = temp[0];\n";
|
||||
c += GetComputationPart(x_elements, y_elements, element_size, precision);
|
||||
c += GetComputationPart(x_elements, y_elements, element_size,
|
||||
op_def.precision);
|
||||
for (int i = 0; i < x_elements * y_elements; ++i) {
|
||||
std::string i_s = std::to_string(i);
|
||||
c += " src_addr_" + i_s + " += src_size.z;\n";
|
||||
@ -232,16 +232,12 @@ ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
|
||||
|
||||
Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
|
||||
std::string code_flt4 = GenerateConvBuffer1x1(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, flt4_x_count_, flt4_y_count_, 1,
|
||||
linked_operations_);
|
||||
definition_, flt4_x_count_, flt4_y_count_, 1, linked_operations_);
|
||||
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
|
||||
code_flt4, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_flt4_));
|
||||
std::string code_flt8 = GenerateConvBuffer1x1(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, flt8_x_count_, flt8_y_count_, 2,
|
||||
linked_operations_);
|
||||
definition_, flt8_x_count_, flt8_y_count_, 2, linked_operations_);
|
||||
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
|
||||
code_flt8, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_flt8_));
|
||||
|
@ -28,25 +28,23 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GenerateConvolutionConstantCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const int2& kernel_size, const int2& dilation, int src_channels,
|
||||
int dst_channels, const CLDevice& device,
|
||||
const OperationDef& op_def, const int2& kernel_size, const int2& dilation,
|
||||
int src_channels, int dst_channels, const CLDevice& device,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.src_tensors[0]);
|
||||
|
||||
std::string c = GetCommonDefines(precision);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
|
||||
const int out_z = IntegralDivideRoundUp(dst_channels, 4);
|
||||
const std::string kOutZ = std::to_string(out_z);
|
||||
const int src_depth = IntegralDivideRoundUp(src_channels, 4);
|
||||
|
||||
const bool manual_clamp =
|
||||
src_descriptor.storage_type == TensorStorageType::BUFFER ||
|
||||
src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER;
|
||||
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
|
||||
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
|
||||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
|
||||
|
||||
switch (precision) {
|
||||
switch (op_def.precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
case CalculationsPrecision::F16:
|
||||
c += "#define CONV4(R, SRC, F, i) \\\n";
|
||||
@ -213,9 +211,8 @@ ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
|
||||
|
||||
Status ConvConstants::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionConstantCode(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, kernel_size_, dilation_, src_channels_,
|
||||
dst_channels_, *creation_context.device, linked_operations_);
|
||||
definition_, kernel_size_, dilation_, src_channels_, dst_channels_,
|
||||
*creation_context.device, linked_operations_);
|
||||
std::vector<CompilerOptions> options;
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
creation_context.device->IsAdreno3xx()) {
|
||||
|
@ -62,9 +62,8 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
|
||||
}
|
||||
|
||||
Status ConvPowerVR::Compile(const CreationContext& creation_context) {
|
||||
const std::string code = GenerateConvPowerVR1x1(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, conv_params_, linked_operations_);
|
||||
const std::string code =
|
||||
GenerateConvPowerVR1x1(definition_, conv_params_, linked_operations_);
|
||||
std::vector<CompilerOptions> options;
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
creation_context.device->IsPowerVR()) {
|
||||
@ -117,18 +116,16 @@ Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
|
||||
}
|
||||
|
||||
std::string GenerateConvPowerVR1x1(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const ConvPowerVR::ConvParams& conv_params,
|
||||
const OperationDef& op_def, const ConvPowerVR::ConvParams& conv_params,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
std::string c = GetCommonDefines(precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
const bool is1x1 = conv_params.x_kernel_is_1 && conv_params.y_kernel_is_1;
|
||||
const bool buffer_type =
|
||||
src_descriptor.storage_type == TensorStorageType::BUFFER ||
|
||||
src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER;
|
||||
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
|
||||
const bool buffer_type = src_tensor_type == TensorStorageType::BUFFER ||
|
||||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
|
||||
const bool manual_clamp = buffer_type && !is1x1;
|
||||
|
||||
c += "#define SIMD_BARRIER " +
|
||||
@ -248,7 +245,7 @@ std::string GenerateConvPowerVR1x1(
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
for (int x = 0; x < block_size.x; ++x) {
|
||||
const std::string id = std::to_string(y) + std::to_string(x);
|
||||
if (precision == CalculationsPrecision::F32_F16) {
|
||||
if (op_def.precision == CalculationsPrecision::F32_F16) {
|
||||
c += " ACCUM_FLT4 src" + id + ";\n";
|
||||
} else {
|
||||
c += " FLT4 src" + id + ";\n";
|
||||
@ -265,8 +262,8 @@ std::string GenerateConvPowerVR1x1(
|
||||
? ""
|
||||
: " * (FLT)(mx" + std::to_string(x) +
|
||||
" && my" + std::to_string(y) + ")";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (precision == CalculationsPrecision::F32_F16) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
if (op_def.precision == CalculationsPrecision::F32_F16) {
|
||||
c += " src" + id + " = convert_float4(src_data[src_a_" + id +
|
||||
"]" + multiplier + ");\n";
|
||||
} else {
|
||||
@ -274,8 +271,8 @@ std::string GenerateConvPowerVR1x1(
|
||||
multiplier + ";\n";
|
||||
}
|
||||
}
|
||||
if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
if (precision == CalculationsPrecision::F32_F16) {
|
||||
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
if (op_def.precision == CalculationsPrecision::F32_F16) {
|
||||
c += " src" + id + " = " +
|
||||
src_tensor.ReadAsFloat3D("src_a_" + id) + multiplier + ";\n";
|
||||
} else {
|
||||
@ -290,7 +287,7 @@ std::string GenerateConvPowerVR1x1(
|
||||
is1x1 ? "X + " + std::to_string(x) : "xck" + std::to_string(x);
|
||||
const std::string yc =
|
||||
is1x1 ? "Y + " + std::to_string(y) : "yck" + std::to_string(y);
|
||||
if (precision == CalculationsPrecision::F32_F16) {
|
||||
if (op_def.precision == CalculationsPrecision::F32_F16) {
|
||||
c += " src" + id + " = " +
|
||||
src_tensor.ReadAsFloat3D(xc, yc, "s") + ";\n";
|
||||
} else {
|
||||
|
@ -76,9 +76,7 @@ class ConvPowerVR : public GPUOperation {
|
||||
ConvPowerVR* result);
|
||||
|
||||
friend std::string GenerateConvPowerVR1x1(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const ConvParams& conv_params,
|
||||
const OperationDef& op_def, const ConvParams& conv_params,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations);
|
||||
|
||||
friend ConvParams GuessBestParams(const CLDevice& device,
|
||||
|
@ -30,15 +30,17 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GenerateConvCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
bool is1x1, bool adreno4xx_optimization, const CLDevice& device,
|
||||
const OperationDef& op_def, bool is1x1, bool adreno4xx_optimization,
|
||||
const CLDevice& device,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
std::string c = GetCommonDefines(precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
switch (precision) {
|
||||
const bool is_image_buffer =
|
||||
op_def.src_tensors[0].storage_type == TensorStorageType::IMAGE_BUFFER;
|
||||
|
||||
switch (op_def.precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
case CalculationsPrecision::F16:
|
||||
c += "#define CONV1(R, S) \\\n";
|
||||
@ -102,14 +104,14 @@ std::string GenerateConvCode(
|
||||
c += " for (int y = 0; y < kernel_size.y; ++y) {\n";
|
||||
c += " c0.y = y * dilation.y + yc0;\n";
|
||||
c += " c1.y = y * dilation.y + yc1;\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
if (is_image_buffer) {
|
||||
c += " bool in_y0 = c0.y >= 0 && c0.y < src_size.y;\n";
|
||||
c += " bool in_y1 = c1.y >= 0 && c1.y < src_size.y;\n";
|
||||
}
|
||||
c += " for (int x = 0; x < kernel_size.x; ++x) {\n";
|
||||
c += " c0.x = x * dilation.x + xc0;\n";
|
||||
c += " c1.x = x * dilation.x + xc1;\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
if (is_image_buffer) {
|
||||
c += " bool in_x0 = c0.x >= 0 && c0.x < src_size.x;\n";
|
||||
c += " bool in_x1 = c1.x >= 0 && c1.x < src_size.x;\n";
|
||||
c += " int addr_0 = select(-1, c0.y * src_size.x + c0.x, (in_x0 && "
|
||||
@ -129,7 +131,7 @@ std::string GenerateConvCode(
|
||||
c += " int dz_3 = select(0, src_size.x * src_size.y, (in_x1 && "
|
||||
"in_y1));\n";
|
||||
}
|
||||
} else if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
} else if (is_image_buffer) {
|
||||
c += " bool in_x0 = xc0 >= 0 && xc0 < src_size.x;\n";
|
||||
c += " bool in_x1 = xc1 >= 0 && xc1 < src_size.x;\n";
|
||||
c += " bool in_y0 = yc0 >= 0 && yc0 < src_size.y;\n";
|
||||
@ -148,7 +150,7 @@ std::string GenerateConvCode(
|
||||
c += " int dz_3 = select(0, src_size.x * src_size.y, (in_x1 && in_y1));\n";
|
||||
}
|
||||
c += " for (int s = 0; s < src_size.w; ++s) {\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
if (is_image_buffer) {
|
||||
c += " FLT4 src0 = " + src_tensor.Read3D("addr_0") + ";\n";
|
||||
c += " FLT4 src1 = " + src_tensor.Read3D("addr_1") + ";\n";
|
||||
c += " FLT4 src2 = " + src_tensor.Read3D("addr_2") + ";\n";
|
||||
@ -164,7 +166,7 @@ std::string GenerateConvCode(
|
||||
c += " FLT4 f5 = READ_IMAGE(filters1, smp_none, " + fc1 + ");\n";
|
||||
c += " FLT4 f6 = READ_IMAGE(filters2, smp_none, " + fc1 + ");\n";
|
||||
c += " FLT4 f7 = READ_IMAGE(filters3, smp_none, " + fc1 + ");\n";
|
||||
if (src_descriptor.storage_type != TensorStorageType::IMAGE_BUFFER) {
|
||||
if (!is_image_buffer) {
|
||||
const auto mode = GetFastestZeroMode(device);
|
||||
c += " FLT4 src0 = " + src_tensor.Read3D(s_x0, s_y0, "s", mode) + ";\n";
|
||||
c += " FLT4 src1 = " + src_tensor.Read3D(s_x1, s_y0, "s", mode) + ";\n";
|
||||
@ -182,7 +184,7 @@ std::string GenerateConvCode(
|
||||
if (!is1x1) {
|
||||
c += " filter_offset++;\n";
|
||||
}
|
||||
if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
if (is_image_buffer) {
|
||||
c += " addr_0 += dz_0;\n";
|
||||
c += " addr_1 += dz_1;\n";
|
||||
c += " addr_2 += dz_2;\n";
|
||||
@ -296,8 +298,7 @@ Status ConvTexture::Compile(const CreationContext& creation_context) {
|
||||
storage_type == TensorStorageType::TEXTURE_ARRAY &&
|
||||
definition_.precision == CalculationsPrecision::F16;
|
||||
std::string code =
|
||||
GenerateConvCode(definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, is1x1, adreno4xx_optimization,
|
||||
GenerateConvCode(definition_, is1x1, adreno4xx_optimization,
|
||||
*creation_context.device, linked_operations_);
|
||||
std::vector<CompilerOptions> options;
|
||||
if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
|
||||
|
@ -28,18 +28,18 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GenerateConvolutionTransposedCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const LinearStorage& biases, const CLDevice& device,
|
||||
const OperationDef& op_def, const LinearStorage& biases,
|
||||
const CLDevice& device,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
std::string c = GetCommonDefines(precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
|
||||
switch (precision) {
|
||||
switch (op_def.precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
case CalculationsPrecision::F16:
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += "#define CONV(R, S) \\\n";
|
||||
c += "R += S.x * f0.s0123; \\\n";
|
||||
c += "R += S.y * f0.s4567; \\\n";
|
||||
@ -54,7 +54,7 @@ std::string GenerateConvolutionTransposedCode(
|
||||
}
|
||||
break;
|
||||
case CalculationsPrecision::F32_F16:
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += "#define CONV(R, S) \\\n";
|
||||
c += "R += convert_float4(S.x * f0.s0123 + S.y * f0.s4567 + S.z * "
|
||||
"f0.s89ab + S.w * f0.scdef);\n";
|
||||
@ -66,7 +66,7 @@ std::string GenerateConvolutionTransposedCode(
|
||||
break;
|
||||
}
|
||||
|
||||
switch (precision) {
|
||||
switch (op_def.precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
c += "#define FLT16 float16\n";
|
||||
break;
|
||||
@ -78,7 +78,7 @@ std::string GenerateConvolutionTransposedCode(
|
||||
|
||||
c += "__kernel void main_function(\n";
|
||||
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " __global FLT16* filters, \n";
|
||||
c += " __global FLT4* biases";
|
||||
} else {
|
||||
@ -99,7 +99,7 @@ std::string GenerateConvolutionTransposedCode(
|
||||
c += " int Y = get_global_id(1);\n";
|
||||
c += " int Z = get_global_id(2);\n";
|
||||
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " int f_base = Z * src_size.w * kernel_size.x * kernel_size.y;\n";
|
||||
}
|
||||
c += " int2 offset = (int2)(X, Y) + padding - k_offset;\n";
|
||||
@ -126,7 +126,7 @@ std::string GenerateConvolutionTransposedCode(
|
||||
c += " bool out_x = s_x < 0 || s_x >= src_size.x;\n";
|
||||
c += " int kernel_index = index_y * kernel_size.x + index_x;\n";
|
||||
c += " if (inside_kernel && !(out_x || out_y)) {\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " int f_offset = f_base + kernel_index * src_size.w;\n";
|
||||
} else {
|
||||
c += " int x_c = kernel_index * src_size.w * 4;\n";
|
||||
@ -135,7 +135,7 @@ std::string GenerateConvolutionTransposedCode(
|
||||
c += " FLT4 src =" +
|
||||
src_tensor.Read3D("s_x", "s_y", "l", TextureAddressMode::DONT_CARE) +
|
||||
";\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " FLT16 f0 = filters[f_offset]; f_offset++;\n";
|
||||
} else {
|
||||
c += " FLT4 f[4];\n";
|
||||
@ -217,9 +217,7 @@ ConvolutionTransposed& ConvolutionTransposed::operator=(
|
||||
|
||||
Status ConvolutionTransposed::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionTransposedCode(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, biases_, *creation_context.device,
|
||||
linked_operations_);
|
||||
definition_, biases_, *creation_context.device, linked_operations_);
|
||||
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
|
@ -28,17 +28,16 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GenerateConvolutionTransposedCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const LinearStorage& biases, int src_depth, int dst_depth,
|
||||
const CLDevice& device,
|
||||
const OperationDef& op_def, const LinearStorage& biases, int src_depth,
|
||||
int dst_depth, const CLDevice& device,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
|
||||
|
||||
std::string c = GetCommonDefines(precision);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
|
||||
switch (precision) {
|
||||
switch (op_def.precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
case CalculationsPrecision::F16:
|
||||
c += "#define CONV(R, SRC, F, i) \\\n";
|
||||
@ -78,7 +77,7 @@ std::string GenerateConvolutionTransposedCode(
|
||||
for (int s = 0; s < src_depth; ++s) {
|
||||
const std::string z = std::to_string(s);
|
||||
c += " {\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " bool x_in = X + 1 < src_size.x;\n";
|
||||
c += " bool y_in = Y + 1 < src_size.y;\n";
|
||||
c += " FLT4 src0 = " + src_tensor.Read3D("X", "Y", z) + ";\n";
|
||||
@ -94,7 +93,7 @@ std::string GenerateConvolutionTransposedCode(
|
||||
c += " if (x_in && y_in) {\n";
|
||||
c += " src3 = " + src_tensor.Read3D("X + 1", "Y + 1", z) + ";\n";
|
||||
c += " }\n";
|
||||
} else if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
} else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
c += " " + src_tensor.GetAddress("c0", "X", "Y", z) + ";\n";
|
||||
c += " " + src_tensor.GetAddress("c1", "X + 1", "Y", z) + ";\n";
|
||||
c += " " + src_tensor.GetAddress("c2", "X", "Y + 1", z) + ";\n";
|
||||
@ -197,8 +196,7 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
|
||||
Status ConvolutionTransposed3x3Thin::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionTransposedCode(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, biases_, IntegralDivideRoundUp(src_channels_, 4),
|
||||
definition_, biases_, IntegralDivideRoundUp(src_channels_, 4),
|
||||
IntegralDivideRoundUp(dst_channels_, 4), *creation_context.device,
|
||||
linked_operations_);
|
||||
|
||||
|
@ -29,15 +29,13 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GenerateConvolutionTransposedCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
int src_depth, int dst_channels, const int2& kernel_size,
|
||||
const CLDevice& device,
|
||||
const OperationDef& op_def, int src_depth, int dst_channels,
|
||||
const int2& kernel_size, const CLDevice& device,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
std::string c = GetCommonDefines(precision);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
const std::string channel_x = dst_channels == 1 ? "" : ".x";
|
||||
const std::vector<std::string> postfix = {channel_x, ".y", ".z", ".w"};
|
||||
const std::vector<std::string> channel = {".x", ".y", ".z", ".w"};
|
||||
@ -47,7 +45,7 @@ std::string GenerateConvolutionTransposedCode(
|
||||
|
||||
std::string accum_type;
|
||||
|
||||
switch (precision) {
|
||||
switch (op_def.precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
case CalculationsPrecision::F32_F16:
|
||||
accum_type = "float" + type_postfix;
|
||||
@ -80,7 +78,8 @@ std::string GenerateConvolutionTransposedCode(
|
||||
std::string r_s =
|
||||
" r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
|
||||
const std::string to_accum =
|
||||
precision == CalculationsPrecision::F32_F16 ? "convert_float" : "";
|
||||
op_def.precision == CalculationsPrecision::F32_F16 ? "convert_float"
|
||||
: "";
|
||||
for (int d = 0; d < dst_channels; ++d) {
|
||||
c += r_s + postfix[d] + " = " + to_accum + "(dot(src, filters[" +
|
||||
std::to_string(index) + "]));\n";
|
||||
@ -90,7 +89,7 @@ std::string GenerateConvolutionTransposedCode(
|
||||
}
|
||||
c += " }\n";
|
||||
for (int i = 1; i < src_depth; ++i) {
|
||||
if (precision != CalculationsPrecision::F32_F16) {
|
||||
if (op_def.precision != CalculationsPrecision::F32_F16) {
|
||||
c += " if (X < src_size.x + " + std::to_string(i + 1) + ") {\n";
|
||||
} else {
|
||||
c += " {\n";
|
||||
@ -116,7 +115,7 @@ std::string GenerateConvolutionTransposedCode(
|
||||
c += " Y *= " + std::to_string(kernel_size.x) + ";\n";
|
||||
for (int y = 0; y < kernel_size.y; ++y) {
|
||||
for (int x = 0; x < kernel_size.x; ++x) {
|
||||
if (precision != CalculationsPrecision::F32_F16) {
|
||||
if (op_def.precision != CalculationsPrecision::F32_F16) {
|
||||
c += " if (X + " + std::to_string(x) + " < dst_size.x && ";
|
||||
c += "Y + " + std::to_string(y) + " < dst_size.y) {\n";
|
||||
} else {
|
||||
@ -185,10 +184,8 @@ ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
|
||||
Status ConvolutionTransposedThin::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionTransposedCode(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, IntegralDivideRoundUp(src_channels_, 4),
|
||||
dst_channels_, kernel_size_, *creation_context.device,
|
||||
linked_operations_);
|
||||
definition_, IntegralDivideRoundUp(src_channels_, 4), dst_channels_,
|
||||
kernel_size_, *creation_context.device, linked_operations_);
|
||||
|
||||
std::vector<CompilerOptions> options;
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
|
@ -73,23 +73,22 @@ std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
|
||||
}
|
||||
|
||||
std::string GenerateDepthWiseConvolutionCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const LinearStorage& biases, int channel_multiplier,
|
||||
const OperationDef& op_def, const LinearStorage& biases,
|
||||
int channel_multiplier,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations,
|
||||
const CLDevice& device) {
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
|
||||
|
||||
std::string c = GetCommonDefines(precision);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
|
||||
const bool manual_clamp =
|
||||
src_descriptor.storage_type == TensorStorageType::BUFFER ||
|
||||
src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER;
|
||||
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
|
||||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
|
||||
|
||||
c += "__kernel void main_function(\n";
|
||||
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " __global FLT4* filters, \n";
|
||||
} else {
|
||||
c += " __read_only image2d_t filters, \n";
|
||||
@ -114,7 +113,7 @@ std::string GenerateDepthWiseConvolutionCode(
|
||||
c += " ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
|
||||
c += " int x_offseted = X * stride.x - padding.x;\n";
|
||||
c += " int y_offseted = Y * stride.y - padding.y;\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " int fx_c = Z * kernel_size.x * kernel_size.y;\n";
|
||||
} else {
|
||||
c += " int fx_c = 0;\n";
|
||||
@ -128,7 +127,7 @@ std::string GenerateDepthWiseConvolutionCode(
|
||||
c += " int x_c = x_offseted + kx * dilation.x;\n";
|
||||
c += " bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
|
||||
c += " if (!outside_x && !outside_y) {\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " FLT4 f = filters[fx_c];\n";
|
||||
} else {
|
||||
c += " FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z));\n";
|
||||
@ -210,8 +209,7 @@ DepthWiseConvolution& DepthWiseConvolution::operator=(
|
||||
|
||||
Status DepthWiseConvolution::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GenerateDepthWiseConvolutionCode(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, biases_, channel_multiplier_, linked_operations_,
|
||||
definition_, biases_, channel_multiplier_, linked_operations_,
|
||||
*creation_context.device);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
|
@ -29,19 +29,18 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GenerateDepthWiseConvCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const OperationDef& op_def,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations,
|
||||
const CLDevice& device, bool weights_are_buffer, bool local_mem_uploads) {
|
||||
std::string c = GetCommonDefines(precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
TensorCodeGenerator src_tensor("src_data", "dst_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
|
||||
|
||||
const auto mode = GetFastestZeroMode(device);
|
||||
|
||||
const bool manual_clamp =
|
||||
src_descriptor.storage_type == TensorStorageType::BUFFER ||
|
||||
src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER;
|
||||
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
|
||||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
|
||||
|
||||
if (local_mem_uploads) {
|
||||
c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
|
||||
@ -119,7 +118,7 @@ std::string GenerateDepthWiseConvCode(
|
||||
c += " y1 = clamp(y1, 0, dst_size.y - 1);\n";
|
||||
c += " y2 = clamp(y2, 0, dst_size.y - 1);\n";
|
||||
c += " y3 = clamp(y3, 0, dst_size.y - 1);\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
c += " __global FLT4* src_loc = src_data + Z * dst_size.x * "
|
||||
"dst_size.y;\n";
|
||||
}
|
||||
@ -145,7 +144,7 @@ std::string GenerateDepthWiseConvCode(
|
||||
bias = "f[9]";
|
||||
}
|
||||
auto read_4x_line = [&](int y) {
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (src_tensor_type == TensorStorageType::BUFFER) {
|
||||
const std::string y_in = "y" + std::to_string(y) + "_in";
|
||||
c += " s0 = src_loc[" + yc[y] + " * dst_size.x + " + xc[0] +
|
||||
"] * (FLT)(x0_in && " + y_in + ");\n";
|
||||
@ -155,7 +154,7 @@ std::string GenerateDepthWiseConvCode(
|
||||
"] * (FLT)(x2_in && " + y_in + ");\n";
|
||||
c += " s3 = src_loc[" + yc[y] + " * dst_size.x + " + xc[3] +
|
||||
"] * (FLT)(x3_in && " + y_in + ");\n";
|
||||
} else if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
} else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
|
||||
const std::string y_in = "y" + std::to_string(y) + "_in";
|
||||
c += " s0 = " + src_tensor.Read3D(xc[0], yc[y], "Z", mode) +
|
||||
" * (FLT)(x0_in && " + y_in + ");\n";
|
||||
@ -296,8 +295,7 @@ DepthWiseConv3x3& DepthWiseConv3x3::operator=(DepthWiseConv3x3&& operation) {
|
||||
|
||||
Status DepthWiseConv3x3::Compile(const CreationContext& creation_context) {
|
||||
std::string code = GenerateDepthWiseConvCode(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, linked_operations_, *creation_context.device,
|
||||
definition_, linked_operations_, *creation_context.device,
|
||||
weights_are_buffer_, local_mem_uploads_);
|
||||
std::vector<CompilerOptions> options;
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
|
@ -32,16 +32,15 @@ namespace {
|
||||
// otimized shaders
|
||||
|
||||
std::string GetFullyConnectedKernelCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const OperationDef& op_def,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations,
|
||||
const int3& work_group_size) {
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
std::string c = GetCommonDefines(precision);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
|
||||
switch (precision) {
|
||||
switch (op_def.precision) {
|
||||
case CalculationsPrecision::F32:
|
||||
c += "#define READ_IMAGE read_imagef\n";
|
||||
break;
|
||||
@ -70,7 +69,7 @@ std::string GetFullyConnectedKernelCode(
|
||||
c += " for (int i = 0; i < src_depth_x4; ++i, c += 4, c2 += 8) {\n";
|
||||
c += " FLT4 v = " +
|
||||
src_tensor.Read3D("0", "0", "c", TextureAddressMode::DONT_CARE) + ";\n";
|
||||
if (precision != CalculationsPrecision::F32) {
|
||||
if (op_def.precision != CalculationsPrecision::F32) {
|
||||
c += " half8 m0 = as_half8(read_imagef(filters, smp_none, (int2)(gid, "
|
||||
"c2+0)));\n";
|
||||
c += " half8 m1 = as_half8(read_imagef(filters, smp_none, (int2)(gid, "
|
||||
@ -143,8 +142,7 @@ Status FullyConnectedTexture::Compile(const CreationContext& creation_context) {
|
||||
work_group_size_ = {wg_width, wg_height, 1};
|
||||
wg_width /= 2;
|
||||
const auto code = GetFullyConnectedKernelCode(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, linked_operations_, work_group_size_);
|
||||
definition_, linked_operations_, work_group_size_);
|
||||
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_));
|
||||
|
@ -26,19 +26,16 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GetMaxUnoolingKernelCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& src_ind_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const CLDevice& device,
|
||||
const OperationDef& op_def, const CLDevice& device,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator src("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator src_ind("src_data_indices", "src_size",
|
||||
src_ind_descriptor);
|
||||
TensorCodeGenerator dst("dst_data", "dst_size", dst_descriptor);
|
||||
op_def.src_tensors[1]);
|
||||
TensorCodeGenerator dst("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
const auto address_mode = GetFastestZeroMode(device);
|
||||
|
||||
std::string code = GetCommonDefines(precision);
|
||||
std::string code = GetCommonDefines(op_def.precision);
|
||||
|
||||
code += "__kernel void main_function(\n";
|
||||
code += src.GetDeclaration(AccessType::READ) + ",\n";
|
||||
@ -59,7 +56,7 @@ std::string GetMaxUnoolingKernelCode(
|
||||
code += " int src_x = (X + padding.x) / stride.x;\n";
|
||||
code += " int src_y = (Y + padding.y) / stride.y;\n";
|
||||
code += " " + src.GetAddress("src_adr", "src_x", "src_y", "Z") + "\n";
|
||||
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
|
||||
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
|
||||
code += " bool outside = src_x < 0 || src_y < 0 ||";
|
||||
code += " src_x >= src_size.x || src_y >= src_size.y;\n";
|
||||
code += " FLT4 src = (FLT4)(0.0f);\n";
|
||||
@ -123,9 +120,7 @@ MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
|
||||
|
||||
Status MaxUnpooling::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetMaxUnoolingKernelCode(
|
||||
definition_.src_tensors[0], definition_.src_tensors[1],
|
||||
definition_.dst_tensors[0], definition_.precision,
|
||||
*creation_context.device, linked_operations_);
|
||||
definition_, *creation_context.device, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
|
@ -26,13 +26,12 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GetPaddingCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const OperationDef& op_def,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
std::string code = GetCommonDefines(precision);
|
||||
std::string code = GetCommonDefines(op_def.precision);
|
||||
const std::string channels[] = {".x", ".y", ".z", ".w"};
|
||||
|
||||
code += "__kernel void main_function(\n";
|
||||
@ -109,9 +108,7 @@ void Padding::SetPrepended(const int3& prepended) {
|
||||
}
|
||||
|
||||
Status Padding::Compile(const CreationContext& creation_context) {
|
||||
const auto code =
|
||||
GetPaddingCode(definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, linked_operations_);
|
||||
const auto code = GetPaddingCode(definition_, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
|
@ -26,13 +26,12 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GetReshapeCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const OperationDef& op_def,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
std::string c = GetCommonDefines(precision);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
c += "__kernel void main_function(\n";
|
||||
c += src_tensor.GetDeclaration(AccessType::READ);
|
||||
c += GetArgsDeclaration(linked_operations);
|
||||
@ -93,9 +92,7 @@ Reshape& Reshape::operator=(Reshape&& operation) {
|
||||
}
|
||||
|
||||
Status Reshape::Compile(const CreationContext& creation_context) {
|
||||
const auto code =
|
||||
GetReshapeCode(definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, linked_operations_);
|
||||
const auto code = GetReshapeCode(definition_, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
|
@ -26,13 +26,12 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GetReshapeCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const OperationDef& op_def,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
std::string c = GetCommonDefines(precision);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
c += "__kernel void main_function(\n";
|
||||
c += src_tensor.GetDeclaration(AccessType::READ);
|
||||
c += GetArgsDeclaration(linked_operations);
|
||||
@ -78,9 +77,7 @@ Reshapex4& Reshapex4::operator=(Reshapex4&& operation) {
|
||||
}
|
||||
|
||||
Status Reshapex4::Compile(const CreationContext& creation_context) {
|
||||
const auto code =
|
||||
GetReshapeCode(definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, linked_operations_);
|
||||
const auto code = GetReshapeCode(definition_, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
|
@ -27,13 +27,12 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GetSoftmaxKernelCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const OperationDef& op_def,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src_tensor("src_data", "size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "size", op_def.dst_tensors[0]);
|
||||
|
||||
std::string code = GetCommonDefines(precision);
|
||||
std::string code = GetCommonDefines(op_def.precision);
|
||||
code += "__kernel void main_function(\n";
|
||||
code += src_tensor.GetDeclaration(AccessType::READ);
|
||||
code += GetArgsDeclaration(linked_operations);
|
||||
@ -92,9 +91,7 @@ Softmax& Softmax::operator=(Softmax&& kernel) {
|
||||
}
|
||||
|
||||
Status Softmax::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetSoftmaxKernelCode(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, linked_operations_);
|
||||
const auto code = GetSoftmaxKernelCode(definition_, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
|
@ -26,13 +26,14 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GetSoftmaxKernelCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const OperationDef& op_def,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src_tensor("src_data", "tensor_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "tensor_size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "tensor_size",
|
||||
op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "tensor_size",
|
||||
op_def.dst_tensors[0]);
|
||||
|
||||
std::string code = GetCommonDefines(precision);
|
||||
std::string code = GetCommonDefines(op_def.precision);
|
||||
code += "__kernel void main_function(\n";
|
||||
code += src_tensor.GetDeclaration(AccessType::READ);
|
||||
code += GetArgsDeclaration(linked_operations);
|
||||
@ -110,9 +111,7 @@ Softmax1x1& Softmax1x1::operator=(Softmax1x1&& kernel) {
|
||||
}
|
||||
|
||||
Status Softmax1x1::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetSoftmaxKernelCode(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, linked_operations_);
|
||||
const auto code = GetSoftmaxKernelCode(definition_, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
|
@ -26,14 +26,12 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GetStridedSliceCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
bool alignedx4,
|
||||
const OperationDef& op_def, bool alignedx4,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
std::string c = GetCommonDefines(precision);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
c += "__kernel void main_function(\n";
|
||||
c += src_tensor.GetDeclaration(AccessType::READ);
|
||||
c += GetArgsDeclaration(linked_operations);
|
||||
@ -144,9 +142,8 @@ StridedSlice& StridedSlice::operator=(StridedSlice&& operation) {
|
||||
}
|
||||
|
||||
Status StridedSlice::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetStridedSliceCode(
|
||||
definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, Is4Alighed(attributes_), linked_operations_);
|
||||
const auto code = GetStridedSliceCode(definition_, Is4Alighed(attributes_),
|
||||
linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
|
@ -25,13 +25,12 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GetUpsampleCode(
|
||||
const TensorDescriptor& src_descriptor,
|
||||
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
|
||||
const OperationDef& op_def,
|
||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
|
||||
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
|
||||
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
|
||||
|
||||
std::string c = GetCommonDefines(precision);
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
c += "__kernel void main_function(\n";
|
||||
c += src_tensor.GetDeclaration(AccessType::READ);
|
||||
c += GetArgsDeclaration(linked_operations);
|
||||
@ -96,9 +95,7 @@ Upsample& Upsample::operator=(Upsample&& operation) {
|
||||
}
|
||||
|
||||
Status Upsample::Compile(const CreationContext& creation_context) {
|
||||
const auto code =
|
||||
GetUpsampleCode(definition_.src_tensors[0], definition_.dst_tensors[0],
|
||||
definition_.precision, linked_operations_);
|
||||
const auto code = GetUpsampleCode(definition_, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
|
Loading…
Reference in New Issue
Block a user