Simplified signatures of CreateKernel... methods.

PiperOrigin-RevId: 269652728
This commit is contained in:
A. Unique TensorFlower 2019-09-17 14:17:07 -07:00 committed by TensorFlower Gardener
parent a15068b1a6
commit e72f5dd506
20 changed files with 173 additions and 224 deletions

View File

@ -29,15 +29,13 @@ namespace cl {
namespace {
std::string GenerateConvBuffer(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
int x_elements, int y_elements,
const OperationDef& op_def, int x_elements, int y_elements,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(op_def.precision);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
switch (precision) {
switch (op_def.precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
c += "#define CONV(R, S) \\\n";
@ -53,7 +51,7 @@ std::string GenerateConvBuffer(
break;
}
switch (precision) {
switch (op_def.precision) {
case CalculationsPrecision::F32:
c += "#define FLT16 float16\n";
break;
@ -206,9 +204,8 @@ ConvBuffer& ConvBuffer::operator=(ConvBuffer&& operation) {
}
Status ConvBuffer::Compile(const CreationContext& creation_context) {
std::string code = GenerateConvBuffer(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, x_elements_, y_elements_, linked_operations_);
std::string code = GenerateConvBuffer(definition_, x_elements_, y_elements_,
linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);

View File

@ -85,15 +85,14 @@ std::string GetShiftFromElementSize(int element_size) {
}
std::string GenerateConvBuffer1x1(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
int x_elements, int y_elements, int element_size,
const OperationDef& op_def, int x_elements, int y_elements,
int element_size,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(op_def.precision);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
switch (precision) {
switch (op_def.precision) {
case CalculationsPrecision::F32:
c += "#define FLT8 float8\n";
c += "#define FLT16 float16\n";
@ -154,7 +153,8 @@ std::string GenerateConvBuffer1x1(
}
}
c += " FLT16 f0 = temp[0];\n";
c += GetComputationPart(x_elements, y_elements, element_size, precision);
c += GetComputationPart(x_elements, y_elements, element_size,
op_def.precision);
for (int i = 0; i < x_elements * y_elements; ++i) {
std::string i_s = std::to_string(i);
c += " src_addr_" + i_s + " += src_size.z;\n";
@ -232,16 +232,12 @@ ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
std::string code_flt4 = GenerateConvBuffer1x1(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, flt4_x_count_, flt4_y_count_, 1,
linked_operations_);
definition_, flt4_x_count_, flt4_y_count_, 1, linked_operations_);
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code_flt4, "main_function", *creation_context.context,
*creation_context.device, &kernel_flt4_));
std::string code_flt8 = GenerateConvBuffer1x1(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, flt8_x_count_, flt8_y_count_, 2,
linked_operations_);
definition_, flt8_x_count_, flt8_y_count_, 2, linked_operations_);
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code_flt8, "main_function", *creation_context.context,
*creation_context.device, &kernel_flt8_));

View File

@ -28,25 +28,23 @@ namespace cl {
namespace {
std::string GenerateConvolutionConstantCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const int2& kernel_size, const int2& dilation, int src_channels,
int dst_channels, const CLDevice& device,
const OperationDef& op_def, const int2& kernel_size, const int2& dilation,
int src_channels, int dst_channels, const CLDevice& device,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.src_tensors[0]);
std::string c = GetCommonDefines(precision);
std::string c = GetCommonDefines(op_def.precision);
const int out_z = IntegralDivideRoundUp(dst_channels, 4);
const std::string kOutZ = std::to_string(out_z);
const int src_depth = IntegralDivideRoundUp(src_channels, 4);
const bool manual_clamp =
src_descriptor.storage_type == TensorStorageType::BUFFER ||
src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER;
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
switch (precision) {
switch (op_def.precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
c += "#define CONV4(R, SRC, F, i) \\\n";
@ -213,9 +211,8 @@ ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
Status ConvConstants::Compile(const CreationContext& creation_context) {
const auto code = GenerateConvolutionConstantCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, kernel_size_, dilation_, src_channels_,
dst_channels_, *creation_context.device, linked_operations_);
definition_, kernel_size_, dilation_, src_channels_, dst_channels_,
*creation_context.device, linked_operations_);
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsAdreno3xx()) {

View File

@ -62,9 +62,8 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
}
Status ConvPowerVR::Compile(const CreationContext& creation_context) {
const std::string code = GenerateConvPowerVR1x1(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, conv_params_, linked_operations_);
const std::string code =
GenerateConvPowerVR1x1(definition_, conv_params_, linked_operations_);
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsPowerVR()) {
@ -117,18 +116,16 @@ Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
}
std::string GenerateConvPowerVR1x1(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const ConvPowerVR::ConvParams& conv_params,
const OperationDef& op_def, const ConvPowerVR::ConvParams& conv_params,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(op_def.precision);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
const bool is1x1 = conv_params.x_kernel_is_1 && conv_params.y_kernel_is_1;
const bool buffer_type =
src_descriptor.storage_type == TensorStorageType::BUFFER ||
src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER;
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
const bool buffer_type = src_tensor_type == TensorStorageType::BUFFER ||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
const bool manual_clamp = buffer_type && !is1x1;
c += "#define SIMD_BARRIER " +
@ -248,7 +245,7 @@ std::string GenerateConvPowerVR1x1(
for (int y = 0; y < block_size.y; ++y) {
for (int x = 0; x < block_size.x; ++x) {
const std::string id = std::to_string(y) + std::to_string(x);
if (precision == CalculationsPrecision::F32_F16) {
if (op_def.precision == CalculationsPrecision::F32_F16) {
c += " ACCUM_FLT4 src" + id + ";\n";
} else {
c += " FLT4 src" + id + ";\n";
@ -265,8 +262,8 @@ std::string GenerateConvPowerVR1x1(
? ""
: " * (FLT)(mx" + std::to_string(x) +
" && my" + std::to_string(y) + ")";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (precision == CalculationsPrecision::F32_F16) {
if (src_tensor_type == TensorStorageType::BUFFER) {
if (op_def.precision == CalculationsPrecision::F32_F16) {
c += " src" + id + " = convert_float4(src_data[src_a_" + id +
"]" + multiplier + ");\n";
} else {
@ -274,8 +271,8 @@ std::string GenerateConvPowerVR1x1(
multiplier + ";\n";
}
}
if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
if (precision == CalculationsPrecision::F32_F16) {
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
if (op_def.precision == CalculationsPrecision::F32_F16) {
c += " src" + id + " = " +
src_tensor.ReadAsFloat3D("src_a_" + id) + multiplier + ";\n";
} else {
@ -290,7 +287,7 @@ std::string GenerateConvPowerVR1x1(
is1x1 ? "X + " + std::to_string(x) : "xck" + std::to_string(x);
const std::string yc =
is1x1 ? "Y + " + std::to_string(y) : "yck" + std::to_string(y);
if (precision == CalculationsPrecision::F32_F16) {
if (op_def.precision == CalculationsPrecision::F32_F16) {
c += " src" + id + " = " +
src_tensor.ReadAsFloat3D(xc, yc, "s") + ";\n";
} else {

View File

@ -76,9 +76,7 @@ class ConvPowerVR : public GPUOperation {
ConvPowerVR* result);
friend std::string GenerateConvPowerVR1x1(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const ConvParams& conv_params,
const OperationDef& op_def, const ConvParams& conv_params,
const std::vector<ElementwiseOperation*>& linked_operations);
friend ConvParams GuessBestParams(const CLDevice& device,

View File

@ -30,15 +30,17 @@ namespace cl {
namespace {
std::string GenerateConvCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
bool is1x1, bool adreno4xx_optimization, const CLDevice& device,
const OperationDef& op_def, bool is1x1, bool adreno4xx_optimization,
const CLDevice& device,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(op_def.precision);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
switch (precision) {
const bool is_image_buffer =
op_def.src_tensors[0].storage_type == TensorStorageType::IMAGE_BUFFER;
switch (op_def.precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
c += "#define CONV1(R, S) \\\n";
@ -102,14 +104,14 @@ std::string GenerateConvCode(
c += " for (int y = 0; y < kernel_size.y; ++y) {\n";
c += " c0.y = y * dilation.y + yc0;\n";
c += " c1.y = y * dilation.y + yc1;\n";
if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
if (is_image_buffer) {
c += " bool in_y0 = c0.y >= 0 && c0.y < src_size.y;\n";
c += " bool in_y1 = c1.y >= 0 && c1.y < src_size.y;\n";
}
c += " for (int x = 0; x < kernel_size.x; ++x) {\n";
c += " c0.x = x * dilation.x + xc0;\n";
c += " c1.x = x * dilation.x + xc1;\n";
if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
if (is_image_buffer) {
c += " bool in_x0 = c0.x >= 0 && c0.x < src_size.x;\n";
c += " bool in_x1 = c1.x >= 0 && c1.x < src_size.x;\n";
c += " int addr_0 = select(-1, c0.y * src_size.x + c0.x, (in_x0 && "
@ -129,7 +131,7 @@ std::string GenerateConvCode(
c += " int dz_3 = select(0, src_size.x * src_size.y, (in_x1 && "
"in_y1));\n";
}
} else if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
} else if (is_image_buffer) {
c += " bool in_x0 = xc0 >= 0 && xc0 < src_size.x;\n";
c += " bool in_x1 = xc1 >= 0 && xc1 < src_size.x;\n";
c += " bool in_y0 = yc0 >= 0 && yc0 < src_size.y;\n";
@ -148,7 +150,7 @@ std::string GenerateConvCode(
c += " int dz_3 = select(0, src_size.x * src_size.y, (in_x1 && in_y1));\n";
}
c += " for (int s = 0; s < src_size.w; ++s) {\n";
if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
if (is_image_buffer) {
c += " FLT4 src0 = " + src_tensor.Read3D("addr_0") + ";\n";
c += " FLT4 src1 = " + src_tensor.Read3D("addr_1") + ";\n";
c += " FLT4 src2 = " + src_tensor.Read3D("addr_2") + ";\n";
@ -164,7 +166,7 @@ std::string GenerateConvCode(
c += " FLT4 f5 = READ_IMAGE(filters1, smp_none, " + fc1 + ");\n";
c += " FLT4 f6 = READ_IMAGE(filters2, smp_none, " + fc1 + ");\n";
c += " FLT4 f7 = READ_IMAGE(filters3, smp_none, " + fc1 + ");\n";
if (src_descriptor.storage_type != TensorStorageType::IMAGE_BUFFER) {
if (!is_image_buffer) {
const auto mode = GetFastestZeroMode(device);
c += " FLT4 src0 = " + src_tensor.Read3D(s_x0, s_y0, "s", mode) + ";\n";
c += " FLT4 src1 = " + src_tensor.Read3D(s_x1, s_y0, "s", mode) + ";\n";
@ -182,7 +184,7 @@ std::string GenerateConvCode(
if (!is1x1) {
c += " filter_offset++;\n";
}
if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
if (is_image_buffer) {
c += " addr_0 += dz_0;\n";
c += " addr_1 += dz_1;\n";
c += " addr_2 += dz_2;\n";
@ -296,8 +298,7 @@ Status ConvTexture::Compile(const CreationContext& creation_context) {
storage_type == TensorStorageType::TEXTURE_ARRAY &&
definition_.precision == CalculationsPrecision::F16;
std::string code =
GenerateConvCode(definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, is1x1, adreno4xx_optimization,
GenerateConvCode(definition_, is1x1, adreno4xx_optimization,
*creation_context.device, linked_operations_);
std::vector<CompilerOptions> options;
if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {

View File

@ -28,18 +28,18 @@ namespace cl {
namespace {
std::string GenerateConvolutionTransposedCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const LinearStorage& biases, const CLDevice& device,
const OperationDef& op_def, const LinearStorage& biases,
const CLDevice& device,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
std::string c = GetCommonDefines(op_def.precision);
switch (precision) {
switch (op_def.precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
c += "#define CONV(R, S) \\\n";
c += "R += S.x * f0.s0123; \\\n";
c += "R += S.y * f0.s4567; \\\n";
@ -54,7 +54,7 @@ std::string GenerateConvolutionTransposedCode(
}
break;
case CalculationsPrecision::F32_F16:
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
c += "#define CONV(R, S) \\\n";
c += "R += convert_float4(S.x * f0.s0123 + S.y * f0.s4567 + S.z * "
"f0.s89ab + S.w * f0.scdef);\n";
@ -66,7 +66,7 @@ std::string GenerateConvolutionTransposedCode(
break;
}
switch (precision) {
switch (op_def.precision) {
case CalculationsPrecision::F32:
c += "#define FLT16 float16\n";
break;
@ -78,7 +78,7 @@ std::string GenerateConvolutionTransposedCode(
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " __global FLT16* filters, \n";
c += " __global FLT4* biases";
} else {
@ -99,7 +99,7 @@ std::string GenerateConvolutionTransposedCode(
c += " int Y = get_global_id(1);\n";
c += " int Z = get_global_id(2);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " int f_base = Z * src_size.w * kernel_size.x * kernel_size.y;\n";
}
c += " int2 offset = (int2)(X, Y) + padding - k_offset;\n";
@ -126,7 +126,7 @@ std::string GenerateConvolutionTransposedCode(
c += " bool out_x = s_x < 0 || s_x >= src_size.x;\n";
c += " int kernel_index = index_y * kernel_size.x + index_x;\n";
c += " if (inside_kernel && !(out_x || out_y)) {\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " int f_offset = f_base + kernel_index * src_size.w;\n";
} else {
c += " int x_c = kernel_index * src_size.w * 4;\n";
@ -135,7 +135,7 @@ std::string GenerateConvolutionTransposedCode(
c += " FLT4 src =" +
src_tensor.Read3D("s_x", "s_y", "l", TextureAddressMode::DONT_CARE) +
";\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " FLT16 f0 = filters[f_offset]; f_offset++;\n";
} else {
c += " FLT4 f[4];\n";
@ -217,9 +217,7 @@ ConvolutionTransposed& ConvolutionTransposed::operator=(
Status ConvolutionTransposed::Compile(const CreationContext& creation_context) {
const auto code = GenerateConvolutionTransposedCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, biases_, *creation_context.device,
linked_operations_);
definition_, biases_, *creation_context.device, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,

View File

@ -28,17 +28,16 @@ namespace cl {
namespace {
std::string GenerateConvolutionTransposedCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const LinearStorage& biases, int src_depth, int dst_depth,
const CLDevice& device,
const OperationDef& op_def, const LinearStorage& biases, int src_depth,
int dst_depth, const CLDevice& device,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
std::string c = GetCommonDefines(precision);
std::string c = GetCommonDefines(op_def.precision);
switch (precision) {
switch (op_def.precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
c += "#define CONV(R, SRC, F, i) \\\n";
@ -78,7 +77,7 @@ std::string GenerateConvolutionTransposedCode(
for (int s = 0; s < src_depth; ++s) {
const std::string z = std::to_string(s);
c += " {\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " bool x_in = X + 1 < src_size.x;\n";
c += " bool y_in = Y + 1 < src_size.y;\n";
c += " FLT4 src0 = " + src_tensor.Read3D("X", "Y", z) + ";\n";
@ -94,7 +93,7 @@ std::string GenerateConvolutionTransposedCode(
c += " if (x_in && y_in) {\n";
c += " src3 = " + src_tensor.Read3D("X + 1", "Y + 1", z) + ";\n";
c += " }\n";
} else if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
} else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
c += " " + src_tensor.GetAddress("c0", "X", "Y", z) + ";\n";
c += " " + src_tensor.GetAddress("c1", "X + 1", "Y", z) + ";\n";
c += " " + src_tensor.GetAddress("c2", "X", "Y + 1", z) + ";\n";
@ -197,8 +196,7 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
Status ConvolutionTransposed3x3Thin::Compile(
const CreationContext& creation_context) {
const auto code = GenerateConvolutionTransposedCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, biases_, IntegralDivideRoundUp(src_channels_, 4),
definition_, biases_, IntegralDivideRoundUp(src_channels_, 4),
IntegralDivideRoundUp(dst_channels_, 4), *creation_context.device,
linked_operations_);

View File

@ -29,15 +29,13 @@ namespace cl {
namespace {
std::string GenerateConvolutionTransposedCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
int src_depth, int dst_channels, const int2& kernel_size,
const CLDevice& device,
const OperationDef& op_def, int src_depth, int dst_channels,
const int2& kernel_size, const CLDevice& device,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
std::string c = GetCommonDefines(precision);
std::string c = GetCommonDefines(op_def.precision);
const std::string channel_x = dst_channels == 1 ? "" : ".x";
const std::vector<std::string> postfix = {channel_x, ".y", ".z", ".w"};
const std::vector<std::string> channel = {".x", ".y", ".z", ".w"};
@ -47,7 +45,7 @@ std::string GenerateConvolutionTransposedCode(
std::string accum_type;
switch (precision) {
switch (op_def.precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F32_F16:
accum_type = "float" + type_postfix;
@ -80,7 +78,8 @@ std::string GenerateConvolutionTransposedCode(
std::string r_s =
" r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
const std::string to_accum =
precision == CalculationsPrecision::F32_F16 ? "convert_float" : "";
op_def.precision == CalculationsPrecision::F32_F16 ? "convert_float"
: "";
for (int d = 0; d < dst_channels; ++d) {
c += r_s + postfix[d] + " = " + to_accum + "(dot(src, filters[" +
std::to_string(index) + "]));\n";
@ -90,7 +89,7 @@ std::string GenerateConvolutionTransposedCode(
}
c += " }\n";
for (int i = 1; i < src_depth; ++i) {
if (precision != CalculationsPrecision::F32_F16) {
if (op_def.precision != CalculationsPrecision::F32_F16) {
c += " if (X < src_size.x + " + std::to_string(i + 1) + ") {\n";
} else {
c += " {\n";
@ -116,7 +115,7 @@ std::string GenerateConvolutionTransposedCode(
c += " Y *= " + std::to_string(kernel_size.x) + ";\n";
for (int y = 0; y < kernel_size.y; ++y) {
for (int x = 0; x < kernel_size.x; ++x) {
if (precision != CalculationsPrecision::F32_F16) {
if (op_def.precision != CalculationsPrecision::F32_F16) {
c += " if (X + " + std::to_string(x) + " < dst_size.x && ";
c += "Y + " + std::to_string(y) + " < dst_size.y) {\n";
} else {
@ -185,10 +184,8 @@ ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
Status ConvolutionTransposedThin::Compile(
const CreationContext& creation_context) {
const auto code = GenerateConvolutionTransposedCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, IntegralDivideRoundUp(src_channels_, 4),
dst_channels_, kernel_size_, *creation_context.device,
linked_operations_);
definition_, IntegralDivideRoundUp(src_channels_, 4), dst_channels_,
kernel_size_, *creation_context.device, linked_operations_);
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&

View File

@ -73,23 +73,22 @@ std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
}
std::string GenerateDepthWiseConvolutionCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const LinearStorage& biases, int channel_multiplier,
const OperationDef& op_def, const LinearStorage& biases,
int channel_multiplier,
const std::vector<ElementwiseOperation*>& linked_operations,
const CLDevice& device) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
std::string c = GetCommonDefines(precision);
std::string c = GetCommonDefines(op_def.precision);
const bool manual_clamp =
src_descriptor.storage_type == TensorStorageType::BUFFER ||
src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER;
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " __global FLT4* filters, \n";
} else {
c += " __read_only image2d_t filters, \n";
@ -114,7 +113,7 @@ std::string GenerateDepthWiseConvolutionCode(
c += " ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
c += " int x_offseted = X * stride.x - padding.x;\n";
c += " int y_offseted = Y * stride.y - padding.y;\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " int fx_c = Z * kernel_size.x * kernel_size.y;\n";
} else {
c += " int fx_c = 0;\n";
@ -128,7 +127,7 @@ std::string GenerateDepthWiseConvolutionCode(
c += " int x_c = x_offseted + kx * dilation.x;\n";
c += " bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
c += " if (!outside_x && !outside_y) {\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " FLT4 f = filters[fx_c];\n";
} else {
c += " FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z));\n";
@ -210,8 +209,7 @@ DepthWiseConvolution& DepthWiseConvolution::operator=(
Status DepthWiseConvolution::Compile(const CreationContext& creation_context) {
const auto code = GenerateDepthWiseConvolutionCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, biases_, channel_multiplier_, linked_operations_,
definition_, biases_, channel_multiplier_, linked_operations_,
*creation_context.device);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,

View File

@ -29,19 +29,18 @@ namespace cl {
namespace {
std::string GenerateDepthWiseConvCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const OperationDef& op_def,
const std::vector<ElementwiseOperation*>& linked_operations,
const CLDevice& device, bool weights_are_buffer, bool local_mem_uploads) {
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(op_def.precision);
TensorCodeGenerator src_tensor("src_data", "dst_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
const auto mode = GetFastestZeroMode(device);
const bool manual_clamp =
src_descriptor.storage_type == TensorStorageType::BUFFER ||
src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER;
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
if (local_mem_uploads) {
c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
@ -119,7 +118,7 @@ std::string GenerateDepthWiseConvCode(
c += " y1 = clamp(y1, 0, dst_size.y - 1);\n";
c += " y2 = clamp(y2, 0, dst_size.y - 1);\n";
c += " y3 = clamp(y3, 0, dst_size.y - 1);\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " __global FLT4* src_loc = src_data + Z * dst_size.x * "
"dst_size.y;\n";
}
@ -145,7 +144,7 @@ std::string GenerateDepthWiseConvCode(
bias = "f[9]";
}
auto read_4x_line = [&](int y) {
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (src_tensor_type == TensorStorageType::BUFFER) {
const std::string y_in = "y" + std::to_string(y) + "_in";
c += " s0 = src_loc[" + yc[y] + " * dst_size.x + " + xc[0] +
"] * (FLT)(x0_in && " + y_in + ");\n";
@ -155,7 +154,7 @@ std::string GenerateDepthWiseConvCode(
"] * (FLT)(x2_in && " + y_in + ");\n";
c += " s3 = src_loc[" + yc[y] + " * dst_size.x + " + xc[3] +
"] * (FLT)(x3_in && " + y_in + ");\n";
} else if (src_descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
} else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
const std::string y_in = "y" + std::to_string(y) + "_in";
c += " s0 = " + src_tensor.Read3D(xc[0], yc[y], "Z", mode) +
" * (FLT)(x0_in && " + y_in + ");\n";
@ -296,8 +295,7 @@ DepthWiseConv3x3& DepthWiseConv3x3::operator=(DepthWiseConv3x3&& operation) {
Status DepthWiseConv3x3::Compile(const CreationContext& creation_context) {
std::string code = GenerateDepthWiseConvCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_, *creation_context.device,
definition_, linked_operations_, *creation_context.device,
weights_are_buffer_, local_mem_uploads_);
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&

View File

@ -32,16 +32,15 @@ namespace {
// otimized shaders
std::string GetFullyConnectedKernelCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const OperationDef& op_def,
const std::vector<ElementwiseOperation*>& linked_operations,
const int3& work_group_size) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
std::string c = GetCommonDefines(precision);
std::string c = GetCommonDefines(op_def.precision);
switch (precision) {
switch (op_def.precision) {
case CalculationsPrecision::F32:
c += "#define READ_IMAGE read_imagef\n";
break;
@ -70,7 +69,7 @@ std::string GetFullyConnectedKernelCode(
c += " for (int i = 0; i < src_depth_x4; ++i, c += 4, c2 += 8) {\n";
c += " FLT4 v = " +
src_tensor.Read3D("0", "0", "c", TextureAddressMode::DONT_CARE) + ";\n";
if (precision != CalculationsPrecision::F32) {
if (op_def.precision != CalculationsPrecision::F32) {
c += " half8 m0 = as_half8(read_imagef(filters, smp_none, (int2)(gid, "
"c2+0)));\n";
c += " half8 m1 = as_half8(read_imagef(filters, smp_none, (int2)(gid, "
@ -143,8 +142,7 @@ Status FullyConnectedTexture::Compile(const CreationContext& creation_context) {
work_group_size_ = {wg_width, wg_height, 1};
wg_width /= 2;
const auto code = GetFullyConnectedKernelCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_, work_group_size_);
definition_, linked_operations_, work_group_size_);
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_));

View File

@ -26,19 +26,16 @@ namespace cl {
namespace {
std::string GetMaxUnoolingKernelCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& src_ind_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const CLDevice& device,
const OperationDef& op_def, const CLDevice& device,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src("src_data", "src_size", src_descriptor);
TensorCodeGenerator src("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator src_ind("src_data_indices", "src_size",
src_ind_descriptor);
TensorCodeGenerator dst("dst_data", "dst_size", dst_descriptor);
op_def.src_tensors[1]);
TensorCodeGenerator dst("dst_data", "dst_size", op_def.dst_tensors[0]);
const auto address_mode = GetFastestZeroMode(device);
std::string code = GetCommonDefines(precision);
std::string code = GetCommonDefines(op_def.precision);
code += "__kernel void main_function(\n";
code += src.GetDeclaration(AccessType::READ) + ",\n";
@ -59,7 +56,7 @@ std::string GetMaxUnoolingKernelCode(
code += " int src_x = (X + padding.x) / stride.x;\n";
code += " int src_y = (Y + padding.y) / stride.y;\n";
code += " " + src.GetAddress("src_adr", "src_x", "src_y", "Z") + "\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
code += " bool outside = src_x < 0 || src_y < 0 ||";
code += " src_x >= src_size.x || src_y >= src_size.y;\n";
code += " FLT4 src = (FLT4)(0.0f);\n";
@ -123,9 +120,7 @@ MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
Status MaxUnpooling::Compile(const CreationContext& creation_context) {
const auto code = GetMaxUnoolingKernelCode(
definition_.src_tensors[0], definition_.src_tensors[1],
definition_.dst_tensors[0], definition_.precision,
*creation_context.device, linked_operations_);
definition_, *creation_context.device, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);

View File

@ -26,13 +26,12 @@ namespace cl {
namespace {
std::string GetPaddingCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const OperationDef& op_def,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
std::string code = GetCommonDefines(precision);
std::string code = GetCommonDefines(op_def.precision);
const std::string channels[] = {".x", ".y", ".z", ".w"};
code += "__kernel void main_function(\n";
@ -109,9 +108,7 @@ void Padding::SetPrepended(const int3& prepended) {
}
Status Padding::Compile(const CreationContext& creation_context) {
const auto code =
GetPaddingCode(definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_);
const auto code = GetPaddingCode(definition_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);

View File

@ -26,13 +26,12 @@ namespace cl {
namespace {
std::string GetReshapeCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const OperationDef& op_def,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
std::string c = GetCommonDefines(precision);
std::string c = GetCommonDefines(op_def.precision);
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ);
c += GetArgsDeclaration(linked_operations);
@ -93,9 +92,7 @@ Reshape& Reshape::operator=(Reshape&& operation) {
}
Status Reshape::Compile(const CreationContext& creation_context) {
const auto code =
GetReshapeCode(definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_);
const auto code = GetReshapeCode(definition_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);

View File

@ -26,13 +26,12 @@ namespace cl {
namespace {
std::string GetReshapeCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const OperationDef& op_def,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
std::string c = GetCommonDefines(precision);
std::string c = GetCommonDefines(op_def.precision);
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ);
c += GetArgsDeclaration(linked_operations);
@ -78,9 +77,7 @@ Reshapex4& Reshapex4::operator=(Reshapex4&& operation) {
}
Status Reshapex4::Compile(const CreationContext& creation_context) {
const auto code =
GetReshapeCode(definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_);
const auto code = GetReshapeCode(definition_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);

View File

@ -27,13 +27,12 @@ namespace cl {
namespace {
std::string GetSoftmaxKernelCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const OperationDef& op_def,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "size", op_def.dst_tensors[0]);
std::string code = GetCommonDefines(precision);
std::string code = GetCommonDefines(op_def.precision);
code += "__kernel void main_function(\n";
code += src_tensor.GetDeclaration(AccessType::READ);
code += GetArgsDeclaration(linked_operations);
@ -92,9 +91,7 @@ Softmax& Softmax::operator=(Softmax&& kernel) {
}
Status Softmax::Compile(const CreationContext& creation_context) {
const auto code = GetSoftmaxKernelCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_);
const auto code = GetSoftmaxKernelCode(definition_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);

View File

@ -26,13 +26,14 @@ namespace cl {
namespace {
std::string GetSoftmaxKernelCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const OperationDef& op_def,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "tensor_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "tensor_size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "tensor_size",
op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "tensor_size",
op_def.dst_tensors[0]);
std::string code = GetCommonDefines(precision);
std::string code = GetCommonDefines(op_def.precision);
code += "__kernel void main_function(\n";
code += src_tensor.GetDeclaration(AccessType::READ);
code += GetArgsDeclaration(linked_operations);
@ -110,9 +111,7 @@ Softmax1x1& Softmax1x1::operator=(Softmax1x1&& kernel) {
}
Status Softmax1x1::Compile(const CreationContext& creation_context) {
const auto code = GetSoftmaxKernelCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_);
const auto code = GetSoftmaxKernelCode(definition_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);

View File

@ -26,14 +26,12 @@ namespace cl {
namespace {
std::string GetStridedSliceCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
bool alignedx4,
const OperationDef& op_def, bool alignedx4,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
std::string c = GetCommonDefines(precision);
std::string c = GetCommonDefines(op_def.precision);
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ);
c += GetArgsDeclaration(linked_operations);
@ -144,9 +142,8 @@ StridedSlice& StridedSlice::operator=(StridedSlice&& operation) {
}
Status StridedSlice::Compile(const CreationContext& creation_context) {
const auto code = GetStridedSliceCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, Is4Alighed(attributes_), linked_operations_);
const auto code = GetStridedSliceCode(definition_, Is4Alighed(attributes_),
linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);

View File

@ -25,13 +25,12 @@ namespace cl {
namespace {
std::string GetUpsampleCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const OperationDef& op_def,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
TensorCodeGenerator src_tensor("src_data", "src_size", op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", op_def.dst_tensors[0]);
std::string c = GetCommonDefines(precision);
std::string c = GetCommonDefines(op_def.precision);
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ);
c += GetArgsDeclaration(linked_operations);
@ -96,9 +95,7 @@ Upsample& Upsample::operator=(Upsample&& operation) {
}
Status Upsample::Compile(const CreationContext& creation_context) {
const auto code =
GetUpsampleCode(definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_);
const auto code = GetUpsampleCode(definition_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);