From 3aecbb9fb163d72618524c98b5633ca521514387 Mon Sep 17 00:00:00 2001 From: Raman Sarokin Date: Wed, 19 Feb 2020 16:30:48 -0800 Subject: [PATCH] Supported F32_F16 precision in Winograd transformations. PiperOrigin-RevId: 296084853 Change-Id: If7f1715d84eae34159cf403d1ad208f9d1aa7305 --- .../lite/delegates/gpu/cl/kernels/util.cc | 66 +++++++++ .../lite/delegates/gpu/cl/kernels/util.h | 25 ++++ .../lite/delegates/gpu/cl/kernels/winograd.cc | 134 ++++++++++++------ 3 files changed, 182 insertions(+), 43 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc index 9b46c91b921..0943816f2d7 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc @@ -16,10 +16,12 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h" #include +#include #include #include "absl/strings/str_cat.h" #include "absl/strings/substitute.h" +#include "tensorflow/lite/delegates/gpu/cl/precision.h" #include "tensorflow/lite/delegates/gpu/common/data_type.h" namespace tflite { @@ -225,6 +227,37 @@ std::string TensorCodeGenerator::ReadAsFloatWHDSB( address_mode); } +std::string TensorCodeGenerator::ReadAsTypeWHS( + DataType type, const std::string& x, const std::string& y, + const std::string& s, TextureAddressMode address_mode) const { + return ReadAsType(type, GetGlobalAddressNoDeclarationWHS(x, y, s), + address_mode); +} + +std::string TensorCodeGenerator::ReadAsTypeWHSB( + DataType type, const std::string& x, const std::string& y, + const std::string& s, const std::string& b, + TextureAddressMode address_mode) const { + return ReadAsType(type, GetGlobalAddressNoDeclarationWHSB(x, y, s, b), + address_mode); +} + +std::string TensorCodeGenerator::ReadAsTypeWHDS( + DataType type, const std::string& x, const std::string& y, + const std::string& z, const std::string& s, + TextureAddressMode address_mode) const { + return ReadAsType(type, GetGlobalAddressNoDeclarationWHDS(x, y, z, s), + address_mode); +} + +std::string TensorCodeGenerator::ReadAsTypeWHDSB( + DataType type, const std::string& x, const std::string& y, + const std::string& z, const std::string& s, const std::string& b, + TextureAddressMode address_mode) const { + return ReadAsType(type, GetGlobalAddressNoDeclarationWHDSB(x, y, z, s, b), + address_mode); +} + std::string TensorCodeGenerator::GetAddressWHS(const std::string& var_name, const std::string& x, const std::string& y, @@ -449,6 +482,39 @@ std::string TensorCodeGenerator::ReadAsFloat( } } +std::string TensorCodeGenerator::ReadAsType( + DataType type, const std::string& global_address, + TextureAddressMode address_mode) const { + const std::string read_as = + type == DataType::FLOAT16 ? "read_imageh" : "read_imagef"; + switch (descriptor_.storage_type) { + case TensorStorageType::BUFFER: { + const std::string reading = + absl::StrCat(tensor_name_, "[", global_address, "]"); + if (type == descriptor_.data_type) { + return reading; + } else { + const std::string conversion = + type == DataType::FLOAT16 ? "convert_half4" : "convert_float4"; + return absl::StrCat(conversion, "(", reading, ")"); + } + } + case TensorStorageType::TEXTURE_2D: + case TensorStorageType::TEXTURE_3D: + case TensorStorageType::SINGLE_TEXTURE_2D: + case TensorStorageType::TEXTURE_ARRAY: + return absl::StrCat( + read_as, "(", tensor_name_, + ", " + TextureAddressModeToString(address_mode) + ", ", + global_address, ")"); + case TensorStorageType::IMAGE_BUFFER: + return absl::StrCat(read_as, "(", tensor_name_, ", ", global_address, + ")"); + case TensorStorageType::UNKNOWN: + return ""; + } +} + std::string TensorCodeGenerator::Write( const std::string& var_name, const std::string& global_address) const { switch (descriptor_.storage_type) { diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h index 14ad9ec0bc3..02d5df6c442 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h +++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h @@ -138,6 +138,28 @@ class TensorCodeGenerator { const std::string& s, const std::string& b, TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const; + // Optimization for textures, so as in opencl we can use read_imagef for any + // texture type. + std::string ReadAsTypeWHS( + DataType type, const std::string& x, const std::string& y, + const std::string& s, + TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const; + + std::string ReadAsTypeWHSB( + DataType type, const std::string& x, const std::string& y, + const std::string& s, const std::string& b, + TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const; + + std::string ReadAsTypeWHDS( + DataType type, const std::string& x, const std::string& y, + const std::string& z, const std::string& s, + TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const; + + std::string ReadAsTypeWHDSB( + DataType type, const std::string& x, const std::string& y, + const std::string& z, const std::string& s, const std::string& b, + TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const; + std::string WriteWHS(const std::string& var_name, const std::string& x, const std::string& y, const std::string& s) const; @@ -161,6 +183,9 @@ class TensorCodeGenerator { std::string ReadAsFloat( const std::string& global_address, TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const; + std::string ReadAsType( + DataType type, const std::string& global_address, + TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const; std::string Write(const std::string& var_name, const std::string& global_address) const; diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc index cfc172055ab..868cca55882 100644 --- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc +++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc @@ -21,6 +21,8 @@ limitations under the License. #include "absl/strings/str_format.h" #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h" #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h" +#include "tensorflow/lite/delegates/gpu/cl/precision.h" +#include "tensorflow/lite/delegates/gpu/common/data_type.h" #include "tensorflow/lite/delegates/gpu/common/shape.h" #include "tensorflow/lite/delegates/gpu/common/status.h" @@ -49,8 +51,22 @@ std::string GetWinograd4x4To36Code( src_tensor_type == TensorStorageType::IMAGE_BUFFER; const bool is_buffer = src_tensor_type == TensorStorageType::BUFFER; + switch (op_def.precision) { + case CalculationsPrecision::F32: + case CalculationsPrecision::F32_F16: + c += "#define ACCUM_FLT float\n"; + break; + case CalculationsPrecision::F16: + c += "#define ACCUM_FLT half\n"; + break; + } + + const DataType accum_type = op_def.precision == CalculationsPrecision::F16 + ? DataType::FLOAT16 + : DataType::FLOAT32; + auto bt_mat = BtMatrixForWinograd4x4To6x6(); - c += "constant FLT Bt[36] = {\n"; + c += "constant ACCUM_FLT Bt[36] = {\n"; for (int y = 0; y < 6; ++y) { c += "\t"; for (int x = 0; x < 6; ++x) { @@ -79,10 +95,12 @@ std::string GetWinograd4x4To36Code( c += " }\n"; c += " int tile_x = (DST_X % tiles_x) * 4;\n"; c += " int tile_y = (DST_X / tiles_x) * 4;\n"; - c += " FLT4 I0, I1, I2, I3, I4, I5;\n"; - c += " FLT bt_ar[6];\n"; - c += " FLT4 t0 = " + bt_arr.ReadLinearFLT4("DST_Y * 2 + 0") + ";\n"; - c += " FLT4 t1 = " + bt_arr.ReadLinearFLT4("DST_Y * 2 + 1") + ";\n"; + c += " ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n"; + c += " ACCUM_FLT bt_ar[6];\n"; + c += " ACCUM_FLT4 t0 = TO_ACCUM_TYPE(" + + bt_arr.ReadLinearFLT4("DST_Y * 2 + 0") + ");\n"; + c += " ACCUM_FLT4 t1 = TO_ACCUM_TYPE(" + + bt_arr.ReadLinearFLT4("DST_Y * 2 + 1") + ");\n"; c += " DST_Y *= 6;\n"; c += " bt_ar[0] = t0.x;\n"; c += " bt_ar[1] = t0.y;\n"; @@ -92,15 +110,17 @@ std::string GetWinograd4x4To36Code( c += " bt_ar[5] = t1.y;\n"; auto read_src = [&](const std::string& src, const std::string& xs) { if (is_image_buffer) { - c += " FLT4 " + src + " = " + - src_tensor.Read("src_a_" + xs + " + offset") + ";\n"; + c += " ACCUM_FLT4 " + src + " = " + + src_tensor.ReadAsType(accum_type, "src_a_" + xs + " + offset") + + ";\n"; } else if (is_buffer) { - c += " FLT4 " + src + " = " + - src_tensor.Read("src_a_" + xs + " + offset") + " * m" + xs + "_x;\n"; + c += " ACCUM_FLT4 " + src + " = " + + src_tensor.ReadAsType(accum_type, "src_a_" + xs + " + offset") + + " * m" + xs + "_x;\n"; } else { - c += " FLT4 " + src + " = " + - src_tensor.ReadWHSB("tile_x + padding.x + " + xs, "yc", "DST_Z", - batch_id) + + c += " ACCUM_FLT4 " + src + " = " + + src_tensor.ReadAsTypeWHSB(accum_type, "tile_x + padding.x + " + xs, + "yc", "DST_Z", batch_id) + ";\n"; } }; @@ -108,8 +128,8 @@ std::string GetWinograd4x4To36Code( for (int x = 0; x < 6; ++x) { const std::string xs = std::to_string(x); c += " int xc" + xs + " = tile_x + padding.x + " + xs + ";\n"; - c += " FLT m" + xs + "_x = (FLT)(xc" + xs + " >= 0 && xc" + xs + - " < src_size.x);\n"; + c += " ACCUM_FLT m" + xs + "_x = (ACCUM_FLT)(xc" + xs + " >= 0 && xc" + + xs + " < src_size.x);\n"; c += " bool inx" + xs + " = (xc" + xs + " >= 0 && xc" + xs + " < src_size.x);\n"; c += " xc" + xs + " = clamp(xc" + xs + ", 0, src_size.x - 1);\n"; @@ -126,9 +146,9 @@ std::string GetWinograd4x4To36Code( if (is_buffer || is_image_buffer) { c += " bool iny = (yc >= 0 && yc < src_size.y);\n"; c += " int offset = select(0, yc * src_size.x, iny);\n"; - c += " FLT bt = bt_ar[0] * (FLT)(iny);\n"; + c += " ACCUM_FLT bt = bt_ar[0] * (ACCUM_FLT)(iny);\n"; } else { - c += " FLT bt = bt_ar[0];\n"; + c += " ACCUM_FLT bt = bt_ar[0];\n"; } for (int x = 0; x < 6; ++x) { const std::string xs = std::to_string(x); @@ -144,9 +164,9 @@ std::string GetWinograd4x4To36Code( if (is_buffer || is_image_buffer) { c += " bool iny = (yc >= 0 && yc < src_size.y);\n"; c += " int offset = select(0, yc * src_size.x, iny);\n"; - c += " FLT bt = bt_ar[" + ys + "] * (FLT)(iny);\n"; + c += " ACCUM_FLT bt = bt_ar[" + ys + "] * (ACCUM_FLT)(iny);\n"; } else { - c += " FLT bt = bt_ar[" + ys + "];\n"; + c += " ACCUM_FLT bt = bt_ar[" + ys + "];\n"; } for (int x = 0; x < 6; ++x) { const std::string xs = std::to_string(x); @@ -158,42 +178,50 @@ std::string GetWinograd4x4To36Code( } const LinkingContext context{"r0", "DST_X", "DST_Y", "DST_Z"}; c += " {\n"; - c += " FLT4 r0 = I0 + Bt[2] * I2 + Bt[4] * I4;\n"; + c += " FLT4 r0 = TO_FLT4(I0 + Bt[2] * I2 + Bt[4] * I4);\n"; c += PostProcess(linked_operations, context); c += " " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id); c += " DST_Y++;\n"; c += " }\n"; c += " {\n"; - c += " FLT4 r0 = Bt[7] * I1 + Bt[8] * I2 + Bt[9] * I3 + Bt[10] * I4;\n"; + c += " FLT4 r0 = TO_FLT4(Bt[7] * I1 + Bt[8] * I2 + Bt[9] * I3 + Bt[10] * " + "I4);\n"; c += PostProcess(linked_operations, context); c += " " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id); c += " DST_Y++;\n"; c += " }\n"; c += " {\n"; - c += " FLT4 r0 = Bt[13] * I1 + Bt[14] * I2 + Bt[15] * I3 + Bt[16] * I4;\n"; + c += " FLT4 r0 = TO_FLT4(Bt[13] * I1 + Bt[14] * I2 + Bt[15] * I3 + Bt[16] " + "* " + "I4);\n"; c += PostProcess(linked_operations, context); c += " " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id); c += " DST_Y++;\n"; c += " }\n"; c += " {\n"; - c += " FLT4 r0 = Bt[19] * I1 + Bt[20] * I2 + Bt[21] * I3 + Bt[22] * I4;\n"; + c += " FLT4 r0 = TO_FLT4(Bt[19] * I1 + Bt[20] * I2 + Bt[21] * I3 + Bt[22] " + "* " + "I4);\n"; c += PostProcess(linked_operations, context); c += " " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id); c += " DST_Y++;\n"; c += " }\n"; c += " {\n"; - c += " FLT4 r0 = Bt[25] * I1 + Bt[26] * I2 + Bt[27] * I3 + Bt[28] * I4;\n"; + c += " FLT4 r0 = TO_FLT4(Bt[25] * I1 + Bt[26] * I2 + Bt[27] * I3 + Bt[28] " + "* " + "I4);\n"; c += PostProcess(linked_operations, context); c += " " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id); c += " DST_Y++;\n"; c += " }\n"; c += " {\n"; - c += " FLT4 r0 = Bt[31] * I1 + Bt[33] * I3 + I5;\n"; + c += " FLT4 r0 = TO_FLT4(Bt[31] * I1 + Bt[33] * I3 + I5);\n"; c += PostProcess(linked_operations, context); c += " " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id); c += " DST_Y++;\n"; c += " }\n"; c += "}\n"; + // std::cout << c << std::endl; return c; } @@ -213,8 +241,22 @@ std::string GetWinograd36To4x4Code( const std::string batch_id = op_def.IsBatchSupported() ? "batch_id" : ""; std::string c = GetCommonDefines(op_def.precision); + switch (op_def.precision) { + case CalculationsPrecision::F32: + case CalculationsPrecision::F32_F16: + c += "#define ACCUM_FLT float\n"; + break; + case CalculationsPrecision::F16: + c += "#define ACCUM_FLT half\n"; + break; + } + + const DataType accum_type = op_def.precision == CalculationsPrecision::F16 + ? DataType::FLOAT16 + : DataType::FLOAT32; + auto at_mat = AtMatrixForWinograd4x4To6x6(); - c += "constant FLT At[24] = {\n"; + c += "constant ACCUM_FLT At[24] = {\n"; for (int y = 0; y < 4; ++y) { c += "\t"; for (int x = 0; x < 6; ++x) { @@ -243,10 +285,12 @@ std::string GetWinograd36To4x4Code( "dst_size.z) {\n"; c += " return; \n"; c += " }\n"; - c += " FLT4 I0, I1, I2, I3, I4, I5;\n"; - c += " FLT at_ar[6];\n"; - c += " FLT4 t00 = " + at_arr.ReadLinearFLT4("DST_Y * 2 + 0") + ";\n"; - c += " FLT4 t01 = " + at_arr.ReadLinearFLT4("DST_Y * 2 + 1") + ";\n"; + c += " ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n"; + c += " ACCUM_FLT at_ar[6];\n"; + c += " ACCUM_FLT4 t00 = TO_ACCUM_TYPE(" + + at_arr.ReadLinearFLT4("DST_Y * 2 + 0") + ");\n"; + c += " ACCUM_FLT4 t01 = TO_ACCUM_TYPE(" + + at_arr.ReadLinearFLT4("DST_Y * 2 + 1") + ");\n"; c += " at_ar[0] = t00.x;\n"; c += " at_ar[1] = t00.y;\n"; c += " at_ar[2] = t00.z;\n"; @@ -254,56 +298,60 @@ std::string GetWinograd36To4x4Code( c += " at_ar[4] = t01.x;\n"; c += " at_ar[5] = t01.y;\n"; c += " {\n"; - c += " FLT at = at_ar[0];\n"; + c += " ACCUM_FLT at = at_ar[0];\n"; for (int x = 0; x < 6; ++x) { const std::string yc = std::to_string(x); const std::string src = "src" + std::to_string(x); - c += " FLT4 " + src + " = " + - src_tensor.ReadWHSB("tile_id", yc, "DST_Z", batch_id) + ";\n"; + c += " ACCUM_FLT4 " + src + " = " + + src_tensor.ReadAsTypeWHSB(accum_type, "tile_id", yc, "DST_Z", + batch_id) + + ";\n"; c += " I" + std::to_string(x) + " = at * " + src + ";\n"; } c += " }\n"; for (int y = 1; y < 6; ++y) { c += " {\n"; - c += " FLT at = at_ar[" + std::to_string(y) + "];\n"; + c += " ACCUM_FLT at = at_ar[" + std::to_string(y) + "];\n"; for (int x = 0; x < 6; ++x) { const std::string yc = std::to_string(y * 6 + x); const std::string src = "src" + std::to_string(x); - c += " FLT4 " + src + " = " + - src_tensor.ReadWHSB("tile_id", yc, "DST_Z", batch_id) + ";\n"; + c += " ACCUM_FLT4 " + src + " = " + + src_tensor.ReadAsTypeWHSB(accum_type, "tile_id", yc, "DST_Z", + batch_id) + + ";\n"; c += " I" + std::to_string(x) + " += at * " + src + ";\n"; } c += " }\n"; } - c += " FLT4 t0 = I1 + I2;\n"; - c += " FLT4 t1 = I3 + I4;\n"; + c += " ACCUM_FLT4 t0 = I1 + I2;\n"; + c += " ACCUM_FLT4 t1 = I3 + I4;\n"; c += " FLT4 bias_val = " + biases.ReadLinearFLT4("DST_Z") + ";\n"; c += " {\n"; const LinkingContext context{"r0", "tile_x", "tile_y", "DST_Z"}; - c += " FLT4 r0 = I0 + t0 + t1 + bias_val;\n"; + c += " FLT4 r0 = TO_FLT4(I0 + t0 + t1) + bias_val;\n"; c += PostProcess(linked_operations, context); c += " " + dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id); c += " tile_x++;\n"; c += " }\n"; - c += " FLT4 t2 = I1 - I2;\n"; - c += " FLT4 t3 = I3 - I4;\n"; + c += " ACCUM_FLT4 t2 = I1 - I2;\n"; + c += " ACCUM_FLT4 t3 = I3 - I4;\n"; c += " if (tile_x < dst_size.x) {\n"; - c += " FLT4 r0 = t2 * At[7] + t3 * At[9] + bias_val;\n"; + c += " FLT4 r0 = TO_FLT4(t2 * At[7] + t3 * At[9]) + bias_val;\n"; c += PostProcess(linked_operations, context); c += " " + dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id); c += " tile_x++;\n"; c += " }\n"; c += " if (tile_x < dst_size.x) {\n"; - c += " FLT4 r0 = t0 * At[13] + t1 * At[15] + bias_val;\n"; + c += " FLT4 r0 = TO_FLT4(t0 * At[13] + t1 * At[15]) + bias_val;\n"; c += PostProcess(linked_operations, context); c += " " + dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id); c += " tile_x++;\n"; c += " }\n"; c += " if (tile_x < dst_size.x) {\n"; - c += " FLT4 r0 = t2 * At[19] + t3 * At[21] + I5 + bias_val;\n"; + c += " FLT4 r0 = TO_FLT4(t2 * At[19] + t3 * At[21] + I5) + bias_val;\n"; c += PostProcess(linked_operations, context); c += " " + dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id);