GpuInfo extended, added Adreno info.

Making it similar to OpenCL.

PiperOrigin-RevId: 341140554
Change-Id: I46b4476f28046fd4e16f1cc1d6d918032c860446
This commit is contained in:
Raman Sarokin 2020-11-06 16:34:52 -08:00 committed by TensorFlower Gardener
parent f903439d8f
commit 430dbcc5de
13 changed files with 348 additions and 152 deletions

View File

@ -15,6 +15,7 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
#include <map>
#include <string>
#include "absl/strings/ascii.h"
@ -23,79 +24,217 @@ namespace tflite {
namespace gpu {
namespace {
GpuType GetGpuType(const std::string& renderer) {
GpuVendor GetGpuVendor(const std::string& renderer) {
if (renderer.find("mali") != renderer.npos) {
return GpuType::MALI;
return GpuVendor::kMali;
}
if (renderer.find("adreno") != renderer.npos) {
return GpuType::ADRENO;
return GpuVendor::kQualcomm;
}
if (renderer.find("powervr") != renderer.npos) {
return GpuType::POWERVR;
return GpuVendor::kPowerVR;
}
if (renderer.find("intel") != renderer.npos) {
return GpuType::INTEL;
return GpuVendor::kIntel;
}
if (renderer.find("nvidia") != renderer.npos) {
return GpuType::NVIDIA;
return GpuVendor::kNvidia;
}
return GpuType::UNKNOWN;
return GpuVendor::kUnknown;
}
GpuModel GetGpuModel(const std::string& renderer) {
auto found_model = [&](std::string model) -> bool {
return renderer.find(model) != renderer.npos;
AdrenoGpu GetAdrenoGpuVersion(const std::string& device_name) {
const std::map<std::string, AdrenoGpu> kMapping = {
// Adreno 6xx series
{"685", AdrenoGpu::kAdreno685},
{"680", AdrenoGpu::kAdreno680},
{"675", AdrenoGpu::kAdreno675},
{"650", AdrenoGpu::kAdreno650},
{"640", AdrenoGpu::kAdreno640},
{"630", AdrenoGpu::kAdreno630},
{"620", AdrenoGpu::kAdreno620},
{"616", AdrenoGpu::kAdreno618},
{"616", AdrenoGpu::kAdreno616},
{"615", AdrenoGpu::kAdreno615},
{"612", AdrenoGpu::kAdreno612},
{"610", AdrenoGpu::kAdreno610},
{"605", AdrenoGpu::kAdreno605},
// Adreno 5xx series
{"540", AdrenoGpu::kAdreno540},
{"530", AdrenoGpu::kAdreno530},
{"512", AdrenoGpu::kAdreno512},
{"510", AdrenoGpu::kAdreno510},
{"509", AdrenoGpu::kAdreno509},
{"508", AdrenoGpu::kAdreno508},
{"506", AdrenoGpu::kAdreno506},
{"505", AdrenoGpu::kAdreno505},
{"504", AdrenoGpu::kAdreno504},
// Adreno 4xx series
{"430", AdrenoGpu::kAdreno430},
{"420", AdrenoGpu::kAdreno420},
{"418", AdrenoGpu::kAdreno418},
{"405", AdrenoGpu::kAdreno405},
// Adreno 3xx series
{"330", AdrenoGpu::kAdreno330},
{"320", AdrenoGpu::kAdreno320},
{"308", AdrenoGpu::kAdreno308},
{"306", AdrenoGpu::kAdreno306},
{"305", AdrenoGpu::kAdreno305},
{"304", AdrenoGpu::kAdreno304},
// Adreno 2xx series
{"225", AdrenoGpu::kAdreno225},
{"220", AdrenoGpu::kAdreno220},
{"205", AdrenoGpu::kAdreno205},
{"203", AdrenoGpu::kAdreno203},
{"200", AdrenoGpu::kAdreno200},
// Adreno 1xx series
{"130", AdrenoGpu::kAdreno130},
{"120", AdrenoGpu::kAdreno120},
};
// Adreno 6xx series
if (found_model("640")) return GpuModel::ADRENO640;
if (found_model("630")) return GpuModel::ADRENO630;
if (found_model("616")) return GpuModel::ADRENO616;
if (found_model("615")) return GpuModel::ADRENO615;
if (found_model("612")) return GpuModel::ADRENO612;
if (found_model("605")) return GpuModel::ADRENO605;
// Adreno 5xx series
if (found_model("540")) return GpuModel::ADRENO540;
if (found_model("530")) return GpuModel::ADRENO530;
if (found_model("512")) return GpuModel::ADRENO512;
if (found_model("510")) return GpuModel::ADRENO510;
if (found_model("509")) return GpuModel::ADRENO509;
if (found_model("508")) return GpuModel::ADRENO508;
if (found_model("506")) return GpuModel::ADRENO506;
if (found_model("505")) return GpuModel::ADRENO505;
if (found_model("504")) return GpuModel::ADRENO504;
// Adreno 4xx series
if (found_model("430")) return GpuModel::ADRENO430;
if (found_model("420")) return GpuModel::ADRENO420;
if (found_model("418")) return GpuModel::ADRENO418;
if (found_model("405")) return GpuModel::ADRENO405;
// Adreno 3xx series
if (found_model("330")) return GpuModel::ADRENO330;
if (found_model("320")) return GpuModel::ADRENO320;
if (found_model("308")) return GpuModel::ADRENO308;
if (found_model("306")) return GpuModel::ADRENO306;
if (found_model("305")) return GpuModel::ADRENO305;
if (found_model("304")) return GpuModel::ADRENO304;
// Adreno 2xx series
if (found_model("225")) return GpuModel::ADRENO225;
if (found_model("220")) return GpuModel::ADRENO220;
if (found_model("205")) return GpuModel::ADRENO205;
if (found_model("203")) return GpuModel::ADRENO203;
if (found_model("200")) return GpuModel::ADRENO200;
// Adreno 1xx series
if (found_model("130")) return GpuModel::ADRENO130;
return GpuModel::UNKNOWN;
for (const auto& v : kMapping) {
if (device_name.find(v.first) != std::string::npos) {
return v.second;
}
}
return AdrenoGpu::kUnknown;
}
} // namespace
void GetGpuModelAndType(const std::string& renderer, GpuModel* gpu_model,
GpuType* gpu_type) {
std::string lowered = renderer;
absl::AsciiStrToLower(&lowered);
*gpu_type = GetGpuType(lowered);
*gpu_model =
*gpu_type == GpuType::ADRENO ? GetGpuModel(lowered) : GpuModel::UNKNOWN;
AdrenoInfo::AdrenoInfo(const std::string& device_version)
: adreno_gpu(GetAdrenoGpuVersion(device_version)) {}
bool AdrenoInfo::IsAdreno1xx() const {
return adreno_gpu == AdrenoGpu::kAdreno120 ||
adreno_gpu == AdrenoGpu::kAdreno130;
}
bool AdrenoInfo::IsAdreno2xx() const {
return adreno_gpu == AdrenoGpu::kAdreno200 ||
adreno_gpu == AdrenoGpu::kAdreno203 ||
adreno_gpu == AdrenoGpu::kAdreno205 ||
adreno_gpu == AdrenoGpu::kAdreno220 ||
adreno_gpu == AdrenoGpu::kAdreno225;
}
bool AdrenoInfo::IsAdreno3xx() const {
return adreno_gpu == AdrenoGpu::kAdreno304 ||
adreno_gpu == AdrenoGpu::kAdreno305 ||
adreno_gpu == AdrenoGpu::kAdreno306 ||
adreno_gpu == AdrenoGpu::kAdreno308 ||
adreno_gpu == AdrenoGpu::kAdreno320 ||
adreno_gpu == AdrenoGpu::kAdreno330;
}
bool AdrenoInfo::IsAdreno4xx() const {
return adreno_gpu == AdrenoGpu::kAdreno405 ||
adreno_gpu == AdrenoGpu::kAdreno418 ||
adreno_gpu == AdrenoGpu::kAdreno420 ||
adreno_gpu == AdrenoGpu::kAdreno430;
}
bool AdrenoInfo::IsAdreno5xx() const {
return adreno_gpu == AdrenoGpu::kAdreno504 ||
adreno_gpu == AdrenoGpu::kAdreno505 ||
adreno_gpu == AdrenoGpu::kAdreno506 ||
adreno_gpu == AdrenoGpu::kAdreno508 ||
adreno_gpu == AdrenoGpu::kAdreno509 ||
adreno_gpu == AdrenoGpu::kAdreno510 ||
adreno_gpu == AdrenoGpu::kAdreno512 ||
adreno_gpu == AdrenoGpu::kAdreno530 ||
adreno_gpu == AdrenoGpu::kAdreno540;
}
bool AdrenoInfo::IsAdreno6xx() const {
return adreno_gpu == AdrenoGpu::kAdreno605 ||
adreno_gpu == AdrenoGpu::kAdreno610 ||
adreno_gpu == AdrenoGpu::kAdreno612 ||
adreno_gpu == AdrenoGpu::kAdreno615 ||
adreno_gpu == AdrenoGpu::kAdreno616 ||
adreno_gpu == AdrenoGpu::kAdreno618 ||
adreno_gpu == AdrenoGpu::kAdreno620 ||
adreno_gpu == AdrenoGpu::kAdreno630 ||
adreno_gpu == AdrenoGpu::kAdreno640 ||
adreno_gpu == AdrenoGpu::kAdreno650 ||
adreno_gpu == AdrenoGpu::kAdreno675 ||
adreno_gpu == AdrenoGpu::kAdreno680 ||
adreno_gpu == AdrenoGpu::kAdreno685;
}
bool AdrenoInfo::IsAdreno6xxOrHigher() const { return IsAdreno6xx(); }
int AdrenoInfo::GetMaximumWavesCount() const {
if (IsAdreno6xx()) {
if (adreno_gpu == AdrenoGpu::kAdreno640) {
return 30;
} else {
return 16;
}
} else {
// all other versions not supported
return 1;
}
}
int AdrenoInfo::GetRegisterMemorySizePerComputeUnit() const {
if (IsAdreno6xx()) {
if (adreno_gpu == AdrenoGpu::kAdreno640) {
return 128 * 144 * 16;
} else if (adreno_gpu == AdrenoGpu::kAdreno650) {
return 128 * 64 * 16;
} else {
return 128 * 96 * 16;
}
} else {
// all other versions not supported
return 1;
}
}
int AdrenoInfo::GetMaximumWavesCount(int register_footprint_per_tread,
bool full_wave) const {
const int register_usage_per_wave =
GetWaveSize(full_wave) * register_footprint_per_tread;
const int possible_waves_count =
GetRegisterMemorySizePerComputeUnit() / register_usage_per_wave;
return std::min(possible_waves_count, GetMaximumWavesCount());
}
int AdrenoInfo::GetWaveSize(bool full_wave) const {
if (IsAdreno6xx()) {
return full_wave ? 128 : 64;
} else if (IsAdreno5xx() || IsAdreno4xx()) {
return full_wave ? 64 : 32;
} else {
// all other versions not supported
return 1;
}
}
void GetGpuInfoFromDeviceDescription(const std::string& gpu_description,
GpuInfo* gpu_info) {
std::string lowered = gpu_description;
absl::AsciiStrToLower(&lowered);
gpu_info->vendor = GetGpuVendor(lowered);
if (gpu_info->IsAdreno()) {
gpu_info->adreno_info = AdrenoInfo(lowered);
}
}
bool GpuInfo::IsAdreno() const { return vendor == GpuVendor::kQualcomm; }
bool GpuInfo::IsApple() const { return vendor == GpuVendor::kApple; }
bool GpuInfo::IsMali() const { return vendor == GpuVendor::kMali; }
bool GpuInfo::IsPowerVR() const { return vendor == GpuVendor::kPowerVR; }
bool GpuInfo::IsNvidia() const { return vendor == GpuVendor::kNvidia; }
bool GpuInfo::IsAMD() const { return vendor == GpuVendor::kAMD; }
bool GpuInfo::IsIntel() const { return vendor == GpuVendor::kIntel; }
} // namespace gpu
} // namespace tflite

View File

@ -23,63 +23,113 @@ namespace tflite {
namespace gpu {
// The VendorID returned by the GPU driver.
enum class GpuType {
UNKNOWN,
APPLE,
MALI,
ADRENO,
POWERVR,
INTEL,
AMD,
NVIDIA,
enum class GpuVendor {
kApple,
kQualcomm,
kMali,
kPowerVR,
kNvidia,
kAMD,
kIntel,
kUnknown
};
enum class GpuModel {
UNKNOWN,
enum class AdrenoGpu {
// Adreno 6xx series
ADRENO640,
ADRENO630,
ADRENO616,
ADRENO615,
ADRENO612,
ADRENO605,
kAdreno685,
kAdreno680,
kAdreno675,
kAdreno650,
kAdreno640,
kAdreno630,
kAdreno620,
kAdreno618,
kAdreno616,
kAdreno615,
kAdreno612,
kAdreno610,
kAdreno605,
// Adreno 5xx series
ADRENO540,
ADRENO530,
ADRENO512,
ADRENO510,
ADRENO509,
ADRENO508,
ADRENO506,
ADRENO505,
ADRENO504,
kAdreno540,
kAdreno530,
kAdreno512,
kAdreno510,
kAdreno509,
kAdreno508,
kAdreno506,
kAdreno505,
kAdreno504,
// Adreno 4xx series
ADRENO430,
ADRENO420,
ADRENO418,
ADRENO405,
kAdreno430,
kAdreno420,
kAdreno418,
kAdreno405,
// Adreno 3xx series
ADRENO330,
ADRENO320,
ADRENO308,
ADRENO306,
ADRENO305,
ADRENO304,
kAdreno330,
kAdreno320,
kAdreno308,
kAdreno306,
kAdreno305,
kAdreno304,
// Adreno 2xx series
ADRENO225,
ADRENO220,
ADRENO205,
ADRENO203,
ADRENO200,
kAdreno225,
kAdreno220,
kAdreno205,
kAdreno203,
kAdreno200,
// Adreno 1xx series
ADRENO130,
kAdreno130,
kAdreno120,
kUnknown
};
struct AdrenoInfo {
AdrenoInfo() = default;
explicit AdrenoInfo(const std::string& device_version);
AdrenoGpu adreno_gpu;
bool IsAdreno1xx() const;
bool IsAdreno2xx() const;
bool IsAdreno3xx() const;
bool IsAdreno4xx() const;
bool IsAdreno5xx() const;
bool IsAdreno6xx() const;
bool IsAdreno6xxOrHigher() const;
// This function returns some not very documented physical parameter of
// Adreno6xx GPU.
// We obtained it using Snapdragon Profiler.
int GetMaximumWavesCount() const;
// returns amount of register memory per CU(Compute Unit) in bytes.
int GetRegisterMemorySizePerComputeUnit() const;
// returns maximum possible amount of waves based on register usage.
int GetMaximumWavesCount(int register_footprint_per_tread,
bool full_wave = true) const;
int GetWaveSize(bool full_wave) const;
// Not supported on some Adreno devices with specific driver version.
// b/131099086
bool support_one_layer_texture_array = true;
};
struct GpuInfo {
GpuType type = GpuType::UNKNOWN;
bool IsAdreno() const;
bool IsApple() const;
bool IsMali() const;
bool IsPowerVR() const;
bool IsNvidia() const;
bool IsAMD() const;
bool IsIntel() const;
GpuVendor vendor = GpuVendor::kUnknown;
std::string renderer_name;
std::string vendor_name;
std::string version;
GpuModel gpu_model;
int major_version = -1;
int minor_version = -1;
std::vector<std::string> extensions;
@ -90,6 +140,8 @@ struct GpuInfo {
int max_texture_size = 0;
int max_image_units = 0;
int max_array_texture_layers = 0;
AdrenoInfo adreno_info;
};
inline bool IsOpenGl31OrAbove(const GpuInfo& gpu_info) {
@ -97,9 +149,10 @@ inline bool IsOpenGl31OrAbove(const GpuInfo& gpu_info) {
gpu_info.major_version > 3;
}
// Analyzes `renderer` and returns matching `GpuType` and `GpuModel`.
void GetGpuModelAndType(const std::string& renderer, GpuModel* gpu_model,
GpuType* gpu_type);
// Currently it initializes vendor and AdrenoInfo if
// vendor is kQualcomm
void GetGpuInfoFromDeviceDescription(const std::string& gpu_description,
GpuInfo* gpu_info);
} // namespace gpu
} // namespace tflite

View File

@ -86,12 +86,12 @@ class AdrenoCommandQueue : public DefaultCommandQueue {
} // namespace
std::unique_ptr<CommandQueue> NewCommandQueue(const GpuInfo& gpu_info) {
if (gpu_info.type == GpuType::ADRENO) {
if (gpu_info.IsAdreno()) {
int flush_every_n = 1;
// On Adreno 630 and Adreno 505 there is up to 2x performance boost when
// glFlush happens not so often.
if (gpu_info.gpu_model == GpuModel::ADRENO630 ||
gpu_info.gpu_model == GpuModel::ADRENO505) {
if (gpu_info.adreno_info.adreno_gpu == AdrenoGpu::kAdreno630 ||
gpu_info.adreno_info.adreno_gpu == AdrenoGpu::kAdreno505) {
flush_every_n = 10;
}
return absl::make_unique<AdrenoCommandQueue>(flush_every_n);

View File

@ -65,21 +65,19 @@ bool ExceedsMaxSize(const Object& object, const GpuInfo& gpu_info) {
}
ObjectType ChooseFastestObjectType(const GpuInfo& gpu_info) {
return gpu_info.type == GpuType::ADRENO ? ObjectType::TEXTURE
: ObjectType::BUFFER;
return gpu_info.IsAdreno() ? ObjectType::TEXTURE : ObjectType::BUFFER;
}
ObjectType ChooseFastestRefObjectType(const GpuInfo& gpu_info,
const CompilationOptions& options) {
if (gpu_info.type != GpuType::ADRENO) {
if (!gpu_info.IsAdreno()) {
return ObjectType::BUFFER;
}
switch (gpu_info.gpu_model) {
case GpuModel::ADRENO630:
return ObjectType::TEXTURE;
default:
return options.allow_precision_loss ? ObjectType::TEXTURE
: ObjectType::BUFFER;
if (gpu_info.adreno_info.adreno_gpu == AdrenoGpu::kAdreno630) {
return ObjectType::TEXTURE;
} else {
return options.allow_precision_loss ? ObjectType::TEXTURE
: ObjectType::BUFFER;
}
}

View File

@ -31,13 +31,13 @@ namespace gl {
ShaderCodegen::ShaderCodegen(const CompilationOptions& options,
const GpuInfo& gpu_info)
: options_(options), gpu_type_(gpu_info.type) {}
: options_(options), gpu_type_(gpu_info.vendor) {}
absl::Status ShaderCodegen::Build(CompiledNodeAttributes attr,
ShaderCode* shader_code) const {
VariableAccessor variable_accessor(options_.inline_parameters,
options_.vulkan_support);
ObjectAccessor object_accessor(gpu_type_ == GpuType::MALI,
ObjectAccessor object_accessor(gpu_type_ == GpuVendor::kMali,
options_.sampler_textures, &variable_accessor);
const auto add_object = [&](const std::string& name, Object&& object) {

View File

@ -44,7 +44,7 @@ class ShaderCodegen {
private:
const CompilationOptions options_;
const GpuType gpu_type_;
const GpuVendor gpu_type_;
};
} // namespace gl

View File

@ -89,7 +89,7 @@ absl::Status EglEnvironment::Init() {
}
}
if (gpu_info_.type == GpuType::UNKNOWN) {
if (gpu_info_.vendor == GpuVendor::kUnknown) {
RETURN_IF_ERROR(RequestGpuInfo(&gpu_info_));
}
// TODO(akulik): when do we need ForceSyncTurning?
@ -110,7 +110,7 @@ absl::Status EglEnvironment::InitSurfacelessContext() {
// PowerVR support EGL_KHR_surfaceless_context, but glFenceSync crashes on
// PowerVR when it is surface-less.
RETURN_IF_ERROR(RequestGpuInfo(&gpu_info_));
if (gpu_info_.type == GpuType::POWERVR) {
if (gpu_info_.IsPowerVR()) {
return absl::UnavailableError(
"Surface-less context is not properly supported on powervr.");
}

View File

@ -134,7 +134,7 @@ class Convolution : public NodeShader {
/*workload=*/uint3(),
/*workgroup=*/
GetIdealWorkgroupIfPossible(
ctx.gpu_info->gpu_model, OperationType::CONVOLUTION_2D,
*ctx.gpu_info, OperationType::CONVOLUTION_2D,
HW(weights.h, weights.w), attr.strides, uint3(0, 0, 0),
OHWI(weights.o, ctx.input_shapes[0][1], ctx.input_shapes[0][2],
ctx.input_shapes[0][3])),
@ -149,8 +149,7 @@ class Convolution : public NodeShader {
int SelectMultiplier(int32_t input_width,
const NodeShader::GenerationContext& ctx) {
std::vector<int> multipliers = {4, 2};
if (!ctx.compiler_options.allow_precision_loss &&
ctx.gpu_info->type == GpuType::MALI) {
if (!ctx.compiler_options.allow_precision_loss && ctx.gpu_info->IsMali()) {
multipliers = {2};
}
for (int i : multipliers) {
@ -234,7 +233,7 @@ class Convolution1x1 : public NodeShader {
auto dst_depth = DivideRoundUp(ctx.output_shapes[0][3], 4);
uint3 workgroup = uint3(16, 16, 1);
if (ctx.gpu_info->type == GpuType::ADRENO) {
if (ctx.gpu_info->IsAdreno()) {
if (dst_depth >= 2) {
workgroup = uint3(8, 8, 2);
}
@ -276,7 +275,7 @@ class Convolution1x1 : public NodeShader {
DivideRoundUp(ctx.output_shapes[0][3], 4)),
/*workgroup=*/
GetIdealWorkgroupIfPossible(
ctx.gpu_info->gpu_model, OperationType::CONVOLUTION_2D,
*ctx.gpu_info, OperationType::CONVOLUTION_2D,
HW(attr.weights.shape.h, attr.weights.shape.w), attr.strides,
workgroup,
OHWI(attr.weights.shape.o, ctx.input_shapes[0][1],

View File

@ -141,7 +141,7 @@ class DepthwiseConvolution : public NodeShader {
/*workload=*/uint3(),
/*workgroup=*/
GetIdealWorkgroupIfPossible(
ctx.gpu_info->gpu_model, OperationType::DEPTHWISE_CONVOLUTION,
*ctx.gpu_info, OperationType::DEPTHWISE_CONVOLUTION,
HW(attr.weights.shape.h, attr.weights.shape.w), attr.strides,
OHWI(attr.weights.shape.o, ctx.input_shapes[0][1],
ctx.input_shapes[0][2], ctx.input_shapes[0][3])),

View File

@ -34,7 +34,7 @@ absl::Status RequestGpuInfo(GpuInfo* gpu_info) {
const GLubyte* renderer_name = glGetString(GL_RENDERER);
if (renderer_name) {
info.renderer_name = reinterpret_cast<const char*>(renderer_name);
GetGpuModelAndType(info.renderer_name, &info.gpu_model, &info.type);
GetGpuInfoFromDeviceDescription(info.renderer_name, &info);
}
const GLubyte* vendor_name = glGetString(GL_VENDOR);

View File

@ -81,7 +81,7 @@ class WorkgroupsCalculatorForMali : public WorkgroupsCalculator {
std::unique_ptr<WorkgroupsCalculator> NewDefaultWorkgroupsCalculator(
const GpuInfo& gpu_info) {
if (gpu_info.type == GpuType::MALI) {
if (gpu_info.IsMali()) {
return absl::make_unique<WorkgroupsCalculatorForMali>(gpu_info);
} else {
return absl::make_unique<DefaultWorkgroupsCalculator>(gpu_info);

View File

@ -137,40 +137,45 @@ std::vector<IdealByType>* kIdealByTypeAdreno418Ptr = kIdealByTypeAdreno508Ptr;
std::vector<IdealByType>* kIdealByTypeAdreno405Ptr = kIdealByTypeAdreno508Ptr;
// Put all ideal workgroups from the list together.
const std::map<GpuModel, IdealWorkgroups>* kIdealWorkgroupsInfoPtr =
new std::map<GpuModel, IdealWorkgroups>{
{GpuModel::ADRENO630,
const std::map<AdrenoGpu, IdealWorkgroups>* kIdealAdrenoWorkgroupsInfoPtr =
new std::map<AdrenoGpu, IdealWorkgroups>{
{AdrenoGpu::kAdreno630,
{*kIdealByTypeAdreno630Ptr, *kIdealByCaseAdreno630Ptr}},
{GpuModel::ADRENO540, {*kIdealByTypeAdreno540Ptr, {}}},
{GpuModel::ADRENO510,
{AdrenoGpu::kAdreno540, {*kIdealByTypeAdreno540Ptr, {}}},
{AdrenoGpu::kAdreno510,
{*kIdealByTypeAdreno510Ptr, *kIdealByCaseAdreno510Ptr}},
{GpuModel::ADRENO509, {*kIdealByTypeAdreno509Ptr, {}}},
{GpuModel::ADRENO508, {*kIdealByTypeAdreno508Ptr, {}}},
{GpuModel::ADRENO506, {*kIdealByTypeAdreno506Ptr, {}}},
{GpuModel::ADRENO505, {*kIdealByTypeAdreno505Ptr, {}}},
{GpuModel::ADRENO418, {*kIdealByTypeAdreno418Ptr, {}}},
{GpuModel::ADRENO405, {*kIdealByTypeAdreno405Ptr, {}}},
{AdrenoGpu::kAdreno509, {*kIdealByTypeAdreno509Ptr, {}}},
{AdrenoGpu::kAdreno508, {*kIdealByTypeAdreno508Ptr, {}}},
{AdrenoGpu::kAdreno506, {*kIdealByTypeAdreno506Ptr, {}}},
{AdrenoGpu::kAdreno505, {*kIdealByTypeAdreno505Ptr, {}}},
{AdrenoGpu::kAdreno418, {*kIdealByTypeAdreno418Ptr, {}}},
{AdrenoGpu::kAdreno405, {*kIdealByTypeAdreno405Ptr, {}}},
};
} // namespace
uint3 GetIdealWorkgroupIfPossible(GpuModel gpu_model, OperationType op_type,
HW kernel, HW strides, uint3 default_wg,
OHWI workload) {
uint3 GetIdealWorkgroupIfPossible(const GpuInfo& gpu_info,
OperationType op_type, HW kernel, HW strides,
uint3 default_wg, OHWI workload) {
// Research showed that ideal workgroup approach doesn't work well with
// convolutions, which have small amount of output channels or output
// height/width dimensions
if (workload.o < 32 || workload.h <= 5 || workload.w <= 5) return default_wg;
if (!gpu_info.IsAdreno()) {
return default_wg;
}
auto adreno_gpu_version = gpu_info.adreno_info.adreno_gpu;
// If GPU was investigated
if (!kIdealWorkgroupsInfoPtr->count(gpu_model)) {
if (!kIdealAdrenoWorkgroupsInfoPtr->count(adreno_gpu_version)) {
return default_wg;
}
// Try to find the ideal workgroup by the specific operation case, cause they
// are expected to be better tuned than default "by type" cases
for (const auto& specific_case :
kIdealWorkgroupsInfoPtr->at(gpu_model).by_case) {
kIdealAdrenoWorkgroupsInfoPtr->at(adreno_gpu_version).by_case) {
if (specific_case.ParamsAccepted(op_type, kernel, strides)) {
return specific_case.ideal_workgroup;
}
@ -178,7 +183,7 @@ uint3 GetIdealWorkgroupIfPossible(GpuModel gpu_model, OperationType op_type,
// Try to find the ideal workgroup by the operation type
for (const auto& default_case :
kIdealWorkgroupsInfoPtr->at(gpu_model).by_type) {
kIdealAdrenoWorkgroupsInfoPtr->at(adreno_gpu_version).by_type) {
if (default_case.ParamsAccepted(op_type)) {
return default_case.ideal_workgroup;
}
@ -189,9 +194,10 @@ uint3 GetIdealWorkgroupIfPossible(GpuModel gpu_model, OperationType op_type,
return default_wg;
}
uint3 GetIdealWorkgroupIfPossible(GpuModel gpu_model, OperationType op_type,
HW kernel, HW strides, OHWI workload) {
return GetIdealWorkgroupIfPossible(gpu_model, op_type, kernel, strides,
uint3 GetIdealWorkgroupIfPossible(const GpuInfo& gpu_info,
OperationType op_type, HW kernel, HW strides,
OHWI workload) {
return GetIdealWorkgroupIfPossible(gpu_info, op_type, kernel, strides,
kEmptyWorkgroupSize, workload);
}

View File

@ -28,15 +28,16 @@ namespace gl {
// Picks up the ideal workgroup size for the given convolution case.
// Ideal workgroup gives top 10% of the possible performance for the given case.
// They are received after the workgroup performance research (b/117291356).
uint3 GetIdealWorkgroupIfPossible(GpuModel gpu_model, OperationType op_type,
HW kernel, HW strides, OHWI workload);
uint3 GetIdealWorkgroupIfPossible(const GpuInfo& gpu_info,
OperationType op_type, HW kernel, HW strides,
OHWI workload);
// Does the same as the function above. Use this one if your operation can
// suggest some reasonable workgroup size. It's expected to give better
// performance than the default workgroup calculator.
uint3 GetIdealWorkgroupIfPossible(GpuModel gpu_model, OperationType op_type,
HW kernel, HW strides, uint3 default_wg,
OHWI workload);
uint3 GetIdealWorkgroupIfPossible(const GpuInfo& gpu_info,
OperationType op_type, HW kernel, HW strides,
uint3 default_wg, OHWI workload);
} // namespace gl
} // namespace gpu