DeviceInfo renamed into GpuInfo.

PiperOrigin-RevId: 341692138
Change-Id: I129db695d3cabaa423059822ecaf75111e98c878
This commit is contained in:
Raman Sarokin 2020-11-10 14:00:34 -08:00 committed by TensorFlower Gardener
parent 916f71f7f6
commit 83a51f49fa
95 changed files with 693 additions and 731 deletions

View File

@ -136,12 +136,12 @@ std::string GetImageModifier(AccessType access) {
}
}
std::string GetDefaultSamplers(const DeviceInfo& device_info) {
std::string GetDefaultSamplers(const GpuInfo& gpu_info) {
std::string result;
result +=
"__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | "
"CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
if (device_info.IsAdreno() && device_info.adreno_info.IsAdreno3xx()) {
if (gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
// Unfortunately, CLK_ADDRESS_CLAMP is very slow on Adreno3xx and
// we can observe huge register overhead when compared to other modes.
@ -209,7 +209,7 @@ absl::Status CreateCLObject(GPUObjectDescriptor* desc, CLContext* context,
constexpr char CLArguments::kArgsPrefix[];
absl::Status CLArguments::Init(
const DeviceInfo& device_info,
const GpuInfo& gpu_info,
const std::map<std::string, std::string>& linkables, CLContext* context,
Arguments* args, std::string* code) {
RETURN_IF_ERROR(AllocateObjects(*args, context));
@ -217,22 +217,22 @@ absl::Status CLArguments::Init(
RETURN_IF_ERROR(ResolveSelectorsPass(*args, linkables, code));
object_refs_ = std::move(args->object_refs_);
args->GetActiveArguments(kArgsPrefix, *code);
const bool use_f32_for_halfs = device_info.IsPowerVR();
const bool use_f32_for_halfs = gpu_info.IsPowerVR();
CopyArguments(*args, use_f32_for_halfs);
RETURN_IF_ERROR(SetObjectsResources(*args));
RenameArgumentsInCode(code);
ResolveArgsPass(code);
*code = absl::Substitute(*code, GetListOfArgs());
*code = GetDefaultSamplers(device_info) + *code;
*code = GetDefaultSamplers(gpu_info) + *code;
return absl::OkStatus();
}
absl::Status CLArguments::Init(const DeviceInfo& device_info, Arguments* args,
absl::Status CLArguments::Init(const GpuInfo& gpu_info, Arguments* args,
CLContext* context) {
RETURN_IF_ERROR(AllocateObjects(*args, context));
RETURN_IF_ERROR(AddObjectArgs(args));
object_refs_ = std::move(args->object_refs_);
const bool use_f32_for_halfs = device_info.IsPowerVR();
const bool use_f32_for_halfs = gpu_info.IsPowerVR();
CopyArguments(*args, use_f32_for_halfs);
RETURN_IF_ERROR(SetObjectsResources(*args));
return absl::OkStatus();

View File

@ -34,10 +34,10 @@ class CLArguments : public ArgumentsBinder {
public:
CLArguments() = default;
absl::Status Init(const DeviceInfo& device_info,
absl::Status Init(const GpuInfo& gpu_info,
const std::map<std::string, std::string>& linkables,
CLContext* context, Arguments* args, std::string* code);
absl::Status Init(const DeviceInfo& device_info, Arguments* args,
absl::Status Init(const GpuInfo& gpu_info, Arguments* args,
CLContext* context);
// Temporary, will be resolved later

View File

@ -44,8 +44,8 @@ __kernel void main_function($0) {
})";
CLArguments cl_args;
DeviceInfo device_info;
ASSERT_OK(cl_args.Init(device_info, {}, nullptr, &args, &sample_code));
GpuInfo gpu_info;
ASSERT_OK(cl_args.Init(gpu_info, {}, nullptr, &args, &sample_code));
EXPECT_TRUE(absl::StrContains(sample_code, "value = weights_buffer[id];"));
EXPECT_TRUE(
absl::StrContains(sample_code, "__global float4* weights_buffer"));
@ -66,9 +66,8 @@ TEST(CLArgumentsTest, TestNoSelector) {
}
)";
CLArguments cl_args;
DeviceInfo device_info;
EXPECT_FALSE(
cl_args.Init(device_info, {}, nullptr, &args, &sample_code).ok());
GpuInfo gpu_info;
EXPECT_FALSE(cl_args.Init(gpu_info, {}, nullptr, &args, &sample_code).ok());
}
} // namespace cl

View File

@ -216,19 +216,19 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
}
absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
const CLKernel& kernel, const DeviceInfo& device_info,
const CLKernel& kernel, const GpuInfo& gpu_info,
const std::vector<int3>& work_groups_count,
const std::vector<int3>& work_group_sizes, int* index) {
// Some Adreno 3xx can have wrong numbers for some events
const bool possible_bug_with_events =
device_info.IsAdreno() && device_info.adreno_info.IsAdreno3xx();
gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx();
events_.resize(work_group_sizes.size());
for (int i = 0; i < work_group_sizes.size(); ++i) {
RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
work_group_sizes[i], &events_[i]));
// reducing the speed of memory leak on Mali for some kernels
if (device_info.IsMali() && i % 8 == 7) {
if (gpu_info.IsMali() && i % 8 == 7) {
events_[i - 7].Wait();
}
if (possible_bug_with_events) {
@ -240,7 +240,7 @@ absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
RETURN_IF_ERROR(WaitForCompletion());
// To release memory of some kernel pool on Mali.
if (device_info.IsMali()) {
if (gpu_info.IsMali()) {
RETURN_IF_ERROR(kernel.ReInit());
}

View File

@ -116,7 +116,7 @@ class ProfilingCommandQueue : public CLCommandQueue {
// will write index for fastest work_group among work_group_sizes
absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
const DeviceInfo& device_info,
const GpuInfo& gpu_info,
const std::vector<int3>& work_groups_count,
const std::vector<int3>& work_group_sizes,
int* index);

View File

@ -50,7 +50,7 @@ bool IsEqualToImageFormat(cl_image_format image_format, DataType data_type,
image_format.image_channel_order == ToChannelOrder(num_channels);
}
void AddSupportedImageFormats(cl_context context, DeviceInfo* info) {
void AddSupportedImageFormats(cl_context context, GpuInfo* info) {
auto supported_formats =
GetSupportedImage2DFormats(context, CL_MEM_READ_WRITE);
for (auto format : supported_formats) {

View File

@ -156,8 +156,8 @@ bool IsGPUVersionInRange(int gpu_version, int min_version, int max_version) {
}
} // namespace
DeviceInfo DeviceInfoFromDeviceID(cl_device_id id) {
DeviceInfo info;
GpuInfo GpuInfoFromDeviceID(cl_device_id id) {
GpuInfo info;
const auto device_name = GetDeviceInfo<std::string>(id, CL_DEVICE_NAME);
const auto vendor_name = GetDeviceInfo<std::string>(id, CL_DEVICE_VENDOR);
const auto opencl_c_version =
@ -267,7 +267,7 @@ DeviceInfo DeviceInfoFromDeviceID(cl_device_id id) {
}
CLDevice::CLDevice(cl_device_id id, cl_platform_id platform_id)
: info_(DeviceInfoFromDeviceID(id)), id_(id), platform_id_(platform_id) {}
: info_(GpuInfoFromDeviceID(id)), id_(id), platform_id_(platform_id) {}
CLDevice::CLDevice(const CLDevice& device)
: info_(device.info_), id_(device.id_), platform_id_(device.platform_id_) {}

View File

@ -67,10 +67,10 @@ class CLDevice {
// To track bug on some Adreno. b/131099086
void DisableOneLayerTextureArray();
const DeviceInfo& GetInfo() const { return info_; }
const GpuInfo& GetInfo() const { return info_; }
// We update device info during context creation, so as supported texture
// formats can be requested from context only.
mutable DeviceInfo info_;
mutable GpuInfo info_;
private:
cl_device_id id_ = nullptr;

View File

@ -296,15 +296,15 @@ bool MaliInfo::IsValhall() const {
gpu_version == MaliGPU::G68 || gpu_version == MaliGPU::G78;
}
bool DeviceInfo::SupportsTextureArray() const {
bool GpuInfo::SupportsTextureArray() const {
return cl_version >= OpenCLVersion::CL_1_2;
}
bool DeviceInfo::SupportsImageBuffer() const {
bool GpuInfo::SupportsImageBuffer() const {
return cl_version >= OpenCLVersion::CL_1_2;
}
bool DeviceInfo::SupportsImage3D() const {
bool GpuInfo::SupportsImage3D() const {
if (IsMali() && mali_info.IsMidgard()) {
// On Mali T880 read_imageh doesn't compile with image3d_t
return false;
@ -312,7 +312,7 @@ bool DeviceInfo::SupportsImage3D() const {
return supports_image3d_writes;
}
bool DeviceInfo::SupportsFloatImage2D(DataType data_type, int channels) const {
bool GpuInfo::SupportsFloatImage2D(DataType data_type, int channels) const {
if (channels == 1) {
return data_type == DataType::FLOAT32 ? supports_r_f32_tex2d
: supports_r_f16_tex2d;
@ -330,7 +330,7 @@ bool DeviceInfo::SupportsFloatImage2D(DataType data_type, int channels) const {
}
}
bool DeviceInfo::SupportsExtension(const std::string& extension) const {
bool GpuInfo::SupportsExtension(const std::string& extension) const {
for (const auto& ext : extensions) {
if (ext == extension) {
return true;
@ -339,13 +339,13 @@ bool DeviceInfo::SupportsExtension(const std::string& extension) const {
return false;
}
bool DeviceInfo::IsCL20OrHigher() const {
bool GpuInfo::IsCL20OrHigher() const {
return cl_version != OpenCLVersion::CL_1_0 &&
cl_version != OpenCLVersion::CL_1_1 &&
cl_version != OpenCLVersion::CL_1_2;
}
bool DeviceInfo::SupportsSubGroupWithSize(int sub_group_size) const {
bool GpuInfo::SupportsSubGroupWithSize(int sub_group_size) const {
for (auto subgroup_size : supported_subgroup_sizes) {
if (sub_group_size == subgroup_size) {
return true;
@ -354,19 +354,19 @@ bool DeviceInfo::SupportsSubGroupWithSize(int sub_group_size) const {
return false;
}
bool DeviceInfo::IsAdreno() const { return gpu_vendor == GpuVendor::kQualcomm; }
bool GpuInfo::IsAdreno() const { return gpu_vendor == GpuVendor::kQualcomm; }
bool DeviceInfo::IsApple() const { return gpu_vendor == GpuVendor::kApple; }
bool GpuInfo::IsApple() const { return gpu_vendor == GpuVendor::kApple; }
bool DeviceInfo::IsMali() const { return gpu_vendor == GpuVendor::kMali; }
bool GpuInfo::IsMali() const { return gpu_vendor == GpuVendor::kMali; }
bool DeviceInfo::IsPowerVR() const { return gpu_vendor == GpuVendor::kPowerVR; }
bool GpuInfo::IsPowerVR() const { return gpu_vendor == GpuVendor::kPowerVR; }
bool DeviceInfo::IsNvidia() const { return gpu_vendor == GpuVendor::kNvidia; }
bool GpuInfo::IsNvidia() const { return gpu_vendor == GpuVendor::kNvidia; }
bool DeviceInfo::IsAMD() const { return gpu_vendor == GpuVendor::kAMD; }
bool GpuInfo::IsAMD() const { return gpu_vendor == GpuVendor::kAMD; }
bool DeviceInfo::IsIntel() const { return gpu_vendor == GpuVendor::kIntel; }
bool GpuInfo::IsIntel() const { return gpu_vendor == GpuVendor::kIntel; }
} // namespace cl
} // namespace gpu

View File

@ -176,8 +176,8 @@ struct MaliInfo {
bool IsValhall() const;
};
struct DeviceInfo {
DeviceInfo() = default;
struct GpuInfo {
GpuInfo() = default;
bool IsAdreno() const;
bool IsApple() const;

View File

@ -173,7 +173,7 @@ bool Environment::IsSupported(TensorStorageType storage_type) const {
return false;
}
TensorStorageType GetFastestStorageType(const DeviceInfo& gpu_info) {
TensorStorageType GetFastestStorageType(const GpuInfo& gpu_info) {
if (gpu_info.IsAdreno()) {
if (gpu_info.adreno_info.IsAdreno6xxOrHigher()) {
return TensorStorageType::TEXTURE_ARRAY;
@ -203,7 +203,7 @@ TensorStorageType GetFastestStorageType(const DeviceInfo& gpu_info) {
}
TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
const DeviceInfo& gpu_info) {
const GpuInfo& gpu_info) {
if (gpu_info.IsAdreno()) {
if (gpu_info.adreno_info.IsAdreno3xx() ||
gpu_info.adreno_info.IsAdreno4xx()) {

View File

@ -75,9 +75,9 @@ class Environment {
ProgramCache program_cache_;
};
TensorStorageType GetFastestStorageType(const DeviceInfo& gpu_info);
TensorStorageType GetFastestStorageType(const GpuInfo& gpu_info);
TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
const DeviceInfo& gpu_info);
const GpuInfo& gpu_info);
absl::Status CreateEnvironment(Environment* result);

View File

@ -160,7 +160,7 @@ absl::Status InferenceContext::InitFromGraph(
creation_context.queue = env->queue();
creation_context.cache = env->program_cache();
ReserveGraphTensors(create_info, creation_context.GetDeviceInfo(), graph);
ReserveGraphTensors(create_info, creation_context.GetGpuInfo(), graph);
precision_ = create_info.precision;
storage_type_ = create_info.storage_type;
if (env->device().IsMali()) {
@ -174,7 +174,7 @@ absl::Status InferenceContext::InitFromGraph(
need_flush_ = true;
}
CopyInAndOutIds(graph);
RETURN_IF_ERROR(ConvertOperations(creation_context.GetDeviceInfo(), graph,
RETURN_IF_ERROR(ConvertOperations(creation_context.GetGpuInfo(), graph,
create_info.hints));
RETURN_IF_ERROR(Merge());
RETURN_IF_ERROR(AllocateMemory(creation_context.context));
@ -284,7 +284,7 @@ void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) {
}
void InferenceContext::ReserveGraphTensors(
const CreateInferenceInfo& create_info, const DeviceInfo& device_info,
const CreateInferenceInfo& create_info, const GpuInfo& gpu_info,
const GraphFloat32& graph) {
ValueId max_id = 0;
auto tensors = graph.values();
@ -296,14 +296,14 @@ void InferenceContext::ReserveGraphTensors(
if (graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id)) {
if (shape.c < 4 &&
CanCreateTensorWithShape(
device_info, shape,
gpu_info, shape,
TensorDescriptor{data_type, TensorStorageType::SINGLE_TEXTURE_2D,
layout})) {
storage_type = TensorStorageType::SINGLE_TEXTURE_2D;
}
}
storage_type = SelectBestStorageType(device_info, shape, storage_type,
data_type, layout);
storage_type =
SelectBestStorageType(gpu_info, shape, storage_type, data_type, layout);
tensor_reserver_.Add(
t->id, {shape, TensorDescriptor{data_type, storage_type, layout}});
max_id = std::max(max_id, t->id);
@ -311,7 +311,7 @@ void InferenceContext::ReserveGraphTensors(
tensor_reserver_.SetNext(max_id + 1);
}
absl::Status InferenceContext::ConvertOperations(const DeviceInfo& device_info,
absl::Status InferenceContext::ConvertOperations(const GpuInfo& gpu_info,
const GraphFloat32& graph,
ModelHints hints) {
std::map<ValueId, TensorDescriptor> tensor_descriptors;
@ -335,7 +335,7 @@ absl::Status InferenceContext::ConvertOperations(const DeviceInfo& device_info,
std::string op_name = node.operation.type + " " + std::to_string(node.id);
GPUOperationsSubgraph gpu_subgraph;
if (hints.Check(ModelHints::kAllowSpecialKernels) &&
GPUSubgraphFromGraph(device_info, precision_, graph, node.id,
GPUSubgraphFromGraph(gpu_info, precision_, graph, node.id,
tensor_descriptors, &consumed_nodes, &gpu_subgraph,
&op_name)
.ok()) {
@ -375,7 +375,7 @@ absl::Status InferenceContext::ConvertOperations(const DeviceInfo& device_info,
op_def.dst_tensors.push_back(
tensor_reserver_.Get(outputs[j]->id).descriptor);
}
RETURN_IF_ERROR(GPUOperationFromNode(device_info, op_def, hints, inputs,
RETURN_IF_ERROR(GPUOperationFromNode(gpu_info, op_def, hints, inputs,
outputs, node, &gpu_subgraph));
}
absl::flat_hash_map<int, ValueId> mapping_to_global_ids;

View File

@ -111,12 +111,11 @@ class InferenceContext {
InferenceContext* inference);
void CopyInAndOutIds(const GraphFloat32& graph);
absl::Status ConvertOperations(const DeviceInfo& device_info,
absl::Status ConvertOperations(const GpuInfo& gpu_info,
const GraphFloat32& graph, ModelHints hints);
void CreateLinks();
void ReserveGraphTensors(const CreateInferenceInfo& create_info,
const DeviceInfo& device_info,
const GraphFloat32& graph);
const GpuInfo& gpu_info, const GraphFloat32& graph);
absl::Status Merge();
absl::Status AllocateMemory(CLContext* context);

View File

@ -48,6 +48,7 @@ cc_library(
":gpu_operation",
"//tensorflow/lite/delegates/gpu/cl:environment",
"//tensorflow/lite/delegates/gpu/cl:opencl_wrapper",
"//tensorflow/lite/delegates/gpu/cl:tensor",
"//tensorflow/lite/delegates/gpu/common:shape",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:tensor",

View File

@ -15,6 +15,7 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {

View File

@ -124,7 +124,7 @@ std::string GetConcatKernelCode(const OperationDef& op_def,
GPUOperation CreateConcatZ(const OperationDef& definition,
const std::vector<int>& channels,
const DeviceInfo& device_info) {
const GpuInfo& gpu_info) {
GPUOperation op(definition);
for (int i = 0; i < definition.src_tensors.size(); ++i) {
const std::string name = "src_tensor_" + std::to_string(i);
@ -140,14 +140,13 @@ GPUOperation CreateConcatZ(const OperationDef& definition,
}
op.AddDstTensor("dst_tensor", dst_desc);
op.code_ = GetConcatKernelCode(definition, channels);
if (device_info.IsPowerVR() &&
if (gpu_info.IsPowerVR() &&
definition.precision == CalculationsPrecision::F32 &&
!IsAllChannelsX4(channels)) {
// BUG, some PowerVRs (GE8320) produce incorrect result without it
op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
}
if (device_info.IsAMD() &&
definition.precision != CalculationsPrecision::F32 &&
if (gpu_info.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
definition.src_tensors[0].storage_type != TensorStorageType::BUFFER &&
!IsAllChannelsX4(channels)) {
// BUG, some AMD gpus crash without it

View File

@ -31,7 +31,7 @@ namespace cl {
GPUOperation CreateConcatZ(const OperationDef& definition,
const std::vector<int>& channels,
const DeviceInfo& device_info);
const GpuInfo& gpu_info);
} // namespace cl
} // namespace gpu

View File

@ -79,19 +79,19 @@ std::string GetComputationPart(const int3& block_size, int element_size,
return c;
}
ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
ConvBuffer1x1::ConvParams GetBestParams(const GpuInfo& gpu_info,
const OperationDef& definition,
const BHWC& shape, int src_depth,
int dst_depth) {
ConvBuffer1x1::ConvParams conv_params;
conv_params.element_size = 4;
conv_params.block_size = int3(1, 1, 1);
if (!device_info.IsMali()) {
if (!gpu_info.IsMali()) {
return conv_params;
}
bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 &&
definition.precision != CalculationsPrecision::F32;
bool is_midgard = device_info.IsMali() && device_info.mali_info.IsMidgard();
bool is_midgard = gpu_info.IsMali() && gpu_info.mali_info.IsMidgard();
if (is_midgard) {
if (can_use_flt8) {
conv_params.element_size = 8;
@ -103,8 +103,8 @@ ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
}
int task_size = shape.w * shape.b * shape.h * dst_depth;
int block_size = GetRecommendedBlockSizeForConv(
device_info, definition.precision, task_size);
int block_size =
GetRecommendedBlockSizeForConv(gpu_info, definition.precision, task_size);
if (!can_use_flt8 && block_size > 4) {
block_size = 4;
@ -132,15 +132,14 @@ ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
return conv_params;
}
ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
ConvBuffer1x1::ConvParams GetBestParams(const GpuInfo& gpu_info,
const OperationDef& definition,
int src_depth, int dst_depth) {
ConvBuffer1x1::ConvParams conv_params;
conv_params.element_size = 4;
conv_params.block_size = int3(1, 1, 1);
if (device_info.IsMali() &&
definition.precision == CalculationsPrecision::F16 &&
device_info.compute_units_count <= 4) {
if (gpu_info.IsMali() && definition.precision == CalculationsPrecision::F16 &&
gpu_info.compute_units_count <= 4) {
conv_params.block_size.x *= 2;
}
return conv_params;
@ -315,9 +314,9 @@ int3 ConvBuffer1x1::GetGridSize() const {
}
void ConvBuffer1x1::GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
work_groups);
}
@ -344,7 +343,7 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
}
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* shape) {
@ -353,16 +352,16 @@ ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
ConvBuffer1x1::ConvParams conv_params;
if (shape) {
conv_params =
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
} else {
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
}
ConvBuffer1x1 result(definition, conv_params);
result.UploadData(attr.weights, attr.bias);
return result;
}
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* shape) {
@ -371,9 +370,9 @@ ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
ConvBuffer1x1::ConvParams conv_params;
if (shape) {
conv_params =
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
} else {
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
}
conv_params.block_size.x *= conv_params.block_size.y;
conv_params.block_size.y = 1;
@ -383,16 +382,16 @@ ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
}
ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* shape) {
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvBuffer1x1::ConvParams conv_params;
if (shape) {
conv_params =
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
} else {
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
}
conv_params.block_size.x *= conv_params.block_size.y;
conv_params.block_size.y = 1;
@ -403,17 +402,17 @@ ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
}
ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC* dst_shape) {
const int dst_depth = DivideRoundUp(weights_shape.b, 4);
const int src_depth = DivideRoundUp(weights_shape.c, 4);
ConvBuffer1x1::ConvParams conv_params;
if (dst_shape) {
conv_params = GetBestParams(device_info, definition, *dst_shape, src_depth,
dst_depth);
conv_params =
GetBestParams(gpu_info, definition, *dst_shape, src_depth, dst_depth);
} else {
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
}
ConvBuffer1x1 result(definition, conv_params);
result.UploadBiases(attr.bias);

View File

@ -47,7 +47,7 @@ class ConvBuffer1x1 : public GPUOperation {
ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override;
int3 GetGridSize() const override;
@ -71,19 +71,19 @@ class ConvBuffer1x1 : public GPUOperation {
private:
ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params);
friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
friend ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* shape);
friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
friend ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* shape);
friend ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* shape);
friend ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC* dst_shape);
@ -177,23 +177,23 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
const BHWC& weights_shape,
const Convolution2DAttributes& attr);
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* shape = nullptr);
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* shape = nullptr);
ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC* dst_shape = nullptr);
ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* shape = nullptr);
} // namespace cl

View File

@ -57,7 +57,7 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1SimpleWeights) {
{data_type, TensorStorageType::BUFFER, Layout::HWC});
TensorFloat32 dst_tensor;
ConvBuffer1x1 operation = CreateConvBuffer1x1(
creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
creation_context_.GetGpuInfo(), op_def, attr, &src_tensor.shape);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -92,7 +92,7 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1) {
{data_type, TensorStorageType::BUFFER, Layout::HWC});
TensorFloat32 dst_tensor;
ConvBuffer1x1 operation = CreateConvBuffer1x1(
creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
creation_context_.GetGpuInfo(), op_def, attr, &src_tensor.shape);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 4), &dst_tensor));
EXPECT_THAT(dst_tensor.data,

View File

@ -36,7 +36,7 @@ int GetAdrenoOptimalMaxConstantSize(const AdrenoInfo& adreno_info) {
}
}
int GetOptimalMaxConstantSize(const DeviceInfo& info) {
int GetOptimalMaxConstantSize(const GpuInfo& info) {
if (!info.IsAdreno()) {
// In general we do not expect that this kernel will be used with non Adreno
// so as it tuned for __constant memory that have big profit on Adreno
@ -237,11 +237,10 @@ bool IsDotConvBetter(int src_channels, int dst_channels) {
} // namespace
bool IsConvConstantsSupported(const DeviceInfo& device_info,
bool IsConvConstantsSupported(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
if (device_info.IsAMD() &&
definition.precision != CalculationsPrecision::F32 &&
if (gpu_info.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) {
// BUG, some AMD gpus crashe without it
return false;
@ -259,12 +258,12 @@ bool IsConvConstantsSupported(const DeviceInfo& device_info,
? sizeof(float)
: sizeof(half);
const int filters_buffer_size = filters_count * float_size;
const int kConstantMaxSize = GetOptimalMaxConstantSize(device_info);
const int kConstantMaxSize = GetOptimalMaxConstantSize(gpu_info);
const int flt4_registers = DivideRoundUp(w_shape.o, 4);
return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
}
GPUOperation CreateConvConstants(const DeviceInfo& device_info,
GPUOperation CreateConvConstants(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
const bool use_dot_conv =
@ -286,11 +285,11 @@ GPUOperation CreateConvConstants(const DeviceInfo& device_info,
op.code_ = GenerateConvolutionConstantCode(
definition, attr.weights.shape, stride_correction, use_dot_conv, &op);
if (definition.precision == CalculationsPrecision::F16 &&
device_info.IsAdreno() && device_info.adreno_info.IsAdreno3xx()) {
gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
op.compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
if (definition.precision != CalculationsPrecision::F32 &&
device_info.IsPowerVR()) {
gpu_info.IsPowerVR()) {
// BUG, some PowerVRs (GE8320) produce incorrect result without it
op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
}

View File

@ -152,11 +152,11 @@ void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
absl::make_unique<BufferDescriptor>(std::move(desc)));
}
bool IsConvConstantsSupported(const DeviceInfo& device_info,
bool IsConvConstantsSupported(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);
GPUOperation CreateConvConstants(const DeviceInfo& device_info,
GPUOperation CreateConvConstants(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr);

View File

@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
CreateConvConstants(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -91,7 +91,7 @@ TEST_F(OpenCLOperationTest, ConvConstants) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
CreateConvConstants(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,

View File

@ -150,35 +150,35 @@ std::string GenerateBlockCoords(const int4& block_size,
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
const Convolution2DAttributes& attr,
const DeviceInfo& device_info, const BHWC* dst_shape)
const GpuInfo& gpu_info, const BHWC* dst_shape)
: GPUOperation(definition),
stride_(attr.strides.w, attr.strides.h, 1, 1),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 1, 1),
dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC& weights_shape,
const DeviceInfo& device_info, const BHWC* dst_shape)
const BHWC& weights_shape, const GpuInfo& gpu_info,
const BHWC* dst_shape)
: GPUOperation(definition),
stride_(attr.strides.w, attr.strides.h, 1, 1),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
kernel_size_(weights_shape.w, weights_shape.h, 1, 1),
dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
conv_params_(GuessBestParams(device_info, definition, attr, weights_shape,
conv_params_(GuessBestParams(gpu_info, definition, attr, weights_shape,
dst_shape)) {}
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
const FullyConnectedAttributes& attr,
const DeviceInfo& device_info, const BHWC* dst_shape)
const GpuInfo& gpu_info, const BHWC* dst_shape)
: GPUOperation(definition),
stride_(1, 1, 1, 1),
padding_(0, 0, 0, 0),
kernel_size_(1, 1, 1, 1),
dilation_(1, 1, 1, 1),
conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
ConvPowerVR::ConvPowerVR(const OperationDef& definition)
: GPUOperation(definition),
@ -197,7 +197,7 @@ ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
const Convolution3DAttributes& attr,
const DeviceInfo& device_info, const BHWDC* dst_shape)
const GpuInfo& gpu_info, const BHWDC* dst_shape)
: GPUOperation(definition),
stride_(attr.strides.w, attr.strides.h, attr.strides.d, 1),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
@ -205,7 +205,7 @@ ConvPowerVR::ConvPowerVR(const OperationDef& definition,
kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
attr.weights.shape.d, 1),
dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 1),
conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
if (this != &operation) {
@ -219,19 +219,18 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
return *this;
}
void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
void ConvPowerVR::GenerateCode(const GpuInfo& gpu_info) {
if (conv_params_.linear_spatial) {
grid_dimension_ = 2;
}
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
code_ =
GenerateConv(device_info, definition_, stride_correction, conv_params_);
code_ = GenerateConv(gpu_info, definition_, stride_correction, conv_params_);
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsPowerVR()) {
gpu_info.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
if (conv_params_.IsPrivateMemBroadcast() && device_info.IsCL20OrHigher()) {
if (conv_params_.IsPrivateMemBroadcast() && gpu_info.IsCL20OrHigher()) {
compiler_options_.push_back(CompilerOptions::CL_2_0);
}
bool kernel_is_trivial =
@ -239,7 +238,7 @@ void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
}
if (device_info.IsAdreno() && device_info.adreno_info.IsAdreno3xx() &&
if (gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx() &&
definition_.precision == CalculationsPrecision::F16 &&
kernel_is_trivial) {
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
@ -306,7 +305,7 @@ int3 ConvPowerVR::GetGridSize() const {
}
void ConvPowerVR::GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
if (conv_params_.weights_upload_type ==
WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
@ -316,11 +315,11 @@ void ConvPowerVR::GetPossibleKernelWorkGroups(
work_groups->push_back(work_group_size_);
return;
}
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
work_groups);
}
std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
std::string ConvPowerVR::GenerateConv(const GpuInfo& gpu_info,
const OperationDef& op_def,
bool stride_correction,
const ConvParams& conv_params) {
@ -446,9 +445,9 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
std::string c = GetCommonDefines(op_def.precision);
if (use_simd_broadcast) {
if (device_info.cl_version == OpenCLVersion::CL_2_0) {
if (gpu_info.cl_version == OpenCLVersion::CL_2_0) {
c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
} else if (device_info.SupportsExtension("cl_intel_subgroups")) {
} else if (gpu_info.SupportsExtension("cl_intel_subgroups")) {
c += "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n";
}
}
@ -459,7 +458,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
std::to_string(work_group_size_.y) + ", " +
std::to_string(work_group_size_.z) + ")))\n";
}
if (use_simd_broadcast && device_info.IsIntel()) {
if (use_simd_broadcast && gpu_info.IsIntel()) {
c += "__attribute__((intel_reqd_sub_group_size(" +
std::to_string(simd_size) + ")))\n";
}
@ -714,7 +713,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
}
}
};
const bool conditional_read = device_info.IsMali();
const bool conditional_read = gpu_info.IsMali();
auto read_src = [&]() {
const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
for (int z = 0; z < block_size.z; ++z) {
@ -1012,8 +1011,8 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
}
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const DeviceInfo& device_info, const OperationDef& definition,
int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
const GpuInfo& gpu_info, const OperationDef& definition, int src_depth,
int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
bool different_weights_for_height, const BHWC* dst_shape) {
ConvParams conv_params;
conv_params.linear_spatial = false;
@ -1022,7 +1021,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
conv_params.x_kernel_is_1 = x_kernel_is_1;
conv_params.y_kernel_is_1 = y_kernel_is_1;
conv_params.different_weights_for_height = different_weights_for_height;
if (device_info.IsNvidia()) {
if (gpu_info.IsNvidia()) {
if (different_weights_for_height) {
work_group_size_ = int3(32, 1, 1);
work_group_launch_order_ = int3(2, 0, 1);
@ -1046,7 +1045,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
if (dst_shape) {
int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
float task_size_per_cu =
static_cast<float>(task_size) / device_info.compute_units_count;
static_cast<float>(task_size) / gpu_info.compute_units_count;
int block_size = conv_params.block_size.x * conv_params.block_size.y *
conv_params.block_size.w;
float threads_per_cu = task_size_per_cu / block_size;
@ -1067,7 +1066,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
conv_params.src_depth_loop_size = 4;
}
} else if (device_info.IsPowerVR()) {
} else if (gpu_info.IsPowerVR()) {
if (different_weights_for_height) {
work_group_size_ = int3(32, 1, 1);
work_group_launch_order_ = int3(2, 0, 1);
@ -1115,7 +1114,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
}
conv_params.block_size.x = 2;
}
} else if (device_info.IsAMD()) {
} else if (gpu_info.IsAMD()) {
if (different_weights_for_height) {
work_group_size_ = int3(32, 1, 1);
work_group_launch_order_ = int3(2, 0, 1);
@ -1144,12 +1143,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
if (src_depth % 2 == 0 && src_depth >= 16) {
conv_params.src_depth_loop_size = 2;
}
} else if (device_info.IsMali()) {
} else if (gpu_info.IsMali()) {
int block_size = 2;
if (dst_shape) {
int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
block_size = GetRecommendedBlockSizeForConv(
device_info, definition.precision, task_size);
gpu_info, definition.precision, task_size);
}
if (!x_kernel_is_1 || !y_kernel_is_1) {
block_size = std::min(block_size, 4);
@ -1172,7 +1171,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
conv_params.block_size = int4(1, 1, 1, 1);
}
conv_params.src_depth_loop_size = 1;
MaliInfo mali_info = device_info.mali_info;
MaliInfo mali_info = gpu_info.mali_info;
if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
conv_params.src_depth_loop_size = 2;
}
@ -1184,9 +1183,9 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
work_group_launch_order_ = int3(0, 1, 2);
conv_params.fixed_work_group_size = false;
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
} else if (device_info.IsAdreno()) {
} else if (gpu_info.IsAdreno()) {
conv_params.block_size = int4(2, 2, 1, 2);
if (device_info.adreno_info.IsAdreno3xx()) {
if (gpu_info.adreno_info.IsAdreno3xx()) {
if (definition.precision == CalculationsPrecision::F16) {
conv_params.block_size = int4(2, 2, 1, 2);
} else if (definition.precision == CalculationsPrecision::F32_F16) {
@ -1205,7 +1204,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
} else {
conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4;
}
} else if (device_info.IsIntel()) {
} else if (gpu_info.IsIntel()) {
if (different_weights_for_height) {
work_group_size_ = int3(16, 1, 1);
work_group_launch_order_ = int3(0, 1, 2);
@ -1220,12 +1219,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
conv_params.src_depth_loop_size = 1;
int sub_group_size = 16;
const bool supports_subgroups =
device_info.SupportsExtension("cl_khr_subgroups") ||
device_info.SupportsExtension("cl_intel_subgroups");
gpu_info.SupportsExtension("cl_khr_subgroups") ||
gpu_info.SupportsExtension("cl_intel_subgroups");
if (definition.precision != CalculationsPrecision::F32_F16 &&
supports_subgroups &&
device_info.SupportsExtension("cl_intel_required_subgroup_size") &&
device_info.SupportsSubGroupWithSize(sub_group_size)) {
gpu_info.SupportsExtension("cl_intel_required_subgroup_size") &&
gpu_info.SupportsSubGroupWithSize(sub_group_size)) {
conv_params.weights_upload_type =
WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
conv_params.simd_size = sub_group_size;
@ -1271,7 +1270,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
}
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* dst_shape) {
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
@ -1283,12 +1282,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
attr.dilations.h == 1 &&
attr.padding.prepended.h == 0 &&
attr.padding.appended.h == 0;
return GuessBestParams(device_info, definition, src_depth, dst_depth,
return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
x_kernel_is_1, y_kernel_is_1, false, dst_shape);
}
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution3DAttributes& attr, const BHWDC* dst_shape) {
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
@ -1312,10 +1311,10 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
shape.h = dst_shape->h * dst_shape->d;
shape.w = dst_shape->w;
shape.c = dst_shape->c;
result = GuessBestParams(device_info, definition, src_depth, dst_depth,
result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
x_kernel_is_1, y_kernel_is_1, false, &shape);
} else {
result = GuessBestParams(device_info, definition, src_depth, dst_depth,
result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
x_kernel_is_1, y_kernel_is_1, false, nullptr);
}
result.z_kernel_is_1 = z_kernel_is_1;
@ -1323,7 +1322,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
}
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC* dst_shape) {
const int dst_depth = DivideRoundUp(weights_shape.b, 4);
@ -1334,18 +1333,17 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const bool y_kernel_is_1 =
weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
return GuessBestParams(device_info, definition, src_depth, dst_depth,
return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
x_kernel_is_1, y_kernel_is_1, false, dst_shape);
}
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const FullyConnectedAttributes& attr, const BHWC* dst_shape) {
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvPowerVR::ConvParams params =
GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
false, dst_shape);
ConvPowerVR::ConvParams params = GuessBestParams(
gpu_info, definition, src_depth, dst_depth, true, true, false, dst_shape);
work_group_size_.x *= work_group_size_.y;
work_group_size_.y = 1;
params.block_size.x *= params.block_size.y;
@ -1354,67 +1352,66 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
}
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* dst_shape) {
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
ConvPowerVR::ConvParams params =
GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
true, dst_shape);
ConvPowerVR::ConvParams params = GuessBestParams(
gpu_info, definition, src_depth, dst_depth, true, true, true, dst_shape);
params.block_size.x *= params.block_size.y;
params.block_size.y = 1;
return params;
}
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape) {
ConvPowerVR result(definition, attr, device_info, dst_shape);
result.GenerateCode(device_info);
ConvPowerVR result(definition, attr, gpu_info, dst_shape);
result.GenerateCode(gpu_info);
result.UploadData(attr.weights, attr.bias);
return result;
}
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* dst_shape) {
ConvPowerVR result(definition, attr, device_info, dst_shape);
result.GenerateCode(device_info);
ConvPowerVR result(definition, attr, gpu_info, dst_shape);
result.GenerateCode(gpu_info);
result.UploadData(attr.weights, attr.bias);
return result;
}
ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC& weights_shape,
const BHWC* dst_shape) {
ConvPowerVR result(definition, attr, weights_shape, device_info, dst_shape);
result.GenerateCode(device_info);
ConvPowerVR result(definition, attr, weights_shape, gpu_info, dst_shape);
result.GenerateCode(gpu_info);
result.UploadBias(attr.bias);
return result;
}
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape) {
ConvPowerVR result(definition);
result.conv_params_ =
result.GuessBestParamsWinograd(device_info, definition, attr, dst_shape);
result.GenerateCode(device_info);
result.GuessBestParamsWinograd(gpu_info, definition, attr, dst_shape);
result.GenerateCode(gpu_info);
result.UploadDataForWinograd4x4To6x6(attr.weights);
return result;
}
ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution3DAttributes& attr,
const BHWDC* dst_shape) {
ConvPowerVR result(definition, attr, device_info, dst_shape);
result.GenerateCode(device_info);
ConvPowerVR result(definition, attr, gpu_info, dst_shape);
result.GenerateCode(gpu_info);
result.UploadWeights(attr.weights);
result.UploadBias(attr.bias);
return result;

View File

@ -44,7 +44,7 @@ class ConvPowerVR : public GPUOperation {
public:
ConvPowerVR() = default;
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override;
absl::Status BindArguments(ArgumentsBinder* args) override;
@ -106,20 +106,20 @@ class ConvPowerVR : public GPUOperation {
};
ConvPowerVR(const OperationDef& definition,
const Convolution2DAttributes& attr,
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
const Convolution2DAttributes& attr, const GpuInfo& gpu_info,
const BHWC* dst_shape = nullptr);
ConvPowerVR(const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
const GpuInfo& gpu_info, const BHWC* dst_shape = nullptr);
ConvPowerVR(const OperationDef& definition,
const FullyConnectedAttributes& attr,
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
const BHWC* dst_shape = nullptr);
explicit ConvPowerVR(const OperationDef& definition);
ConvPowerVR(const OperationDef& definition,
const Convolution3DAttributes& attr,
const DeviceInfo& device_info, const BHWDC* dst_shape = nullptr);
const Convolution3DAttributes& attr, const GpuInfo& gpu_info,
const BHWDC* dst_shape = nullptr);
void GenerateCode(const DeviceInfo& device_info);
void GenerateCode(const GpuInfo& gpu_info);
template <DataType T>
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
@ -137,60 +137,60 @@ class ConvPowerVR : public GPUOperation {
template <DataType T>
void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape);
friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* dst_shape);
friend ConvPowerVR CreateConvPowerVRDynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC* dst_shape);
friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC* dst_shape);
friend ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
friend ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution3DAttributes& attr,
const BHWDC* dst_shape);
ConvParams GuessBestParams(const DeviceInfo& device_info,
ConvParams GuessBestParams(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape = nullptr);
ConvParams GuessBestParams(const DeviceInfo& device_info,
ConvParams GuessBestParams(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC& weights_shape,
const BHWC* dst_shape = nullptr);
ConvParams GuessBestParams(const DeviceInfo& device_info,
ConvParams GuessBestParams(const GpuInfo& gpu_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* dst_shape = nullptr);
ConvParams GuessBestParamsWinograd(const DeviceInfo& device_info,
ConvParams GuessBestParamsWinograd(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape = nullptr);
ConvParams GuessBestParams(const DeviceInfo& device_info,
ConvParams GuessBestParams(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution3DAttributes& attr,
const BHWDC* dst_shape = nullptr);
ConvParams GuessBestParams(const DeviceInfo& device_info,
ConvParams GuessBestParams(const GpuInfo& gpu_info,
const OperationDef& definition, int src_depth,
int dst_depth, bool x_kernel_is_1,
bool y_kernel_is_1,
bool different_weights_for_height,
const BHWC* dst_shape = nullptr);
std::string GenerateConv(const DeviceInfo& device_info,
const OperationDef& op_def, bool stride_correction,
std::string GenerateConv(const GpuInfo& gpu_info, const OperationDef& op_def,
bool stride_correction,
const ConvParams& conv_params);
int4 stride_;
@ -372,28 +372,28 @@ void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
}
}
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape = nullptr);
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
const BHWC* dst_shape = nullptr);
ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC& weights_shape,
const BHWC* dst_shape = nullptr);
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution2DAttributes& attr,
const BHWC* dst_shape = nullptr);
ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
const OperationDef& definition,
const Convolution3DAttributes& attr,
const BHWDC* dst_shape = nullptr);

View File

@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1SimpleWeights) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvPowerVR operation =
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
CreateConvPowerVR(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -91,7 +91,7 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvPowerVR operation =
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
CreateConvPowerVR(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -126,7 +126,7 @@ TEST_F(OpenCLOperationTest, ConvPowerVRSimpleWeights) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvPowerVR operation =
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
CreateConvPowerVR(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -161,7 +161,7 @@ TEST_F(OpenCLOperationTest, ConvPowerVR) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvPowerVR operation =
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
CreateConvPowerVR(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,

View File

@ -31,14 +31,14 @@ namespace cl {
ConvolutionTransposed::ConvolutionTransposed(
const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
const DeviceInfo& device_info)
const GpuInfo& gpu_info)
: GPUOperation(definition),
stride_(attr.stride.w, attr.stride.h, 1, 1),
block_size_(2, 2, 1, 2) {
const bool weights_are_buffer = device_info.IsMali();
const bool weights_are_buffer = gpu_info.IsMali();
const bool is_f16 = definition.precision == CalculationsPrecision::F16;
if (device_info.IsMali()) {
if (device_info.mali_info.IsMidgard()) {
if (gpu_info.IsMali()) {
if (gpu_info.mali_info.IsMidgard()) {
block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
} else {
block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
@ -46,7 +46,7 @@ ConvolutionTransposed::ConvolutionTransposed(
}
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
if (dst_depth == 1 || dst_depth == 3) {
if (!device_info.IsMali()) {
if (!gpu_info.IsMali()) {
block_size_.y *= block_size_.w;
}
block_size_.w = 1;
@ -58,22 +58,21 @@ ConvolutionTransposed::ConvolutionTransposed(
args_.AddInt("padding_y", attr.padding.prepended.h);
args_.AddInt("kernel_size_x", attr.weights.shape.w);
args_.AddInt("kernel_size_y", attr.weights.shape.h);
code_ = GenerateConvolutionTransposedCode(definition_, device_info,
code_ = GenerateConvolutionTransposedCode(definition_, gpu_info,
weights_are_buffer, block_size_);
UploadWeights(attr.weights, weights_are_buffer);
}
ConvolutionTransposed::ConvolutionTransposed(
const OperationDef& definition,
const ConvolutionTransposed3DAttributes& attr,
const DeviceInfo& device_info)
const ConvolutionTransposed3DAttributes& attr, const GpuInfo& gpu_info)
: GPUOperation(definition),
stride_(attr.stride.w, attr.stride.h, attr.stride.d, 1),
block_size_(2, 2, 1, 2) {
const bool weights_are_buffer = device_info.IsMali();
const bool weights_are_buffer = gpu_info.IsMali();
const bool is_f16 = definition.precision == CalculationsPrecision::F16;
if (device_info.IsMali()) {
if (device_info.mali_info.IsMidgard()) {
if (gpu_info.IsMali()) {
if (gpu_info.mali_info.IsMidgard()) {
block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
} else {
block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
@ -81,7 +80,7 @@ ConvolutionTransposed::ConvolutionTransposed(
}
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
if (dst_depth == 1 || dst_depth == 3) {
if (!device_info.IsMali()) {
if (!gpu_info.IsMali()) {
block_size_.y *= block_size_.w;
}
block_size_.w = 1;
@ -97,7 +96,7 @@ ConvolutionTransposed::ConvolutionTransposed(
args_.AddInt("kernel_size_y", attr.weights.shape.h);
args_.AddInt("kernel_size_z", attr.weights.shape.d);
args_.AddInt("grid_size_y");
code_ = GenerateConvolutionTransposedCode(definition_, device_info,
code_ = GenerateConvolutionTransposedCode(definition_, gpu_info,
weights_are_buffer, block_size_);
UploadWeights(attr.weights, weights_are_buffer);
}
@ -118,7 +117,7 @@ ConvolutionTransposed& ConvolutionTransposed::operator=(
}
std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
const OperationDef& op_def, const DeviceInfo& device_info,
const OperationDef& op_def, const GpuInfo& gpu_info,
bool weights_are_buffer, const int4& block_size) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetAddressMode(AddressMode::kZero);
@ -398,7 +397,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
c += " int x_c = kernel_index * args.src_tensor.Slices();\n";
}
c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
const bool conditional_read = device_info.IsMali();
const bool conditional_read = gpu_info.IsMali();
for (int z = 0; z < block_size.z; ++z) {
const std::string zind = std::to_string(z);
for (int y = 0; y < block_size.y; ++y) {
@ -536,16 +535,16 @@ int3 ConvolutionTransposed::GetGridSize() const {
}
void ConvolutionTransposed::GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
work_groups);
}
ConvolutionTransposed CreateConvolutionTransposed(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr) {
ConvolutionTransposed result(definition, attr, device_info);
ConvolutionTransposed result(definition, attr, gpu_info);
TensorLinearDescriptor desc;
desc.storage_type =
@ -558,9 +557,9 @@ ConvolutionTransposed CreateConvolutionTransposed(
}
ConvolutionTransposed CreateConvolutionTransposed3D(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposed3DAttributes& attr) {
ConvolutionTransposed result(definition, attr, device_info);
ConvolutionTransposed result(definition, attr, gpu_info);
TensorLinearDescriptor desc;
desc.storage_type =

View File

@ -41,7 +41,7 @@ class ConvolutionTransposed : public GPUOperation {
public:
ConvolutionTransposed() = default;
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override;
absl::Status BindArguments(ArgumentsBinder* args) override;
@ -55,17 +55,17 @@ class ConvolutionTransposed : public GPUOperation {
private:
friend ConvolutionTransposed CreateConvolutionTransposed(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
friend ConvolutionTransposed CreateConvolutionTransposed3D(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposed3DAttributes& attr);
ConvolutionTransposed(const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
const DeviceInfo& device_info);
const GpuInfo& gpu_info);
ConvolutionTransposed(const OperationDef& definition,
const ConvolutionTransposed3DAttributes& attr,
const DeviceInfo& device_info);
const GpuInfo& gpu_info);
template <DataType T>
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
@ -76,7 +76,7 @@ class ConvolutionTransposed : public GPUOperation {
bool weights_are_buffer);
std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
const DeviceInfo& device_info,
const GpuInfo& gpu_info,
bool weights_are_buffer,
const int4& block_size);
int4 stride_;
@ -206,11 +206,11 @@ void ConvolutionTransposed::UploadWeights(
}
ConvolutionTransposed CreateConvolutionTransposed(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
ConvolutionTransposed CreateConvolutionTransposed3D(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposed3DAttributes& attr);
} // namespace cl

View File

@ -26,15 +26,15 @@ namespace tflite {
namespace gpu {
namespace cl {
ConvolutionTransposed3x3::ConvolutionTransposed3x3(
const OperationDef& definition, const DeviceInfo& device_info, int2 padding)
const OperationDef& definition, const GpuInfo& gpu_info, int2 padding)
: GPUOperation(definition), padding_(padding) {
work_group_size_ = int3(8, 4, 1);
work_group_launch_order_ = int3(2, 0, 1);
if (device_info.IsPowerVR()) {
if (gpu_info.IsPowerVR()) {
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
} else if (device_info.IsNvidia() || device_info.IsIntel()) {
} else if (gpu_info.IsNvidia() || gpu_info.IsIntel()) {
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
} else if (device_info.IsAMD()) {
} else if (gpu_info.IsAMD()) {
weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
} else {
weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
@ -42,7 +42,7 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_,
padding_, work_group_launch_order_);
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsPowerVR()) {
gpu_info.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
}
@ -332,14 +332,14 @@ absl::Status ConvolutionTransposed3x3::BindArguments(ArgumentsBinder* args) {
}
void ConvolutionTransposed3x3::GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
if (weights_upload_type_ == WeightsUploadType::LOCAL_MEM_ASYNC ||
weights_upload_type_ == WeightsUploadType::LOCAL_MEM_BY_THREADS) {
work_groups->push_back(work_group_size_);
return;
}
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
work_groups);
}
@ -358,10 +358,10 @@ bool IsConvolutionTransposed3x3Supported(
}
ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr) {
const int2 padding = int2(attr.padding.prepended.w, attr.padding.prepended.h);
ConvolutionTransposed3x3 result(definition, device_info, padding);
ConvolutionTransposed3x3 result(definition, gpu_info, padding);
result.UploadWeights(attr.weights);
TensorLinearDescriptor desc;

View File

@ -38,7 +38,7 @@ class ConvolutionTransposed3x3 : public GPUOperation {
public:
ConvolutionTransposed3x3() = default;
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override;
absl::Status BindArguments(ArgumentsBinder* args) override;
@ -59,9 +59,9 @@ class ConvolutionTransposed3x3 : public GPUOperation {
private:
ConvolutionTransposed3x3(const OperationDef& definition,
const DeviceInfo& device_info, int2 padding);
const GpuInfo& gpu_info, int2 padding);
friend ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
template <DataType T>
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
@ -174,7 +174,7 @@ bool IsConvolutionTransposed3x3Supported(
const ConvolutionTransposedAttributes& attr);
ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
} // namespace cl

View File

@ -55,7 +55,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvolutionTransposed3x3 operation = CreateConvolutionTransposed3x3(
creation_context_.GetDeviceInfo(), op_def, attr);
creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,

View File

@ -198,7 +198,7 @@ bool IsConvolutionTransposed3x3ThinSupported(
}
ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr) {
ConvolutionTransposed3x3Thin result(definition, attr);
result.UploadData(attr.weights, attr.bias);

View File

@ -49,7 +49,7 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
private:
friend ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
explicit ConvolutionTransposed3x3Thin(
const OperationDef& definition,
@ -160,7 +160,7 @@ bool IsConvolutionTransposed3x3ThinSupported(
const ConvolutionTransposedAttributes& attr);
ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
} // namespace cl

View File

@ -55,7 +55,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3ThinSimpleWeights) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvolutionTransposed3x3Thin operation =
CreateConvolutionTransposed3x3Thin(creation_context_.GetDeviceInfo(),
CreateConvolutionTransposed3x3Thin(creation_context_.GetGpuInfo(),
op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
@ -91,7 +91,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3Thin) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvolutionTransposed3x3Thin operation =
CreateConvolutionTransposed3x3Thin(creation_context_.GetDeviceInfo(),
CreateConvolutionTransposed3x3Thin(creation_context_.GetGpuInfo(),
op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));

View File

@ -26,16 +26,16 @@ namespace tflite {
namespace gpu {
namespace cl {
ConvolutionTransposed4x4::ConvolutionTransposed4x4(
const OperationDef& definition, const DeviceInfo& device_info,
const OperationDef& definition, const GpuInfo& gpu_info,
const ConvolutionTransposedAttributes& attr)
: GPUOperation(definition) {
work_group_size_ = int3(8, 4, 1);
WeightsUploadType weights_upload_type = WeightsUploadType::GLOBAL_MEM;
if (device_info.IsPowerVR()) {
if (gpu_info.IsPowerVR()) {
weights_upload_type = WeightsUploadType::LOCAL_MEM_ASYNC;
} else if (device_info.IsNvidia() || device_info.IsIntel()) {
} else if (gpu_info.IsNvidia() || gpu_info.IsIntel()) {
weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
} else if (device_info.IsAMD()) {
} else if (gpu_info.IsAMD()) {
weights_upload_type = WeightsUploadType::CONSTANT_MEM;
} else {
weights_upload_type = WeightsUploadType::GLOBAL_MEM;
@ -44,7 +44,7 @@ ConvolutionTransposed4x4::ConvolutionTransposed4x4(
code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type);
UploadWeights(attr.weights, weights_upload_type);
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsPowerVR()) {
gpu_info.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
}
@ -332,9 +332,9 @@ bool IsConvolutionTransposed4x4Supported(
}
ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr) {
ConvolutionTransposed4x4 result(definition, device_info, attr);
ConvolutionTransposed4x4 result(definition, gpu_info, attr);
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::TEXTURE_2D;

View File

@ -38,7 +38,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
public:
ConvolutionTransposed4x4() = default;
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override {
work_groups->push_back(work_group_size_);
@ -61,10 +61,10 @@ class ConvolutionTransposed4x4 : public GPUOperation {
private:
ConvolutionTransposed4x4(const OperationDef& definition,
const DeviceInfo& device_info,
const GpuInfo& gpu_info,
const ConvolutionTransposedAttributes& attr);
friend ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
template <DataType T>
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
@ -161,7 +161,7 @@ bool IsConvolutionTransposed4x4Supported(
const ConvolutionTransposedAttributes& attr);
ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
} // namespace cl

View File

@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed4x4) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvolutionTransposed4x4 operation = CreateConvolutionTransposed4x4(
creation_context_.GetDeviceInfo(), op_def, attr);
creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,

View File

@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedSimpleWeights) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvolutionTransposed operation = CreateConvolutionTransposed(
creation_context_.GetDeviceInfo(), op_def, attr);
creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 2), &dst_tensor));
EXPECT_THAT(
@ -94,7 +94,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvolutionTransposed operation = CreateConvolutionTransposed(
creation_context_.GetDeviceInfo(), op_def, attr);
creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(

View File

@ -28,13 +28,13 @@ namespace cl {
ConvolutionTransposedThin::ConvolutionTransposedThin(
const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
const DeviceInfo& device_info)
const GpuInfo& gpu_info)
: GPUOperation(definition) {
code_ = GenerateConvolutionTransposedCode(
definition_, DivideRoundUp(attr.weights.shape.i, 4), attr.weights.shape.o,
int2(attr.weights.shape.w, attr.weights.shape.h));
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsAdreno() && device_info.adreno_info.IsAdreno3xx()) {
gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
}
@ -166,9 +166,9 @@ bool IsConvolutionTransposedThinSupported(
}
ConvolutionTransposedThin CreateConvolutionTransposedThin(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr) {
ConvolutionTransposedThin result(definition, attr, device_info);
ConvolutionTransposedThin result(definition, attr, gpu_info);
result.UploadData(attr.weights, attr.bias);
return result;
}

View File

@ -48,11 +48,11 @@ class ConvolutionTransposedThin : public GPUOperation {
private:
friend ConvolutionTransposedThin CreateConvolutionTransposedThin(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
ConvolutionTransposedThin(const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
const DeviceInfo& device_info);
const GpuInfo& gpu_info);
template <DataType T>
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases);
@ -141,7 +141,7 @@ bool IsConvolutionTransposedThinSupported(
const ConvolutionTransposedAttributes& attr);
ConvolutionTransposedThin CreateConvolutionTransposedThin(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
} // namespace cl

View File

@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedThinSimpleWeights) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvolutionTransposedThin operation = CreateConvolutionTransposedThin(
creation_context_.GetDeviceInfo(), op_def, attr);
creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 2), &dst_tensor));
EXPECT_THAT(
@ -94,7 +94,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedThin) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvolutionTransposedThin operation = CreateConvolutionTransposedThin(
creation_context_.GetDeviceInfo(), op_def, attr);
creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(

View File

@ -235,9 +235,9 @@ std::string GenerateDepthwiseConvolutionCode(
} // namespace
GPUOperation CreateDepthwiseConvolution2D(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr) {
bool weights_are_buffer = device_info.IsMali();
bool weights_are_buffer = gpu_info.IsMali();
GPUOperation op(definition);
op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
op.args_.AddInt("stride_x", attr.strides.w);
@ -270,7 +270,7 @@ GPUOperation CreateDepthwiseConvolution2D(
}
GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr) {
GPUOperation op(definition);
op.args_.AddInt("stride_x", attr.strides.w);
@ -286,8 +286,8 @@ GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
TensorLinearDescriptor desc;
desc.storage_type = device_info.IsMali() ? LinearStorageType::BUFFER
: LinearStorageType::TEXTURE_2D;
desc.storage_type = gpu_info.IsMali() ? LinearStorageType::BUFFER
: LinearStorageType::TEXTURE_2D;
desc.element_type = definition.GetDataType();
desc.UploadLinearData(attr.bias);
op.args_.AddObject(
@ -296,9 +296,9 @@ GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
}
GPUOperation CreateDepthwiseConvolution3D(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const DepthwiseConvolution3DAttributes& attr) {
bool weights_are_buffer = device_info.IsMali();
bool weights_are_buffer = gpu_info.IsMali();
GPUOperation op(definition);
op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
op.args_.AddInt("stride_x", attr.strides.w);

View File

@ -183,15 +183,15 @@ void UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, T>& weights,
}
GPUOperation CreateDepthwiseConvolution2D(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr);
GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr);
GPUOperation CreateDepthwiseConvolution3D(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const DepthwiseConvolution3DAttributes& attr);
} // namespace cl

View File

@ -29,15 +29,14 @@ namespace cl {
DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef& definition,
bool weights_are_buffer,
bool local_mem_uploads,
const DeviceInfo& device_info)
: GPUOperation(definition),
local_mem_uploads_(local_mem_uploads) {
const GpuInfo& gpu_info)
: GPUOperation(definition), local_mem_uploads_(local_mem_uploads) {
work_group_size_ = int3(8, 4, 1);
code_ = GenerateDepthwiseConvCode(definition_, weights_are_buffer,
local_mem_uploads_);
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsPowerVR()) {
gpu_info.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
}
@ -293,12 +292,12 @@ int3 DepthwiseConv3x3::GetGridSize() const {
}
void DepthwiseConv3x3::GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
if (local_mem_uploads_) {
work_groups->push_back(work_group_size_);
} else {
GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
work_groups);
}
}
@ -313,12 +312,12 @@ bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr) {
}
DepthwiseConv3x3 CreateDepthwiseConv3x3(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr) {
bool weights_are_buffer = device_info.IsPowerVR() || device_info.IsMali();
bool local_mem_uploads = weights_are_buffer && device_info.IsPowerVR();
bool weights_are_buffer = gpu_info.IsPowerVR() || gpu_info.IsMali();
bool local_mem_uploads = weights_are_buffer && gpu_info.IsPowerVR();
DepthwiseConv3x3 result(definition, weights_are_buffer, local_mem_uploads,
device_info);
gpu_info);
result.UploadWeightsAndBiases(attr.weights, attr.bias, weights_are_buffer);
return result;
}

View File

@ -39,7 +39,7 @@ class DepthwiseConv3x3 : public GPUOperation {
public:
DepthwiseConv3x3() = default;
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override;
int3 GetGridSize() const override;
@ -53,14 +53,14 @@ class DepthwiseConv3x3 : public GPUOperation {
private:
explicit DepthwiseConv3x3(const OperationDef& definition,
bool weights_are_buffer, bool local_mem_uploads,
const DeviceInfo& device_info);
const GpuInfo& gpu_info);
template <DataType T>
void UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases,
bool weights_are_buffer);
friend DepthwiseConv3x3 CreateDepthwiseConv3x3(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr);
template <DataType S, typename T>
@ -151,7 +151,7 @@ void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr);
DepthwiseConv3x3 CreateDepthwiseConv3x3(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr);
} // namespace cl

View File

@ -56,8 +56,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConv3x3SimpleWeights) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
DepthwiseConv3x3 operation = CreateDepthwiseConv3x3(
creation_context_.GetDeviceInfo(), op_def, attr);
DepthwiseConv3x3 operation =
CreateDepthwiseConv3x3(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -92,8 +92,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConv3x3) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
DepthwiseConv3x3 operation = CreateDepthwiseConv3x3(
creation_context_.GetDeviceInfo(), op_def, attr);
DepthwiseConv3x3 operation =
CreateDepthwiseConv3x3(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,

View File

@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvSimpleWeights) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation = CreateDepthwiseConvolution2D(
creation_context_.GetDeviceInfo(), op_def, attr);
creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -91,7 +91,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvNoMultiplier) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation = CreateDepthwiseConvolution2D(
creation_context_.GetDeviceInfo(), op_def, attr);
creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -127,7 +127,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvMultiplier2) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation = CreateDepthwiseConvolution2D(
creation_context_.GetDeviceInfo(), op_def, attr);
creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 4), &dst_tensor));
EXPECT_THAT(

View File

@ -197,14 +197,14 @@ GPUOperation CreateElementwiseOneRuntimeOneScalar(
// Creates simple two input(first input is runtime tensor and second input is
// constant linear tensor) operation, for example sub, div and etc.
GPUOperation CreateElementwiseTwoInput(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const OperationType& op_type,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
bool swap_inputs) {
const BHWC shape = BHWC(1, 1, 1, constant_tensor.shape.v);
TensorStorageType storage_type = SelectBestStorageType(
device_info, shape, definition.GetPrimaryStorageType(),
definition.GetDataType(), Layout::HWC);
TensorStorageType storage_type =
SelectBestStorageType(gpu_info, shape, definition.GetPrimaryStorageType(),
definition.GetDataType(), Layout::HWC);
TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
desc.UploadData(constant_tensor);
@ -228,15 +228,15 @@ GPUOperation CreateElementwiseTwoInput(
// Creates simple two input(first input is runtime tensor and second input is
// constant HWC tensor) operation, for example sub, div and etc.
GPUOperation CreateElementwiseTwoInput(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const OperationType& op_type,
const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
bool swap_inputs) {
const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w,
constant_tensor.shape.c);
TensorStorageType storage_type = SelectBestStorageType(
device_info, shape, definition.GetPrimaryStorageType(),
definition.GetDataType(), Layout::HWC);
TensorStorageType storage_type =
SelectBestStorageType(gpu_info, shape, definition.GetPrimaryStorageType(),
definition.GetDataType(), Layout::HWC);
TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
desc.UploadData(constant_tensor);
@ -270,7 +270,7 @@ GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
return op;
}
GPUOperation CreateElementwise(const DeviceInfo& device_info,
GPUOperation CreateElementwise(const GpuInfo& gpu_info,
const OperationDef& definition,
const OperationType& op_type,
const ElementwiseAttributes& attr) {
@ -284,12 +284,11 @@ GPUOperation CreateElementwise(const DeviceInfo& device_info,
return CreateElementwiseOneRuntimeOneScalar(definition, op_type, *scalar,
attr.runtime_tensor_is_second);
} else if (linear_tensor) {
return CreateElementwiseTwoInput(device_info, definition, op_type,
return CreateElementwiseTwoInput(gpu_info, definition, op_type,
*linear_tensor,
attr.runtime_tensor_is_second);
} else if (hwc_tensor) {
return CreateElementwiseTwoInput(device_info, definition, op_type,
*hwc_tensor,
return CreateElementwiseTwoInput(gpu_info, definition, op_type, *hwc_tensor,
attr.runtime_tensor_is_second);
} else {
return GPUOperation(definition);

View File

@ -33,7 +33,7 @@ GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
// Creates simple two input(first input is runtime tensor and second input is
// constant or linear/hwc tensor) operation, for example sub, div and etc.
GPUOperation CreateElementwise(const DeviceInfo& device_info,
GPUOperation CreateElementwise(const GpuInfo& gpu_info,
const OperationDef& definition,
const OperationType& op_type,
const ElementwiseAttributes& attr);

View File

@ -570,9 +570,8 @@ TEST_F(OpenCLOperationTest, MaximumWithScalar) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
OperationType::MAXIMUM, attr);
GPUOperation operation = CreateElementwise(
creation_context_.GetGpuInfo(), op_def, OperationType::MAXIMUM, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
BHWC(1, 4, 1, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -601,9 +600,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantLinearTensor) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
OperationType::MAXIMUM, attr);
GPUOperation operation = CreateElementwise(
creation_context_.GetGpuInfo(), op_def, OperationType::MAXIMUM, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -632,9 +630,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
OperationType::MAXIMUM, attr);
GPUOperation operation = CreateElementwise(
creation_context_.GetGpuInfo(), op_def, OperationType::MAXIMUM, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -662,9 +659,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
OperationType::MAXIMUM, attr);
GPUOperation operation = CreateElementwise(
creation_context_.GetGpuInfo(), op_def, OperationType::MAXIMUM, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -718,9 +714,8 @@ TEST_F(OpenCLOperationTest, MinimumWithScalar) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
OperationType::MINIMUM, attr);
GPUOperation operation = CreateElementwise(
creation_context_.GetGpuInfo(), op_def, OperationType::MINIMUM, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
BHWC(1, 4, 1, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -832,7 +827,7 @@ TEST_F(OpenCLOperationTest, SubWithScalarAtFirstPosition) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation = CreateElementwise(
creation_context_.GetDeviceInfo(), op_def, OperationType::SUB, attr);
creation_context_.GetGpuInfo(), op_def, OperationType::SUB, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
BHWC(1, 4, 1, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -887,7 +882,7 @@ TEST_F(OpenCLOperationTest, LessEqual) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
CreateElementwise(creation_context_.GetGpuInfo(), op_def,
OperationType::LESS_EQUAL, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
@ -914,9 +909,8 @@ TEST_F(OpenCLOperationTest, Greater) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
OperationType::GREATER, attr);
GPUOperation operation = CreateElementwise(
creation_context_.GetGpuInfo(), op_def, OperationType::GREATER, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -943,7 +937,7 @@ TEST_F(OpenCLOperationTest, GreaterEqual) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
CreateElementwise(creation_context_.GetGpuInfo(), op_def,
OperationType::GREATER_EQUAL, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
@ -970,9 +964,8 @@ TEST_F(OpenCLOperationTest, Equal) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
OperationType::EQUAL, attr);
GPUOperation operation = CreateElementwise(
creation_context_.GetGpuInfo(), op_def, OperationType::EQUAL, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -999,7 +992,7 @@ TEST_F(OpenCLOperationTest, NotEqual) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
CreateElementwise(creation_context_.GetGpuInfo(), op_def,
OperationType::NOT_EQUAL, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));

View File

@ -32,32 +32,32 @@ namespace tflite {
namespace gpu {
namespace cl {
namespace {
bool UseBufferForWeights(const DeviceInfo& device_info) {
return device_info.IsAdreno() || device_info.IsAMD() || device_info.IsMali();
bool UseBufferForWeights(const GpuInfo& gpu_info) {
return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali();
}
} // namespace
FullyConnected::FullyConnected(const OperationDef& definition,
const DeviceInfo& device_info)
const GpuInfo& gpu_info)
: GPUOperation(definition) {
if (device_info.IsAdreno()) {
if (device_info.adreno_info.IsAdreno3xx()) {
if (gpu_info.IsAdreno()) {
if (gpu_info.adreno_info.IsAdreno3xx()) {
work_group_size_ = int3(16, 4, 1);
} else if (device_info.adreno_info.IsAdreno4xx()) {
} else if (gpu_info.adreno_info.IsAdreno4xx()) {
work_group_size_ = int3(32, 4, 1);
} else {
work_group_size_ = int3(32, 4, 1);
}
} else if (device_info.IsIntel()) {
} else if (gpu_info.IsIntel()) {
work_group_size_ = int3(8, 4, 1);
} else if (device_info.IsNvidia()) {
} else if (gpu_info.IsNvidia()) {
work_group_size_ = int3(8, 4, 1);
} else if (device_info.IsPowerVR()) {
} else if (gpu_info.IsPowerVR()) {
work_group_size_ = int3(8, 4, 1);
} else {
work_group_size_ = int3(16, 4, 1);
}
code_ = GetFullyConnectedKernelCode(definition_, device_info);
code_ = GetFullyConnectedKernelCode(definition_, gpu_info);
}
FullyConnected::FullyConnected(FullyConnected&& kernel)
@ -77,11 +77,11 @@ FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
// optimized shaders
std::string FullyConnected::GetFullyConnectedKernelCode(
const OperationDef& op_def, const DeviceInfo& device_info) {
const OperationDef& op_def, const GpuInfo& gpu_info) {
AddSrcTensor("src_tensor", op_def.src_tensors[0]);
AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
const bool weights_are_buffer = UseBufferForWeights(device_info);
const bool weights_are_buffer = UseBufferForWeights(gpu_info);
std::string c = GetCommonDefines(op_def.precision);
switch (op_def.precision) {
@ -150,11 +150,11 @@ int3 FullyConnected::GetGridSize() const {
return int3(dst_[0]->Slices(), 1, 1);
}
FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr) {
FullyConnected result(definition, device_info);
result.UploadWeights(attr.weights, UseBufferForWeights(device_info));
FullyConnected result(definition, gpu_info);
result.UploadWeights(attr.weights, UseBufferForWeights(gpu_info));
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::TEXTURE_2D;

View File

@ -121,7 +121,7 @@ class FullyConnected : public GPUOperation {
public:
FullyConnected() = default;
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override {
work_groups->push_back(work_group_size_);
@ -135,9 +135,9 @@ class FullyConnected : public GPUOperation {
FullyConnected& operator=(const FullyConnected&) = delete;
private:
FullyConnected(const OperationDef& definition, const DeviceInfo& device_info);
FullyConnected(const OperationDef& definition, const GpuInfo& gpu_info);
friend FullyConnected CreateFullyConnected(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const FullyConnectedAttributes& attr);
template <DataType T>
@ -145,7 +145,7 @@ class FullyConnected : public GPUOperation {
bool weights_are_buffer);
std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
const DeviceInfo& device_info);
const GpuInfo& gpu_info);
};
template <DataType T>
@ -195,7 +195,7 @@ void FullyConnected::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
}
}
FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr);

View File

@ -58,7 +58,7 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
FullyConnected operation =
CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
CreateFullyConnected(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 1, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}))
@ -102,7 +102,7 @@ TEST_F(OpenCLOperationTest, FullyConnectedLarge) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
FullyConnected operation =
CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
CreateFullyConnected(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 1, 1, 12), &dst_tensor));
EXPECT_THAT(
@ -151,7 +151,7 @@ TEST_F(OpenCLOperationTest, FullyConnectedExtraLarge) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
FullyConnected operation =
CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
CreateFullyConnected(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 1, 1, kOutputSize), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected))

View File

@ -236,7 +236,7 @@ absl::Status GPUOperation::UpdateParams() {
return absl::OkStatus();
}
absl::Status GPUOperation::AssembleCode(const DeviceInfo& device_info,
absl::Status GPUOperation::AssembleCode(const GpuInfo& gpu_info,
CLContext* context) {
if (elementwise_) {
auto src_desc =
@ -258,14 +258,13 @@ absl::Status GPUOperation::AssembleCode(const DeviceInfo& device_info,
elementwise_code_ = "{\n" + code_ + "\n}\n" + elementwise_code_;
code_ = GetElementWiseCode(definition_, check_src_channels_size_);
}
return cl_args_.Init(device_info,
{{dst_tensors_names_[0], elementwise_code_}}, context,
&args_, &code_);
return cl_args_.Init(gpu_info, {{dst_tensors_names_[0], elementwise_code_}},
context, &args_, &code_);
}
absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
RETURN_IF_ERROR(
AssembleCode(creation_context.GetDeviceInfo(), creation_context.context));
AssembleCode(creation_context.GetGpuInfo(), creation_context.context));
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code_, "main_function", compiler_options_, *creation_context.context,
*creation_context.device, &kernel_));
@ -274,7 +273,7 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
absl::Status GPUOperation::CompileDeserialized(
const CreationContext& creation_context) {
RETURN_IF_ERROR(cl_args_.Init(creation_context.GetDeviceInfo(), &args_,
RETURN_IF_ERROR(cl_args_.Init(creation_context.GetGpuInfo(), &args_,
creation_context.context));
return creation_context.cache->GetOrCreateCLKernel(
code_, "main_function", compiler_options_, *creation_context.context,
@ -282,9 +281,9 @@ absl::Status GPUOperation::CompileDeserialized(
}
void GPUOperation::GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
work_groups);
}

View File

@ -74,7 +74,7 @@ struct CreationContext {
CLCommandQueue* queue;
ProgramCache* cache;
const DeviceInfo& GetDeviceInfo() const { return device->info_; }
const GpuInfo& GetGpuInfo() const { return device->info_; }
};
struct OperationDef {
@ -126,18 +126,18 @@ class GPUOperation {
}
virtual void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const;
absl::Status Tune(const TuningParameters& params);
absl::Status AssembleCode(const DeviceInfo& device_info, CLContext* context);
absl::Status AssembleCode(const GpuInfo& gpu_info, CLContext* context);
absl::Status Compile(const CreationContext& creation_context);
absl::Status CompileDeserialized(const CreationContext& creation_context);
virtual absl::Status PostCompileCheck(const DeviceInfo& device_info,
virtual absl::Status PostCompileCheck(const GpuInfo& gpu_info,
const KernelInfo& kernel_info) {
return absl::OkStatus();
}

View File

@ -24,8 +24,7 @@ namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GetLSTMCode(const OperationDef& op_def,
const DeviceInfo& device_info) {
std::string GetLSTMCode(const OperationDef& op_def, const GpuInfo& gpu_info) {
std::string c = GetCommonDefines(op_def.precision);
c += "__kernel void main_function(\n";
c += "$0) {\n";
@ -39,8 +38,7 @@ std::string GetLSTMCode(const OperationDef& op_def,
c += " FLT4 r1 = args.intermediate.Read(0, 0, Z + state_stride, B);\n";
c += " FLT4 r2 = args.intermediate.Read(0, 0, Z + state_stride * 2, B);\n";
c += " FLT4 r3 = args.intermediate.Read(0, 0, Z + state_stride * 3, B);\n";
if (op_def.precision != CalculationsPrecision::F32 &&
device_info.IsAdreno()) {
if (op_def.precision != CalculationsPrecision::F32 && gpu_info.IsAdreno()) {
c += " FLT4 input_gate;\n";
c += " FLT4 new_input;\n";
c += " FLT4 forget_gate;\n";
@ -88,13 +86,13 @@ std::string GetLSTMCode(const OperationDef& op_def,
} // namespace
GPUOperation CreateLSTM(const OperationDef& definition,
const DeviceInfo& device_info) {
const GpuInfo& gpu_info) {
GPUOperation op(definition);
op.AddSrcTensor("intermediate", definition.src_tensors[0]);
op.AddSrcTensor("prev_state", definition.src_tensors[1]);
op.AddDstTensor("new_state", definition.dst_tensors[0]);
op.AddDstTensor("activation", definition.dst_tensors[1]);
op.code_ = GetLSTMCode(definition, device_info);
op.code_ = GetLSTMCode(definition, gpu_info);
op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
return op;
}

View File

@ -26,7 +26,7 @@ namespace gpu {
namespace cl {
GPUOperation CreateLSTM(const OperationDef& definition,
const DeviceInfo& device_info);
const GpuInfo& gpu_info);
} // namespace cl
} // namespace gpu

View File

@ -26,19 +26,19 @@ namespace tflite {
namespace gpu {
namespace cl {
Mean::Mean(const OperationDef& definition, const DeviceInfo& device_info)
Mean::Mean(const OperationDef& definition, const GpuInfo& gpu_info)
: GPUOperation(definition) {
// for workgroup size:
// must be: (x * y) % 4 = 0;
// must be: z = 1;
work_group_size_ = int3(16, 16, 1);
if (device_info.IsAdreno()) {
if (device_info.adreno_info.IsAdreno3xx()) {
if (gpu_info.IsAdreno()) {
if (gpu_info.adreno_info.IsAdreno3xx()) {
work_group_size_ = int3(16, 8, 1);
}
}
if (device_info.IsMali()) {
const MaliInfo& mali_info = device_info.mali_info;
if (gpu_info.IsMali()) {
const MaliInfo& mali_info = gpu_info.mali_info;
if (mali_info.IsMaliT6xx() || mali_info.IsMaliT7xx() ||
mali_info.IsMaliT8xx()) {
work_group_size_ = int3(8, 4, 1);
@ -135,8 +135,8 @@ int3 Mean::GetGridSize() const {
return int3(grid_x, grid_y, grid_z);
}
Mean CreateMean(const OperationDef& definition, const DeviceInfo& device_info) {
return Mean(definition, device_info);
Mean CreateMean(const OperationDef& definition, const GpuInfo& gpu_info) {
return Mean(definition, gpu_info);
}
} // namespace cl

View File

@ -28,10 +28,10 @@ namespace cl {
class Mean : public GPUOperation {
public:
Mean() = default;
Mean(const OperationDef& definition, const DeviceInfo& device_info);
Mean(const OperationDef& definition, const GpuInfo& gpu_info);
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override {
work_groups->push_back(work_group_size_);
@ -50,7 +50,7 @@ class Mean : public GPUOperation {
const int3& work_group_size);
};
Mean CreateMean(const OperationDef& definition, const DeviceInfo& device_info);
Mean CreateMean(const OperationDef& definition, const GpuInfo& gpu_info);
} // namespace cl
} // namespace gpu

View File

@ -86,7 +86,7 @@ float4 filter_outside_tensor(float4 x, int num_channels, int slice) {
} // namespace
MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
const DeviceInfo& device_info,
const GpuInfo& gpu_info,
const int tensor_slices)
: GPUOperation(definition) {
// The kernel code does not inherently need a fixed size, but in order to not
@ -95,15 +95,15 @@ MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
// For now, fix workgroup size to the biggest supported by the device, but not
// larger than the number of tensor slices.
int desired_work_group_size =
std::min(tensor_slices, device_info.max_work_group_size_x);
if (device_info.IsMali()) {
std::min(tensor_slices, gpu_info.max_work_group_size_x);
if (gpu_info.IsMali()) {
// Don't use more than 64 work items per work group on ARM Mali. They
// implement local memory using the global memory, larger workgroups have
// severe performance penalty.
desired_work_group_size = 64;
}
if (device_info.IsAdreno()) {
AdrenoInfo info = device_info.adreno_info;
if (gpu_info.IsAdreno()) {
AdrenoInfo info = gpu_info.adreno_info;
if (info.IsAdreno3xx()) {
if (info.adreno_gpu == AdrenoGpu::kAdreno320 ||
info.adreno_gpu == AdrenoGpu::kAdreno330) {
@ -126,7 +126,7 @@ MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
}
}
}
if (device_info.IsPowerVR()) {
if (gpu_info.IsPowerVR()) {
desired_work_group_size = 64;
}
while (desired_work_group_size >= tensor_slices * 2) {
@ -136,9 +136,9 @@ MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
work_group_size_.y = 1; // Required
work_group_size_.z = 1; // Required
code_ = GetNormalizationCode();
if (device_info.cl_version >= OpenCLVersion::CL_3_0) {
if (gpu_info.cl_version >= OpenCLVersion::CL_3_0) {
compiler_options_.push_back(CompilerOptions::CL_3_0);
} else if (device_info.cl_version >= OpenCLVersion::CL_2_0) {
} else if (gpu_info.cl_version >= OpenCLVersion::CL_2_0) {
compiler_options_.push_back(CompilerOptions::CL_2_0);
}
}
@ -205,9 +205,9 @@ int3 MeanStdDevNormalization::GetGridSize() const {
}
MeanStdDevNormalization CreateMeanStdDevNormalization(
const OperationDef& definition, const DeviceInfo& device_info,
const OperationDef& definition, const GpuInfo& gpu_info,
const int tensor_slices) {
return MeanStdDevNormalization(definition, device_info, tensor_slices);
return MeanStdDevNormalization(definition, gpu_info, tensor_slices);
}
} // namespace cl

View File

@ -30,11 +30,11 @@ namespace cl {
class MeanStdDevNormalization : public GPUOperation {
public:
explicit MeanStdDevNormalization(const OperationDef& definition,
const DeviceInfo& device_info,
const GpuInfo& gpu_info,
const int tensor_slices);
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override {
work_groups->push_back(work_group_size_);
@ -53,7 +53,7 @@ class MeanStdDevNormalization : public GPUOperation {
};
MeanStdDevNormalization CreateMeanStdDevNormalization(
const OperationDef& definition, const DeviceInfo& device_info,
const OperationDef& definition, const GpuInfo& gpu_info,
const int tensor_slices);
} // namespace cl

View File

@ -25,7 +25,7 @@ namespace tflite {
namespace gpu {
namespace cl {
GPUOperation CreatePReLU(const DeviceInfo& device_info,
GPUOperation CreatePReLU(const GpuInfo& gpu_info,
const OperationDef& definition,
const PReLUAttributes& attr) {
GPUOperation result(definition);
@ -51,7 +51,7 @@ GPUOperation CreatePReLU(const DeviceInfo& device_info,
const BHWC shape =
BHWC(1, alpha_hwc->shape.h, alpha_hwc->shape.w, alpha_hwc->shape.c);
TensorStorageType storage_type = SelectBestStorageType(
device_info, shape, definition.GetPrimaryStorageType(),
gpu_info, shape, definition.GetPrimaryStorageType(),
definition.GetDataType(), Layout::HWC);
TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
desc.UploadData(*alpha_hwc);

View File

@ -31,7 +31,7 @@ namespace tflite {
namespace gpu {
namespace cl {
GPUOperation CreatePReLU(const DeviceInfo& device_info,
GPUOperation CreatePReLU(const GpuInfo& gpu_info,
const OperationDef& definition,
const PReLUAttributes& attr);

View File

@ -53,7 +53,7 @@ TEST_F(OpenCLOperationTest, PReLUAlpha) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
CreatePReLU(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -84,7 +84,7 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
CreatePReLU(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
@ -115,7 +115,7 @@ TEST_F(OpenCLOperationTest, PReLUHWCAlpha) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
GPUOperation operation =
CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
CreatePReLU(creation_context_.GetGpuInfo(), op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,

View File

@ -38,7 +38,7 @@ TEST_F(OpenCLOperationTest, ReduceSumChannels) {
src_tensor.shape = BHWC(1, 2, 1, 5);
src_tensor.data = {1.1, 2.1, 0.7, 0.3, 1.2, 3.1, 4.1, 0.0, 1.0, 4.4};
ReduceAttributes attr;
attr.axis = Axis::CHANNELS;
attr.dims = {Axis::CHANNELS};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
@ -63,7 +63,7 @@ TEST_F(OpenCLOperationTest, ReduceProductChannels) {
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {1.1, 2.0, 3.1, 4.0};
ReduceAttributes attr;
attr.axis = Axis::CHANNELS;
attr.dims = {Axis::CHANNELS};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
@ -89,7 +89,7 @@ TEST_F(OpenCLOperationTest, ReduceMaxChannels) {
src_tensor.data = {1.1, 2.0, -0.3, -100.0, 32.6, 1.1,
-3.1, -4.0, -5.0, -7.0, -2.0, -100.0};
ReduceAttributes attr;
attr.axis = Axis::CHANNELS;
attr.dims = {Axis::CHANNELS};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
@ -115,7 +115,7 @@ TEST_F(OpenCLOperationTest, ReduceMinChannels) {
src_tensor.data = {1.1, 2.0, -0.3, -100.0, 32.6, 1.1,
-3.1, -4.0, -5.0, -7.0, -2.0, 100.0};
ReduceAttributes attr;
attr.axis = Axis::CHANNELS;
attr.dims = {Axis::CHANNELS};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {

View File

@ -29,7 +29,7 @@ class Softmax1x1 : public GPUOperation {
Softmax1x1() = default;
explicit Softmax1x1(const OperationDef& definition);
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override {
work_groups->push_back(work_group_size_);

View File

@ -32,31 +32,31 @@ namespace tflite {
namespace gpu {
namespace cl {
namespace {
bool UseBufferForWeights(const DeviceInfo& device_info) {
return device_info.IsAdreno() || device_info.IsAMD() || device_info.IsMali();
bool UseBufferForWeights(const GpuInfo& gpu_info) {
return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali();
}
} // namespace
FCFCAdd::FCFCAdd(const OperationDef& definition, const DeviceInfo& device_info)
FCFCAdd::FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info)
: GPUOperation(definition) {
if (device_info.IsAdreno()) {
if (device_info.adreno_info.IsAdreno3xx()) {
if (gpu_info.IsAdreno()) {
if (gpu_info.adreno_info.IsAdreno3xx()) {
work_group_size_ = int3(16, 4, 1);
} else if (device_info.adreno_info.IsAdreno4xx()) {
} else if (gpu_info.adreno_info.IsAdreno4xx()) {
work_group_size_ = int3(32, 4, 1);
} else {
work_group_size_ = int3(32, 4, 1);
}
} else if (device_info.IsIntel()) {
} else if (gpu_info.IsIntel()) {
work_group_size_ = int3(8, 4, 1);
} else if (device_info.IsNvidia()) {
} else if (gpu_info.IsNvidia()) {
work_group_size_ = int3(8, 4, 1);
} else if (device_info.IsPowerVR()) {
} else if (gpu_info.IsPowerVR()) {
work_group_size_ = int3(8, 4, 1);
} else {
work_group_size_ = int3(16, 4, 1);
}
code_ = GetFCFCAddKernelCode(definition_, device_info);
code_ = GetFCFCAddKernelCode(definition_, gpu_info);
}
FCFCAdd::FCFCAdd(FCFCAdd&& kernel) : GPUOperation(std::move(kernel)) {}
@ -75,12 +75,12 @@ FCFCAdd& FCFCAdd::operator=(FCFCAdd&& kernel) {
// optimized shaders
std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
const DeviceInfo& device_info) {
const GpuInfo& gpu_info) {
AddSrcTensor("src_tensor_0", op_def.src_tensors[0]);
AddSrcTensor("src_tensor_1", op_def.src_tensors[1]);
AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
const bool weights_are_buffer = UseBufferForWeights(device_info);
const bool weights_are_buffer = UseBufferForWeights(gpu_info);
std::string c = GetCommonDefines(op_def.precision);
switch (op_def.precision) {
@ -172,15 +172,14 @@ std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
int3 FCFCAdd::GetGridSize() const { return int3(dst_[0]->Slices(), 1, 1); }
FCFCAdd CreateFCFCAdd(const DeviceInfo& device_info,
const OperationDef& definition,
FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
const FullyConnectedAttributes& attr0,
const FullyConnectedAttributes& attr1) {
FCFCAdd result(definition, device_info);
FCFCAdd result(definition, gpu_info);
result.UploadWeights(attr0.weights, "weights0",
UseBufferForWeights(device_info));
UseBufferForWeights(gpu_info));
result.UploadWeights(attr1.weights, "weights1",
UseBufferForWeights(device_info));
UseBufferForWeights(gpu_info));
TensorLinearDescriptor desc0;
desc0.storage_type = LinearStorageType::TEXTURE_2D;

View File

@ -97,7 +97,7 @@ class FCFCAdd : public GPUOperation {
public:
FCFCAdd() = default;
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override {
work_groups->push_back(work_group_size_);
@ -111,8 +111,8 @@ class FCFCAdd : public GPUOperation {
FCFCAdd& operator=(const FCFCAdd&) = delete;
private:
FCFCAdd(const OperationDef& definition, const DeviceInfo& device_info);
friend FCFCAdd CreateFCFCAdd(const DeviceInfo& device_info,
FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info);
friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
const OperationDef& definition,
const FullyConnectedAttributes& attr0,
const FullyConnectedAttributes& attr1);
@ -122,7 +122,7 @@ class FCFCAdd : public GPUOperation {
const std::string& name, bool weights_are_buffer);
std::string GetFCFCAddKernelCode(const OperationDef& op_def,
const DeviceInfo& device_info);
const GpuInfo& gpu_info);
};
template <DataType T>
@ -175,8 +175,7 @@ void FCFCAdd::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
}
}
FCFCAdd CreateFCFCAdd(const DeviceInfo& device_info,
const OperationDef& definition,
FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
const FullyConnectedAttributes& attr0,
const FullyConnectedAttributes& attr1);

View File

@ -27,7 +27,7 @@ enum class TuningType { EXHAUSTIVE, FAST };
struct TuningParameters {
ProfilingCommandQueue* queue;
const DeviceInfo* info;
const GpuInfo* info;
TuningType tuning_type = TuningType::EXHAUSTIVE;
};

View File

@ -114,19 +114,19 @@ int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size) {
return {1, 1, 1};
}
int GetRecommendedBlockSizeForConv(const DeviceInfo& device_info,
int GetRecommendedBlockSizeForConv(const GpuInfo& gpu_info,
CalculationsPrecision precision,
int task_size) {
const float task_size_per_cu =
task_size / static_cast<float>(device_info.compute_units_count);
task_size / static_cast<float>(gpu_info.compute_units_count);
int block_size = 1;
float threshold_1 = FLT_MAX;
float threshold_2 = FLT_MAX;
float threshold_4 = FLT_MAX;
if (!device_info.IsMali()) {
if (!gpu_info.IsMali()) {
return 1;
}
MaliInfo mali_info = device_info.mali_info;
MaliInfo mali_info = gpu_info.mali_info;
switch (precision) {
case CalculationsPrecision::F16:
if (mali_info.IsBifrostGen1()) {

View File

@ -210,7 +210,7 @@ float4 GetMaskForLastPlane(int channels);
int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
// task_size as amount of FLT4 processed elements.
int GetRecommendedBlockSizeForConv(const DeviceInfo& device,
int GetRecommendedBlockSizeForConv(const GpuInfo& gpu_info,
CalculationsPrecision precision,
int task_size);

View File

@ -33,15 +33,15 @@ namespace cl {
Winograd4x4To36::Winograd4x4To36(const OperationDef& definition,
const Padding2D& padding,
const DeviceInfo& device_info)
const GpuInfo& gpu_info)
: GPUOperation(definition), padding_(padding) {
work_group_size_ = int3(32, 1, 1);
code_ = GetWinograd4x4To36Code(definition_);
if (device_info.IsAdreno()) {
if (gpu_info.IsAdreno()) {
compiler_options_.push_back(CompilerOptions::ADRENO_MORE_WAVES);
}
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsPowerVR()) {
gpu_info.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
}
@ -282,11 +282,11 @@ int3 Winograd4x4To36::GetGridSize() const {
}
void Winograd4x4To36::GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
switch (tuning_type) {
case TuningType::EXHAUSTIVE:
GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
work_groups);
return;
case TuningType::FAST:
@ -296,20 +296,20 @@ void Winograd4x4To36::GetPossibleKernelWorkGroups(
}
}
Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
Winograd4x4To36 CreateWinograd4x4To36(const GpuInfo& gpu_info,
const OperationDef& definition,
const Padding2D& padding) {
Winograd4x4To36 result(definition, padding, device_info);
Winograd4x4To36 result(definition, padding, gpu_info);
result.UploadBt();
return result;
}
Winograd36To4x4::Winograd36To4x4(const OperationDef& definition,
const DeviceInfo& device_info)
const GpuInfo& gpu_info)
: GPUOperation(definition) {
work_group_size_ = int3(32, 1, 1);
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsPowerVR()) {
gpu_info.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
code_ = GetWinograd36To4x4Code(definition_);
@ -478,11 +478,11 @@ int3 Winograd36To4x4::GetGridSize() const {
}
void Winograd36To4x4::GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
switch (tuning_type) {
case TuningType::EXHAUSTIVE:
GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
work_groups);
return;
case TuningType::FAST:
@ -493,9 +493,9 @@ void Winograd36To4x4::GetPossibleKernelWorkGroups(
}
Winograd36To4x4 CreateWinograd36To4x4(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
Winograd36To4x4 result(definition, device_info);
Winograd36To4x4 result(definition, gpu_info);
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::TEXTURE_2D;
desc.element_type = definition.GetDataType();

View File

@ -34,11 +34,11 @@ class Winograd4x4To36 : public GPUOperation {
public:
Winograd4x4To36() = default;
Winograd4x4To36(const OperationDef& definition, const Padding2D& padding,
const DeviceInfo& device_info);
const GpuInfo& gpu_info);
absl::Status BindArguments(ArgumentsBinder* args) override;
int3 GetGridSize() const override;
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override;
@ -49,7 +49,7 @@ class Winograd4x4To36 : public GPUOperation {
Winograd4x4To36& operator=(const Winograd4x4To36&) = delete;
private:
friend Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
friend Winograd4x4To36 CreateWinograd4x4To36(const GpuInfo& gpu_info,
const OperationDef& definition,
const Padding2D& padding);
@ -63,19 +63,18 @@ class Winograd4x4To36 : public GPUOperation {
Padding2D padding_;
};
Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
Winograd4x4To36 CreateWinograd4x4To36(const GpuInfo& gpu_info,
const OperationDef& definition,
const Padding2D& padding);
class Winograd36To4x4 : public GPUOperation {
public:
Winograd36To4x4() = default;
Winograd36To4x4(const OperationDef& definition,
const DeviceInfo& device_info);
Winograd36To4x4(const OperationDef& definition, const GpuInfo& gpu_info);
absl::Status BindArguments(ArgumentsBinder* args) override;
int3 GetGridSize() const override;
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override;
@ -87,7 +86,7 @@ class Winograd36To4x4 : public GPUOperation {
private:
friend Winograd36To4x4 CreateWinograd36To4x4(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
void UploadAt();
@ -99,7 +98,7 @@ class Winograd36To4x4 : public GPUOperation {
};
Winograd36To4x4 CreateWinograd36To4x4(
const DeviceInfo& device_info, const OperationDef& definition,
const GpuInfo& gpu_info, const OperationDef& definition,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
} // namespace cl

View File

@ -92,7 +92,7 @@ TEST_F(OpenCLOperationTest, Winograd4x4To36) {
padding.prepended = HW(1, 1);
padding.appended = HW(1, 1);
Winograd4x4To36 wino_up = CreateWinograd4x4To36(
creation_context_.GetDeviceInfo(), op_def, padding);
creation_context_.GetGpuInfo(), op_def, padding);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_up,
BHWC(1, 36, 1, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
@ -159,8 +159,8 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
Winograd36To4x4 wino_down = CreateWinograd36To4x4(
creation_context_.GetDeviceInfo(), op_def, biases);
Winograd36To4x4 wino_down =
CreateWinograd36To4x4(creation_context_.GetGpuInfo(), op_def, biases);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_down,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));

View File

@ -35,7 +35,7 @@ std::vector<int2> Get2DWorkgroupsEqualTo128() {
std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(
int multiplier, int3 grid, const KernelInfo& kernel_info,
const DeviceInfo& device_info, WorkGroupSizeAlignment z_alignment) {
const GpuInfo& gpu_info, WorkGroupSizeAlignment z_alignment) {
std::vector<int3> work_groups;
work_groups.reserve(32);
@ -52,9 +52,9 @@ std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(
if (work_group_size_xy * z > kernel_info.max_work_group_size) {
continue;
}
if (x <= device_info.max_work_group_size_x &&
y <= device_info.max_work_group_size_y &&
z <= device_info.max_work_group_size_z) {
if (x <= gpu_info.max_work_group_size_x &&
y <= gpu_info.max_work_group_size_y &&
z <= gpu_info.max_work_group_size_z) {
work_groups.push_back({x, y, z});
}
}
@ -65,7 +65,7 @@ std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(
std::vector<int3> GenerateWorkGroupSizesXMultipleOf(
int multiplier, int3 grid, const KernelInfo& kernel_info,
const DeviceInfo& device_info, WorkGroupSizeAlignment z_alignment) {
const GpuInfo& gpu_info, WorkGroupSizeAlignment z_alignment) {
std::vector<int3> work_groups;
work_groups.reserve(32);
@ -78,9 +78,9 @@ std::vector<int3> GenerateWorkGroupSizesXMultipleOf(
x += multiplier) {
for (auto y : possible_y_sizes) {
for (auto z : possible_z_sizes) {
if (x <= device_info.max_work_group_size_x &&
y <= device_info.max_work_group_size_y &&
z <= device_info.max_work_group_size_z &&
if (x <= gpu_info.max_work_group_size_x &&
y <= gpu_info.max_work_group_size_y &&
z <= gpu_info.max_work_group_size_z &&
x * y * z <= kernel_info.max_work_group_size) {
work_groups.push_back({x, y, z});
}
@ -90,13 +90,13 @@ std::vector<int3> GenerateWorkGroupSizesXMultipleOf(
return work_groups;
}
void GetWorkGroupsAlignedToGrid(const DeviceInfo& device_info,
void GetWorkGroupsAlignedToGrid(const GpuInfo& gpu_info,
const KernelInfo& kernel_info, const int3& grid,
std::vector<int3>* work_groups) {
int3 max_wg_size;
max_wg_size.x = device_info.max_work_group_size_x;
max_wg_size.y = device_info.max_work_group_size_y;
max_wg_size.z = device_info.max_work_group_size_z;
max_wg_size.x = gpu_info.max_work_group_size_x;
max_wg_size.y = gpu_info.max_work_group_size_y;
max_wg_size.z = gpu_info.max_work_group_size_z;
GenerateWorkGroupSizesAlignedToGrid(
grid, max_wg_size, kernel_info.max_work_group_size, work_groups);
}
@ -214,24 +214,22 @@ int3 GetWorkGroupConv(const int3& grid, int max_size, int max_z_size) {
return int3(wg_x, wg_y, wg_z);
}
void GetPossibleWorkGroupsXYMultipleOf(int multiplier,
const DeviceInfo& device_info,
void GetPossibleWorkGroupsXYMultipleOf(int multiplier, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
const int3& grid,
WorkGroupSizeAlignment z_alignment,
std::vector<int3>* work_groups) {
*work_groups = GenerateWorkGroupSizesXYMultipleOf(
multiplier, grid, kernel_info, device_info, z_alignment);
multiplier, grid, kernel_info, gpu_info, z_alignment);
}
void GetPossibleWorkGroupsXMultipleOf(int multiplier,
const DeviceInfo& device_info,
void GetPossibleWorkGroupsXMultipleOf(int multiplier, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
const int3& grid,
WorkGroupSizeAlignment z_alignment,
std::vector<int3>* work_groups) {
*work_groups = GenerateWorkGroupSizesXMultipleOf(
multiplier, grid, kernel_info, device_info, z_alignment);
multiplier, grid, kernel_info, gpu_info, z_alignment);
}
bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
@ -250,8 +248,7 @@ bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
return !have_equal_work_groups;
}
void GetPossibleWorkGroups(TuningType tuning_type,
const DeviceInfo& device_info,
void GetPossibleWorkGroups(TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, const int3& grid,
std::vector<int3>* work_groups) {
switch (tuning_type) {
@ -260,7 +257,7 @@ void GetPossibleWorkGroups(TuningType tuning_type,
GetWorkGroup(grid, kernel_info.max_work_group_size));
return;
case TuningType::EXHAUSTIVE: {
GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups);
GetWorkGroupsAlignedToGrid(gpu_info, kernel_info, grid, work_groups);
return;
}
default:
@ -269,23 +266,22 @@ void GetPossibleWorkGroups(TuningType tuning_type,
}
}
void GetPossibleWorkGroupsConv(TuningType tuning_type,
const DeviceInfo& device_info,
void GetPossibleWorkGroupsConv(TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, const int3& grid,
std::vector<int3>* work_groups) {
switch (tuning_type) {
case TuningType::FAST: {
int max_z_size = 16;
if (device_info.IsAdreno()) {
max_z_size = device_info.adreno_info.IsAdreno3xx() ? 16 : 64;
if (gpu_info.IsAdreno()) {
max_z_size = gpu_info.adreno_info.IsAdreno3xx() ? 16 : 64;
}
max_z_size = std::min(max_z_size, device_info.max_work_group_size_z);
max_z_size = std::min(max_z_size, gpu_info.max_work_group_size_z);
work_groups->push_back(
GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size));
return;
}
case TuningType::EXHAUSTIVE: {
GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups);
GetWorkGroupsAlignedToGrid(gpu_info, kernel_info, grid, work_groups);
return;
}
default:

View File

@ -29,15 +29,13 @@ namespace gpu {
namespace cl {
// multiplier can be power of two only
void GetPossibleWorkGroupsXYMultipleOf(int multiplier,
const DeviceInfo& device_info,
void GetPossibleWorkGroupsXYMultipleOf(int multiplier, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
const int3& grid,
WorkGroupSizeAlignment z_alignment,
std::vector<int3>* work_groups);
void GetPossibleWorkGroupsXMultipleOf(int multiplier,
const DeviceInfo& device_info,
void GetPossibleWorkGroupsXMultipleOf(int multiplier, const GpuInfo& gpu_info,
const KernelInfo& kernel_info,
const int3& grid,
WorkGroupSizeAlignment z_alignment,
@ -50,13 +48,11 @@ int3 GetWorkGroupXY128Conv(const int3& grid);
bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height);
void GetPossibleWorkGroups(TuningType tuning_type,
const DeviceInfo& device_info,
void GetPossibleWorkGroups(TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, const int3& grid,
std::vector<int3>* work_groups);
void GetPossibleWorkGroupsConv(TuningType tuning_type,
const DeviceInfo& device_info,
void GetPossibleWorkGroupsConv(TuningType tuning_type, const GpuInfo& gpu_info,
const KernelInfo& kernel_info, const int3& grid,
std::vector<int3>* work_groups);

View File

@ -31,98 +31,98 @@ namespace {
std::unique_ptr<GPUOperation> SelectConvolutionAdreno(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const DeviceInfo& device_info, const OperationDef& op_def,
const GpuInfo& gpu_info, const OperationDef& op_def,
ModelHints hints) {
if (IsConvConstantsSupported(device_info, op_def, attr)) {
GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
if (IsConvConstantsSupported(gpu_info, op_def, attr)) {
GPUOperation conv = CreateConvConstants(gpu_info, op_def, attr);
return absl::make_unique<GPUOperation>(std::move(conv));
} else {
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
}
std::unique_ptr<GPUOperation> SelectConvolutionWinogradAdreno(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const DeviceInfo& device_info, const OperationDef& op_def,
const GpuInfo& gpu_info, const OperationDef& op_def,
ModelHints hints) {
ConvPowerVR conv =
CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
CreateConvPowerVRWino4x4To6x6(gpu_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsAdreno(
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC& dst_shape, const DeviceInfo& device_info,
const BHWC& dst_shape, const GpuInfo& gpu_info,
const OperationDef& op_def, ModelHints hints,
ConvWeightsDescription* weights_desc) {
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
device_info, op_def, attr, weights_shape, &dst_shape);
gpu_info, op_def, attr, weights_shape, &dst_shape);
*weights_desc = conv.GetConvWeightsDescription();
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
std::unique_ptr<GPUOperation> SelectConvolutionNVidia(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const DeviceInfo& device_info, const OperationDef& op_def) {
if (IsConvConstantsSupported(device_info, op_def, attr)) {
GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
const GpuInfo& gpu_info, const OperationDef& op_def) {
if (IsConvConstantsSupported(gpu_info, op_def, attr)) {
GPUOperation conv = CreateConvConstants(gpu_info, op_def, attr);
return absl::make_unique<GPUOperation>(std::move(conv));
} else {
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
}
std::unique_ptr<GPUOperation> SelectConvolutionPowerVR(
const Convolution2DAttributes& attr, const DeviceInfo& device_info,
const Convolution2DAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def) {
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr);
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
std::unique_ptr<GPUOperation> SelectConvolutionMali(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const DeviceInfo& device_info, const OperationDef& op_def) {
const GpuInfo& gpu_info, const OperationDef& op_def) {
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
IsConvBuffer1x1Supported(op_def, attr)) {
ConvBuffer1x1 conv =
CreateConvBuffer1x1(device_info, op_def, attr, &dst_shape);
CreateConvBuffer1x1(gpu_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvBuffer1x1>(std::move(conv));
} else {
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
}
std::unique_ptr<GPUOperation> SelectConvolutionWinogradMali(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const DeviceInfo& device_info, const OperationDef& op_def) {
const GpuInfo& gpu_info, const OperationDef& op_def) {
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
ConvBuffer1x1 conv =
CreateConvBuffer1x1Wino4x4To6x6(device_info, op_def, attr, &dst_shape);
CreateConvBuffer1x1Wino4x4To6x6(gpu_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvBuffer1x1>(std::move(conv));
} else {
ConvPowerVR conv =
CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
CreateConvPowerVRWino4x4To6x6(gpu_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
}
std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsMali(
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC& dst_shape, const DeviceInfo& device_info,
const BHWC& dst_shape, const GpuInfo& gpu_info,
const OperationDef& op_def, ModelHints hints,
ConvWeightsDescription* weights_desc) {
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
IsConvBuffer1x1Supported(op_def, weights_shape, attr)) {
ConvBuffer1x1 conv = CreateConvBuffer1x1DynamicWeights(
device_info, op_def, attr, weights_shape, &dst_shape);
gpu_info, op_def, attr, weights_shape, &dst_shape);
*weights_desc = conv.GetConvWeightsDescription();
return absl::make_unique<ConvBuffer1x1>(std::move(conv));
} else {
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
device_info, op_def, attr, weights_shape, &dst_shape);
gpu_info, op_def, attr, weights_shape, &dst_shape);
*weights_desc = conv.GetConvWeightsDescription();
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
@ -132,58 +132,58 @@ std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsMali(
std::unique_ptr<GPUOperation> SelectConvolution(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const DeviceInfo& device_info, const OperationDef& op_def,
const GpuInfo& gpu_info, const OperationDef& op_def,
ModelHints hints) {
if (device_info.IsAdreno()) {
return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints);
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
device_info.IsIntel()) {
return SelectConvolutionPowerVR(attr, device_info, op_def);
} else if (device_info.IsNvidia()) {
return SelectConvolutionNVidia(attr, dst_shape, device_info, op_def);
} else if (device_info.IsMali()) {
return SelectConvolutionMali(attr, dst_shape, device_info, op_def);
if (gpu_info.IsAdreno()) {
return SelectConvolutionAdreno(attr, dst_shape, gpu_info, op_def, hints);
} else if (gpu_info.IsPowerVR() || gpu_info.IsAMD() ||
gpu_info.IsIntel()) {
return SelectConvolutionPowerVR(attr, gpu_info, op_def);
} else if (gpu_info.IsNvidia()) {
return SelectConvolutionNVidia(attr, dst_shape, gpu_info, op_def);
} else if (gpu_info.IsMali()) {
return SelectConvolutionMali(attr, dst_shape, gpu_info, op_def);
} else {
return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints);
return SelectConvolutionAdreno(attr, dst_shape, gpu_info, op_def, hints);
}
}
std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const DeviceInfo& device_info, const OperationDef& op_def,
const GpuInfo& gpu_info, const OperationDef& op_def,
ModelHints hints) {
if (device_info.IsAdreno()) {
return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
if (gpu_info.IsAdreno()) {
return SelectConvolutionWinogradAdreno(attr, dst_shape, gpu_info, op_def,
hints);
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
device_info.IsNvidia() || device_info.IsIntel()) {
} else if (gpu_info.IsPowerVR() || gpu_info.IsAMD() ||
gpu_info.IsNvidia() || gpu_info.IsIntel()) {
ConvPowerVR conv =
CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
CreateConvPowerVRWino4x4To6x6(gpu_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
} else if (device_info.IsMali()) {
return SelectConvolutionWinogradMali(attr, dst_shape, device_info, op_def);
} else if (gpu_info.IsMali()) {
return SelectConvolutionWinogradMali(attr, dst_shape, gpu_info, op_def);
} else {
return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
return SelectConvolutionWinogradAdreno(attr, dst_shape, gpu_info, op_def,
hints);
}
}
std::unique_ptr<GPUOperation> SelectConvolutionWithDynamicWeights(
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC& dst_shape, const DeviceInfo& device_info,
const BHWC& dst_shape, const GpuInfo& gpu_info,
const OperationDef& op_def, ModelHints hints,
ConvWeightsDescription* weights_desc) {
if (device_info.IsAdreno()) {
if (gpu_info.IsAdreno()) {
return SelectConvolutionDynamicWeightsAdreno(attr, weights_shape, dst_shape,
device_info, op_def, hints,
gpu_info, op_def, hints,
weights_desc);
} else if (device_info.IsMali()) {
} else if (gpu_info.IsMali()) {
return SelectConvolutionDynamicWeightsMali(attr, weights_shape, dst_shape,
device_info, op_def, hints,
gpu_info, op_def, hints,
weights_desc);
} else {
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
device_info, op_def, attr, weights_shape, &dst_shape);
gpu_info, op_def, attr, weights_shape, &dst_shape);
*weights_desc = conv.GetConvWeightsDescription();
return absl::make_unique<ConvPowerVR>(std::move(conv));
}

View File

@ -31,19 +31,16 @@ namespace cl {
std::unique_ptr<GPUOperation> SelectConvolution(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const DeviceInfo& device_info, const OperationDef& op_def,
ModelHints hints);
const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints);
std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(
const Convolution2DAttributes& attr, const BHWC& dst_shape,
const DeviceInfo& device_info, const OperationDef& op_def,
ModelHints hints);
const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints);
std::unique_ptr<GPUOperation> SelectConvolutionWithDynamicWeights(
const Convolution2DAttributes& attr, const BHWC& weights_shape,
const BHWC& dst_shape, const DeviceInfo& device_info,
const OperationDef& op_def, ModelHints hints,
ConvWeightsDescription* weights_desc);
const BHWC& dst_shape, const GpuInfo& gpu_info, const OperationDef& op_def,
ModelHints hints, ConvWeightsDescription* weights_desc);
std::unique_ptr<GPUOperation> SelectConverterToConvWeights(
const ConvWeightsDescription& weights_desc, const OperationDef& op_def,

View File

@ -29,70 +29,70 @@ namespace cl {
namespace {
std::unique_ptr<GPUOperation> SelectConvolutionTransposedAdreno(
const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def) {
if (IsConvolutionTransposedThinSupported(attr)) {
ConvolutionTransposedThin conv =
CreateConvolutionTransposedThin(device_info, op_def, attr);
CreateConvolutionTransposedThin(gpu_info, op_def, attr);
return absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
} else if (IsConvolutionTransposed3x3ThinSupported(attr)) {
ConvolutionTransposed3x3Thin conv =
CreateConvolutionTransposed3x3Thin(device_info, op_def, attr);
CreateConvolutionTransposed3x3Thin(gpu_info, op_def, attr);
return absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
} else {
ConvolutionTransposed conv =
CreateConvolutionTransposed(device_info, op_def, attr);
CreateConvolutionTransposed(gpu_info, op_def, attr);
return absl::make_unique<ConvolutionTransposed>(std::move(conv));
}
}
std::unique_ptr<GPUOperation> SelectConvolutionTransposedPowerVR(
const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def) {
if (IsConvolutionTransposedThinSupported(attr)) {
ConvolutionTransposedThin conv =
CreateConvolutionTransposedThin(device_info, op_def, attr);
CreateConvolutionTransposedThin(gpu_info, op_def, attr);
return absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
} else if (IsConvolutionTransposed3x3ThinSupported(attr)) {
ConvolutionTransposed3x3Thin conv =
CreateConvolutionTransposed3x3Thin(device_info, op_def, attr);
CreateConvolutionTransposed3x3Thin(gpu_info, op_def, attr);
return absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
} else if (IsConvolutionTransposed3x3Supported(op_def, attr)) {
ConvolutionTransposed3x3 conv =
CreateConvolutionTransposed3x3(device_info, op_def, attr);
CreateConvolutionTransposed3x3(gpu_info, op_def, attr);
return absl::make_unique<ConvolutionTransposed3x3>(std::move(conv));
} else if (IsConvolutionTransposed4x4Supported(op_def, attr)) {
ConvolutionTransposed4x4 conv =
CreateConvolutionTransposed4x4(device_info, op_def, attr);
CreateConvolutionTransposed4x4(gpu_info, op_def, attr);
return absl::make_unique<ConvolutionTransposed4x4>(std::move(conv));
} else {
ConvolutionTransposed conv =
CreateConvolutionTransposed(device_info, op_def, attr);
CreateConvolutionTransposed(gpu_info, op_def, attr);
return absl::make_unique<ConvolutionTransposed>(std::move(conv));
}
}
std::unique_ptr<GPUOperation> SelectConvolutionTransposedMali(
const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def) {
ConvolutionTransposed conv =
CreateConvolutionTransposed(device_info, op_def, attr);
CreateConvolutionTransposed(gpu_info, op_def, attr);
return absl::make_unique<ConvolutionTransposed>(std::move(conv));
}
} // namespace
std::unique_ptr<GPUOperation> SelectConvolutionTransposed(
const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def) {
if (device_info.IsAdreno()) {
return SelectConvolutionTransposedAdreno(attr, device_info, op_def);
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
device_info.IsNvidia() || device_info.IsIntel()) {
return SelectConvolutionTransposedPowerVR(attr, device_info, op_def);
} else if (device_info.IsMali()) {
return SelectConvolutionTransposedMali(attr, device_info, op_def);
if (gpu_info.IsAdreno()) {
return SelectConvolutionTransposedAdreno(attr, gpu_info, op_def);
} else if (gpu_info.IsPowerVR() || gpu_info.IsAMD() ||
gpu_info.IsNvidia() || gpu_info.IsIntel()) {
return SelectConvolutionTransposedPowerVR(attr, gpu_info, op_def);
} else if (gpu_info.IsMali()) {
return SelectConvolutionTransposedMali(attr, gpu_info, op_def);
} else {
return SelectConvolutionTransposedAdreno(attr, device_info, op_def);
return SelectConvolutionTransposedAdreno(attr, gpu_info, op_def);
}
}

View File

@ -27,7 +27,7 @@ namespace gpu {
namespace cl {
std::unique_ptr<GPUOperation> SelectConvolutionTransposed(
const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def);
} // namespace cl

View File

@ -27,9 +27,8 @@ namespace tflite {
namespace gpu {
namespace cl {
absl::Status SelectDefault(const DeviceInfo& device_info,
const OperationDef& op_def, ModelHints hints,
const std::vector<Value*>& inputs,
absl::Status SelectDefault(const GpuInfo& gpu_info, const OperationDef& op_def,
ModelHints hints, const std::vector<Value*>& inputs,
const std::vector<Value*>& outputs, const Node& node,
GPUOperationsSubgraph* gpu_subgraph) {
return absl::UnimplementedError(

View File

@ -30,9 +30,8 @@ namespace tflite {
namespace gpu {
namespace cl {
absl::Status SelectDefault(const DeviceInfo& device_info,
const OperationDef& op_def, ModelHints hints,
const std::vector<Value*>& inputs,
absl::Status SelectDefault(const GpuInfo& gpu_info, const OperationDef& op_def,
ModelHints hints, const std::vector<Value*>& inputs,
const std::vector<Value*>& outputs, const Node& node,
GPUOperationsSubgraph* gpu_subgraph);

View File

@ -26,58 +26,58 @@ namespace cl {
namespace {
std::unique_ptr<GPUOperation> SelectDWConvolutionAdreno(
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def) {
if (IsDepthwiseConv3x3Supported(attr)) {
return absl::make_unique<DepthwiseConv3x3>(
CreateDepthwiseConv3x3(device_info, op_def, attr));
CreateDepthwiseConv3x3(gpu_info, op_def, attr));
} else {
return absl::make_unique<GPUOperation>(
CreateDepthwiseConvolution2D(device_info, op_def, attr));
CreateDepthwiseConvolution2D(gpu_info, op_def, attr));
}
}
std::unique_ptr<GPUOperation> SelectDWConvolutionPowerVR(
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def) {
if (IsDepthwiseConv3x3Supported(attr)) {
return absl::make_unique<DepthwiseConv3x3>(
CreateDepthwiseConv3x3(device_info, op_def, attr));
CreateDepthwiseConv3x3(gpu_info, op_def, attr));
} else {
return absl::make_unique<GPUOperation>(
CreateDepthwiseConvolution2D(device_info, op_def, attr));
CreateDepthwiseConvolution2D(gpu_info, op_def, attr));
}
}
std::unique_ptr<GPUOperation> SelectDWConvolutionMali(
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def) {
const auto storage_type = op_def.src_tensors[0].storage_type;
bool buffer_type = storage_type == TensorStorageType::BUFFER ||
storage_type == TensorStorageType::IMAGE_BUFFER;
const MaliInfo mali_info = device_info.mali_info;
const MaliInfo mali_info = gpu_info.mali_info;
if (IsDepthwiseConv3x3Supported(attr) && !mali_info.IsMidgard() &&
!buffer_type && op_def.precision != CalculationsPrecision::F32) {
return absl::make_unique<DepthwiseConv3x3>(
CreateDepthwiseConv3x3(device_info, op_def, attr));
CreateDepthwiseConv3x3(gpu_info, op_def, attr));
} else {
return absl::make_unique<GPUOperation>(
CreateDepthwiseConvolution2D(device_info, op_def, attr));
CreateDepthwiseConvolution2D(gpu_info, op_def, attr));
}
}
} // namespace
std::unique_ptr<GPUOperation> SelectDWConvolution(
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def) {
if (device_info.IsAdreno()) {
return SelectDWConvolutionAdreno(attr, device_info, op_def);
} else if (device_info.IsPowerVR()) {
return SelectDWConvolutionPowerVR(attr, device_info, op_def);
} else if (device_info.IsMali()) {
return SelectDWConvolutionMali(attr, device_info, op_def);
if (gpu_info.IsAdreno()) {
return SelectDWConvolutionAdreno(attr, gpu_info, op_def);
} else if (gpu_info.IsPowerVR()) {
return SelectDWConvolutionPowerVR(attr, gpu_info, op_def);
} else if (gpu_info.IsMali()) {
return SelectDWConvolutionMali(attr, gpu_info, op_def);
} else {
return SelectDWConvolutionAdreno(attr, device_info, op_def);
return SelectDWConvolutionAdreno(attr, gpu_info, op_def);
}
}

View File

@ -27,7 +27,7 @@ namespace gpu {
namespace cl {
std::unique_ptr<GPUOperation> SelectDWConvolution(
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def);
} // namespace cl

View File

@ -27,74 +27,74 @@ namespace gpu {
namespace cl {
std::unique_ptr<GPUOperation> SelectFullyConnectedGeneric(
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def, int batch_size) {
if (op_def.IsBatchSupported()) {
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
} else {
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
FullyConnected fc = CreateFullyConnected(gpu_info, op_def, attr);
return absl::make_unique<FullyConnected>(std::move(fc));
}
}
std::unique_ptr<GPUOperation> SelectFullyConnectedAdreno(
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def, int batch_size) {
if (op_def.IsBatchSupported()) {
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
} else {
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
FullyConnected fc = CreateFullyConnected(gpu_info, op_def, attr);
return absl::make_unique<FullyConnected>(std::move(fc));
}
}
std::unique_ptr<GPUOperation> SelectFullyConnectedPowerVR(
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def, int batch_size) {
if (op_def.IsBatchSupported()) {
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr);
return absl::make_unique<ConvPowerVR>(std::move(conv));
} else {
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
FullyConnected fc = CreateFullyConnected(gpu_info, op_def, attr);
return absl::make_unique<FullyConnected>(std::move(fc));
}
}
std::unique_ptr<GPUOperation> SelectFullyConnectedMali(
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def, int batch_size) {
if (op_def.IsBatchSupported()) {
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr);
ConvBuffer1x1 conv = CreateConvBuffer1x1(gpu_info, op_def, attr);
return absl::make_unique<ConvBuffer1x1>(std::move(conv));
} else {
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
ConvPowerVR conv =
CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
return absl::make_unique<ConvPowerVR>(std::move(conv));
}
} else {
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
FullyConnected fc = CreateFullyConnected(gpu_info, op_def, attr);
return absl::make_unique<FullyConnected>(std::move(fc));
}
}
std::unique_ptr<GPUOperation> SelectFullyConnected(
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def, int batch_size) {
if (device_info.IsAdreno()) {
return SelectFullyConnectedAdreno(attr, device_info, op_def, batch_size);
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
device_info.IsNvidia() || device_info.IsIntel()) {
return SelectFullyConnectedPowerVR(attr, device_info, op_def, batch_size);
} else if (device_info.IsMali()) {
return SelectFullyConnectedMali(attr, device_info, op_def, batch_size);
if (gpu_info.IsAdreno()) {
return SelectFullyConnectedAdreno(attr, gpu_info, op_def, batch_size);
} else if (gpu_info.IsPowerVR() || gpu_info.IsAMD() ||
gpu_info.IsNvidia() || gpu_info.IsIntel()) {
return SelectFullyConnectedPowerVR(attr, gpu_info, op_def, batch_size);
} else if (gpu_info.IsMali()) {
return SelectFullyConnectedMali(attr, gpu_info, op_def, batch_size);
} else {
return SelectFullyConnectedGeneric(attr, device_info, op_def, batch_size);
return SelectFullyConnectedGeneric(attr, gpu_info, op_def, batch_size);
}
}

View File

@ -27,7 +27,7 @@ namespace gpu {
namespace cl {
std::unique_ptr<GPUOperation> SelectFullyConnected(
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def, int batch_size);
} // namespace cl

View File

@ -41,7 +41,7 @@ namespace gpu {
namespace cl {
namespace {
bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
const DeviceInfo& device_info,
const GpuInfo& gpu_info,
const BHWC& dst_shape) {
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
@ -51,22 +51,22 @@ bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
// Mali among other devices has smaller SIMD line size
const int min_depth = device_info.IsMali() ? 16 : 32;
const int min_hw = device_info.IsMali() ? 32 : 128;
const int min_depth = gpu_info.IsMali() ? 16 : 32;
const int min_hw = gpu_info.IsMali() ? 32 : 128;
const bool recommended_channels =
dst_depth % 4 == 0 && src_depth >= min_depth && dst_depth >= min_depth;
const bool recommended_hw = tiles_x * tiles_y >= min_hw;
return suitable_attributes && recommended_channels && recommended_hw;
}
absl::Status WinogradFromNode(const DeviceInfo& device_info,
absl::Status WinogradFromNode(const GpuInfo& gpu_info,
const std::vector<Value*>& inputs,
const std::vector<Value*>& outputs,
const OperationDef& op_def, ModelHints hints,
const BHWC& input_shape, const BHWC& output_shape,
const Convolution2DAttributes& attr,
GPUOperationsSubgraph* gpu_subgraph) {
if (!IsSuitableForWinograd4x4To6x6(attr, device_info, output_shape)) {
if (!IsSuitableForWinograd4x4To6x6(attr, gpu_info, output_shape)) {
return absl::UnimplementedError("No implementation for this case.");
}
@ -76,13 +76,13 @@ absl::Status WinogradFromNode(const DeviceInfo& device_info,
const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
TensorDescriptor td_0;
td_0.storage_type = SelectBestStorageType(
device_info, shape_0, op_def.src_tensors[0].storage_type,
gpu_info, shape_0, op_def.src_tensors[0].storage_type,
op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
td_0.data_type = op_def.src_tensors[0].data_type;
td_0.layout = op_def.src_tensors[0].layout;
TensorDescriptor td_1;
td_1.storage_type = SelectBestStorageType(
device_info, shape_1, op_def.src_tensors[0].storage_type,
gpu_info, shape_1, op_def.src_tensors[0].storage_type,
op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
td_1.data_type = op_def.src_tensors[0].data_type;
td_1.layout = op_def.src_tensors[0].layout;
@ -96,7 +96,7 @@ absl::Status WinogradFromNode(const DeviceInfo& device_info,
winograd_up_def.dst_tensors.push_back(td_0);
auto& winograd_up = gpu_subgraph->operations[0];
winograd_up.operation =
SelectWinograd4x4To36(device_info, attr.padding, winograd_up_def);
SelectWinograd4x4To36(gpu_info, attr.padding, winograd_up_def);
winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
winograd_up.output_ids = {-1};
@ -107,7 +107,7 @@ absl::Status WinogradFromNode(const DeviceInfo& device_info,
auto& conv = gpu_subgraph->operations[1];
conv.input_ids = {-1};
conv.output_ids = {-2};
conv.operation = SelectConvolutionForWinograd(attr, input_shape, device_info,
conv.operation = SelectConvolutionForWinograd(attr, input_shape, gpu_info,
conv_def, hints);
OperationDef winograd_down_def;
@ -123,13 +123,13 @@ absl::Status WinogradFromNode(const DeviceInfo& device_info,
bias_copy.data.resize(attr.weights.shape.o);
}
winograd_down.operation =
SelectWinograd36To4x4(device_info, winograd_down_def, bias_copy);
SelectWinograd36To4x4(gpu_info, winograd_down_def, bias_copy);
return absl::OkStatus();
}
} // namespace
absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
absl::Status GPUOperationFromNode(const GpuInfo& gpu_info,
const OperationDef& op_def, ModelHints hints,
const std::vector<Value*>& inputs,
const std::vector<Value*>& outputs,
@ -159,7 +159,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
auto attr =
absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
GPUOperation operation =
CreateElementwise(device_info, op_def, op_type, attr);
CreateElementwise(gpu_info, op_def, op_type, attr);
*gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
return absl::OkStatus();
}
@ -191,7 +191,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
op_def.src_tensors[1].storage_type,
Layout::BHWC};
transposed_desc.storage_type = SelectBestStorageType(
device_info, weights_shape, transposed_desc.storage_type,
gpu_info, weights_shape, transposed_desc.storage_type,
transposed_desc.data_type, transposed_desc.layout);
TensorDescriptor weights_desc = {op_def.src_tensors[1].data_type,
TensorStorageType::BUFFER, Layout::BHWC};
@ -206,7 +206,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
conv_def.src_tensors[1] = weights_desc;
ConvWeightsDescription conv_weights_desc;
conv_op.operation = SelectConvolutionWithDynamicWeights(
attr, weights_shape, dst_shape, device_info, conv_def, hints,
attr, weights_shape, dst_shape, gpu_info, conv_def, hints,
&conv_weights_desc);
int aligned_output =
@ -246,7 +246,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
for (int i = 0; i < inputs.size(); ++i) {
channels[i] = inputs[i]->tensor.shape.c;
}
return SelectConcat(attr, channels, op_def, device_info, gpu_op);
return SelectConcat(attr, channels, op_def, gpu_info, gpu_op);
}
case OperationType::CONVOLUTION_2D: {
auto attr =
@ -254,14 +254,14 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
auto input_shape = inputs[0]->tensor.shape;
auto output_shape = outputs[0]->tensor.shape;
if (inputs.size() == 1) {
if (WinogradFromNode(device_info, inputs, outputs, op_def, hints,
if (WinogradFromNode(gpu_info, inputs, outputs, op_def, hints,
input_shape, output_shape, attr, gpu_subgraph)
.ok()) {
return absl::OkStatus();
} else {
gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
*gpu_op =
SelectConvolution(attr, output_shape, device_info, op_def, hints);
SelectConvolution(attr, output_shape, gpu_info, op_def, hints);
return absl::OkStatus();
}
} else {
@ -283,7 +283,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
conv_def.src_tensors[1] = weights_desc;
ConvWeightsDescription conv_weights_desc;
conv_op.operation = SelectConvolutionWithDynamicWeights(
attr, weights_shape, output_shape, device_info, conv_def, hints,
attr, weights_shape, output_shape, gpu_info, conv_def, hints,
&conv_weights_desc);
int aligned_output =
@ -309,33 +309,33 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
case OperationType::CONVOLUTION_TRANSPOSED: {
auto attr = absl::any_cast<ConvolutionTransposedAttributes>(
node.operation.attributes);
*gpu_op = SelectConvolutionTransposed(attr, device_info, op_def);
*gpu_op = SelectConvolutionTransposed(attr, gpu_info, op_def);
return absl::OkStatus();
}
case OperationType::DEPTHWISE_CONVOLUTION: {
auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
node.operation.attributes);
if (inputs.size() == 1) {
*gpu_op = SelectDWConvolution(attr, device_info, op_def);
*gpu_op = SelectDWConvolution(attr, gpu_info, op_def);
} else {
if (inputs[1]->tensor.shape.b != 1) {
return absl::UnimplementedError(
"No support of depthwise runtime weights with channel multiplier "
"!= 1");
}
*gpu_op = SelectDWConvolutionDynamicWeights(attr, device_info, op_def);
*gpu_op = SelectDWConvolutionDynamicWeights(attr, gpu_info, op_def);
}
return absl::OkStatus();
}
case OperationType::FULLY_CONNECTED: {
auto attr =
absl::any_cast<FullyConnectedAttributes>(node.operation.attributes);
*gpu_op = SelectFullyConnected(attr, device_info, op_def,
*gpu_op = SelectFullyConnected(attr, gpu_info, op_def,
inputs[0]->tensor.shape.b);
return absl::OkStatus();
}
case OperationType::LSTM: {
*gpu_op = SelectLSTM(op_def, device_info);
*gpu_op = SelectLSTM(op_def, gpu_info);
return absl::OkStatus();
}
case OperationType::MAX_UNPOOLING_2D: {
@ -346,11 +346,11 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
}
case OperationType::MEAN: {
auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
return SelectMean(attr, op_def, device_info, gpu_op);
return SelectMean(attr, op_def, gpu_info, gpu_op);
}
case OperationType::MEAN_STDDEV_NORMALIZATION: {
MeanStdDevNormalization operation = CreateMeanStdDevNormalization(
op_def, device_info, (inputs[0]->tensor.shape.c + 3) / 4);
op_def, gpu_info, (inputs[0]->tensor.shape.c + 3) / 4);
*gpu_op =
absl::make_unique<MeanStdDevNormalization>(std::move(operation));
return absl::OkStatus();
@ -368,7 +368,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
}
case OperationType::PRELU: {
auto attr = absl::any_cast<PReLUAttributes>(node.operation.attributes);
*gpu_op = SelectPReLU(attr, device_info, op_def);
*gpu_op = SelectPReLU(attr, gpu_info, op_def);
return absl::OkStatus();
}
case OperationType::QUANTIZE_AND_DEQUANTIZE: {
@ -453,7 +453,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
auto attr =
absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
GPUOperation operation =
CreateElementwise(device_info, op_def, op_type, attr);
CreateElementwise(gpu_info, op_def, op_type, attr);
*gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
return absl::OkStatus();
}
@ -474,7 +474,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
return absl::OkStatus();
}
default:
return SelectDefault(device_info, op_def, hints, inputs, outputs, node,
return SelectDefault(gpu_info, op_def, hints, inputs, outputs, node,
gpu_subgraph);
}
}

View File

@ -29,7 +29,7 @@ namespace tflite {
namespace gpu {
namespace cl {
absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
absl::Status GPUOperationFromNode(const GpuInfo& gpu_info,
const OperationDef& op_def, ModelHints hints,
const std::vector<Value*>& inputs,
const std::vector<Value*>& outputs,

View File

@ -47,8 +47,8 @@ namespace gpu {
namespace cl {
std::unique_ptr<GPUOperation> SelectLSTM(const OperationDef& op_def,
const DeviceInfo& device_info) {
return absl::make_unique<GPUOperation>(CreateLSTM(op_def, device_info));
const GpuInfo& gpu_info) {
return absl::make_unique<GPUOperation>(CreateLSTM(op_def, gpu_info));
}
std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes& attr,
@ -57,10 +57,9 @@ std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes& attr,
}
std::unique_ptr<GPUOperation> SelectPReLU(const PReLUAttributes& attr,
const DeviceInfo& device_info,
const GpuInfo& gpu_info,
const OperationDef& op_def) {
return absl::make_unique<GPUOperation>(
CreatePReLU(device_info, op_def, attr));
return absl::make_unique<GPUOperation>(CreatePReLU(gpu_info, op_def, attr));
}
std::unique_ptr<GPUOperation> SelectPooling(const Pooling2DAttributes& attr,
@ -89,12 +88,11 @@ absl::Status SelectResize(const Resize2DAttributes& attr,
absl::Status SelectConcat(const ConcatAttributes& attr,
const std::vector<int>& channels,
const OperationDef& op_def,
const DeviceInfo& device_info,
const OperationDef& op_def, const GpuInfo& gpu_info,
std::unique_ptr<GPUOperation>* ptr) {
switch (attr.axis) {
case Axis::CHANNELS: {
GPUOperation operation = CreateConcatZ(op_def, channels, device_info);
GPUOperation operation = CreateConcatZ(op_def, channels, gpu_info);
*ptr = absl::make_unique<GPUOperation>(std::move(operation));
return absl::OkStatus();
}
@ -112,10 +110,10 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
}
std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def) {
return absl::make_unique<GPUOperation>(
CreateDepthwiseConvolution2DDynamicWeights(device_info, op_def, attr));
CreateDepthwiseConvolution2DDynamicWeights(gpu_info, op_def, attr));
}
void SelectReshape(int src_channels, int dst_channels,
@ -150,12 +148,12 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
}
absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
const DeviceInfo& device_info,
const GpuInfo& gpu_info,
std::unique_ptr<GPUOperation>* ptr) {
if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
return absl::UnimplementedError("Mean operation supports only HW plane");
}
Mean operation = CreateMean(op_def, device_info);
Mean operation = CreateMean(op_def, gpu_info);
*ptr = absl::make_unique<Mean>(std::move(operation));
return absl::OkStatus();
}
@ -179,17 +177,17 @@ void SelectTranspose(const TransposeAttributes& attr,
}
std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
const DeviceInfo& device_info, const Padding2D& padding,
const GpuInfo& gpu_info, const Padding2D& padding,
const OperationDef& op_def) {
return absl::make_unique<Winograd4x4To36>(
CreateWinograd4x4To36(device_info, op_def, padding));
CreateWinograd4x4To36(gpu_info, op_def, padding));
}
std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
const DeviceInfo& device_info, const OperationDef& op_def,
const GpuInfo& gpu_info, const OperationDef& op_def,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
return absl::make_unique<Winograd36To4x4>(
CreateWinograd36To4x4(device_info, op_def, biases));
CreateWinograd36To4x4(gpu_info, op_def, biases));
}
std::unique_ptr<GPUOperation> SelectQuantizeAndDequantize(

View File

@ -29,13 +29,13 @@ namespace gpu {
namespace cl {
std::unique_ptr<GPUOperation> SelectLSTM(const OperationDef& op_def,
const DeviceInfo& device_info);
const GpuInfo& gpu_info);
std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes& attr,
const OperationDef& op_def);
std::unique_ptr<GPUOperation> SelectPReLU(const PReLUAttributes& attr,
const DeviceInfo& device_info,
const GpuInfo& gpu_info,
const OperationDef& op_def);
std::unique_ptr<GPUOperation> SelectPooling(const Pooling2DAttributes& attr,
@ -53,12 +53,11 @@ absl::Status SelectResize(const Resize2DAttributes& attr,
absl::Status SelectConcat(const ConcatAttributes& attr,
const std::vector<int>& channels,
const OperationDef& op_def,
const DeviceInfo& device_info,
const OperationDef& op_def, const GpuInfo& gpu_info,
std::unique_ptr<GPUOperation>* ptr);
std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
const OperationDef& op_def);
void SelectReshape(int src_channels, int dst_channels,
@ -72,7 +71,7 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr);
absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
const DeviceInfo& device_info,
const GpuInfo& gpu_info,
std::unique_ptr<GPUOperation>* ptr);
void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
@ -86,12 +85,12 @@ void SelectTranspose(const TransposeAttributes& attr,
const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr);
std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
const DeviceInfo& device_info, const Padding2D& padding,
const OperationDef& op_def);
std::unique_ptr<GPUOperation> SelectWinograd4x4To36(const GpuInfo& gpu_info,
const Padding2D& padding,
const OperationDef& op_def);
std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
const DeviceInfo& device_info, const OperationDef& op_def,
const GpuInfo& gpu_info, const OperationDef& op_def,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
std::unique_ptr<GPUOperation> SelectQuantizeAndDequantize(

View File

@ -89,7 +89,7 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
// fully connected + fully connected + add
absl::Status TryFCFCAdd(
const DeviceInfo& device_info, CalculationsPrecision precision,
const GpuInfo& gpu_info, CalculationsPrecision precision,
const GraphFloat32& graph, NodeId first_node_id,
const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
@ -160,7 +160,7 @@ absl::Status TryFCFCAdd(
}
std::unique_ptr<GPUOperation>* gpu_op =
InitSingleOpSubgraph(fc0_inputs, add_outputs, gpu_subgraph);
FCFCAdd fc = CreateFCFCAdd(device_info, op_def, fc0_attr, fc1_attr);
FCFCAdd fc = CreateFCFCAdd(gpu_info, op_def, fc0_attr, fc1_attr);
*gpu_op = absl::make_unique<FCFCAdd>(std::move(fc));
consumed_nodes->insert(fc0_node->id);
consumed_nodes->insert(fc1_node->id);
@ -170,12 +170,12 @@ absl::Status TryFCFCAdd(
} // namespace
absl::Status GPUSubgraphFromGraph(
const DeviceInfo& device_info, CalculationsPrecision precision,
const GpuInfo& gpu_info, CalculationsPrecision precision,
const GraphFloat32& graph, NodeId first_node_id,
const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph,
std::string* name) {
if ((device_info.IsAdreno() || device_info.IsNvidia()) &&
if ((gpu_info.IsAdreno() || gpu_info.IsNvidia()) &&
TryDepthwiseConvPlus1x1Conv(precision, graph, first_node_id,
tensor_descriptors, consumed_nodes,
gpu_subgraph)
@ -183,9 +183,9 @@ absl::Status GPUSubgraphFromGraph(
*name = "depthwise_conv_plus_1x1_conv";
return absl::OkStatus();
}
if ((device_info.IsIntel() || device_info.IsNvidia()) &&
TryFCFCAdd(device_info, precision, graph, first_node_id,
tensor_descriptors, consumed_nodes, gpu_subgraph)
if ((gpu_info.IsIntel() || gpu_info.IsNvidia()) &&
TryFCFCAdd(gpu_info, precision, graph, first_node_id, tensor_descriptors,
consumed_nodes, gpu_subgraph)
.ok()) {
*name = "fully_connected_x2_and_add";
return absl::OkStatus();

View File

@ -31,7 +31,7 @@ namespace gpu {
namespace cl {
absl::Status GPUSubgraphFromGraph(
const DeviceInfo& device_info, CalculationsPrecision precision,
const GpuInfo& gpu_info, CalculationsPrecision precision,
const GraphFloat32& graph, NodeId first_node_id,
const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph,

View File

@ -24,7 +24,7 @@ namespace tflite {
namespace gpu {
namespace cl {
bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWDC& shape,
bool CanCreateTensorWithShape(const GpuInfo& gpu_info, const BHWDC& shape,
const TensorDescriptor& descriptor) {
const int slices = DivideRoundUp(shape.c, 4);
switch (descriptor.storage_type) {
@ -33,61 +33,61 @@ bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWDC& shape,
4 * (descriptor.data_type == DataType::FLOAT32 ? 4 : 2);
const int buffer_size =
shape.b * shape.w * shape.h * shape.d * slices * flt4_size;
return buffer_size <= device_info.buffer_max_size;
return buffer_size <= gpu_info.buffer_max_size;
}
case TensorStorageType::IMAGE_BUFFER:
return shape.b * shape.w * shape.h * shape.d * slices <=
device_info.image_buffer_max_size;
gpu_info.image_buffer_max_size;
case TensorStorageType::TEXTURE_3D:
if (device_info.cl_version < OpenCLVersion::CL_1_2 && slices == 1) {
if (gpu_info.cl_version < OpenCLVersion::CL_1_2 && slices == 1) {
// clCreateImage3D (that used in CL 1.0/1.1) can not create image with
// depth = 1 by specification;
return false;
}
return shape.w * shape.b <= device_info.image3d_max_width &&
shape.h <= device_info.image3d_max_height &&
slices * shape.d <= device_info.image3d_max_depth;
return shape.w * shape.b <= gpu_info.image3d_max_width &&
shape.h <= gpu_info.image3d_max_height &&
slices * shape.d <= gpu_info.image3d_max_depth;
case TensorStorageType::TEXTURE_ARRAY:
// Bug on some Adreno. b/131099086
if (slices == 1 && device_info.IsAdreno() &&
!device_info.adreno_info.support_one_layer_texture_array) {
if (slices == 1 && gpu_info.IsAdreno() &&
!gpu_info.adreno_info.support_one_layer_texture_array) {
return false;
}
return shape.w * shape.b <= device_info.image2d_max_width &&
shape.h <= device_info.image2d_max_height &&
slices * shape.d <= device_info.image_array_max_layers;
return shape.w * shape.b <= gpu_info.image2d_max_width &&
shape.h <= gpu_info.image2d_max_height &&
slices * shape.d <= gpu_info.image_array_max_layers;
case TensorStorageType::TEXTURE_2D:
return shape.w * shape.b * shape.d <= device_info.image2d_max_width &&
shape.h * slices <= device_info.image2d_max_height;
return shape.w * shape.b * shape.d <= gpu_info.image2d_max_width &&
shape.h * slices <= gpu_info.image2d_max_height;
case TensorStorageType::SINGLE_TEXTURE_2D:
return shape.c <= 4 &&
device_info.SupportsFloatImage2D(descriptor.data_type, shape.c) &&
shape.w * shape.b * shape.d <= device_info.image2d_max_width &&
shape.h <= device_info.image2d_max_height;
gpu_info.SupportsFloatImage2D(descriptor.data_type, shape.c) &&
shape.w * shape.b * shape.d <= gpu_info.image2d_max_width &&
shape.h <= gpu_info.image2d_max_height;
default:
return false;
}
}
bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWC& shape,
bool CanCreateTensorWithShape(const GpuInfo& gpu_info, const BHWC& shape,
const TensorDescriptor& descriptor) {
const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
return CanCreateTensorWithShape(device_info, shape5D, descriptor);
return CanCreateTensorWithShape(gpu_info, shape5D, descriptor);
}
TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
TensorStorageType SelectBestStorageType(const GpuInfo& gpu_info,
const BHWC& shape,
const TensorStorageType& desired,
const DataType& data_type,
const Layout& layout) {
if (CanCreateTensorWithShape(device_info, shape,
if (CanCreateTensorWithShape(gpu_info, shape,
TensorDescriptor{data_type, desired, layout})) {
return desired;
}
auto GetBestTypeAfterTextureArray = [&]() {
if (device_info.SupportsImageBuffer() &&
if (gpu_info.SupportsImageBuffer() &&
CanCreateTensorWithShape(
device_info, shape,
gpu_info, shape,
TensorDescriptor{data_type, TensorStorageType::IMAGE_BUFFER,
layout})) {
return TensorStorageType::IMAGE_BUFFER;
@ -96,9 +96,9 @@ TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
}
};
auto GetBestTypeAfterTexture2D = [&]() {
if (device_info.SupportsTextureArray() &&
if (gpu_info.SupportsTextureArray() &&
CanCreateTensorWithShape(
device_info, shape,
gpu_info, shape,
TensorDescriptor{data_type, TensorStorageType::TEXTURE_ARRAY,
layout})) {
return TensorStorageType::TEXTURE_ARRAY;
@ -108,7 +108,7 @@ TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
};
auto GetBestTypeAfterTexture3D = [&]() {
if (CanCreateTensorWithShape(
device_info, shape,
gpu_info, shape,
TensorDescriptor{data_type, TensorStorageType::TEXTURE_2D,
layout})) {
return TensorStorageType::TEXTURE_2D;

View File

@ -25,13 +25,13 @@ namespace tflite {
namespace gpu {
namespace cl {
bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWDC& shape,
bool CanCreateTensorWithShape(const GpuInfo& gpu_info, const BHWDC& shape,
const TensorDescriptor& descriptor);
bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWC& shape,
bool CanCreateTensorWithShape(const GpuInfo& gpu_info, const BHWC& shape,
const TensorDescriptor& descriptor);
TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
TensorStorageType SelectBestStorageType(const GpuInfo& gpu_info,
const BHWC& shape,
const TensorStorageType& desired,
const DataType& data_type,