DeviceInfo renamed into GpuInfo.
PiperOrigin-RevId: 341692138 Change-Id: I129db695d3cabaa423059822ecaf75111e98c878
This commit is contained in:
parent
916f71f7f6
commit
83a51f49fa
@ -136,12 +136,12 @@ std::string GetImageModifier(AccessType access) {
|
||||
}
|
||||
}
|
||||
|
||||
std::string GetDefaultSamplers(const DeviceInfo& device_info) {
|
||||
std::string GetDefaultSamplers(const GpuInfo& gpu_info) {
|
||||
std::string result;
|
||||
result +=
|
||||
"__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | "
|
||||
"CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
|
||||
if (device_info.IsAdreno() && device_info.adreno_info.IsAdreno3xx()) {
|
||||
if (gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
|
||||
// Unfortunately, CLK_ADDRESS_CLAMP is very slow on Adreno3xx and
|
||||
// we can observe huge register overhead when compared to other modes.
|
||||
|
||||
@ -209,7 +209,7 @@ absl::Status CreateCLObject(GPUObjectDescriptor* desc, CLContext* context,
|
||||
constexpr char CLArguments::kArgsPrefix[];
|
||||
|
||||
absl::Status CLArguments::Init(
|
||||
const DeviceInfo& device_info,
|
||||
const GpuInfo& gpu_info,
|
||||
const std::map<std::string, std::string>& linkables, CLContext* context,
|
||||
Arguments* args, std::string* code) {
|
||||
RETURN_IF_ERROR(AllocateObjects(*args, context));
|
||||
@ -217,22 +217,22 @@ absl::Status CLArguments::Init(
|
||||
RETURN_IF_ERROR(ResolveSelectorsPass(*args, linkables, code));
|
||||
object_refs_ = std::move(args->object_refs_);
|
||||
args->GetActiveArguments(kArgsPrefix, *code);
|
||||
const bool use_f32_for_halfs = device_info.IsPowerVR();
|
||||
const bool use_f32_for_halfs = gpu_info.IsPowerVR();
|
||||
CopyArguments(*args, use_f32_for_halfs);
|
||||
RETURN_IF_ERROR(SetObjectsResources(*args));
|
||||
RenameArgumentsInCode(code);
|
||||
ResolveArgsPass(code);
|
||||
*code = absl::Substitute(*code, GetListOfArgs());
|
||||
*code = GetDefaultSamplers(device_info) + *code;
|
||||
*code = GetDefaultSamplers(gpu_info) + *code;
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CLArguments::Init(const DeviceInfo& device_info, Arguments* args,
|
||||
absl::Status CLArguments::Init(const GpuInfo& gpu_info, Arguments* args,
|
||||
CLContext* context) {
|
||||
RETURN_IF_ERROR(AllocateObjects(*args, context));
|
||||
RETURN_IF_ERROR(AddObjectArgs(args));
|
||||
object_refs_ = std::move(args->object_refs_);
|
||||
const bool use_f32_for_halfs = device_info.IsPowerVR();
|
||||
const bool use_f32_for_halfs = gpu_info.IsPowerVR();
|
||||
CopyArguments(*args, use_f32_for_halfs);
|
||||
RETURN_IF_ERROR(SetObjectsResources(*args));
|
||||
return absl::OkStatus();
|
||||
|
@ -34,10 +34,10 @@ class CLArguments : public ArgumentsBinder {
|
||||
public:
|
||||
CLArguments() = default;
|
||||
|
||||
absl::Status Init(const DeviceInfo& device_info,
|
||||
absl::Status Init(const GpuInfo& gpu_info,
|
||||
const std::map<std::string, std::string>& linkables,
|
||||
CLContext* context, Arguments* args, std::string* code);
|
||||
absl::Status Init(const DeviceInfo& device_info, Arguments* args,
|
||||
absl::Status Init(const GpuInfo& gpu_info, Arguments* args,
|
||||
CLContext* context);
|
||||
|
||||
// Temporary, will be resolved later
|
||||
|
@ -44,8 +44,8 @@ __kernel void main_function($0) {
|
||||
})";
|
||||
|
||||
CLArguments cl_args;
|
||||
DeviceInfo device_info;
|
||||
ASSERT_OK(cl_args.Init(device_info, {}, nullptr, &args, &sample_code));
|
||||
GpuInfo gpu_info;
|
||||
ASSERT_OK(cl_args.Init(gpu_info, {}, nullptr, &args, &sample_code));
|
||||
EXPECT_TRUE(absl::StrContains(sample_code, "value = weights_buffer[id];"));
|
||||
EXPECT_TRUE(
|
||||
absl::StrContains(sample_code, "__global float4* weights_buffer"));
|
||||
@ -66,9 +66,8 @@ TEST(CLArgumentsTest, TestNoSelector) {
|
||||
}
|
||||
)";
|
||||
CLArguments cl_args;
|
||||
DeviceInfo device_info;
|
||||
EXPECT_FALSE(
|
||||
cl_args.Init(device_info, {}, nullptr, &args, &sample_code).ok());
|
||||
GpuInfo gpu_info;
|
||||
EXPECT_FALSE(cl_args.Init(gpu_info, {}, nullptr, &args, &sample_code).ok());
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -216,19 +216,19 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
|
||||
}
|
||||
|
||||
absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
|
||||
const CLKernel& kernel, const DeviceInfo& device_info,
|
||||
const CLKernel& kernel, const GpuInfo& gpu_info,
|
||||
const std::vector<int3>& work_groups_count,
|
||||
const std::vector<int3>& work_group_sizes, int* index) {
|
||||
// Some Adreno 3xx can have wrong numbers for some events
|
||||
const bool possible_bug_with_events =
|
||||
device_info.IsAdreno() && device_info.adreno_info.IsAdreno3xx();
|
||||
gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx();
|
||||
events_.resize(work_group_sizes.size());
|
||||
for (int i = 0; i < work_group_sizes.size(); ++i) {
|
||||
RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
|
||||
work_group_sizes[i], &events_[i]));
|
||||
|
||||
// reducing the speed of memory leak on Mali for some kernels
|
||||
if (device_info.IsMali() && i % 8 == 7) {
|
||||
if (gpu_info.IsMali() && i % 8 == 7) {
|
||||
events_[i - 7].Wait();
|
||||
}
|
||||
if (possible_bug_with_events) {
|
||||
@ -240,7 +240,7 @@ absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
|
||||
RETURN_IF_ERROR(WaitForCompletion());
|
||||
|
||||
// To release memory of some kernel pool on Mali.
|
||||
if (device_info.IsMali()) {
|
||||
if (gpu_info.IsMali()) {
|
||||
RETURN_IF_ERROR(kernel.ReInit());
|
||||
}
|
||||
|
||||
|
@ -116,7 +116,7 @@ class ProfilingCommandQueue : public CLCommandQueue {
|
||||
|
||||
// will write index for fastest work_group among work_group_sizes
|
||||
absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
|
||||
const DeviceInfo& device_info,
|
||||
const GpuInfo& gpu_info,
|
||||
const std::vector<int3>& work_groups_count,
|
||||
const std::vector<int3>& work_group_sizes,
|
||||
int* index);
|
||||
|
@ -50,7 +50,7 @@ bool IsEqualToImageFormat(cl_image_format image_format, DataType data_type,
|
||||
image_format.image_channel_order == ToChannelOrder(num_channels);
|
||||
}
|
||||
|
||||
void AddSupportedImageFormats(cl_context context, DeviceInfo* info) {
|
||||
void AddSupportedImageFormats(cl_context context, GpuInfo* info) {
|
||||
auto supported_formats =
|
||||
GetSupportedImage2DFormats(context, CL_MEM_READ_WRITE);
|
||||
for (auto format : supported_formats) {
|
||||
|
@ -156,8 +156,8 @@ bool IsGPUVersionInRange(int gpu_version, int min_version, int max_version) {
|
||||
}
|
||||
} // namespace
|
||||
|
||||
DeviceInfo DeviceInfoFromDeviceID(cl_device_id id) {
|
||||
DeviceInfo info;
|
||||
GpuInfo GpuInfoFromDeviceID(cl_device_id id) {
|
||||
GpuInfo info;
|
||||
const auto device_name = GetDeviceInfo<std::string>(id, CL_DEVICE_NAME);
|
||||
const auto vendor_name = GetDeviceInfo<std::string>(id, CL_DEVICE_VENDOR);
|
||||
const auto opencl_c_version =
|
||||
@ -267,7 +267,7 @@ DeviceInfo DeviceInfoFromDeviceID(cl_device_id id) {
|
||||
}
|
||||
|
||||
CLDevice::CLDevice(cl_device_id id, cl_platform_id platform_id)
|
||||
: info_(DeviceInfoFromDeviceID(id)), id_(id), platform_id_(platform_id) {}
|
||||
: info_(GpuInfoFromDeviceID(id)), id_(id), platform_id_(platform_id) {}
|
||||
|
||||
CLDevice::CLDevice(const CLDevice& device)
|
||||
: info_(device.info_), id_(device.id_), platform_id_(device.platform_id_) {}
|
||||
|
@ -67,10 +67,10 @@ class CLDevice {
|
||||
// To track bug on some Adreno. b/131099086
|
||||
void DisableOneLayerTextureArray();
|
||||
|
||||
const DeviceInfo& GetInfo() const { return info_; }
|
||||
const GpuInfo& GetInfo() const { return info_; }
|
||||
// We update device info during context creation, so as supported texture
|
||||
// formats can be requested from context only.
|
||||
mutable DeviceInfo info_;
|
||||
mutable GpuInfo info_;
|
||||
|
||||
private:
|
||||
cl_device_id id_ = nullptr;
|
||||
|
@ -296,15 +296,15 @@ bool MaliInfo::IsValhall() const {
|
||||
gpu_version == MaliGPU::G68 || gpu_version == MaliGPU::G78;
|
||||
}
|
||||
|
||||
bool DeviceInfo::SupportsTextureArray() const {
|
||||
bool GpuInfo::SupportsTextureArray() const {
|
||||
return cl_version >= OpenCLVersion::CL_1_2;
|
||||
}
|
||||
|
||||
bool DeviceInfo::SupportsImageBuffer() const {
|
||||
bool GpuInfo::SupportsImageBuffer() const {
|
||||
return cl_version >= OpenCLVersion::CL_1_2;
|
||||
}
|
||||
|
||||
bool DeviceInfo::SupportsImage3D() const {
|
||||
bool GpuInfo::SupportsImage3D() const {
|
||||
if (IsMali() && mali_info.IsMidgard()) {
|
||||
// On Mali T880 read_imageh doesn't compile with image3d_t
|
||||
return false;
|
||||
@ -312,7 +312,7 @@ bool DeviceInfo::SupportsImage3D() const {
|
||||
return supports_image3d_writes;
|
||||
}
|
||||
|
||||
bool DeviceInfo::SupportsFloatImage2D(DataType data_type, int channels) const {
|
||||
bool GpuInfo::SupportsFloatImage2D(DataType data_type, int channels) const {
|
||||
if (channels == 1) {
|
||||
return data_type == DataType::FLOAT32 ? supports_r_f32_tex2d
|
||||
: supports_r_f16_tex2d;
|
||||
@ -330,7 +330,7 @@ bool DeviceInfo::SupportsFloatImage2D(DataType data_type, int channels) const {
|
||||
}
|
||||
}
|
||||
|
||||
bool DeviceInfo::SupportsExtension(const std::string& extension) const {
|
||||
bool GpuInfo::SupportsExtension(const std::string& extension) const {
|
||||
for (const auto& ext : extensions) {
|
||||
if (ext == extension) {
|
||||
return true;
|
||||
@ -339,13 +339,13 @@ bool DeviceInfo::SupportsExtension(const std::string& extension) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool DeviceInfo::IsCL20OrHigher() const {
|
||||
bool GpuInfo::IsCL20OrHigher() const {
|
||||
return cl_version != OpenCLVersion::CL_1_0 &&
|
||||
cl_version != OpenCLVersion::CL_1_1 &&
|
||||
cl_version != OpenCLVersion::CL_1_2;
|
||||
}
|
||||
|
||||
bool DeviceInfo::SupportsSubGroupWithSize(int sub_group_size) const {
|
||||
bool GpuInfo::SupportsSubGroupWithSize(int sub_group_size) const {
|
||||
for (auto subgroup_size : supported_subgroup_sizes) {
|
||||
if (sub_group_size == subgroup_size) {
|
||||
return true;
|
||||
@ -354,19 +354,19 @@ bool DeviceInfo::SupportsSubGroupWithSize(int sub_group_size) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool DeviceInfo::IsAdreno() const { return gpu_vendor == GpuVendor::kQualcomm; }
|
||||
bool GpuInfo::IsAdreno() const { return gpu_vendor == GpuVendor::kQualcomm; }
|
||||
|
||||
bool DeviceInfo::IsApple() const { return gpu_vendor == GpuVendor::kApple; }
|
||||
bool GpuInfo::IsApple() const { return gpu_vendor == GpuVendor::kApple; }
|
||||
|
||||
bool DeviceInfo::IsMali() const { return gpu_vendor == GpuVendor::kMali; }
|
||||
bool GpuInfo::IsMali() const { return gpu_vendor == GpuVendor::kMali; }
|
||||
|
||||
bool DeviceInfo::IsPowerVR() const { return gpu_vendor == GpuVendor::kPowerVR; }
|
||||
bool GpuInfo::IsPowerVR() const { return gpu_vendor == GpuVendor::kPowerVR; }
|
||||
|
||||
bool DeviceInfo::IsNvidia() const { return gpu_vendor == GpuVendor::kNvidia; }
|
||||
bool GpuInfo::IsNvidia() const { return gpu_vendor == GpuVendor::kNvidia; }
|
||||
|
||||
bool DeviceInfo::IsAMD() const { return gpu_vendor == GpuVendor::kAMD; }
|
||||
bool GpuInfo::IsAMD() const { return gpu_vendor == GpuVendor::kAMD; }
|
||||
|
||||
bool DeviceInfo::IsIntel() const { return gpu_vendor == GpuVendor::kIntel; }
|
||||
bool GpuInfo::IsIntel() const { return gpu_vendor == GpuVendor::kIntel; }
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -176,8 +176,8 @@ struct MaliInfo {
|
||||
bool IsValhall() const;
|
||||
};
|
||||
|
||||
struct DeviceInfo {
|
||||
DeviceInfo() = default;
|
||||
struct GpuInfo {
|
||||
GpuInfo() = default;
|
||||
|
||||
bool IsAdreno() const;
|
||||
bool IsApple() const;
|
||||
|
@ -173,7 +173,7 @@ bool Environment::IsSupported(TensorStorageType storage_type) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
TensorStorageType GetFastestStorageType(const DeviceInfo& gpu_info) {
|
||||
TensorStorageType GetFastestStorageType(const GpuInfo& gpu_info) {
|
||||
if (gpu_info.IsAdreno()) {
|
||||
if (gpu_info.adreno_info.IsAdreno6xxOrHigher()) {
|
||||
return TensorStorageType::TEXTURE_ARRAY;
|
||||
@ -203,7 +203,7 @@ TensorStorageType GetFastestStorageType(const DeviceInfo& gpu_info) {
|
||||
}
|
||||
|
||||
TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
|
||||
const DeviceInfo& gpu_info) {
|
||||
const GpuInfo& gpu_info) {
|
||||
if (gpu_info.IsAdreno()) {
|
||||
if (gpu_info.adreno_info.IsAdreno3xx() ||
|
||||
gpu_info.adreno_info.IsAdreno4xx()) {
|
||||
|
@ -75,9 +75,9 @@ class Environment {
|
||||
ProgramCache program_cache_;
|
||||
};
|
||||
|
||||
TensorStorageType GetFastestStorageType(const DeviceInfo& gpu_info);
|
||||
TensorStorageType GetFastestStorageType(const GpuInfo& gpu_info);
|
||||
TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
|
||||
const DeviceInfo& gpu_info);
|
||||
const GpuInfo& gpu_info);
|
||||
|
||||
absl::Status CreateEnvironment(Environment* result);
|
||||
|
||||
|
@ -160,7 +160,7 @@ absl::Status InferenceContext::InitFromGraph(
|
||||
creation_context.queue = env->queue();
|
||||
creation_context.cache = env->program_cache();
|
||||
|
||||
ReserveGraphTensors(create_info, creation_context.GetDeviceInfo(), graph);
|
||||
ReserveGraphTensors(create_info, creation_context.GetGpuInfo(), graph);
|
||||
precision_ = create_info.precision;
|
||||
storage_type_ = create_info.storage_type;
|
||||
if (env->device().IsMali()) {
|
||||
@ -174,7 +174,7 @@ absl::Status InferenceContext::InitFromGraph(
|
||||
need_flush_ = true;
|
||||
}
|
||||
CopyInAndOutIds(graph);
|
||||
RETURN_IF_ERROR(ConvertOperations(creation_context.GetDeviceInfo(), graph,
|
||||
RETURN_IF_ERROR(ConvertOperations(creation_context.GetGpuInfo(), graph,
|
||||
create_info.hints));
|
||||
RETURN_IF_ERROR(Merge());
|
||||
RETURN_IF_ERROR(AllocateMemory(creation_context.context));
|
||||
@ -284,7 +284,7 @@ void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) {
|
||||
}
|
||||
|
||||
void InferenceContext::ReserveGraphTensors(
|
||||
const CreateInferenceInfo& create_info, const DeviceInfo& device_info,
|
||||
const CreateInferenceInfo& create_info, const GpuInfo& gpu_info,
|
||||
const GraphFloat32& graph) {
|
||||
ValueId max_id = 0;
|
||||
auto tensors = graph.values();
|
||||
@ -296,14 +296,14 @@ void InferenceContext::ReserveGraphTensors(
|
||||
if (graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id)) {
|
||||
if (shape.c < 4 &&
|
||||
CanCreateTensorWithShape(
|
||||
device_info, shape,
|
||||
gpu_info, shape,
|
||||
TensorDescriptor{data_type, TensorStorageType::SINGLE_TEXTURE_2D,
|
||||
layout})) {
|
||||
storage_type = TensorStorageType::SINGLE_TEXTURE_2D;
|
||||
}
|
||||
}
|
||||
storage_type = SelectBestStorageType(device_info, shape, storage_type,
|
||||
data_type, layout);
|
||||
storage_type =
|
||||
SelectBestStorageType(gpu_info, shape, storage_type, data_type, layout);
|
||||
tensor_reserver_.Add(
|
||||
t->id, {shape, TensorDescriptor{data_type, storage_type, layout}});
|
||||
max_id = std::max(max_id, t->id);
|
||||
@ -311,7 +311,7 @@ void InferenceContext::ReserveGraphTensors(
|
||||
tensor_reserver_.SetNext(max_id + 1);
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::ConvertOperations(const DeviceInfo& device_info,
|
||||
absl::Status InferenceContext::ConvertOperations(const GpuInfo& gpu_info,
|
||||
const GraphFloat32& graph,
|
||||
ModelHints hints) {
|
||||
std::map<ValueId, TensorDescriptor> tensor_descriptors;
|
||||
@ -335,7 +335,7 @@ absl::Status InferenceContext::ConvertOperations(const DeviceInfo& device_info,
|
||||
std::string op_name = node.operation.type + " " + std::to_string(node.id);
|
||||
GPUOperationsSubgraph gpu_subgraph;
|
||||
if (hints.Check(ModelHints::kAllowSpecialKernels) &&
|
||||
GPUSubgraphFromGraph(device_info, precision_, graph, node.id,
|
||||
GPUSubgraphFromGraph(gpu_info, precision_, graph, node.id,
|
||||
tensor_descriptors, &consumed_nodes, &gpu_subgraph,
|
||||
&op_name)
|
||||
.ok()) {
|
||||
@ -375,7 +375,7 @@ absl::Status InferenceContext::ConvertOperations(const DeviceInfo& device_info,
|
||||
op_def.dst_tensors.push_back(
|
||||
tensor_reserver_.Get(outputs[j]->id).descriptor);
|
||||
}
|
||||
RETURN_IF_ERROR(GPUOperationFromNode(device_info, op_def, hints, inputs,
|
||||
RETURN_IF_ERROR(GPUOperationFromNode(gpu_info, op_def, hints, inputs,
|
||||
outputs, node, &gpu_subgraph));
|
||||
}
|
||||
absl::flat_hash_map<int, ValueId> mapping_to_global_ids;
|
||||
|
@ -111,12 +111,11 @@ class InferenceContext {
|
||||
InferenceContext* inference);
|
||||
|
||||
void CopyInAndOutIds(const GraphFloat32& graph);
|
||||
absl::Status ConvertOperations(const DeviceInfo& device_info,
|
||||
absl::Status ConvertOperations(const GpuInfo& gpu_info,
|
||||
const GraphFloat32& graph, ModelHints hints);
|
||||
void CreateLinks();
|
||||
void ReserveGraphTensors(const CreateInferenceInfo& create_info,
|
||||
const DeviceInfo& device_info,
|
||||
const GraphFloat32& graph);
|
||||
const GpuInfo& gpu_info, const GraphFloat32& graph);
|
||||
absl::Status Merge();
|
||||
absl::Status AllocateMemory(CLContext* context);
|
||||
|
||||
|
@ -48,6 +48,7 @@ cc_library(
|
||||
":gpu_operation",
|
||||
"//tensorflow/lite/delegates/gpu/cl:environment",
|
||||
"//tensorflow/lite/delegates/gpu/cl:opencl_wrapper",
|
||||
"//tensorflow/lite/delegates/gpu/cl:tensor",
|
||||
"//tensorflow/lite/delegates/gpu/common:shape",
|
||||
"//tensorflow/lite/delegates/gpu/common:status",
|
||||
"//tensorflow/lite/delegates/gpu/common:tensor",
|
||||
|
@ -15,6 +15,7 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
|
||||
|
||||
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/status.h"
|
||||
|
||||
namespace tflite {
|
||||
|
@ -124,7 +124,7 @@ std::string GetConcatKernelCode(const OperationDef& op_def,
|
||||
|
||||
GPUOperation CreateConcatZ(const OperationDef& definition,
|
||||
const std::vector<int>& channels,
|
||||
const DeviceInfo& device_info) {
|
||||
const GpuInfo& gpu_info) {
|
||||
GPUOperation op(definition);
|
||||
for (int i = 0; i < definition.src_tensors.size(); ++i) {
|
||||
const std::string name = "src_tensor_" + std::to_string(i);
|
||||
@ -140,14 +140,13 @@ GPUOperation CreateConcatZ(const OperationDef& definition,
|
||||
}
|
||||
op.AddDstTensor("dst_tensor", dst_desc);
|
||||
op.code_ = GetConcatKernelCode(definition, channels);
|
||||
if (device_info.IsPowerVR() &&
|
||||
if (gpu_info.IsPowerVR() &&
|
||||
definition.precision == CalculationsPrecision::F32 &&
|
||||
!IsAllChannelsX4(channels)) {
|
||||
// BUG, some PowerVRs (GE8320) produce incorrect result without it
|
||||
op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
|
||||
}
|
||||
if (device_info.IsAMD() &&
|
||||
definition.precision != CalculationsPrecision::F32 &&
|
||||
if (gpu_info.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
|
||||
definition.src_tensors[0].storage_type != TensorStorageType::BUFFER &&
|
||||
!IsAllChannelsX4(channels)) {
|
||||
// BUG, some AMD gpus crash without it
|
||||
|
@ -31,7 +31,7 @@ namespace cl {
|
||||
|
||||
GPUOperation CreateConcatZ(const OperationDef& definition,
|
||||
const std::vector<int>& channels,
|
||||
const DeviceInfo& device_info);
|
||||
const GpuInfo& gpu_info);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -79,19 +79,19 @@ std::string GetComputationPart(const int3& block_size, int element_size,
|
||||
return c;
|
||||
}
|
||||
|
||||
ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
|
||||
ConvBuffer1x1::ConvParams GetBestParams(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const BHWC& shape, int src_depth,
|
||||
int dst_depth) {
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
conv_params.element_size = 4;
|
||||
conv_params.block_size = int3(1, 1, 1);
|
||||
if (!device_info.IsMali()) {
|
||||
if (!gpu_info.IsMali()) {
|
||||
return conv_params;
|
||||
}
|
||||
bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 &&
|
||||
definition.precision != CalculationsPrecision::F32;
|
||||
bool is_midgard = device_info.IsMali() && device_info.mali_info.IsMidgard();
|
||||
bool is_midgard = gpu_info.IsMali() && gpu_info.mali_info.IsMidgard();
|
||||
if (is_midgard) {
|
||||
if (can_use_flt8) {
|
||||
conv_params.element_size = 8;
|
||||
@ -103,8 +103,8 @@ ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
|
||||
}
|
||||
|
||||
int task_size = shape.w * shape.b * shape.h * dst_depth;
|
||||
int block_size = GetRecommendedBlockSizeForConv(
|
||||
device_info, definition.precision, task_size);
|
||||
int block_size =
|
||||
GetRecommendedBlockSizeForConv(gpu_info, definition.precision, task_size);
|
||||
|
||||
if (!can_use_flt8 && block_size > 4) {
|
||||
block_size = 4;
|
||||
@ -132,15 +132,14 @@ ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
|
||||
return conv_params;
|
||||
}
|
||||
|
||||
ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
|
||||
ConvBuffer1x1::ConvParams GetBestParams(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
int src_depth, int dst_depth) {
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
conv_params.element_size = 4;
|
||||
conv_params.block_size = int3(1, 1, 1);
|
||||
if (device_info.IsMali() &&
|
||||
definition.precision == CalculationsPrecision::F16 &&
|
||||
device_info.compute_units_count <= 4) {
|
||||
if (gpu_info.IsMali() && definition.precision == CalculationsPrecision::F16 &&
|
||||
gpu_info.compute_units_count <= 4) {
|
||||
conv_params.block_size.x *= 2;
|
||||
}
|
||||
return conv_params;
|
||||
@ -315,9 +314,9 @@ int3 ConvBuffer1x1::GetGridSize() const {
|
||||
}
|
||||
|
||||
void ConvBuffer1x1::GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
|
||||
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
|
||||
GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
}
|
||||
|
||||
@ -344,7 +343,7 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
|
||||
attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
|
||||
}
|
||||
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* shape) {
|
||||
@ -353,16 +352,16 @@ ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
if (shape) {
|
||||
conv_params =
|
||||
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
|
||||
GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
|
||||
} else {
|
||||
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
|
||||
conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
|
||||
}
|
||||
ConvBuffer1x1 result(definition, conv_params);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* shape) {
|
||||
@ -371,9 +370,9 @@ ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
if (shape) {
|
||||
conv_params =
|
||||
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
|
||||
GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
|
||||
} else {
|
||||
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
|
||||
conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
|
||||
}
|
||||
conv_params.block_size.x *= conv_params.block_size.y;
|
||||
conv_params.block_size.y = 1;
|
||||
@ -383,16 +382,16 @@ ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
}
|
||||
|
||||
ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* shape) {
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
if (shape) {
|
||||
conv_params =
|
||||
GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
|
||||
GetBestParams(gpu_info, definition, *shape, src_depth, dst_depth);
|
||||
} else {
|
||||
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
|
||||
conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
|
||||
}
|
||||
conv_params.block_size.x *= conv_params.block_size.y;
|
||||
conv_params.block_size.y = 1;
|
||||
@ -403,17 +402,17 @@ ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
|
||||
}
|
||||
|
||||
ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC* dst_shape) {
|
||||
const int dst_depth = DivideRoundUp(weights_shape.b, 4);
|
||||
const int src_depth = DivideRoundUp(weights_shape.c, 4);
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
if (dst_shape) {
|
||||
conv_params = GetBestParams(device_info, definition, *dst_shape, src_depth,
|
||||
dst_depth);
|
||||
conv_params =
|
||||
GetBestParams(gpu_info, definition, *dst_shape, src_depth, dst_depth);
|
||||
} else {
|
||||
conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
|
||||
conv_params = GetBestParams(gpu_info, definition, src_depth, dst_depth);
|
||||
}
|
||||
ConvBuffer1x1 result(definition, conv_params);
|
||||
result.UploadBiases(attr.bias);
|
||||
|
@ -47,7 +47,7 @@ class ConvBuffer1x1 : public GPUOperation {
|
||||
ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
|
||||
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override;
|
||||
int3 GetGridSize() const override;
|
||||
@ -71,19 +71,19 @@ class ConvBuffer1x1 : public GPUOperation {
|
||||
|
||||
private:
|
||||
ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params);
|
||||
friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
friend ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* shape);
|
||||
friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
friend ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* shape);
|
||||
friend ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* shape);
|
||||
friend ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC* dst_shape);
|
||||
|
||||
@ -177,23 +177,23 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
|
||||
const BHWC& weights_shape,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* shape = nullptr);
|
||||
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
|
||||
ConvBuffer1x1 CreateConvBuffer1x1(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* shape = nullptr);
|
||||
|
||||
ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* shape = nullptr);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -57,7 +57,7 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1SimpleWeights) {
|
||||
{data_type, TensorStorageType::BUFFER, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvBuffer1x1 operation = CreateConvBuffer1x1(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
|
||||
creation_context_.GetGpuInfo(), op_def, attr, &src_tensor.shape);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -92,7 +92,7 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1) {
|
||||
{data_type, TensorStorageType::BUFFER, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvBuffer1x1 operation = CreateConvBuffer1x1(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
|
||||
creation_context_.GetGpuInfo(), op_def, attr, &src_tensor.shape);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 4), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
|
@ -36,7 +36,7 @@ int GetAdrenoOptimalMaxConstantSize(const AdrenoInfo& adreno_info) {
|
||||
}
|
||||
}
|
||||
|
||||
int GetOptimalMaxConstantSize(const DeviceInfo& info) {
|
||||
int GetOptimalMaxConstantSize(const GpuInfo& info) {
|
||||
if (!info.IsAdreno()) {
|
||||
// In general we do not expect that this kernel will be used with non Adreno
|
||||
// so as it tuned for __constant memory that have big profit on Adreno
|
||||
@ -237,11 +237,10 @@ bool IsDotConvBetter(int src_channels, int dst_channels) {
|
||||
|
||||
} // namespace
|
||||
|
||||
bool IsConvConstantsSupported(const DeviceInfo& device_info,
|
||||
bool IsConvConstantsSupported(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr) {
|
||||
if (device_info.IsAMD() &&
|
||||
definition.precision != CalculationsPrecision::F32 &&
|
||||
if (gpu_info.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
|
||||
definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) {
|
||||
// BUG, some AMD gpus crashe without it
|
||||
return false;
|
||||
@ -259,12 +258,12 @@ bool IsConvConstantsSupported(const DeviceInfo& device_info,
|
||||
? sizeof(float)
|
||||
: sizeof(half);
|
||||
const int filters_buffer_size = filters_count * float_size;
|
||||
const int kConstantMaxSize = GetOptimalMaxConstantSize(device_info);
|
||||
const int kConstantMaxSize = GetOptimalMaxConstantSize(gpu_info);
|
||||
const int flt4_registers = DivideRoundUp(w_shape.o, 4);
|
||||
return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
|
||||
}
|
||||
|
||||
GPUOperation CreateConvConstants(const DeviceInfo& device_info,
|
||||
GPUOperation CreateConvConstants(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr) {
|
||||
const bool use_dot_conv =
|
||||
@ -286,11 +285,11 @@ GPUOperation CreateConvConstants(const DeviceInfo& device_info,
|
||||
op.code_ = GenerateConvolutionConstantCode(
|
||||
definition, attr.weights.shape, stride_correction, use_dot_conv, &op);
|
||||
if (definition.precision == CalculationsPrecision::F16 &&
|
||||
device_info.IsAdreno() && device_info.adreno_info.IsAdreno3xx()) {
|
||||
gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
|
||||
op.compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
|
||||
}
|
||||
if (definition.precision != CalculationsPrecision::F32 &&
|
||||
device_info.IsPowerVR()) {
|
||||
gpu_info.IsPowerVR()) {
|
||||
// BUG, some PowerVRs (GE8320) produce incorrect result without it
|
||||
op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
|
||||
}
|
||||
|
@ -152,11 +152,11 @@ void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
}
|
||||
|
||||
bool IsConvConstantsSupported(const DeviceInfo& device_info,
|
||||
bool IsConvConstantsSupported(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
GPUOperation CreateConvConstants(const DeviceInfo& device_info,
|
||||
GPUOperation CreateConvConstants(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
|
@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreateConvConstants(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -91,7 +91,7 @@ TEST_F(OpenCLOperationTest, ConvConstants) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreateConvConstants(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
|
@ -150,35 +150,35 @@ std::string GenerateBlockCoords(const int4& block_size,
|
||||
|
||||
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape)
|
||||
const GpuInfo& gpu_info, const BHWC* dst_shape)
|
||||
: GPUOperation(definition),
|
||||
stride_(attr.strides.w, attr.strides.h, 1, 1),
|
||||
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
|
||||
kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 1, 1),
|
||||
dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
|
||||
conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
|
||||
conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
|
||||
|
||||
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC& weights_shape,
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape)
|
||||
const BHWC& weights_shape, const GpuInfo& gpu_info,
|
||||
const BHWC* dst_shape)
|
||||
: GPUOperation(definition),
|
||||
stride_(attr.strides.w, attr.strides.h, 1, 1),
|
||||
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
|
||||
kernel_size_(weights_shape.w, weights_shape.h, 1, 1),
|
||||
dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
|
||||
conv_params_(GuessBestParams(device_info, definition, attr, weights_shape,
|
||||
conv_params_(GuessBestParams(gpu_info, definition, attr, weights_shape,
|
||||
dst_shape)) {}
|
||||
|
||||
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape)
|
||||
const GpuInfo& gpu_info, const BHWC* dst_shape)
|
||||
: GPUOperation(definition),
|
||||
stride_(1, 1, 1, 1),
|
||||
padding_(0, 0, 0, 0),
|
||||
kernel_size_(1, 1, 1, 1),
|
||||
dilation_(1, 1, 1, 1),
|
||||
conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
|
||||
conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
|
||||
|
||||
ConvPowerVR::ConvPowerVR(const OperationDef& definition)
|
||||
: GPUOperation(definition),
|
||||
@ -197,7 +197,7 @@ ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
|
||||
|
||||
ConvPowerVR::ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr,
|
||||
const DeviceInfo& device_info, const BHWDC* dst_shape)
|
||||
const GpuInfo& gpu_info, const BHWDC* dst_shape)
|
||||
: GPUOperation(definition),
|
||||
stride_(attr.strides.w, attr.strides.h, attr.strides.d, 1),
|
||||
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
|
||||
@ -205,7 +205,7 @@ ConvPowerVR::ConvPowerVR(const OperationDef& definition,
|
||||
kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
|
||||
attr.weights.shape.d, 1),
|
||||
dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 1),
|
||||
conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
|
||||
conv_params_(GuessBestParams(gpu_info, definition, attr, dst_shape)) {}
|
||||
|
||||
ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
|
||||
if (this != &operation) {
|
||||
@ -219,19 +219,18 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
|
||||
void ConvPowerVR::GenerateCode(const GpuInfo& gpu_info) {
|
||||
if (conv_params_.linear_spatial) {
|
||||
grid_dimension_ = 2;
|
||||
}
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_.x != 1;
|
||||
code_ =
|
||||
GenerateConv(device_info, definition_, stride_correction, conv_params_);
|
||||
code_ = GenerateConv(gpu_info, definition_, stride_correction, conv_params_);
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
device_info.IsPowerVR()) {
|
||||
gpu_info.IsPowerVR()) {
|
||||
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
|
||||
}
|
||||
if (conv_params_.IsPrivateMemBroadcast() && device_info.IsCL20OrHigher()) {
|
||||
if (conv_params_.IsPrivateMemBroadcast() && gpu_info.IsCL20OrHigher()) {
|
||||
compiler_options_.push_back(CompilerOptions::CL_2_0);
|
||||
}
|
||||
bool kernel_is_trivial =
|
||||
@ -239,7 +238,7 @@ void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
|
||||
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
|
||||
kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
|
||||
}
|
||||
if (device_info.IsAdreno() && device_info.adreno_info.IsAdreno3xx() &&
|
||||
if (gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx() &&
|
||||
definition_.precision == CalculationsPrecision::F16 &&
|
||||
kernel_is_trivial) {
|
||||
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
|
||||
@ -306,7 +305,7 @@ int3 ConvPowerVR::GetGridSize() const {
|
||||
}
|
||||
|
||||
void ConvPowerVR::GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
|
||||
if (conv_params_.weights_upload_type ==
|
||||
WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
|
||||
@ -316,11 +315,11 @@ void ConvPowerVR::GetPossibleKernelWorkGroups(
|
||||
work_groups->push_back(work_group_size_);
|
||||
return;
|
||||
}
|
||||
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
|
||||
GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
}
|
||||
|
||||
std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
|
||||
std::string ConvPowerVR::GenerateConv(const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def,
|
||||
bool stride_correction,
|
||||
const ConvParams& conv_params) {
|
||||
@ -446,9 +445,9 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
|
||||
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
if (use_simd_broadcast) {
|
||||
if (device_info.cl_version == OpenCLVersion::CL_2_0) {
|
||||
if (gpu_info.cl_version == OpenCLVersion::CL_2_0) {
|
||||
c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
|
||||
} else if (device_info.SupportsExtension("cl_intel_subgroups")) {
|
||||
} else if (gpu_info.SupportsExtension("cl_intel_subgroups")) {
|
||||
c += "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n";
|
||||
}
|
||||
}
|
||||
@ -459,7 +458,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
|
||||
std::to_string(work_group_size_.y) + ", " +
|
||||
std::to_string(work_group_size_.z) + ")))\n";
|
||||
}
|
||||
if (use_simd_broadcast && device_info.IsIntel()) {
|
||||
if (use_simd_broadcast && gpu_info.IsIntel()) {
|
||||
c += "__attribute__((intel_reqd_sub_group_size(" +
|
||||
std::to_string(simd_size) + ")))\n";
|
||||
}
|
||||
@ -714,7 +713,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
|
||||
}
|
||||
}
|
||||
};
|
||||
const bool conditional_read = device_info.IsMali();
|
||||
const bool conditional_read = gpu_info.IsMali();
|
||||
auto read_src = [&]() {
|
||||
const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
@ -1012,8 +1011,8 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition, int src_depth,
|
||||
int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
|
||||
bool different_weights_for_height, const BHWC* dst_shape) {
|
||||
ConvParams conv_params;
|
||||
conv_params.linear_spatial = false;
|
||||
@ -1022,7 +1021,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
conv_params.x_kernel_is_1 = x_kernel_is_1;
|
||||
conv_params.y_kernel_is_1 = y_kernel_is_1;
|
||||
conv_params.different_weights_for_height = different_weights_for_height;
|
||||
if (device_info.IsNvidia()) {
|
||||
if (gpu_info.IsNvidia()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
work_group_launch_order_ = int3(2, 0, 1);
|
||||
@ -1046,7 +1045,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
if (dst_shape) {
|
||||
int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
|
||||
float task_size_per_cu =
|
||||
static_cast<float>(task_size) / device_info.compute_units_count;
|
||||
static_cast<float>(task_size) / gpu_info.compute_units_count;
|
||||
int block_size = conv_params.block_size.x * conv_params.block_size.y *
|
||||
conv_params.block_size.w;
|
||||
float threads_per_cu = task_size_per_cu / block_size;
|
||||
@ -1067,7 +1066,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
|
||||
conv_params.src_depth_loop_size = 4;
|
||||
}
|
||||
} else if (device_info.IsPowerVR()) {
|
||||
} else if (gpu_info.IsPowerVR()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
work_group_launch_order_ = int3(2, 0, 1);
|
||||
@ -1115,7 +1114,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
}
|
||||
conv_params.block_size.x = 2;
|
||||
}
|
||||
} else if (device_info.IsAMD()) {
|
||||
} else if (gpu_info.IsAMD()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
work_group_launch_order_ = int3(2, 0, 1);
|
||||
@ -1144,12 +1143,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
if (src_depth % 2 == 0 && src_depth >= 16) {
|
||||
conv_params.src_depth_loop_size = 2;
|
||||
}
|
||||
} else if (device_info.IsMali()) {
|
||||
} else if (gpu_info.IsMali()) {
|
||||
int block_size = 2;
|
||||
if (dst_shape) {
|
||||
int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
|
||||
block_size = GetRecommendedBlockSizeForConv(
|
||||
device_info, definition.precision, task_size);
|
||||
gpu_info, definition.precision, task_size);
|
||||
}
|
||||
if (!x_kernel_is_1 || !y_kernel_is_1) {
|
||||
block_size = std::min(block_size, 4);
|
||||
@ -1172,7 +1171,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
conv_params.block_size = int4(1, 1, 1, 1);
|
||||
}
|
||||
conv_params.src_depth_loop_size = 1;
|
||||
MaliInfo mali_info = device_info.mali_info;
|
||||
MaliInfo mali_info = gpu_info.mali_info;
|
||||
if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
|
||||
conv_params.src_depth_loop_size = 2;
|
||||
}
|
||||
@ -1184,9 +1183,9 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
work_group_launch_order_ = int3(0, 1, 2);
|
||||
conv_params.fixed_work_group_size = false;
|
||||
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
|
||||
} else if (device_info.IsAdreno()) {
|
||||
} else if (gpu_info.IsAdreno()) {
|
||||
conv_params.block_size = int4(2, 2, 1, 2);
|
||||
if (device_info.adreno_info.IsAdreno3xx()) {
|
||||
if (gpu_info.adreno_info.IsAdreno3xx()) {
|
||||
if (definition.precision == CalculationsPrecision::F16) {
|
||||
conv_params.block_size = int4(2, 2, 1, 2);
|
||||
} else if (definition.precision == CalculationsPrecision::F32_F16) {
|
||||
@ -1205,7 +1204,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
} else {
|
||||
conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4;
|
||||
}
|
||||
} else if (device_info.IsIntel()) {
|
||||
} else if (gpu_info.IsIntel()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(16, 1, 1);
|
||||
work_group_launch_order_ = int3(0, 1, 2);
|
||||
@ -1220,12 +1219,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
conv_params.src_depth_loop_size = 1;
|
||||
int sub_group_size = 16;
|
||||
const bool supports_subgroups =
|
||||
device_info.SupportsExtension("cl_khr_subgroups") ||
|
||||
device_info.SupportsExtension("cl_intel_subgroups");
|
||||
gpu_info.SupportsExtension("cl_khr_subgroups") ||
|
||||
gpu_info.SupportsExtension("cl_intel_subgroups");
|
||||
if (definition.precision != CalculationsPrecision::F32_F16 &&
|
||||
supports_subgroups &&
|
||||
device_info.SupportsExtension("cl_intel_required_subgroup_size") &&
|
||||
device_info.SupportsSubGroupWithSize(sub_group_size)) {
|
||||
gpu_info.SupportsExtension("cl_intel_required_subgroup_size") &&
|
||||
gpu_info.SupportsSubGroupWithSize(sub_group_size)) {
|
||||
conv_params.weights_upload_type =
|
||||
WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
|
||||
conv_params.simd_size = sub_group_size;
|
||||
@ -1271,7 +1270,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* dst_shape) {
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
@ -1283,12 +1282,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
attr.dilations.h == 1 &&
|
||||
attr.padding.prepended.h == 0 &&
|
||||
attr.padding.appended.h == 0;
|
||||
return GuessBestParams(device_info, definition, src_depth, dst_depth,
|
||||
return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
|
||||
x_kernel_is_1, y_kernel_is_1, false, dst_shape);
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr, const BHWDC* dst_shape) {
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
@ -1312,10 +1311,10 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
shape.h = dst_shape->h * dst_shape->d;
|
||||
shape.w = dst_shape->w;
|
||||
shape.c = dst_shape->c;
|
||||
result = GuessBestParams(device_info, definition, src_depth, dst_depth,
|
||||
result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
|
||||
x_kernel_is_1, y_kernel_is_1, false, &shape);
|
||||
} else {
|
||||
result = GuessBestParams(device_info, definition, src_depth, dst_depth,
|
||||
result = GuessBestParams(gpu_info, definition, src_depth, dst_depth,
|
||||
x_kernel_is_1, y_kernel_is_1, false, nullptr);
|
||||
}
|
||||
result.z_kernel_is_1 = z_kernel_is_1;
|
||||
@ -1323,7 +1322,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC* dst_shape) {
|
||||
const int dst_depth = DivideRoundUp(weights_shape.b, 4);
|
||||
@ -1334,18 +1333,17 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const bool y_kernel_is_1 =
|
||||
weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
|
||||
attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
|
||||
return GuessBestParams(device_info, definition, src_depth, dst_depth,
|
||||
return GuessBestParams(gpu_info, definition, src_depth, dst_depth,
|
||||
x_kernel_is_1, y_kernel_is_1, false, dst_shape);
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr, const BHWC* dst_shape) {
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvPowerVR::ConvParams params =
|
||||
GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
|
||||
false, dst_shape);
|
||||
ConvPowerVR::ConvParams params = GuessBestParams(
|
||||
gpu_info, definition, src_depth, dst_depth, true, true, false, dst_shape);
|
||||
work_group_size_.x *= work_group_size_.y;
|
||||
work_group_size_.y = 1;
|
||||
params.block_size.x *= params.block_size.y;
|
||||
@ -1354,67 +1352,66 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
}
|
||||
|
||||
ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* dst_shape) {
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvPowerVR::ConvParams params =
|
||||
GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
|
||||
true, dst_shape);
|
||||
ConvPowerVR::ConvParams params = GuessBestParams(
|
||||
gpu_info, definition, src_depth, dst_depth, true, true, true, dst_shape);
|
||||
params.block_size.x *= params.block_size.y;
|
||||
params.block_size.y = 1;
|
||||
return params;
|
||||
}
|
||||
|
||||
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape) {
|
||||
ConvPowerVR result(definition, attr, device_info, dst_shape);
|
||||
result.GenerateCode(device_info);
|
||||
ConvPowerVR result(definition, attr, gpu_info, dst_shape);
|
||||
result.GenerateCode(gpu_info);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* dst_shape) {
|
||||
ConvPowerVR result(definition, attr, device_info, dst_shape);
|
||||
result.GenerateCode(device_info);
|
||||
ConvPowerVR result(definition, attr, gpu_info, dst_shape);
|
||||
result.GenerateCode(gpu_info);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
|
||||
ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC& weights_shape,
|
||||
const BHWC* dst_shape) {
|
||||
ConvPowerVR result(definition, attr, weights_shape, device_info, dst_shape);
|
||||
result.GenerateCode(device_info);
|
||||
ConvPowerVR result(definition, attr, weights_shape, gpu_info, dst_shape);
|
||||
result.GenerateCode(gpu_info);
|
||||
result.UploadBias(attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
||||
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
|
||||
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape) {
|
||||
ConvPowerVR result(definition);
|
||||
result.conv_params_ =
|
||||
result.GuessBestParamsWinograd(device_info, definition, attr, dst_shape);
|
||||
result.GenerateCode(device_info);
|
||||
result.GuessBestParamsWinograd(gpu_info, definition, attr, dst_shape);
|
||||
result.GenerateCode(gpu_info);
|
||||
result.UploadDataForWinograd4x4To6x6(attr.weights);
|
||||
return result;
|
||||
}
|
||||
|
||||
ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
|
||||
ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr,
|
||||
const BHWDC* dst_shape) {
|
||||
ConvPowerVR result(definition, attr, device_info, dst_shape);
|
||||
result.GenerateCode(device_info);
|
||||
ConvPowerVR result(definition, attr, gpu_info, dst_shape);
|
||||
result.GenerateCode(gpu_info);
|
||||
result.UploadWeights(attr.weights);
|
||||
result.UploadBias(attr.bias);
|
||||
return result;
|
||||
|
@ -44,7 +44,7 @@ class ConvPowerVR : public GPUOperation {
|
||||
public:
|
||||
ConvPowerVR() = default;
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override;
|
||||
absl::Status BindArguments(ArgumentsBinder* args) override;
|
||||
@ -106,20 +106,20 @@ class ConvPowerVR : public GPUOperation {
|
||||
};
|
||||
|
||||
ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
|
||||
const Convolution2DAttributes& attr, const GpuInfo& gpu_info,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
|
||||
const GpuInfo& gpu_info, const BHWC* dst_shape = nullptr);
|
||||
ConvPowerVR(const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
|
||||
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
explicit ConvPowerVR(const OperationDef& definition);
|
||||
ConvPowerVR(const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr,
|
||||
const DeviceInfo& device_info, const BHWDC* dst_shape = nullptr);
|
||||
const Convolution3DAttributes& attr, const GpuInfo& gpu_info,
|
||||
const BHWDC* dst_shape = nullptr);
|
||||
|
||||
void GenerateCode(const DeviceInfo& device_info);
|
||||
void GenerateCode(const GpuInfo& gpu_info);
|
||||
|
||||
template <DataType T>
|
||||
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
@ -137,60 +137,60 @@ class ConvPowerVR : public GPUOperation {
|
||||
template <DataType T>
|
||||
void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
|
||||
|
||||
friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape);
|
||||
|
||||
friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
friend ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* dst_shape);
|
||||
|
||||
friend ConvPowerVR CreateConvPowerVRDynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC* dst_shape);
|
||||
|
||||
friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, const BHWC* dst_shape);
|
||||
|
||||
friend ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
|
||||
friend ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr,
|
||||
const BHWDC* dst_shape);
|
||||
|
||||
ConvParams GuessBestParams(const DeviceInfo& device_info,
|
||||
ConvParams GuessBestParams(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvParams GuessBestParams(const DeviceInfo& device_info,
|
||||
ConvParams GuessBestParams(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC& weights_shape,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvParams GuessBestParams(const DeviceInfo& device_info,
|
||||
ConvParams GuessBestParams(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvParams GuessBestParamsWinograd(const DeviceInfo& device_info,
|
||||
ConvParams GuessBestParamsWinograd(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
ConvParams GuessBestParams(const DeviceInfo& device_info,
|
||||
ConvParams GuessBestParams(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr,
|
||||
const BHWDC* dst_shape = nullptr);
|
||||
ConvParams GuessBestParams(const DeviceInfo& device_info,
|
||||
ConvParams GuessBestParams(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition, int src_depth,
|
||||
int dst_depth, bool x_kernel_is_1,
|
||||
bool y_kernel_is_1,
|
||||
bool different_weights_for_height,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
std::string GenerateConv(const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, bool stride_correction,
|
||||
std::string GenerateConv(const GpuInfo& gpu_info, const OperationDef& op_def,
|
||||
bool stride_correction,
|
||||
const ConvParams& conv_params);
|
||||
|
||||
int4 stride_;
|
||||
@ -372,28 +372,28 @@ void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
|
||||
}
|
||||
}
|
||||
|
||||
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
|
||||
ConvPowerVR CreateConvPowerVR(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
|
||||
ConvPowerVR CreateConvPowerVRDynamicWeights(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC& weights_shape,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
|
||||
ConvPowerVR CreateConvPowerVRWino4x4To6x6(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
|
||||
ConvPowerVR CreateConvPowerVR3D(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr,
|
||||
const BHWDC* dst_shape = nullptr);
|
||||
|
@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1SimpleWeights) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvPowerVR operation =
|
||||
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreateConvPowerVR(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -91,7 +91,7 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvPowerVR operation =
|
||||
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreateConvPowerVR(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -126,7 +126,7 @@ TEST_F(OpenCLOperationTest, ConvPowerVRSimpleWeights) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvPowerVR operation =
|
||||
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreateConvPowerVR(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -161,7 +161,7 @@ TEST_F(OpenCLOperationTest, ConvPowerVR) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvPowerVR operation =
|
||||
CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreateConvPowerVR(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
|
@ -31,14 +31,14 @@ namespace cl {
|
||||
|
||||
ConvolutionTransposed::ConvolutionTransposed(
|
||||
const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
|
||||
const DeviceInfo& device_info)
|
||||
const GpuInfo& gpu_info)
|
||||
: GPUOperation(definition),
|
||||
stride_(attr.stride.w, attr.stride.h, 1, 1),
|
||||
block_size_(2, 2, 1, 2) {
|
||||
const bool weights_are_buffer = device_info.IsMali();
|
||||
const bool weights_are_buffer = gpu_info.IsMali();
|
||||
const bool is_f16 = definition.precision == CalculationsPrecision::F16;
|
||||
if (device_info.IsMali()) {
|
||||
if (device_info.mali_info.IsMidgard()) {
|
||||
if (gpu_info.IsMali()) {
|
||||
if (gpu_info.mali_info.IsMidgard()) {
|
||||
block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
|
||||
} else {
|
||||
block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
|
||||
@ -46,7 +46,7 @@ ConvolutionTransposed::ConvolutionTransposed(
|
||||
}
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
if (dst_depth == 1 || dst_depth == 3) {
|
||||
if (!device_info.IsMali()) {
|
||||
if (!gpu_info.IsMali()) {
|
||||
block_size_.y *= block_size_.w;
|
||||
}
|
||||
block_size_.w = 1;
|
||||
@ -58,22 +58,21 @@ ConvolutionTransposed::ConvolutionTransposed(
|
||||
args_.AddInt("padding_y", attr.padding.prepended.h);
|
||||
args_.AddInt("kernel_size_x", attr.weights.shape.w);
|
||||
args_.AddInt("kernel_size_y", attr.weights.shape.h);
|
||||
code_ = GenerateConvolutionTransposedCode(definition_, device_info,
|
||||
code_ = GenerateConvolutionTransposedCode(definition_, gpu_info,
|
||||
weights_are_buffer, block_size_);
|
||||
UploadWeights(attr.weights, weights_are_buffer);
|
||||
}
|
||||
|
||||
ConvolutionTransposed::ConvolutionTransposed(
|
||||
const OperationDef& definition,
|
||||
const ConvolutionTransposed3DAttributes& attr,
|
||||
const DeviceInfo& device_info)
|
||||
const ConvolutionTransposed3DAttributes& attr, const GpuInfo& gpu_info)
|
||||
: GPUOperation(definition),
|
||||
stride_(attr.stride.w, attr.stride.h, attr.stride.d, 1),
|
||||
block_size_(2, 2, 1, 2) {
|
||||
const bool weights_are_buffer = device_info.IsMali();
|
||||
const bool weights_are_buffer = gpu_info.IsMali();
|
||||
const bool is_f16 = definition.precision == CalculationsPrecision::F16;
|
||||
if (device_info.IsMali()) {
|
||||
if (device_info.mali_info.IsMidgard()) {
|
||||
if (gpu_info.IsMali()) {
|
||||
if (gpu_info.mali_info.IsMidgard()) {
|
||||
block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
|
||||
} else {
|
||||
block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
|
||||
@ -81,7 +80,7 @@ ConvolutionTransposed::ConvolutionTransposed(
|
||||
}
|
||||
const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
|
||||
if (dst_depth == 1 || dst_depth == 3) {
|
||||
if (!device_info.IsMali()) {
|
||||
if (!gpu_info.IsMali()) {
|
||||
block_size_.y *= block_size_.w;
|
||||
}
|
||||
block_size_.w = 1;
|
||||
@ -97,7 +96,7 @@ ConvolutionTransposed::ConvolutionTransposed(
|
||||
args_.AddInt("kernel_size_y", attr.weights.shape.h);
|
||||
args_.AddInt("kernel_size_z", attr.weights.shape.d);
|
||||
args_.AddInt("grid_size_y");
|
||||
code_ = GenerateConvolutionTransposedCode(definition_, device_info,
|
||||
code_ = GenerateConvolutionTransposedCode(definition_, gpu_info,
|
||||
weights_are_buffer, block_size_);
|
||||
UploadWeights(attr.weights, weights_are_buffer);
|
||||
}
|
||||
@ -118,7 +117,7 @@ ConvolutionTransposed& ConvolutionTransposed::operator=(
|
||||
}
|
||||
|
||||
std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
|
||||
const OperationDef& op_def, const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, const GpuInfo& gpu_info,
|
||||
bool weights_are_buffer, const int4& block_size) {
|
||||
auto src_desc = op_def.src_tensors[0];
|
||||
src_desc.SetAddressMode(AddressMode::kZero);
|
||||
@ -398,7 +397,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
|
||||
c += " int x_c = kernel_index * args.src_tensor.Slices();\n";
|
||||
}
|
||||
c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
|
||||
const bool conditional_read = device_info.IsMali();
|
||||
const bool conditional_read = gpu_info.IsMali();
|
||||
for (int z = 0; z < block_size.z; ++z) {
|
||||
const std::string zind = std::to_string(z);
|
||||
for (int y = 0; y < block_size.y; ++y) {
|
||||
@ -536,16 +535,16 @@ int3 ConvolutionTransposed::GetGridSize() const {
|
||||
}
|
||||
|
||||
void ConvolutionTransposed::GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
|
||||
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
|
||||
GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
}
|
||||
|
||||
ConvolutionTransposed CreateConvolutionTransposed(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr) {
|
||||
ConvolutionTransposed result(definition, attr, device_info);
|
||||
ConvolutionTransposed result(definition, attr, gpu_info);
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type =
|
||||
@ -558,9 +557,9 @@ ConvolutionTransposed CreateConvolutionTransposed(
|
||||
}
|
||||
|
||||
ConvolutionTransposed CreateConvolutionTransposed3D(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposed3DAttributes& attr) {
|
||||
ConvolutionTransposed result(definition, attr, device_info);
|
||||
ConvolutionTransposed result(definition, attr, gpu_info);
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type =
|
||||
|
@ -41,7 +41,7 @@ class ConvolutionTransposed : public GPUOperation {
|
||||
public:
|
||||
ConvolutionTransposed() = default;
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override;
|
||||
absl::Status BindArguments(ArgumentsBinder* args) override;
|
||||
@ -55,17 +55,17 @@ class ConvolutionTransposed : public GPUOperation {
|
||||
|
||||
private:
|
||||
friend ConvolutionTransposed CreateConvolutionTransposed(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
friend ConvolutionTransposed CreateConvolutionTransposed3D(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposed3DAttributes& attr);
|
||||
ConvolutionTransposed(const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
const DeviceInfo& device_info);
|
||||
const GpuInfo& gpu_info);
|
||||
ConvolutionTransposed(const OperationDef& definition,
|
||||
const ConvolutionTransposed3DAttributes& attr,
|
||||
const DeviceInfo& device_info);
|
||||
const GpuInfo& gpu_info);
|
||||
|
||||
template <DataType T>
|
||||
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
@ -76,7 +76,7 @@ class ConvolutionTransposed : public GPUOperation {
|
||||
bool weights_are_buffer);
|
||||
|
||||
std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
|
||||
const DeviceInfo& device_info,
|
||||
const GpuInfo& gpu_info,
|
||||
bool weights_are_buffer,
|
||||
const int4& block_size);
|
||||
int4 stride_;
|
||||
@ -206,11 +206,11 @@ void ConvolutionTransposed::UploadWeights(
|
||||
}
|
||||
|
||||
ConvolutionTransposed CreateConvolutionTransposed(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
ConvolutionTransposed CreateConvolutionTransposed3D(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposed3DAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -26,15 +26,15 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
ConvolutionTransposed3x3::ConvolutionTransposed3x3(
|
||||
const OperationDef& definition, const DeviceInfo& device_info, int2 padding)
|
||||
const OperationDef& definition, const GpuInfo& gpu_info, int2 padding)
|
||||
: GPUOperation(definition), padding_(padding) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
work_group_launch_order_ = int3(2, 0, 1);
|
||||
if (device_info.IsPowerVR()) {
|
||||
if (gpu_info.IsPowerVR()) {
|
||||
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
|
||||
} else if (device_info.IsNvidia() || device_info.IsIntel()) {
|
||||
} else if (gpu_info.IsNvidia() || gpu_info.IsIntel()) {
|
||||
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
|
||||
} else if (device_info.IsAMD()) {
|
||||
} else if (gpu_info.IsAMD()) {
|
||||
weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
|
||||
} else {
|
||||
weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
|
||||
@ -42,7 +42,7 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
|
||||
code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_,
|
||||
padding_, work_group_launch_order_);
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
device_info.IsPowerVR()) {
|
||||
gpu_info.IsPowerVR()) {
|
||||
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
|
||||
}
|
||||
}
|
||||
@ -332,14 +332,14 @@ absl::Status ConvolutionTransposed3x3::BindArguments(ArgumentsBinder* args) {
|
||||
}
|
||||
|
||||
void ConvolutionTransposed3x3::GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
|
||||
if (weights_upload_type_ == WeightsUploadType::LOCAL_MEM_ASYNC ||
|
||||
weights_upload_type_ == WeightsUploadType::LOCAL_MEM_BY_THREADS) {
|
||||
work_groups->push_back(work_group_size_);
|
||||
return;
|
||||
}
|
||||
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
|
||||
GetPossibleWorkGroupsConv(tuning_type, gpu_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
}
|
||||
|
||||
@ -358,10 +358,10 @@ bool IsConvolutionTransposed3x3Supported(
|
||||
}
|
||||
|
||||
ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr) {
|
||||
const int2 padding = int2(attr.padding.prepended.w, attr.padding.prepended.h);
|
||||
ConvolutionTransposed3x3 result(definition, device_info, padding);
|
||||
ConvolutionTransposed3x3 result(definition, gpu_info, padding);
|
||||
result.UploadWeights(attr.weights);
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
|
@ -38,7 +38,7 @@ class ConvolutionTransposed3x3 : public GPUOperation {
|
||||
public:
|
||||
ConvolutionTransposed3x3() = default;
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override;
|
||||
absl::Status BindArguments(ArgumentsBinder* args) override;
|
||||
@ -59,9 +59,9 @@ class ConvolutionTransposed3x3 : public GPUOperation {
|
||||
|
||||
private:
|
||||
ConvolutionTransposed3x3(const OperationDef& definition,
|
||||
const DeviceInfo& device_info, int2 padding);
|
||||
const GpuInfo& gpu_info, int2 padding);
|
||||
friend ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
template <DataType T>
|
||||
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
|
||||
@ -174,7 +174,7 @@ bool IsConvolutionTransposed3x3Supported(
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -55,7 +55,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvolutionTransposed3x3 operation = CreateConvolutionTransposed3x3(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 4, 4, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
|
@ -198,7 +198,7 @@ bool IsConvolutionTransposed3x3ThinSupported(
|
||||
}
|
||||
|
||||
ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr) {
|
||||
ConvolutionTransposed3x3Thin result(definition, attr);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
|
@ -49,7 +49,7 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
|
||||
|
||||
private:
|
||||
friend ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
explicit ConvolutionTransposed3x3Thin(
|
||||
const OperationDef& definition,
|
||||
@ -160,7 +160,7 @@ bool IsConvolutionTransposed3x3ThinSupported(
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -55,7 +55,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3ThinSimpleWeights) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvolutionTransposed3x3Thin operation =
|
||||
CreateConvolutionTransposed3x3Thin(creation_context_.GetDeviceInfo(),
|
||||
CreateConvolutionTransposed3x3Thin(creation_context_.GetGpuInfo(),
|
||||
op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 4, 4, 1), &dst_tensor));
|
||||
@ -91,7 +91,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3Thin) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvolutionTransposed3x3Thin operation =
|
||||
CreateConvolutionTransposed3x3Thin(creation_context_.GetDeviceInfo(),
|
||||
CreateConvolutionTransposed3x3Thin(creation_context_.GetGpuInfo(),
|
||||
op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 4, 4, 1), &dst_tensor));
|
||||
|
@ -26,16 +26,16 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
ConvolutionTransposed4x4::ConvolutionTransposed4x4(
|
||||
const OperationDef& definition, const DeviceInfo& device_info,
|
||||
const OperationDef& definition, const GpuInfo& gpu_info,
|
||||
const ConvolutionTransposedAttributes& attr)
|
||||
: GPUOperation(definition) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
WeightsUploadType weights_upload_type = WeightsUploadType::GLOBAL_MEM;
|
||||
if (device_info.IsPowerVR()) {
|
||||
if (gpu_info.IsPowerVR()) {
|
||||
weights_upload_type = WeightsUploadType::LOCAL_MEM_ASYNC;
|
||||
} else if (device_info.IsNvidia() || device_info.IsIntel()) {
|
||||
} else if (gpu_info.IsNvidia() || gpu_info.IsIntel()) {
|
||||
weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
|
||||
} else if (device_info.IsAMD()) {
|
||||
} else if (gpu_info.IsAMD()) {
|
||||
weights_upload_type = WeightsUploadType::CONSTANT_MEM;
|
||||
} else {
|
||||
weights_upload_type = WeightsUploadType::GLOBAL_MEM;
|
||||
@ -44,7 +44,7 @@ ConvolutionTransposed4x4::ConvolutionTransposed4x4(
|
||||
code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type);
|
||||
UploadWeights(attr.weights, weights_upload_type);
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
device_info.IsPowerVR()) {
|
||||
gpu_info.IsPowerVR()) {
|
||||
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
|
||||
}
|
||||
}
|
||||
@ -332,9 +332,9 @@ bool IsConvolutionTransposed4x4Supported(
|
||||
}
|
||||
|
||||
ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr) {
|
||||
ConvolutionTransposed4x4 result(definition, device_info, attr);
|
||||
ConvolutionTransposed4x4 result(definition, gpu_info, attr);
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
|
@ -38,7 +38,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
|
||||
public:
|
||||
ConvolutionTransposed4x4() = default;
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override {
|
||||
work_groups->push_back(work_group_size_);
|
||||
@ -61,10 +61,10 @@ class ConvolutionTransposed4x4 : public GPUOperation {
|
||||
|
||||
private:
|
||||
ConvolutionTransposed4x4(const OperationDef& definition,
|
||||
const DeviceInfo& device_info,
|
||||
const GpuInfo& gpu_info,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
friend ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
template <DataType T>
|
||||
void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
@ -161,7 +161,7 @@ bool IsConvolutionTransposed4x4Supported(
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed4x4) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvolutionTransposed4x4 operation = CreateConvolutionTransposed4x4(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 4, 4, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
|
@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedSimpleWeights) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvolutionTransposed operation = CreateConvolutionTransposed(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 4, 4, 2), &dst_tensor));
|
||||
EXPECT_THAT(
|
||||
@ -94,7 +94,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvolutionTransposed operation = CreateConvolutionTransposed(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 4, 4, 1), &dst_tensor));
|
||||
EXPECT_THAT(
|
||||
|
@ -28,13 +28,13 @@ namespace cl {
|
||||
|
||||
ConvolutionTransposedThin::ConvolutionTransposedThin(
|
||||
const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
|
||||
const DeviceInfo& device_info)
|
||||
const GpuInfo& gpu_info)
|
||||
: GPUOperation(definition) {
|
||||
code_ = GenerateConvolutionTransposedCode(
|
||||
definition_, DivideRoundUp(attr.weights.shape.i, 4), attr.weights.shape.o,
|
||||
int2(attr.weights.shape.w, attr.weights.shape.h));
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
device_info.IsAdreno() && device_info.adreno_info.IsAdreno3xx()) {
|
||||
gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
|
||||
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
|
||||
}
|
||||
}
|
||||
@ -166,9 +166,9 @@ bool IsConvolutionTransposedThinSupported(
|
||||
}
|
||||
|
||||
ConvolutionTransposedThin CreateConvolutionTransposedThin(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr) {
|
||||
ConvolutionTransposedThin result(definition, attr, device_info);
|
||||
ConvolutionTransposedThin result(definition, attr, gpu_info);
|
||||
result.UploadData(attr.weights, attr.bias);
|
||||
return result;
|
||||
}
|
||||
|
@ -48,11 +48,11 @@ class ConvolutionTransposedThin : public GPUOperation {
|
||||
|
||||
private:
|
||||
friend ConvolutionTransposedThin CreateConvolutionTransposedThin(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
ConvolutionTransposedThin(const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
const DeviceInfo& device_info);
|
||||
const GpuInfo& gpu_info);
|
||||
template <DataType T>
|
||||
void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases);
|
||||
@ -141,7 +141,7 @@ bool IsConvolutionTransposedThinSupported(
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
ConvolutionTransposedThin CreateConvolutionTransposedThin(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedThinSimpleWeights) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvolutionTransposedThin operation = CreateConvolutionTransposedThin(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 4, 4, 2), &dst_tensor));
|
||||
EXPECT_THAT(
|
||||
@ -94,7 +94,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedThin) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
ConvolutionTransposedThin operation = CreateConvolutionTransposedThin(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 4, 4, 1), &dst_tensor));
|
||||
EXPECT_THAT(
|
||||
|
@ -235,9 +235,9 @@ std::string GenerateDepthwiseConvolutionCode(
|
||||
} // namespace
|
||||
|
||||
GPUOperation CreateDepthwiseConvolution2D(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr) {
|
||||
bool weights_are_buffer = device_info.IsMali();
|
||||
bool weights_are_buffer = gpu_info.IsMali();
|
||||
GPUOperation op(definition);
|
||||
op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
|
||||
op.args_.AddInt("stride_x", attr.strides.w);
|
||||
@ -270,7 +270,7 @@ GPUOperation CreateDepthwiseConvolution2D(
|
||||
}
|
||||
|
||||
GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr) {
|
||||
GPUOperation op(definition);
|
||||
op.args_.AddInt("stride_x", attr.strides.w);
|
||||
@ -286,8 +286,8 @@ GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
|
||||
op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = device_info.IsMali() ? LinearStorageType::BUFFER
|
||||
: LinearStorageType::TEXTURE_2D;
|
||||
desc.storage_type = gpu_info.IsMali() ? LinearStorageType::BUFFER
|
||||
: LinearStorageType::TEXTURE_2D;
|
||||
desc.element_type = definition.GetDataType();
|
||||
desc.UploadLinearData(attr.bias);
|
||||
op.args_.AddObject(
|
||||
@ -296,9 +296,9 @@ GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
|
||||
}
|
||||
|
||||
GPUOperation CreateDepthwiseConvolution3D(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution3DAttributes& attr) {
|
||||
bool weights_are_buffer = device_info.IsMali();
|
||||
bool weights_are_buffer = gpu_info.IsMali();
|
||||
GPUOperation op(definition);
|
||||
op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
|
||||
op.args_.AddInt("stride_x", attr.strides.w);
|
||||
|
@ -183,15 +183,15 @@ void UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
}
|
||||
|
||||
GPUOperation CreateDepthwiseConvolution2D(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr);
|
||||
|
||||
GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr);
|
||||
|
||||
GPUOperation CreateDepthwiseConvolution3D(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution3DAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -29,15 +29,14 @@ namespace cl {
|
||||
DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef& definition,
|
||||
bool weights_are_buffer,
|
||||
bool local_mem_uploads,
|
||||
const DeviceInfo& device_info)
|
||||
: GPUOperation(definition),
|
||||
local_mem_uploads_(local_mem_uploads) {
|
||||
const GpuInfo& gpu_info)
|
||||
: GPUOperation(definition), local_mem_uploads_(local_mem_uploads) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
code_ = GenerateDepthwiseConvCode(definition_, weights_are_buffer,
|
||||
local_mem_uploads_);
|
||||
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
device_info.IsPowerVR()) {
|
||||
gpu_info.IsPowerVR()) {
|
||||
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
|
||||
}
|
||||
}
|
||||
@ -293,12 +292,12 @@ int3 DepthwiseConv3x3::GetGridSize() const {
|
||||
}
|
||||
|
||||
void DepthwiseConv3x3::GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
|
||||
if (local_mem_uploads_) {
|
||||
work_groups->push_back(work_group_size_);
|
||||
} else {
|
||||
GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
|
||||
GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
}
|
||||
}
|
||||
@ -313,12 +312,12 @@ bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr) {
|
||||
}
|
||||
|
||||
DepthwiseConv3x3 CreateDepthwiseConv3x3(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr) {
|
||||
bool weights_are_buffer = device_info.IsPowerVR() || device_info.IsMali();
|
||||
bool local_mem_uploads = weights_are_buffer && device_info.IsPowerVR();
|
||||
bool weights_are_buffer = gpu_info.IsPowerVR() || gpu_info.IsMali();
|
||||
bool local_mem_uploads = weights_are_buffer && gpu_info.IsPowerVR();
|
||||
DepthwiseConv3x3 result(definition, weights_are_buffer, local_mem_uploads,
|
||||
device_info);
|
||||
gpu_info);
|
||||
result.UploadWeightsAndBiases(attr.weights, attr.bias, weights_are_buffer);
|
||||
return result;
|
||||
}
|
||||
|
@ -39,7 +39,7 @@ class DepthwiseConv3x3 : public GPUOperation {
|
||||
public:
|
||||
DepthwiseConv3x3() = default;
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override;
|
||||
int3 GetGridSize() const override;
|
||||
@ -53,14 +53,14 @@ class DepthwiseConv3x3 : public GPUOperation {
|
||||
private:
|
||||
explicit DepthwiseConv3x3(const OperationDef& definition,
|
||||
bool weights_are_buffer, bool local_mem_uploads,
|
||||
const DeviceInfo& device_info);
|
||||
const GpuInfo& gpu_info);
|
||||
template <DataType T>
|
||||
void UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const tflite::gpu::Tensor<Linear, T>& biases,
|
||||
bool weights_are_buffer);
|
||||
|
||||
friend DepthwiseConv3x3 CreateDepthwiseConv3x3(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr);
|
||||
|
||||
template <DataType S, typename T>
|
||||
@ -151,7 +151,7 @@ void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
|
||||
bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr);
|
||||
|
||||
DepthwiseConv3x3 CreateDepthwiseConv3x3(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -56,8 +56,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConv3x3SimpleWeights) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
DepthwiseConv3x3 operation = CreateDepthwiseConv3x3(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
DepthwiseConv3x3 operation =
|
||||
CreateDepthwiseConv3x3(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -92,8 +92,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConv3x3) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
DepthwiseConv3x3 operation = CreateDepthwiseConv3x3(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
DepthwiseConv3x3 operation =
|
||||
CreateDepthwiseConv3x3(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
|
@ -56,7 +56,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvSimpleWeights) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation = CreateDepthwiseConvolution2D(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -91,7 +91,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvNoMultiplier) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation = CreateDepthwiseConvolution2D(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -127,7 +127,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvMultiplier2) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation = CreateDepthwiseConvolution2D(
|
||||
creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 2, 4), &dst_tensor));
|
||||
EXPECT_THAT(
|
||||
|
@ -197,14 +197,14 @@ GPUOperation CreateElementwiseOneRuntimeOneScalar(
|
||||
// Creates simple two input(first input is runtime tensor and second input is
|
||||
// constant linear tensor) operation, for example sub, div and etc.
|
||||
GPUOperation CreateElementwiseTwoInput(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const OperationType& op_type,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
|
||||
bool swap_inputs) {
|
||||
const BHWC shape = BHWC(1, 1, 1, constant_tensor.shape.v);
|
||||
TensorStorageType storage_type = SelectBestStorageType(
|
||||
device_info, shape, definition.GetPrimaryStorageType(),
|
||||
definition.GetDataType(), Layout::HWC);
|
||||
TensorStorageType storage_type =
|
||||
SelectBestStorageType(gpu_info, shape, definition.GetPrimaryStorageType(),
|
||||
definition.GetDataType(), Layout::HWC);
|
||||
TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
|
||||
desc.UploadData(constant_tensor);
|
||||
|
||||
@ -228,15 +228,15 @@ GPUOperation CreateElementwiseTwoInput(
|
||||
// Creates simple two input(first input is runtime tensor and second input is
|
||||
// constant HWC tensor) operation, for example sub, div and etc.
|
||||
GPUOperation CreateElementwiseTwoInput(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const OperationType& op_type,
|
||||
const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
|
||||
bool swap_inputs) {
|
||||
const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w,
|
||||
constant_tensor.shape.c);
|
||||
TensorStorageType storage_type = SelectBestStorageType(
|
||||
device_info, shape, definition.GetPrimaryStorageType(),
|
||||
definition.GetDataType(), Layout::HWC);
|
||||
TensorStorageType storage_type =
|
||||
SelectBestStorageType(gpu_info, shape, definition.GetPrimaryStorageType(),
|
||||
definition.GetDataType(), Layout::HWC);
|
||||
TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
|
||||
desc.UploadData(constant_tensor);
|
||||
|
||||
@ -270,7 +270,7 @@ GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
|
||||
return op;
|
||||
}
|
||||
|
||||
GPUOperation CreateElementwise(const DeviceInfo& device_info,
|
||||
GPUOperation CreateElementwise(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const OperationType& op_type,
|
||||
const ElementwiseAttributes& attr) {
|
||||
@ -284,12 +284,11 @@ GPUOperation CreateElementwise(const DeviceInfo& device_info,
|
||||
return CreateElementwiseOneRuntimeOneScalar(definition, op_type, *scalar,
|
||||
attr.runtime_tensor_is_second);
|
||||
} else if (linear_tensor) {
|
||||
return CreateElementwiseTwoInput(device_info, definition, op_type,
|
||||
return CreateElementwiseTwoInput(gpu_info, definition, op_type,
|
||||
*linear_tensor,
|
||||
attr.runtime_tensor_is_second);
|
||||
} else if (hwc_tensor) {
|
||||
return CreateElementwiseTwoInput(device_info, definition, op_type,
|
||||
*hwc_tensor,
|
||||
return CreateElementwiseTwoInput(gpu_info, definition, op_type, *hwc_tensor,
|
||||
attr.runtime_tensor_is_second);
|
||||
} else {
|
||||
return GPUOperation(definition);
|
||||
|
@ -33,7 +33,7 @@ GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
|
||||
|
||||
// Creates simple two input(first input is runtime tensor and second input is
|
||||
// constant or linear/hwc tensor) operation, for example sub, div and etc.
|
||||
GPUOperation CreateElementwise(const DeviceInfo& device_info,
|
||||
GPUOperation CreateElementwise(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const OperationType& op_type,
|
||||
const ElementwiseAttributes& attr);
|
||||
|
@ -570,9 +570,8 @@ TEST_F(OpenCLOperationTest, MaximumWithScalar) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
|
||||
OperationType::MAXIMUM, attr);
|
||||
GPUOperation operation = CreateElementwise(
|
||||
creation_context_.GetGpuInfo(), op_def, OperationType::MAXIMUM, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
|
||||
BHWC(1, 4, 1, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -601,9 +600,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantLinearTensor) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
|
||||
OperationType::MAXIMUM, attr);
|
||||
GPUOperation operation = CreateElementwise(
|
||||
creation_context_.GetGpuInfo(), op_def, OperationType::MAXIMUM, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -632,9 +630,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
|
||||
OperationType::MAXIMUM, attr);
|
||||
GPUOperation operation = CreateElementwise(
|
||||
creation_context_.GetGpuInfo(), op_def, OperationType::MAXIMUM, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -662,9 +659,8 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
|
||||
OperationType::MAXIMUM, attr);
|
||||
GPUOperation operation = CreateElementwise(
|
||||
creation_context_.GetGpuInfo(), op_def, OperationType::MAXIMUM, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -718,9 +714,8 @@ TEST_F(OpenCLOperationTest, MinimumWithScalar) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
|
||||
OperationType::MINIMUM, attr);
|
||||
GPUOperation operation = CreateElementwise(
|
||||
creation_context_.GetGpuInfo(), op_def, OperationType::MINIMUM, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
|
||||
BHWC(1, 4, 1, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -832,7 +827,7 @@ TEST_F(OpenCLOperationTest, SubWithScalarAtFirstPosition) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation = CreateElementwise(
|
||||
creation_context_.GetDeviceInfo(), op_def, OperationType::SUB, attr);
|
||||
creation_context_.GetGpuInfo(), op_def, OperationType::SUB, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
|
||||
BHWC(1, 4, 1, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -887,7 +882,7 @@ TEST_F(OpenCLOperationTest, LessEqual) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
|
||||
CreateElementwise(creation_context_.GetGpuInfo(), op_def,
|
||||
OperationType::LESS_EQUAL, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
@ -914,9 +909,8 @@ TEST_F(OpenCLOperationTest, Greater) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
|
||||
OperationType::GREATER, attr);
|
||||
GPUOperation operation = CreateElementwise(
|
||||
creation_context_.GetGpuInfo(), op_def, OperationType::GREATER, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -943,7 +937,7 @@ TEST_F(OpenCLOperationTest, GreaterEqual) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
|
||||
CreateElementwise(creation_context_.GetGpuInfo(), op_def,
|
||||
OperationType::GREATER_EQUAL, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
@ -970,9 +964,8 @@ TEST_F(OpenCLOperationTest, Equal) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
|
||||
OperationType::EQUAL, attr);
|
||||
GPUOperation operation = CreateElementwise(
|
||||
creation_context_.GetGpuInfo(), op_def, OperationType::EQUAL, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -999,7 +992,7 @@ TEST_F(OpenCLOperationTest, NotEqual) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
|
||||
CreateElementwise(creation_context_.GetGpuInfo(), op_def,
|
||||
OperationType::NOT_EQUAL, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
|
@ -32,32 +32,32 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
bool UseBufferForWeights(const DeviceInfo& device_info) {
|
||||
return device_info.IsAdreno() || device_info.IsAMD() || device_info.IsMali();
|
||||
bool UseBufferForWeights(const GpuInfo& gpu_info) {
|
||||
return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
FullyConnected::FullyConnected(const OperationDef& definition,
|
||||
const DeviceInfo& device_info)
|
||||
const GpuInfo& gpu_info)
|
||||
: GPUOperation(definition) {
|
||||
if (device_info.IsAdreno()) {
|
||||
if (device_info.adreno_info.IsAdreno3xx()) {
|
||||
if (gpu_info.IsAdreno()) {
|
||||
if (gpu_info.adreno_info.IsAdreno3xx()) {
|
||||
work_group_size_ = int3(16, 4, 1);
|
||||
} else if (device_info.adreno_info.IsAdreno4xx()) {
|
||||
} else if (gpu_info.adreno_info.IsAdreno4xx()) {
|
||||
work_group_size_ = int3(32, 4, 1);
|
||||
} else {
|
||||
work_group_size_ = int3(32, 4, 1);
|
||||
}
|
||||
} else if (device_info.IsIntel()) {
|
||||
} else if (gpu_info.IsIntel()) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
} else if (device_info.IsNvidia()) {
|
||||
} else if (gpu_info.IsNvidia()) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
} else if (device_info.IsPowerVR()) {
|
||||
} else if (gpu_info.IsPowerVR()) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
} else {
|
||||
work_group_size_ = int3(16, 4, 1);
|
||||
}
|
||||
code_ = GetFullyConnectedKernelCode(definition_, device_info);
|
||||
code_ = GetFullyConnectedKernelCode(definition_, gpu_info);
|
||||
}
|
||||
|
||||
FullyConnected::FullyConnected(FullyConnected&& kernel)
|
||||
@ -77,11 +77,11 @@ FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
|
||||
// optimized shaders
|
||||
|
||||
std::string FullyConnected::GetFullyConnectedKernelCode(
|
||||
const OperationDef& op_def, const DeviceInfo& device_info) {
|
||||
const OperationDef& op_def, const GpuInfo& gpu_info) {
|
||||
AddSrcTensor("src_tensor", op_def.src_tensors[0]);
|
||||
AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
|
||||
|
||||
const bool weights_are_buffer = UseBufferForWeights(device_info);
|
||||
const bool weights_are_buffer = UseBufferForWeights(gpu_info);
|
||||
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
switch (op_def.precision) {
|
||||
@ -150,11 +150,11 @@ int3 FullyConnected::GetGridSize() const {
|
||||
return int3(dst_[0]->Slices(), 1, 1);
|
||||
}
|
||||
|
||||
FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
|
||||
FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr) {
|
||||
FullyConnected result(definition, device_info);
|
||||
result.UploadWeights(attr.weights, UseBufferForWeights(device_info));
|
||||
FullyConnected result(definition, gpu_info);
|
||||
result.UploadWeights(attr.weights, UseBufferForWeights(gpu_info));
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
|
@ -121,7 +121,7 @@ class FullyConnected : public GPUOperation {
|
||||
public:
|
||||
FullyConnected() = default;
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override {
|
||||
work_groups->push_back(work_group_size_);
|
||||
@ -135,9 +135,9 @@ class FullyConnected : public GPUOperation {
|
||||
FullyConnected& operator=(const FullyConnected&) = delete;
|
||||
|
||||
private:
|
||||
FullyConnected(const OperationDef& definition, const DeviceInfo& device_info);
|
||||
FullyConnected(const OperationDef& definition, const GpuInfo& gpu_info);
|
||||
friend FullyConnected CreateFullyConnected(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr);
|
||||
|
||||
template <DataType T>
|
||||
@ -145,7 +145,7 @@ class FullyConnected : public GPUOperation {
|
||||
bool weights_are_buffer);
|
||||
|
||||
std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
|
||||
const DeviceInfo& device_info);
|
||||
const GpuInfo& gpu_info);
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
@ -195,7 +195,7 @@ void FullyConnected::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
}
|
||||
}
|
||||
|
||||
FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
|
||||
FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr);
|
||||
|
||||
|
@ -58,7 +58,7 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
FullyConnected operation =
|
||||
CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreateFullyConnected(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 1, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}))
|
||||
@ -102,7 +102,7 @@ TEST_F(OpenCLOperationTest, FullyConnectedLarge) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
FullyConnected operation =
|
||||
CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreateFullyConnected(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 1, 1, 12), &dst_tensor));
|
||||
EXPECT_THAT(
|
||||
@ -151,7 +151,7 @@ TEST_F(OpenCLOperationTest, FullyConnectedExtraLarge) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
FullyConnected operation =
|
||||
CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreateFullyConnected(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 1, 1, kOutputSize), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected))
|
||||
|
@ -236,7 +236,7 @@ absl::Status GPUOperation::UpdateParams() {
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status GPUOperation::AssembleCode(const DeviceInfo& device_info,
|
||||
absl::Status GPUOperation::AssembleCode(const GpuInfo& gpu_info,
|
||||
CLContext* context) {
|
||||
if (elementwise_) {
|
||||
auto src_desc =
|
||||
@ -258,14 +258,13 @@ absl::Status GPUOperation::AssembleCode(const DeviceInfo& device_info,
|
||||
elementwise_code_ = "{\n" + code_ + "\n}\n" + elementwise_code_;
|
||||
code_ = GetElementWiseCode(definition_, check_src_channels_size_);
|
||||
}
|
||||
return cl_args_.Init(device_info,
|
||||
{{dst_tensors_names_[0], elementwise_code_}}, context,
|
||||
&args_, &code_);
|
||||
return cl_args_.Init(gpu_info, {{dst_tensors_names_[0], elementwise_code_}},
|
||||
context, &args_, &code_);
|
||||
}
|
||||
|
||||
absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
|
||||
RETURN_IF_ERROR(
|
||||
AssembleCode(creation_context.GetDeviceInfo(), creation_context.context));
|
||||
AssembleCode(creation_context.GetGpuInfo(), creation_context.context));
|
||||
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
|
||||
code_, "main_function", compiler_options_, *creation_context.context,
|
||||
*creation_context.device, &kernel_));
|
||||
@ -274,7 +273,7 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
|
||||
|
||||
absl::Status GPUOperation::CompileDeserialized(
|
||||
const CreationContext& creation_context) {
|
||||
RETURN_IF_ERROR(cl_args_.Init(creation_context.GetDeviceInfo(), &args_,
|
||||
RETURN_IF_ERROR(cl_args_.Init(creation_context.GetGpuInfo(), &args_,
|
||||
creation_context.context));
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code_, "main_function", compiler_options_, *creation_context.context,
|
||||
@ -282,9 +281,9 @@ absl::Status GPUOperation::CompileDeserialized(
|
||||
}
|
||||
|
||||
void GPUOperation::GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
|
||||
GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
|
||||
GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
}
|
||||
|
||||
|
@ -74,7 +74,7 @@ struct CreationContext {
|
||||
CLCommandQueue* queue;
|
||||
ProgramCache* cache;
|
||||
|
||||
const DeviceInfo& GetDeviceInfo() const { return device->info_; }
|
||||
const GpuInfo& GetGpuInfo() const { return device->info_; }
|
||||
};
|
||||
|
||||
struct OperationDef {
|
||||
@ -126,18 +126,18 @@ class GPUOperation {
|
||||
}
|
||||
|
||||
virtual void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const;
|
||||
|
||||
absl::Status Tune(const TuningParameters& params);
|
||||
|
||||
absl::Status AssembleCode(const DeviceInfo& device_info, CLContext* context);
|
||||
absl::Status AssembleCode(const GpuInfo& gpu_info, CLContext* context);
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context);
|
||||
|
||||
absl::Status CompileDeserialized(const CreationContext& creation_context);
|
||||
|
||||
virtual absl::Status PostCompileCheck(const DeviceInfo& device_info,
|
||||
virtual absl::Status PostCompileCheck(const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info) {
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
@ -24,8 +24,7 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
std::string GetLSTMCode(const OperationDef& op_def,
|
||||
const DeviceInfo& device_info) {
|
||||
std::string GetLSTMCode(const OperationDef& op_def, const GpuInfo& gpu_info) {
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
c += "__kernel void main_function(\n";
|
||||
c += "$0) {\n";
|
||||
@ -39,8 +38,7 @@ std::string GetLSTMCode(const OperationDef& op_def,
|
||||
c += " FLT4 r1 = args.intermediate.Read(0, 0, Z + state_stride, B);\n";
|
||||
c += " FLT4 r2 = args.intermediate.Read(0, 0, Z + state_stride * 2, B);\n";
|
||||
c += " FLT4 r3 = args.intermediate.Read(0, 0, Z + state_stride * 3, B);\n";
|
||||
if (op_def.precision != CalculationsPrecision::F32 &&
|
||||
device_info.IsAdreno()) {
|
||||
if (op_def.precision != CalculationsPrecision::F32 && gpu_info.IsAdreno()) {
|
||||
c += " FLT4 input_gate;\n";
|
||||
c += " FLT4 new_input;\n";
|
||||
c += " FLT4 forget_gate;\n";
|
||||
@ -88,13 +86,13 @@ std::string GetLSTMCode(const OperationDef& op_def,
|
||||
} // namespace
|
||||
|
||||
GPUOperation CreateLSTM(const OperationDef& definition,
|
||||
const DeviceInfo& device_info) {
|
||||
const GpuInfo& gpu_info) {
|
||||
GPUOperation op(definition);
|
||||
op.AddSrcTensor("intermediate", definition.src_tensors[0]);
|
||||
op.AddSrcTensor("prev_state", definition.src_tensors[1]);
|
||||
op.AddDstTensor("new_state", definition.dst_tensors[0]);
|
||||
op.AddDstTensor("activation", definition.dst_tensors[1]);
|
||||
op.code_ = GetLSTMCode(definition, device_info);
|
||||
op.code_ = GetLSTMCode(definition, gpu_info);
|
||||
op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
|
||||
return op;
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
GPUOperation CreateLSTM(const OperationDef& definition,
|
||||
const DeviceInfo& device_info);
|
||||
const GpuInfo& gpu_info);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -26,19 +26,19 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
Mean::Mean(const OperationDef& definition, const DeviceInfo& device_info)
|
||||
Mean::Mean(const OperationDef& definition, const GpuInfo& gpu_info)
|
||||
: GPUOperation(definition) {
|
||||
// for workgroup size:
|
||||
// must be: (x * y) % 4 = 0;
|
||||
// must be: z = 1;
|
||||
work_group_size_ = int3(16, 16, 1);
|
||||
if (device_info.IsAdreno()) {
|
||||
if (device_info.adreno_info.IsAdreno3xx()) {
|
||||
if (gpu_info.IsAdreno()) {
|
||||
if (gpu_info.adreno_info.IsAdreno3xx()) {
|
||||
work_group_size_ = int3(16, 8, 1);
|
||||
}
|
||||
}
|
||||
if (device_info.IsMali()) {
|
||||
const MaliInfo& mali_info = device_info.mali_info;
|
||||
if (gpu_info.IsMali()) {
|
||||
const MaliInfo& mali_info = gpu_info.mali_info;
|
||||
if (mali_info.IsMaliT6xx() || mali_info.IsMaliT7xx() ||
|
||||
mali_info.IsMaliT8xx()) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
@ -135,8 +135,8 @@ int3 Mean::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
Mean CreateMean(const OperationDef& definition, const DeviceInfo& device_info) {
|
||||
return Mean(definition, device_info);
|
||||
Mean CreateMean(const OperationDef& definition, const GpuInfo& gpu_info) {
|
||||
return Mean(definition, gpu_info);
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -28,10 +28,10 @@ namespace cl {
|
||||
class Mean : public GPUOperation {
|
||||
public:
|
||||
Mean() = default;
|
||||
Mean(const OperationDef& definition, const DeviceInfo& device_info);
|
||||
Mean(const OperationDef& definition, const GpuInfo& gpu_info);
|
||||
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override {
|
||||
work_groups->push_back(work_group_size_);
|
||||
@ -50,7 +50,7 @@ class Mean : public GPUOperation {
|
||||
const int3& work_group_size);
|
||||
};
|
||||
|
||||
Mean CreateMean(const OperationDef& definition, const DeviceInfo& device_info);
|
||||
Mean CreateMean(const OperationDef& definition, const GpuInfo& gpu_info);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -86,7 +86,7 @@ float4 filter_outside_tensor(float4 x, int num_channels, int slice) {
|
||||
} // namespace
|
||||
|
||||
MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
|
||||
const DeviceInfo& device_info,
|
||||
const GpuInfo& gpu_info,
|
||||
const int tensor_slices)
|
||||
: GPUOperation(definition) {
|
||||
// The kernel code does not inherently need a fixed size, but in order to not
|
||||
@ -95,15 +95,15 @@ MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
|
||||
// For now, fix workgroup size to the biggest supported by the device, but not
|
||||
// larger than the number of tensor slices.
|
||||
int desired_work_group_size =
|
||||
std::min(tensor_slices, device_info.max_work_group_size_x);
|
||||
if (device_info.IsMali()) {
|
||||
std::min(tensor_slices, gpu_info.max_work_group_size_x);
|
||||
if (gpu_info.IsMali()) {
|
||||
// Don't use more than 64 work items per work group on ARM Mali. They
|
||||
// implement local memory using the global memory, larger workgroups have
|
||||
// severe performance penalty.
|
||||
desired_work_group_size = 64;
|
||||
}
|
||||
if (device_info.IsAdreno()) {
|
||||
AdrenoInfo info = device_info.adreno_info;
|
||||
if (gpu_info.IsAdreno()) {
|
||||
AdrenoInfo info = gpu_info.adreno_info;
|
||||
if (info.IsAdreno3xx()) {
|
||||
if (info.adreno_gpu == AdrenoGpu::kAdreno320 ||
|
||||
info.adreno_gpu == AdrenoGpu::kAdreno330) {
|
||||
@ -126,7 +126,7 @@ MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
|
||||
}
|
||||
}
|
||||
}
|
||||
if (device_info.IsPowerVR()) {
|
||||
if (gpu_info.IsPowerVR()) {
|
||||
desired_work_group_size = 64;
|
||||
}
|
||||
while (desired_work_group_size >= tensor_slices * 2) {
|
||||
@ -136,9 +136,9 @@ MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
|
||||
work_group_size_.y = 1; // Required
|
||||
work_group_size_.z = 1; // Required
|
||||
code_ = GetNormalizationCode();
|
||||
if (device_info.cl_version >= OpenCLVersion::CL_3_0) {
|
||||
if (gpu_info.cl_version >= OpenCLVersion::CL_3_0) {
|
||||
compiler_options_.push_back(CompilerOptions::CL_3_0);
|
||||
} else if (device_info.cl_version >= OpenCLVersion::CL_2_0) {
|
||||
} else if (gpu_info.cl_version >= OpenCLVersion::CL_2_0) {
|
||||
compiler_options_.push_back(CompilerOptions::CL_2_0);
|
||||
}
|
||||
}
|
||||
@ -205,9 +205,9 @@ int3 MeanStdDevNormalization::GetGridSize() const {
|
||||
}
|
||||
|
||||
MeanStdDevNormalization CreateMeanStdDevNormalization(
|
||||
const OperationDef& definition, const DeviceInfo& device_info,
|
||||
const OperationDef& definition, const GpuInfo& gpu_info,
|
||||
const int tensor_slices) {
|
||||
return MeanStdDevNormalization(definition, device_info, tensor_slices);
|
||||
return MeanStdDevNormalization(definition, gpu_info, tensor_slices);
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -30,11 +30,11 @@ namespace cl {
|
||||
class MeanStdDevNormalization : public GPUOperation {
|
||||
public:
|
||||
explicit MeanStdDevNormalization(const OperationDef& definition,
|
||||
const DeviceInfo& device_info,
|
||||
const GpuInfo& gpu_info,
|
||||
const int tensor_slices);
|
||||
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override {
|
||||
work_groups->push_back(work_group_size_);
|
||||
@ -53,7 +53,7 @@ class MeanStdDevNormalization : public GPUOperation {
|
||||
};
|
||||
|
||||
MeanStdDevNormalization CreateMeanStdDevNormalization(
|
||||
const OperationDef& definition, const DeviceInfo& device_info,
|
||||
const OperationDef& definition, const GpuInfo& gpu_info,
|
||||
const int tensor_slices);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -25,7 +25,7 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
GPUOperation CreatePReLU(const DeviceInfo& device_info,
|
||||
GPUOperation CreatePReLU(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const PReLUAttributes& attr) {
|
||||
GPUOperation result(definition);
|
||||
@ -51,7 +51,7 @@ GPUOperation CreatePReLU(const DeviceInfo& device_info,
|
||||
const BHWC shape =
|
||||
BHWC(1, alpha_hwc->shape.h, alpha_hwc->shape.w, alpha_hwc->shape.c);
|
||||
TensorStorageType storage_type = SelectBestStorageType(
|
||||
device_info, shape, definition.GetPrimaryStorageType(),
|
||||
gpu_info, shape, definition.GetPrimaryStorageType(),
|
||||
definition.GetDataType(), Layout::HWC);
|
||||
TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
|
||||
desc.UploadData(*alpha_hwc);
|
||||
|
@ -31,7 +31,7 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
GPUOperation CreatePReLU(const DeviceInfo& device_info,
|
||||
GPUOperation CreatePReLU(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const PReLUAttributes& attr);
|
||||
|
||||
|
@ -53,7 +53,7 @@ TEST_F(OpenCLOperationTest, PReLUAlpha) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreatePReLU(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -84,7 +84,7 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreatePReLU(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
@ -115,7 +115,7 @@ TEST_F(OpenCLOperationTest, PReLUHWCAlpha) {
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
GPUOperation operation =
|
||||
CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
|
||||
CreatePReLU(creation_context_.GetGpuInfo(), op_def, attr);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
|
||||
BHWC(1, 2, 1, 2), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data,
|
||||
|
@ -38,7 +38,7 @@ TEST_F(OpenCLOperationTest, ReduceSumChannels) {
|
||||
src_tensor.shape = BHWC(1, 2, 1, 5);
|
||||
src_tensor.data = {1.1, 2.1, 0.7, 0.3, 1.2, 3.1, 4.1, 0.0, 1.0, 4.4};
|
||||
ReduceAttributes attr;
|
||||
attr.axis = Axis::CHANNELS;
|
||||
attr.dims = {Axis::CHANNELS};
|
||||
|
||||
for (auto storage : env_.GetSupportedStorages()) {
|
||||
for (auto precision : env_.GetSupportedPrecisions()) {
|
||||
@ -63,7 +63,7 @@ TEST_F(OpenCLOperationTest, ReduceProductChannels) {
|
||||
src_tensor.shape = BHWC(1, 2, 1, 2);
|
||||
src_tensor.data = {1.1, 2.0, 3.1, 4.0};
|
||||
ReduceAttributes attr;
|
||||
attr.axis = Axis::CHANNELS;
|
||||
attr.dims = {Axis::CHANNELS};
|
||||
|
||||
for (auto storage : env_.GetSupportedStorages()) {
|
||||
for (auto precision : env_.GetSupportedPrecisions()) {
|
||||
@ -89,7 +89,7 @@ TEST_F(OpenCLOperationTest, ReduceMaxChannels) {
|
||||
src_tensor.data = {1.1, 2.0, -0.3, -100.0, 32.6, 1.1,
|
||||
-3.1, -4.0, -5.0, -7.0, -2.0, -100.0};
|
||||
ReduceAttributes attr;
|
||||
attr.axis = Axis::CHANNELS;
|
||||
attr.dims = {Axis::CHANNELS};
|
||||
|
||||
for (auto storage : env_.GetSupportedStorages()) {
|
||||
for (auto precision : env_.GetSupportedPrecisions()) {
|
||||
@ -115,7 +115,7 @@ TEST_F(OpenCLOperationTest, ReduceMinChannels) {
|
||||
src_tensor.data = {1.1, 2.0, -0.3, -100.0, 32.6, 1.1,
|
||||
-3.1, -4.0, -5.0, -7.0, -2.0, 100.0};
|
||||
ReduceAttributes attr;
|
||||
attr.axis = Axis::CHANNELS;
|
||||
attr.dims = {Axis::CHANNELS};
|
||||
|
||||
for (auto storage : env_.GetSupportedStorages()) {
|
||||
for (auto precision : env_.GetSupportedPrecisions()) {
|
||||
|
@ -29,7 +29,7 @@ class Softmax1x1 : public GPUOperation {
|
||||
Softmax1x1() = default;
|
||||
explicit Softmax1x1(const OperationDef& definition);
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override {
|
||||
work_groups->push_back(work_group_size_);
|
||||
|
@ -32,31 +32,31 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
bool UseBufferForWeights(const DeviceInfo& device_info) {
|
||||
return device_info.IsAdreno() || device_info.IsAMD() || device_info.IsMali();
|
||||
bool UseBufferForWeights(const GpuInfo& gpu_info) {
|
||||
return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
FCFCAdd::FCFCAdd(const OperationDef& definition, const DeviceInfo& device_info)
|
||||
FCFCAdd::FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info)
|
||||
: GPUOperation(definition) {
|
||||
if (device_info.IsAdreno()) {
|
||||
if (device_info.adreno_info.IsAdreno3xx()) {
|
||||
if (gpu_info.IsAdreno()) {
|
||||
if (gpu_info.adreno_info.IsAdreno3xx()) {
|
||||
work_group_size_ = int3(16, 4, 1);
|
||||
} else if (device_info.adreno_info.IsAdreno4xx()) {
|
||||
} else if (gpu_info.adreno_info.IsAdreno4xx()) {
|
||||
work_group_size_ = int3(32, 4, 1);
|
||||
} else {
|
||||
work_group_size_ = int3(32, 4, 1);
|
||||
}
|
||||
} else if (device_info.IsIntel()) {
|
||||
} else if (gpu_info.IsIntel()) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
} else if (device_info.IsNvidia()) {
|
||||
} else if (gpu_info.IsNvidia()) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
} else if (device_info.IsPowerVR()) {
|
||||
} else if (gpu_info.IsPowerVR()) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
} else {
|
||||
work_group_size_ = int3(16, 4, 1);
|
||||
}
|
||||
code_ = GetFCFCAddKernelCode(definition_, device_info);
|
||||
code_ = GetFCFCAddKernelCode(definition_, gpu_info);
|
||||
}
|
||||
|
||||
FCFCAdd::FCFCAdd(FCFCAdd&& kernel) : GPUOperation(std::move(kernel)) {}
|
||||
@ -75,12 +75,12 @@ FCFCAdd& FCFCAdd::operator=(FCFCAdd&& kernel) {
|
||||
// optimized shaders
|
||||
|
||||
std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
|
||||
const DeviceInfo& device_info) {
|
||||
const GpuInfo& gpu_info) {
|
||||
AddSrcTensor("src_tensor_0", op_def.src_tensors[0]);
|
||||
AddSrcTensor("src_tensor_1", op_def.src_tensors[1]);
|
||||
AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
|
||||
|
||||
const bool weights_are_buffer = UseBufferForWeights(device_info);
|
||||
const bool weights_are_buffer = UseBufferForWeights(gpu_info);
|
||||
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
switch (op_def.precision) {
|
||||
@ -172,15 +172,14 @@ std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
|
||||
|
||||
int3 FCFCAdd::GetGridSize() const { return int3(dst_[0]->Slices(), 1, 1); }
|
||||
|
||||
FCFCAdd CreateFCFCAdd(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr0,
|
||||
const FullyConnectedAttributes& attr1) {
|
||||
FCFCAdd result(definition, device_info);
|
||||
FCFCAdd result(definition, gpu_info);
|
||||
result.UploadWeights(attr0.weights, "weights0",
|
||||
UseBufferForWeights(device_info));
|
||||
UseBufferForWeights(gpu_info));
|
||||
result.UploadWeights(attr1.weights, "weights1",
|
||||
UseBufferForWeights(device_info));
|
||||
UseBufferForWeights(gpu_info));
|
||||
|
||||
TensorLinearDescriptor desc0;
|
||||
desc0.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
|
@ -97,7 +97,7 @@ class FCFCAdd : public GPUOperation {
|
||||
public:
|
||||
FCFCAdd() = default;
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override {
|
||||
work_groups->push_back(work_group_size_);
|
||||
@ -111,8 +111,8 @@ class FCFCAdd : public GPUOperation {
|
||||
FCFCAdd& operator=(const FCFCAdd&) = delete;
|
||||
|
||||
private:
|
||||
FCFCAdd(const OperationDef& definition, const DeviceInfo& device_info);
|
||||
friend FCFCAdd CreateFCFCAdd(const DeviceInfo& device_info,
|
||||
FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info);
|
||||
friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr0,
|
||||
const FullyConnectedAttributes& attr1);
|
||||
@ -122,7 +122,7 @@ class FCFCAdd : public GPUOperation {
|
||||
const std::string& name, bool weights_are_buffer);
|
||||
|
||||
std::string GetFCFCAddKernelCode(const OperationDef& op_def,
|
||||
const DeviceInfo& device_info);
|
||||
const GpuInfo& gpu_info);
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
@ -175,8 +175,7 @@ void FCFCAdd::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
}
|
||||
}
|
||||
|
||||
FCFCAdd CreateFCFCAdd(const DeviceInfo& device_info,
|
||||
const OperationDef& definition,
|
||||
FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr0,
|
||||
const FullyConnectedAttributes& attr1);
|
||||
|
||||
|
@ -27,7 +27,7 @@ enum class TuningType { EXHAUSTIVE, FAST };
|
||||
|
||||
struct TuningParameters {
|
||||
ProfilingCommandQueue* queue;
|
||||
const DeviceInfo* info;
|
||||
const GpuInfo* info;
|
||||
TuningType tuning_type = TuningType::EXHAUSTIVE;
|
||||
};
|
||||
|
||||
|
@ -114,19 +114,19 @@ int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size) {
|
||||
return {1, 1, 1};
|
||||
}
|
||||
|
||||
int GetRecommendedBlockSizeForConv(const DeviceInfo& device_info,
|
||||
int GetRecommendedBlockSizeForConv(const GpuInfo& gpu_info,
|
||||
CalculationsPrecision precision,
|
||||
int task_size) {
|
||||
const float task_size_per_cu =
|
||||
task_size / static_cast<float>(device_info.compute_units_count);
|
||||
task_size / static_cast<float>(gpu_info.compute_units_count);
|
||||
int block_size = 1;
|
||||
float threshold_1 = FLT_MAX;
|
||||
float threshold_2 = FLT_MAX;
|
||||
float threshold_4 = FLT_MAX;
|
||||
if (!device_info.IsMali()) {
|
||||
if (!gpu_info.IsMali()) {
|
||||
return 1;
|
||||
}
|
||||
MaliInfo mali_info = device_info.mali_info;
|
||||
MaliInfo mali_info = gpu_info.mali_info;
|
||||
switch (precision) {
|
||||
case CalculationsPrecision::F16:
|
||||
if (mali_info.IsBifrostGen1()) {
|
||||
|
@ -210,7 +210,7 @@ float4 GetMaskForLastPlane(int channels);
|
||||
int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
|
||||
|
||||
// task_size as amount of FLT4 processed elements.
|
||||
int GetRecommendedBlockSizeForConv(const DeviceInfo& device,
|
||||
int GetRecommendedBlockSizeForConv(const GpuInfo& gpu_info,
|
||||
CalculationsPrecision precision,
|
||||
int task_size);
|
||||
|
||||
|
@ -33,15 +33,15 @@ namespace cl {
|
||||
|
||||
Winograd4x4To36::Winograd4x4To36(const OperationDef& definition,
|
||||
const Padding2D& padding,
|
||||
const DeviceInfo& device_info)
|
||||
const GpuInfo& gpu_info)
|
||||
: GPUOperation(definition), padding_(padding) {
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
code_ = GetWinograd4x4To36Code(definition_);
|
||||
if (device_info.IsAdreno()) {
|
||||
if (gpu_info.IsAdreno()) {
|
||||
compiler_options_.push_back(CompilerOptions::ADRENO_MORE_WAVES);
|
||||
}
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
device_info.IsPowerVR()) {
|
||||
gpu_info.IsPowerVR()) {
|
||||
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
|
||||
}
|
||||
}
|
||||
@ -282,11 +282,11 @@ int3 Winograd4x4To36::GetGridSize() const {
|
||||
}
|
||||
|
||||
void Winograd4x4To36::GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
|
||||
switch (tuning_type) {
|
||||
case TuningType::EXHAUSTIVE:
|
||||
GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
|
||||
GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
return;
|
||||
case TuningType::FAST:
|
||||
@ -296,20 +296,20 @@ void Winograd4x4To36::GetPossibleKernelWorkGroups(
|
||||
}
|
||||
}
|
||||
|
||||
Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
|
||||
Winograd4x4To36 CreateWinograd4x4To36(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Padding2D& padding) {
|
||||
Winograd4x4To36 result(definition, padding, device_info);
|
||||
Winograd4x4To36 result(definition, padding, gpu_info);
|
||||
result.UploadBt();
|
||||
return result;
|
||||
}
|
||||
|
||||
Winograd36To4x4::Winograd36To4x4(const OperationDef& definition,
|
||||
const DeviceInfo& device_info)
|
||||
const GpuInfo& gpu_info)
|
||||
: GPUOperation(definition) {
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
device_info.IsPowerVR()) {
|
||||
gpu_info.IsPowerVR()) {
|
||||
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
|
||||
}
|
||||
code_ = GetWinograd36To4x4Code(definition_);
|
||||
@ -478,11 +478,11 @@ int3 Winograd36To4x4::GetGridSize() const {
|
||||
}
|
||||
|
||||
void Winograd36To4x4::GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
|
||||
switch (tuning_type) {
|
||||
case TuningType::EXHAUSTIVE:
|
||||
GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_,
|
||||
GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
return;
|
||||
case TuningType::FAST:
|
||||
@ -493,9 +493,9 @@ void Winograd36To4x4::GetPossibleKernelWorkGroups(
|
||||
}
|
||||
|
||||
Winograd36To4x4 CreateWinograd36To4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
|
||||
Winograd36To4x4 result(definition, device_info);
|
||||
Winograd36To4x4 result(definition, gpu_info);
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
desc.element_type = definition.GetDataType();
|
||||
|
@ -34,11 +34,11 @@ class Winograd4x4To36 : public GPUOperation {
|
||||
public:
|
||||
Winograd4x4To36() = default;
|
||||
Winograd4x4To36(const OperationDef& definition, const Padding2D& padding,
|
||||
const DeviceInfo& device_info);
|
||||
const GpuInfo& gpu_info);
|
||||
absl::Status BindArguments(ArgumentsBinder* args) override;
|
||||
int3 GetGridSize() const override;
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override;
|
||||
|
||||
@ -49,7 +49,7 @@ class Winograd4x4To36 : public GPUOperation {
|
||||
Winograd4x4To36& operator=(const Winograd4x4To36&) = delete;
|
||||
|
||||
private:
|
||||
friend Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
|
||||
friend Winograd4x4To36 CreateWinograd4x4To36(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Padding2D& padding);
|
||||
|
||||
@ -63,19 +63,18 @@ class Winograd4x4To36 : public GPUOperation {
|
||||
Padding2D padding_;
|
||||
};
|
||||
|
||||
Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
|
||||
Winograd4x4To36 CreateWinograd4x4To36(const GpuInfo& gpu_info,
|
||||
const OperationDef& definition,
|
||||
const Padding2D& padding);
|
||||
|
||||
class Winograd36To4x4 : public GPUOperation {
|
||||
public:
|
||||
Winograd36To4x4() = default;
|
||||
Winograd36To4x4(const OperationDef& definition,
|
||||
const DeviceInfo& device_info);
|
||||
Winograd36To4x4(const OperationDef& definition, const GpuInfo& gpu_info);
|
||||
absl::Status BindArguments(ArgumentsBinder* args) override;
|
||||
int3 GetGridSize() const override;
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override;
|
||||
|
||||
@ -87,7 +86,7 @@ class Winograd36To4x4 : public GPUOperation {
|
||||
|
||||
private:
|
||||
friend Winograd36To4x4 CreateWinograd36To4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
|
||||
|
||||
void UploadAt();
|
||||
@ -99,7 +98,7 @@ class Winograd36To4x4 : public GPUOperation {
|
||||
};
|
||||
|
||||
Winograd36To4x4 CreateWinograd36To4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info, const OperationDef& definition,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -92,7 +92,7 @@ TEST_F(OpenCLOperationTest, Winograd4x4To36) {
|
||||
padding.prepended = HW(1, 1);
|
||||
padding.appended = HW(1, 1);
|
||||
Winograd4x4To36 wino_up = CreateWinograd4x4To36(
|
||||
creation_context_.GetDeviceInfo(), op_def, padding);
|
||||
creation_context_.GetGpuInfo(), op_def, padding);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_up,
|
||||
BHWC(1, 36, 1, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
|
||||
@ -159,8 +159,8 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
Winograd36To4x4 wino_down = CreateWinograd36To4x4(
|
||||
creation_context_.GetDeviceInfo(), op_def, biases);
|
||||
Winograd36To4x4 wino_down =
|
||||
CreateWinograd36To4x4(creation_context_.GetGpuInfo(), op_def, biases);
|
||||
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_down,
|
||||
BHWC(1, 4, 4, 1), &dst_tensor));
|
||||
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
|
||||
|
@ -35,7 +35,7 @@ std::vector<int2> Get2DWorkgroupsEqualTo128() {
|
||||
|
||||
std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(
|
||||
int multiplier, int3 grid, const KernelInfo& kernel_info,
|
||||
const DeviceInfo& device_info, WorkGroupSizeAlignment z_alignment) {
|
||||
const GpuInfo& gpu_info, WorkGroupSizeAlignment z_alignment) {
|
||||
std::vector<int3> work_groups;
|
||||
work_groups.reserve(32);
|
||||
|
||||
@ -52,9 +52,9 @@ std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(
|
||||
if (work_group_size_xy * z > kernel_info.max_work_group_size) {
|
||||
continue;
|
||||
}
|
||||
if (x <= device_info.max_work_group_size_x &&
|
||||
y <= device_info.max_work_group_size_y &&
|
||||
z <= device_info.max_work_group_size_z) {
|
||||
if (x <= gpu_info.max_work_group_size_x &&
|
||||
y <= gpu_info.max_work_group_size_y &&
|
||||
z <= gpu_info.max_work_group_size_z) {
|
||||
work_groups.push_back({x, y, z});
|
||||
}
|
||||
}
|
||||
@ -65,7 +65,7 @@ std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(
|
||||
|
||||
std::vector<int3> GenerateWorkGroupSizesXMultipleOf(
|
||||
int multiplier, int3 grid, const KernelInfo& kernel_info,
|
||||
const DeviceInfo& device_info, WorkGroupSizeAlignment z_alignment) {
|
||||
const GpuInfo& gpu_info, WorkGroupSizeAlignment z_alignment) {
|
||||
std::vector<int3> work_groups;
|
||||
work_groups.reserve(32);
|
||||
|
||||
@ -78,9 +78,9 @@ std::vector<int3> GenerateWorkGroupSizesXMultipleOf(
|
||||
x += multiplier) {
|
||||
for (auto y : possible_y_sizes) {
|
||||
for (auto z : possible_z_sizes) {
|
||||
if (x <= device_info.max_work_group_size_x &&
|
||||
y <= device_info.max_work_group_size_y &&
|
||||
z <= device_info.max_work_group_size_z &&
|
||||
if (x <= gpu_info.max_work_group_size_x &&
|
||||
y <= gpu_info.max_work_group_size_y &&
|
||||
z <= gpu_info.max_work_group_size_z &&
|
||||
x * y * z <= kernel_info.max_work_group_size) {
|
||||
work_groups.push_back({x, y, z});
|
||||
}
|
||||
@ -90,13 +90,13 @@ std::vector<int3> GenerateWorkGroupSizesXMultipleOf(
|
||||
return work_groups;
|
||||
}
|
||||
|
||||
void GetWorkGroupsAlignedToGrid(const DeviceInfo& device_info,
|
||||
void GetWorkGroupsAlignedToGrid(const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, const int3& grid,
|
||||
std::vector<int3>* work_groups) {
|
||||
int3 max_wg_size;
|
||||
max_wg_size.x = device_info.max_work_group_size_x;
|
||||
max_wg_size.y = device_info.max_work_group_size_y;
|
||||
max_wg_size.z = device_info.max_work_group_size_z;
|
||||
max_wg_size.x = gpu_info.max_work_group_size_x;
|
||||
max_wg_size.y = gpu_info.max_work_group_size_y;
|
||||
max_wg_size.z = gpu_info.max_work_group_size_z;
|
||||
GenerateWorkGroupSizesAlignedToGrid(
|
||||
grid, max_wg_size, kernel_info.max_work_group_size, work_groups);
|
||||
}
|
||||
@ -214,24 +214,22 @@ int3 GetWorkGroupConv(const int3& grid, int max_size, int max_z_size) {
|
||||
return int3(wg_x, wg_y, wg_z);
|
||||
}
|
||||
|
||||
void GetPossibleWorkGroupsXYMultipleOf(int multiplier,
|
||||
const DeviceInfo& device_info,
|
||||
void GetPossibleWorkGroupsXYMultipleOf(int multiplier, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
const int3& grid,
|
||||
WorkGroupSizeAlignment z_alignment,
|
||||
std::vector<int3>* work_groups) {
|
||||
*work_groups = GenerateWorkGroupSizesXYMultipleOf(
|
||||
multiplier, grid, kernel_info, device_info, z_alignment);
|
||||
multiplier, grid, kernel_info, gpu_info, z_alignment);
|
||||
}
|
||||
|
||||
void GetPossibleWorkGroupsXMultipleOf(int multiplier,
|
||||
const DeviceInfo& device_info,
|
||||
void GetPossibleWorkGroupsXMultipleOf(int multiplier, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
const int3& grid,
|
||||
WorkGroupSizeAlignment z_alignment,
|
||||
std::vector<int3>* work_groups) {
|
||||
*work_groups = GenerateWorkGroupSizesXMultipleOf(
|
||||
multiplier, grid, kernel_info, device_info, z_alignment);
|
||||
multiplier, grid, kernel_info, gpu_info, z_alignment);
|
||||
}
|
||||
|
||||
bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
|
||||
@ -250,8 +248,7 @@ bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
|
||||
return !have_equal_work_groups;
|
||||
}
|
||||
|
||||
void GetPossibleWorkGroups(TuningType tuning_type,
|
||||
const DeviceInfo& device_info,
|
||||
void GetPossibleWorkGroups(TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, const int3& grid,
|
||||
std::vector<int3>* work_groups) {
|
||||
switch (tuning_type) {
|
||||
@ -260,7 +257,7 @@ void GetPossibleWorkGroups(TuningType tuning_type,
|
||||
GetWorkGroup(grid, kernel_info.max_work_group_size));
|
||||
return;
|
||||
case TuningType::EXHAUSTIVE: {
|
||||
GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups);
|
||||
GetWorkGroupsAlignedToGrid(gpu_info, kernel_info, grid, work_groups);
|
||||
return;
|
||||
}
|
||||
default:
|
||||
@ -269,23 +266,22 @@ void GetPossibleWorkGroups(TuningType tuning_type,
|
||||
}
|
||||
}
|
||||
|
||||
void GetPossibleWorkGroupsConv(TuningType tuning_type,
|
||||
const DeviceInfo& device_info,
|
||||
void GetPossibleWorkGroupsConv(TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, const int3& grid,
|
||||
std::vector<int3>* work_groups) {
|
||||
switch (tuning_type) {
|
||||
case TuningType::FAST: {
|
||||
int max_z_size = 16;
|
||||
if (device_info.IsAdreno()) {
|
||||
max_z_size = device_info.adreno_info.IsAdreno3xx() ? 16 : 64;
|
||||
if (gpu_info.IsAdreno()) {
|
||||
max_z_size = gpu_info.adreno_info.IsAdreno3xx() ? 16 : 64;
|
||||
}
|
||||
max_z_size = std::min(max_z_size, device_info.max_work_group_size_z);
|
||||
max_z_size = std::min(max_z_size, gpu_info.max_work_group_size_z);
|
||||
work_groups->push_back(
|
||||
GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size));
|
||||
return;
|
||||
}
|
||||
case TuningType::EXHAUSTIVE: {
|
||||
GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups);
|
||||
GetWorkGroupsAlignedToGrid(gpu_info, kernel_info, grid, work_groups);
|
||||
return;
|
||||
}
|
||||
default:
|
||||
|
@ -29,15 +29,13 @@ namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
// multiplier can be power of two only
|
||||
void GetPossibleWorkGroupsXYMultipleOf(int multiplier,
|
||||
const DeviceInfo& device_info,
|
||||
void GetPossibleWorkGroupsXYMultipleOf(int multiplier, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
const int3& grid,
|
||||
WorkGroupSizeAlignment z_alignment,
|
||||
std::vector<int3>* work_groups);
|
||||
|
||||
void GetPossibleWorkGroupsXMultipleOf(int multiplier,
|
||||
const DeviceInfo& device_info,
|
||||
void GetPossibleWorkGroupsXMultipleOf(int multiplier, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info,
|
||||
const int3& grid,
|
||||
WorkGroupSizeAlignment z_alignment,
|
||||
@ -50,13 +48,11 @@ int3 GetWorkGroupXY128Conv(const int3& grid);
|
||||
|
||||
bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height);
|
||||
|
||||
void GetPossibleWorkGroups(TuningType tuning_type,
|
||||
const DeviceInfo& device_info,
|
||||
void GetPossibleWorkGroups(TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, const int3& grid,
|
||||
std::vector<int3>* work_groups);
|
||||
|
||||
void GetPossibleWorkGroupsConv(TuningType tuning_type,
|
||||
const DeviceInfo& device_info,
|
||||
void GetPossibleWorkGroupsConv(TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
const KernelInfo& kernel_info, const int3& grid,
|
||||
std::vector<int3>* work_groups);
|
||||
|
||||
|
@ -31,98 +31,98 @@ namespace {
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionAdreno(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info, const OperationDef& op_def,
|
||||
const GpuInfo& gpu_info, const OperationDef& op_def,
|
||||
ModelHints hints) {
|
||||
if (IsConvConstantsSupported(device_info, op_def, attr)) {
|
||||
GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
|
||||
if (IsConvConstantsSupported(gpu_info, op_def, attr)) {
|
||||
GPUOperation conv = CreateConvConstants(gpu_info, op_def, attr);
|
||||
return absl::make_unique<GPUOperation>(std::move(conv));
|
||||
} else {
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionWinogradAdreno(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info, const OperationDef& op_def,
|
||||
const GpuInfo& gpu_info, const OperationDef& op_def,
|
||||
ModelHints hints) {
|
||||
ConvPowerVR conv =
|
||||
CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
|
||||
CreateConvPowerVRWino4x4To6x6(gpu_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsAdreno(
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC& dst_shape, const DeviceInfo& device_info,
|
||||
const BHWC& dst_shape, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
ConvWeightsDescription* weights_desc) {
|
||||
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
|
||||
device_info, op_def, attr, weights_shape, &dst_shape);
|
||||
gpu_info, op_def, attr, weights_shape, &dst_shape);
|
||||
*weights_desc = conv.GetConvWeightsDescription();
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionNVidia(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info, const OperationDef& op_def) {
|
||||
if (IsConvConstantsSupported(device_info, op_def, attr)) {
|
||||
GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
|
||||
const GpuInfo& gpu_info, const OperationDef& op_def) {
|
||||
if (IsConvConstantsSupported(gpu_info, op_def, attr)) {
|
||||
GPUOperation conv = CreateConvConstants(gpu_info, op_def, attr);
|
||||
return absl::make_unique<GPUOperation>(std::move(conv));
|
||||
} else {
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionPowerVR(
|
||||
const Convolution2DAttributes& attr, const DeviceInfo& device_info,
|
||||
const Convolution2DAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def) {
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
|
||||
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionMali(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info, const OperationDef& op_def) {
|
||||
const GpuInfo& gpu_info, const OperationDef& op_def) {
|
||||
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
|
||||
IsConvBuffer1x1Supported(op_def, attr)) {
|
||||
ConvBuffer1x1 conv =
|
||||
CreateConvBuffer1x1(device_info, op_def, attr, &dst_shape);
|
||||
CreateConvBuffer1x1(gpu_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvBuffer1x1>(std::move(conv));
|
||||
} else {
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionWinogradMali(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info, const OperationDef& op_def) {
|
||||
const GpuInfo& gpu_info, const OperationDef& op_def) {
|
||||
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
|
||||
ConvBuffer1x1 conv =
|
||||
CreateConvBuffer1x1Wino4x4To6x6(device_info, op_def, attr, &dst_shape);
|
||||
CreateConvBuffer1x1Wino4x4To6x6(gpu_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvBuffer1x1>(std::move(conv));
|
||||
} else {
|
||||
ConvPowerVR conv =
|
||||
CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
|
||||
CreateConvPowerVRWino4x4To6x6(gpu_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsMali(
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC& dst_shape, const DeviceInfo& device_info,
|
||||
const BHWC& dst_shape, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
ConvWeightsDescription* weights_desc) {
|
||||
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
|
||||
IsConvBuffer1x1Supported(op_def, weights_shape, attr)) {
|
||||
ConvBuffer1x1 conv = CreateConvBuffer1x1DynamicWeights(
|
||||
device_info, op_def, attr, weights_shape, &dst_shape);
|
||||
gpu_info, op_def, attr, weights_shape, &dst_shape);
|
||||
*weights_desc = conv.GetConvWeightsDescription();
|
||||
return absl::make_unique<ConvBuffer1x1>(std::move(conv));
|
||||
} else {
|
||||
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
|
||||
device_info, op_def, attr, weights_shape, &dst_shape);
|
||||
gpu_info, op_def, attr, weights_shape, &dst_shape);
|
||||
*weights_desc = conv.GetConvWeightsDescription();
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
@ -132,58 +132,58 @@ std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsMali(
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolution(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info, const OperationDef& op_def,
|
||||
const GpuInfo& gpu_info, const OperationDef& op_def,
|
||||
ModelHints hints) {
|
||||
if (device_info.IsAdreno()) {
|
||||
return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints);
|
||||
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
|
||||
device_info.IsIntel()) {
|
||||
return SelectConvolutionPowerVR(attr, device_info, op_def);
|
||||
} else if (device_info.IsNvidia()) {
|
||||
return SelectConvolutionNVidia(attr, dst_shape, device_info, op_def);
|
||||
} else if (device_info.IsMali()) {
|
||||
return SelectConvolutionMali(attr, dst_shape, device_info, op_def);
|
||||
if (gpu_info.IsAdreno()) {
|
||||
return SelectConvolutionAdreno(attr, dst_shape, gpu_info, op_def, hints);
|
||||
} else if (gpu_info.IsPowerVR() || gpu_info.IsAMD() ||
|
||||
gpu_info.IsIntel()) {
|
||||
return SelectConvolutionPowerVR(attr, gpu_info, op_def);
|
||||
} else if (gpu_info.IsNvidia()) {
|
||||
return SelectConvolutionNVidia(attr, dst_shape, gpu_info, op_def);
|
||||
} else if (gpu_info.IsMali()) {
|
||||
return SelectConvolutionMali(attr, dst_shape, gpu_info, op_def);
|
||||
} else {
|
||||
return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints);
|
||||
return SelectConvolutionAdreno(attr, dst_shape, gpu_info, op_def, hints);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info, const OperationDef& op_def,
|
||||
const GpuInfo& gpu_info, const OperationDef& op_def,
|
||||
ModelHints hints) {
|
||||
if (device_info.IsAdreno()) {
|
||||
return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
|
||||
if (gpu_info.IsAdreno()) {
|
||||
return SelectConvolutionWinogradAdreno(attr, dst_shape, gpu_info, op_def,
|
||||
hints);
|
||||
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
|
||||
device_info.IsNvidia() || device_info.IsIntel()) {
|
||||
} else if (gpu_info.IsPowerVR() || gpu_info.IsAMD() ||
|
||||
gpu_info.IsNvidia() || gpu_info.IsIntel()) {
|
||||
ConvPowerVR conv =
|
||||
CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
|
||||
CreateConvPowerVRWino4x4To6x6(gpu_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
} else if (device_info.IsMali()) {
|
||||
return SelectConvolutionWinogradMali(attr, dst_shape, device_info, op_def);
|
||||
} else if (gpu_info.IsMali()) {
|
||||
return SelectConvolutionWinogradMali(attr, dst_shape, gpu_info, op_def);
|
||||
} else {
|
||||
return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
|
||||
return SelectConvolutionWinogradAdreno(attr, dst_shape, gpu_info, op_def,
|
||||
hints);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionWithDynamicWeights(
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC& dst_shape, const DeviceInfo& device_info,
|
||||
const BHWC& dst_shape, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
ConvWeightsDescription* weights_desc) {
|
||||
if (device_info.IsAdreno()) {
|
||||
if (gpu_info.IsAdreno()) {
|
||||
return SelectConvolutionDynamicWeightsAdreno(attr, weights_shape, dst_shape,
|
||||
device_info, op_def, hints,
|
||||
gpu_info, op_def, hints,
|
||||
weights_desc);
|
||||
} else if (device_info.IsMali()) {
|
||||
} else if (gpu_info.IsMali()) {
|
||||
return SelectConvolutionDynamicWeightsMali(attr, weights_shape, dst_shape,
|
||||
device_info, op_def, hints,
|
||||
gpu_info, op_def, hints,
|
||||
weights_desc);
|
||||
} else {
|
||||
ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
|
||||
device_info, op_def, attr, weights_shape, &dst_shape);
|
||||
gpu_info, op_def, attr, weights_shape, &dst_shape);
|
||||
*weights_desc = conv.GetConvWeightsDescription();
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
|
@ -31,19 +31,16 @@ namespace cl {
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolution(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info, const OperationDef& op_def,
|
||||
ModelHints hints);
|
||||
const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints);
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(
|
||||
const Convolution2DAttributes& attr, const BHWC& dst_shape,
|
||||
const DeviceInfo& device_info, const OperationDef& op_def,
|
||||
ModelHints hints);
|
||||
const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints);
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionWithDynamicWeights(
|
||||
const Convolution2DAttributes& attr, const BHWC& weights_shape,
|
||||
const BHWC& dst_shape, const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
ConvWeightsDescription* weights_desc);
|
||||
const BHWC& dst_shape, const GpuInfo& gpu_info, const OperationDef& op_def,
|
||||
ModelHints hints, ConvWeightsDescription* weights_desc);
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConverterToConvWeights(
|
||||
const ConvWeightsDescription& weights_desc, const OperationDef& op_def,
|
||||
|
@ -29,70 +29,70 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionTransposedAdreno(
|
||||
const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
|
||||
const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def) {
|
||||
if (IsConvolutionTransposedThinSupported(attr)) {
|
||||
ConvolutionTransposedThin conv =
|
||||
CreateConvolutionTransposedThin(device_info, op_def, attr);
|
||||
CreateConvolutionTransposedThin(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
|
||||
} else if (IsConvolutionTransposed3x3ThinSupported(attr)) {
|
||||
ConvolutionTransposed3x3Thin conv =
|
||||
CreateConvolutionTransposed3x3Thin(device_info, op_def, attr);
|
||||
CreateConvolutionTransposed3x3Thin(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
|
||||
} else {
|
||||
ConvolutionTransposed conv =
|
||||
CreateConvolutionTransposed(device_info, op_def, attr);
|
||||
CreateConvolutionTransposed(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvolutionTransposed>(std::move(conv));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionTransposedPowerVR(
|
||||
const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
|
||||
const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def) {
|
||||
if (IsConvolutionTransposedThinSupported(attr)) {
|
||||
ConvolutionTransposedThin conv =
|
||||
CreateConvolutionTransposedThin(device_info, op_def, attr);
|
||||
CreateConvolutionTransposedThin(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
|
||||
} else if (IsConvolutionTransposed3x3ThinSupported(attr)) {
|
||||
ConvolutionTransposed3x3Thin conv =
|
||||
CreateConvolutionTransposed3x3Thin(device_info, op_def, attr);
|
||||
CreateConvolutionTransposed3x3Thin(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
|
||||
} else if (IsConvolutionTransposed3x3Supported(op_def, attr)) {
|
||||
ConvolutionTransposed3x3 conv =
|
||||
CreateConvolutionTransposed3x3(device_info, op_def, attr);
|
||||
CreateConvolutionTransposed3x3(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvolutionTransposed3x3>(std::move(conv));
|
||||
} else if (IsConvolutionTransposed4x4Supported(op_def, attr)) {
|
||||
ConvolutionTransposed4x4 conv =
|
||||
CreateConvolutionTransposed4x4(device_info, op_def, attr);
|
||||
CreateConvolutionTransposed4x4(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvolutionTransposed4x4>(std::move(conv));
|
||||
} else {
|
||||
ConvolutionTransposed conv =
|
||||
CreateConvolutionTransposed(device_info, op_def, attr);
|
||||
CreateConvolutionTransposed(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvolutionTransposed>(std::move(conv));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionTransposedMali(
|
||||
const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
|
||||
const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def) {
|
||||
ConvolutionTransposed conv =
|
||||
CreateConvolutionTransposed(device_info, op_def, attr);
|
||||
CreateConvolutionTransposed(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvolutionTransposed>(std::move(conv));
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionTransposed(
|
||||
const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
|
||||
const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def) {
|
||||
if (device_info.IsAdreno()) {
|
||||
return SelectConvolutionTransposedAdreno(attr, device_info, op_def);
|
||||
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
|
||||
device_info.IsNvidia() || device_info.IsIntel()) {
|
||||
return SelectConvolutionTransposedPowerVR(attr, device_info, op_def);
|
||||
} else if (device_info.IsMali()) {
|
||||
return SelectConvolutionTransposedMali(attr, device_info, op_def);
|
||||
if (gpu_info.IsAdreno()) {
|
||||
return SelectConvolutionTransposedAdreno(attr, gpu_info, op_def);
|
||||
} else if (gpu_info.IsPowerVR() || gpu_info.IsAMD() ||
|
||||
gpu_info.IsNvidia() || gpu_info.IsIntel()) {
|
||||
return SelectConvolutionTransposedPowerVR(attr, gpu_info, op_def);
|
||||
} else if (gpu_info.IsMali()) {
|
||||
return SelectConvolutionTransposedMali(attr, gpu_info, op_def);
|
||||
} else {
|
||||
return SelectConvolutionTransposedAdreno(attr, device_info, op_def);
|
||||
return SelectConvolutionTransposedAdreno(attr, gpu_info, op_def);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -27,7 +27,7 @@ namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectConvolutionTransposed(
|
||||
const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
|
||||
const ConvolutionTransposedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -27,9 +27,8 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
absl::Status SelectDefault(const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
const std::vector<Value*>& inputs,
|
||||
absl::Status SelectDefault(const GpuInfo& gpu_info, const OperationDef& op_def,
|
||||
ModelHints hints, const std::vector<Value*>& inputs,
|
||||
const std::vector<Value*>& outputs, const Node& node,
|
||||
GPUOperationsSubgraph* gpu_subgraph) {
|
||||
return absl::UnimplementedError(
|
||||
|
@ -30,9 +30,8 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
absl::Status SelectDefault(const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
const std::vector<Value*>& inputs,
|
||||
absl::Status SelectDefault(const GpuInfo& gpu_info, const OperationDef& op_def,
|
||||
ModelHints hints, const std::vector<Value*>& inputs,
|
||||
const std::vector<Value*>& outputs, const Node& node,
|
||||
GPUOperationsSubgraph* gpu_subgraph);
|
||||
|
||||
|
@ -26,58 +26,58 @@ namespace cl {
|
||||
namespace {
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectDWConvolutionAdreno(
|
||||
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
|
||||
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def) {
|
||||
if (IsDepthwiseConv3x3Supported(attr)) {
|
||||
return absl::make_unique<DepthwiseConv3x3>(
|
||||
CreateDepthwiseConv3x3(device_info, op_def, attr));
|
||||
CreateDepthwiseConv3x3(gpu_info, op_def, attr));
|
||||
} else {
|
||||
return absl::make_unique<GPUOperation>(
|
||||
CreateDepthwiseConvolution2D(device_info, op_def, attr));
|
||||
CreateDepthwiseConvolution2D(gpu_info, op_def, attr));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectDWConvolutionPowerVR(
|
||||
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
|
||||
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def) {
|
||||
if (IsDepthwiseConv3x3Supported(attr)) {
|
||||
return absl::make_unique<DepthwiseConv3x3>(
|
||||
CreateDepthwiseConv3x3(device_info, op_def, attr));
|
||||
CreateDepthwiseConv3x3(gpu_info, op_def, attr));
|
||||
} else {
|
||||
return absl::make_unique<GPUOperation>(
|
||||
CreateDepthwiseConvolution2D(device_info, op_def, attr));
|
||||
CreateDepthwiseConvolution2D(gpu_info, op_def, attr));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectDWConvolutionMali(
|
||||
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
|
||||
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def) {
|
||||
const auto storage_type = op_def.src_tensors[0].storage_type;
|
||||
bool buffer_type = storage_type == TensorStorageType::BUFFER ||
|
||||
storage_type == TensorStorageType::IMAGE_BUFFER;
|
||||
const MaliInfo mali_info = device_info.mali_info;
|
||||
const MaliInfo mali_info = gpu_info.mali_info;
|
||||
if (IsDepthwiseConv3x3Supported(attr) && !mali_info.IsMidgard() &&
|
||||
!buffer_type && op_def.precision != CalculationsPrecision::F32) {
|
||||
return absl::make_unique<DepthwiseConv3x3>(
|
||||
CreateDepthwiseConv3x3(device_info, op_def, attr));
|
||||
CreateDepthwiseConv3x3(gpu_info, op_def, attr));
|
||||
} else {
|
||||
return absl::make_unique<GPUOperation>(
|
||||
CreateDepthwiseConvolution2D(device_info, op_def, attr));
|
||||
CreateDepthwiseConvolution2D(gpu_info, op_def, attr));
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectDWConvolution(
|
||||
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
|
||||
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def) {
|
||||
if (device_info.IsAdreno()) {
|
||||
return SelectDWConvolutionAdreno(attr, device_info, op_def);
|
||||
} else if (device_info.IsPowerVR()) {
|
||||
return SelectDWConvolutionPowerVR(attr, device_info, op_def);
|
||||
} else if (device_info.IsMali()) {
|
||||
return SelectDWConvolutionMali(attr, device_info, op_def);
|
||||
if (gpu_info.IsAdreno()) {
|
||||
return SelectDWConvolutionAdreno(attr, gpu_info, op_def);
|
||||
} else if (gpu_info.IsPowerVR()) {
|
||||
return SelectDWConvolutionPowerVR(attr, gpu_info, op_def);
|
||||
} else if (gpu_info.IsMali()) {
|
||||
return SelectDWConvolutionMali(attr, gpu_info, op_def);
|
||||
} else {
|
||||
return SelectDWConvolutionAdreno(attr, device_info, op_def);
|
||||
return SelectDWConvolutionAdreno(attr, gpu_info, op_def);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -27,7 +27,7 @@ namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectDWConvolution(
|
||||
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
|
||||
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -27,74 +27,74 @@ namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectFullyConnectedGeneric(
|
||||
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
|
||||
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def, int batch_size) {
|
||||
if (op_def.IsBatchSupported()) {
|
||||
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
} else {
|
||||
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
|
||||
FullyConnected fc = CreateFullyConnected(gpu_info, op_def, attr);
|
||||
return absl::make_unique<FullyConnected>(std::move(fc));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectFullyConnectedAdreno(
|
||||
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
|
||||
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def, int batch_size) {
|
||||
if (op_def.IsBatchSupported()) {
|
||||
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
} else {
|
||||
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
|
||||
FullyConnected fc = CreateFullyConnected(gpu_info, op_def, attr);
|
||||
return absl::make_unique<FullyConnected>(std::move(fc));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectFullyConnectedPowerVR(
|
||||
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
|
||||
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def, int batch_size) {
|
||||
if (op_def.IsBatchSupported()) {
|
||||
ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
|
||||
ConvPowerVR conv = CreateConvPowerVR(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
} else {
|
||||
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
|
||||
FullyConnected fc = CreateFullyConnected(gpu_info, op_def, attr);
|
||||
return absl::make_unique<FullyConnected>(std::move(fc));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectFullyConnectedMali(
|
||||
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
|
||||
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def, int batch_size) {
|
||||
if (op_def.IsBatchSupported()) {
|
||||
if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
|
||||
ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr);
|
||||
ConvBuffer1x1 conv = CreateConvBuffer1x1(gpu_info, op_def, attr);
|
||||
return absl::make_unique<ConvBuffer1x1>(std::move(conv));
|
||||
} else {
|
||||
BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
|
||||
ConvPowerVR conv =
|
||||
CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
|
||||
CreateConvPowerVR(gpu_info, op_def, attr, &dst_shape);
|
||||
return absl::make_unique<ConvPowerVR>(std::move(conv));
|
||||
}
|
||||
} else {
|
||||
FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
|
||||
FullyConnected fc = CreateFullyConnected(gpu_info, op_def, attr);
|
||||
return absl::make_unique<FullyConnected>(std::move(fc));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectFullyConnected(
|
||||
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
|
||||
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def, int batch_size) {
|
||||
if (device_info.IsAdreno()) {
|
||||
return SelectFullyConnectedAdreno(attr, device_info, op_def, batch_size);
|
||||
} else if (device_info.IsPowerVR() || device_info.IsAMD() ||
|
||||
device_info.IsNvidia() || device_info.IsIntel()) {
|
||||
return SelectFullyConnectedPowerVR(attr, device_info, op_def, batch_size);
|
||||
} else if (device_info.IsMali()) {
|
||||
return SelectFullyConnectedMali(attr, device_info, op_def, batch_size);
|
||||
if (gpu_info.IsAdreno()) {
|
||||
return SelectFullyConnectedAdreno(attr, gpu_info, op_def, batch_size);
|
||||
} else if (gpu_info.IsPowerVR() || gpu_info.IsAMD() ||
|
||||
gpu_info.IsNvidia() || gpu_info.IsIntel()) {
|
||||
return SelectFullyConnectedPowerVR(attr, gpu_info, op_def, batch_size);
|
||||
} else if (gpu_info.IsMali()) {
|
||||
return SelectFullyConnectedMali(attr, gpu_info, op_def, batch_size);
|
||||
} else {
|
||||
return SelectFullyConnectedGeneric(attr, device_info, op_def, batch_size);
|
||||
return SelectFullyConnectedGeneric(attr, gpu_info, op_def, batch_size);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -27,7 +27,7 @@ namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectFullyConnected(
|
||||
const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
|
||||
const FullyConnectedAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def, int batch_size);
|
||||
|
||||
} // namespace cl
|
||||
|
@ -41,7 +41,7 @@ namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
|
||||
const DeviceInfo& device_info,
|
||||
const GpuInfo& gpu_info,
|
||||
const BHWC& dst_shape) {
|
||||
const int tiles_x = DivideRoundUp(dst_shape.w, 4);
|
||||
const int tiles_y = DivideRoundUp(dst_shape.h, 4);
|
||||
@ -51,22 +51,22 @@ bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
|
||||
attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
|
||||
attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
|
||||
// Mali among other devices has smaller SIMD line size
|
||||
const int min_depth = device_info.IsMali() ? 16 : 32;
|
||||
const int min_hw = device_info.IsMali() ? 32 : 128;
|
||||
const int min_depth = gpu_info.IsMali() ? 16 : 32;
|
||||
const int min_hw = gpu_info.IsMali() ? 32 : 128;
|
||||
const bool recommended_channels =
|
||||
dst_depth % 4 == 0 && src_depth >= min_depth && dst_depth >= min_depth;
|
||||
const bool recommended_hw = tiles_x * tiles_y >= min_hw;
|
||||
return suitable_attributes && recommended_channels && recommended_hw;
|
||||
}
|
||||
|
||||
absl::Status WinogradFromNode(const DeviceInfo& device_info,
|
||||
absl::Status WinogradFromNode(const GpuInfo& gpu_info,
|
||||
const std::vector<Value*>& inputs,
|
||||
const std::vector<Value*>& outputs,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
const BHWC& input_shape, const BHWC& output_shape,
|
||||
const Convolution2DAttributes& attr,
|
||||
GPUOperationsSubgraph* gpu_subgraph) {
|
||||
if (!IsSuitableForWinograd4x4To6x6(attr, device_info, output_shape)) {
|
||||
if (!IsSuitableForWinograd4x4To6x6(attr, gpu_info, output_shape)) {
|
||||
return absl::UnimplementedError("No implementation for this case.");
|
||||
}
|
||||
|
||||
@ -76,13 +76,13 @@ absl::Status WinogradFromNode(const DeviceInfo& device_info,
|
||||
const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
|
||||
TensorDescriptor td_0;
|
||||
td_0.storage_type = SelectBestStorageType(
|
||||
device_info, shape_0, op_def.src_tensors[0].storage_type,
|
||||
gpu_info, shape_0, op_def.src_tensors[0].storage_type,
|
||||
op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
|
||||
td_0.data_type = op_def.src_tensors[0].data_type;
|
||||
td_0.layout = op_def.src_tensors[0].layout;
|
||||
TensorDescriptor td_1;
|
||||
td_1.storage_type = SelectBestStorageType(
|
||||
device_info, shape_1, op_def.src_tensors[0].storage_type,
|
||||
gpu_info, shape_1, op_def.src_tensors[0].storage_type,
|
||||
op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
|
||||
td_1.data_type = op_def.src_tensors[0].data_type;
|
||||
td_1.layout = op_def.src_tensors[0].layout;
|
||||
@ -96,7 +96,7 @@ absl::Status WinogradFromNode(const DeviceInfo& device_info,
|
||||
winograd_up_def.dst_tensors.push_back(td_0);
|
||||
auto& winograd_up = gpu_subgraph->operations[0];
|
||||
winograd_up.operation =
|
||||
SelectWinograd4x4To36(device_info, attr.padding, winograd_up_def);
|
||||
SelectWinograd4x4To36(gpu_info, attr.padding, winograd_up_def);
|
||||
winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
|
||||
winograd_up.output_ids = {-1};
|
||||
|
||||
@ -107,7 +107,7 @@ absl::Status WinogradFromNode(const DeviceInfo& device_info,
|
||||
auto& conv = gpu_subgraph->operations[1];
|
||||
conv.input_ids = {-1};
|
||||
conv.output_ids = {-2};
|
||||
conv.operation = SelectConvolutionForWinograd(attr, input_shape, device_info,
|
||||
conv.operation = SelectConvolutionForWinograd(attr, input_shape, gpu_info,
|
||||
conv_def, hints);
|
||||
|
||||
OperationDef winograd_down_def;
|
||||
@ -123,13 +123,13 @@ absl::Status WinogradFromNode(const DeviceInfo& device_info,
|
||||
bias_copy.data.resize(attr.weights.shape.o);
|
||||
}
|
||||
winograd_down.operation =
|
||||
SelectWinograd36To4x4(device_info, winograd_down_def, bias_copy);
|
||||
SelectWinograd36To4x4(gpu_info, winograd_down_def, bias_copy);
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
absl::Status GPUOperationFromNode(const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
const std::vector<Value*>& inputs,
|
||||
const std::vector<Value*>& outputs,
|
||||
@ -159,7 +159,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
auto attr =
|
||||
absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
|
||||
GPUOperation operation =
|
||||
CreateElementwise(device_info, op_def, op_type, attr);
|
||||
CreateElementwise(gpu_info, op_def, op_type, attr);
|
||||
*gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
@ -191,7 +191,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
op_def.src_tensors[1].storage_type,
|
||||
Layout::BHWC};
|
||||
transposed_desc.storage_type = SelectBestStorageType(
|
||||
device_info, weights_shape, transposed_desc.storage_type,
|
||||
gpu_info, weights_shape, transposed_desc.storage_type,
|
||||
transposed_desc.data_type, transposed_desc.layout);
|
||||
TensorDescriptor weights_desc = {op_def.src_tensors[1].data_type,
|
||||
TensorStorageType::BUFFER, Layout::BHWC};
|
||||
@ -206,7 +206,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
conv_def.src_tensors[1] = weights_desc;
|
||||
ConvWeightsDescription conv_weights_desc;
|
||||
conv_op.operation = SelectConvolutionWithDynamicWeights(
|
||||
attr, weights_shape, dst_shape, device_info, conv_def, hints,
|
||||
attr, weights_shape, dst_shape, gpu_info, conv_def, hints,
|
||||
&conv_weights_desc);
|
||||
|
||||
int aligned_output =
|
||||
@ -246,7 +246,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
for (int i = 0; i < inputs.size(); ++i) {
|
||||
channels[i] = inputs[i]->tensor.shape.c;
|
||||
}
|
||||
return SelectConcat(attr, channels, op_def, device_info, gpu_op);
|
||||
return SelectConcat(attr, channels, op_def, gpu_info, gpu_op);
|
||||
}
|
||||
case OperationType::CONVOLUTION_2D: {
|
||||
auto attr =
|
||||
@ -254,14 +254,14 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
auto input_shape = inputs[0]->tensor.shape;
|
||||
auto output_shape = outputs[0]->tensor.shape;
|
||||
if (inputs.size() == 1) {
|
||||
if (WinogradFromNode(device_info, inputs, outputs, op_def, hints,
|
||||
if (WinogradFromNode(gpu_info, inputs, outputs, op_def, hints,
|
||||
input_shape, output_shape, attr, gpu_subgraph)
|
||||
.ok()) {
|
||||
return absl::OkStatus();
|
||||
} else {
|
||||
gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
|
||||
*gpu_op =
|
||||
SelectConvolution(attr, output_shape, device_info, op_def, hints);
|
||||
SelectConvolution(attr, output_shape, gpu_info, op_def, hints);
|
||||
return absl::OkStatus();
|
||||
}
|
||||
} else {
|
||||
@ -283,7 +283,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
conv_def.src_tensors[1] = weights_desc;
|
||||
ConvWeightsDescription conv_weights_desc;
|
||||
conv_op.operation = SelectConvolutionWithDynamicWeights(
|
||||
attr, weights_shape, output_shape, device_info, conv_def, hints,
|
||||
attr, weights_shape, output_shape, gpu_info, conv_def, hints,
|
||||
&conv_weights_desc);
|
||||
|
||||
int aligned_output =
|
||||
@ -309,33 +309,33 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
case OperationType::CONVOLUTION_TRANSPOSED: {
|
||||
auto attr = absl::any_cast<ConvolutionTransposedAttributes>(
|
||||
node.operation.attributes);
|
||||
*gpu_op = SelectConvolutionTransposed(attr, device_info, op_def);
|
||||
*gpu_op = SelectConvolutionTransposed(attr, gpu_info, op_def);
|
||||
return absl::OkStatus();
|
||||
}
|
||||
case OperationType::DEPTHWISE_CONVOLUTION: {
|
||||
auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
|
||||
node.operation.attributes);
|
||||
if (inputs.size() == 1) {
|
||||
*gpu_op = SelectDWConvolution(attr, device_info, op_def);
|
||||
*gpu_op = SelectDWConvolution(attr, gpu_info, op_def);
|
||||
} else {
|
||||
if (inputs[1]->tensor.shape.b != 1) {
|
||||
return absl::UnimplementedError(
|
||||
"No support of depthwise runtime weights with channel multiplier "
|
||||
"!= 1");
|
||||
}
|
||||
*gpu_op = SelectDWConvolutionDynamicWeights(attr, device_info, op_def);
|
||||
*gpu_op = SelectDWConvolutionDynamicWeights(attr, gpu_info, op_def);
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
case OperationType::FULLY_CONNECTED: {
|
||||
auto attr =
|
||||
absl::any_cast<FullyConnectedAttributes>(node.operation.attributes);
|
||||
*gpu_op = SelectFullyConnected(attr, device_info, op_def,
|
||||
*gpu_op = SelectFullyConnected(attr, gpu_info, op_def,
|
||||
inputs[0]->tensor.shape.b);
|
||||
return absl::OkStatus();
|
||||
}
|
||||
case OperationType::LSTM: {
|
||||
*gpu_op = SelectLSTM(op_def, device_info);
|
||||
*gpu_op = SelectLSTM(op_def, gpu_info);
|
||||
return absl::OkStatus();
|
||||
}
|
||||
case OperationType::MAX_UNPOOLING_2D: {
|
||||
@ -346,11 +346,11 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
}
|
||||
case OperationType::MEAN: {
|
||||
auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
|
||||
return SelectMean(attr, op_def, device_info, gpu_op);
|
||||
return SelectMean(attr, op_def, gpu_info, gpu_op);
|
||||
}
|
||||
case OperationType::MEAN_STDDEV_NORMALIZATION: {
|
||||
MeanStdDevNormalization operation = CreateMeanStdDevNormalization(
|
||||
op_def, device_info, (inputs[0]->tensor.shape.c + 3) / 4);
|
||||
op_def, gpu_info, (inputs[0]->tensor.shape.c + 3) / 4);
|
||||
*gpu_op =
|
||||
absl::make_unique<MeanStdDevNormalization>(std::move(operation));
|
||||
return absl::OkStatus();
|
||||
@ -368,7 +368,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
}
|
||||
case OperationType::PRELU: {
|
||||
auto attr = absl::any_cast<PReLUAttributes>(node.operation.attributes);
|
||||
*gpu_op = SelectPReLU(attr, device_info, op_def);
|
||||
*gpu_op = SelectPReLU(attr, gpu_info, op_def);
|
||||
return absl::OkStatus();
|
||||
}
|
||||
case OperationType::QUANTIZE_AND_DEQUANTIZE: {
|
||||
@ -453,7 +453,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
auto attr =
|
||||
absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
|
||||
GPUOperation operation =
|
||||
CreateElementwise(device_info, op_def, op_type, attr);
|
||||
CreateElementwise(gpu_info, op_def, op_type, attr);
|
||||
*gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
@ -474,7 +474,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
return absl::OkStatus();
|
||||
}
|
||||
default:
|
||||
return SelectDefault(device_info, op_def, hints, inputs, outputs, node,
|
||||
return SelectDefault(gpu_info, op_def, hints, inputs, outputs, node,
|
||||
gpu_subgraph);
|
||||
}
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
absl::Status GPUOperationFromNode(const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def, ModelHints hints,
|
||||
const std::vector<Value*>& inputs,
|
||||
const std::vector<Value*>& outputs,
|
||||
|
@ -47,8 +47,8 @@ namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectLSTM(const OperationDef& op_def,
|
||||
const DeviceInfo& device_info) {
|
||||
return absl::make_unique<GPUOperation>(CreateLSTM(op_def, device_info));
|
||||
const GpuInfo& gpu_info) {
|
||||
return absl::make_unique<GPUOperation>(CreateLSTM(op_def, gpu_info));
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes& attr,
|
||||
@ -57,10 +57,9 @@ std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes& attr,
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectPReLU(const PReLUAttributes& attr,
|
||||
const DeviceInfo& device_info,
|
||||
const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def) {
|
||||
return absl::make_unique<GPUOperation>(
|
||||
CreatePReLU(device_info, op_def, attr));
|
||||
return absl::make_unique<GPUOperation>(CreatePReLU(gpu_info, op_def, attr));
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectPooling(const Pooling2DAttributes& attr,
|
||||
@ -89,12 +88,11 @@ absl::Status SelectResize(const Resize2DAttributes& attr,
|
||||
|
||||
absl::Status SelectConcat(const ConcatAttributes& attr,
|
||||
const std::vector<int>& channels,
|
||||
const OperationDef& op_def,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, const GpuInfo& gpu_info,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
switch (attr.axis) {
|
||||
case Axis::CHANNELS: {
|
||||
GPUOperation operation = CreateConcatZ(op_def, channels, device_info);
|
||||
GPUOperation operation = CreateConcatZ(op_def, channels, gpu_info);
|
||||
*ptr = absl::make_unique<GPUOperation>(std::move(operation));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
@ -112,10 +110,10 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
|
||||
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
|
||||
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def) {
|
||||
return absl::make_unique<GPUOperation>(
|
||||
CreateDepthwiseConvolution2DDynamicWeights(device_info, op_def, attr));
|
||||
CreateDepthwiseConvolution2DDynamicWeights(gpu_info, op_def, attr));
|
||||
}
|
||||
|
||||
void SelectReshape(int src_channels, int dst_channels,
|
||||
@ -150,12 +148,12 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
|
||||
}
|
||||
|
||||
absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
|
||||
const DeviceInfo& device_info,
|
||||
const GpuInfo& gpu_info,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
|
||||
return absl::UnimplementedError("Mean operation supports only HW plane");
|
||||
}
|
||||
Mean operation = CreateMean(op_def, device_info);
|
||||
Mean operation = CreateMean(op_def, gpu_info);
|
||||
*ptr = absl::make_unique<Mean>(std::move(operation));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
@ -179,17 +177,17 @@ void SelectTranspose(const TransposeAttributes& attr,
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
|
||||
const DeviceInfo& device_info, const Padding2D& padding,
|
||||
const GpuInfo& gpu_info, const Padding2D& padding,
|
||||
const OperationDef& op_def) {
|
||||
return absl::make_unique<Winograd4x4To36>(
|
||||
CreateWinograd4x4To36(device_info, op_def, padding));
|
||||
CreateWinograd4x4To36(gpu_info, op_def, padding));
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& op_def,
|
||||
const GpuInfo& gpu_info, const OperationDef& op_def,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
|
||||
return absl::make_unique<Winograd36To4x4>(
|
||||
CreateWinograd36To4x4(device_info, op_def, biases));
|
||||
CreateWinograd36To4x4(gpu_info, op_def, biases));
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectQuantizeAndDequantize(
|
||||
|
@ -29,13 +29,13 @@ namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectLSTM(const OperationDef& op_def,
|
||||
const DeviceInfo& device_info);
|
||||
const GpuInfo& gpu_info);
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes& attr,
|
||||
const OperationDef& op_def);
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectPReLU(const PReLUAttributes& attr,
|
||||
const DeviceInfo& device_info,
|
||||
const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def);
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectPooling(const Pooling2DAttributes& attr,
|
||||
@ -53,12 +53,11 @@ absl::Status SelectResize(const Resize2DAttributes& attr,
|
||||
|
||||
absl::Status SelectConcat(const ConcatAttributes& attr,
|
||||
const std::vector<int>& channels,
|
||||
const OperationDef& op_def,
|
||||
const DeviceInfo& device_info,
|
||||
const OperationDef& op_def, const GpuInfo& gpu_info,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
|
||||
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
|
||||
const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info,
|
||||
const OperationDef& op_def);
|
||||
|
||||
void SelectReshape(int src_channels, int dst_channels,
|
||||
@ -72,7 +71,7 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
|
||||
absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
|
||||
const DeviceInfo& device_info,
|
||||
const GpuInfo& gpu_info,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
|
||||
void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
|
||||
@ -86,12 +85,12 @@ void SelectTranspose(const TransposeAttributes& attr,
|
||||
const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
|
||||
const DeviceInfo& device_info, const Padding2D& padding,
|
||||
const OperationDef& op_def);
|
||||
std::unique_ptr<GPUOperation> SelectWinograd4x4To36(const GpuInfo& gpu_info,
|
||||
const Padding2D& padding,
|
||||
const OperationDef& op_def);
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
|
||||
const DeviceInfo& device_info, const OperationDef& op_def,
|
||||
const GpuInfo& gpu_info, const OperationDef& op_def,
|
||||
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectQuantizeAndDequantize(
|
||||
|
@ -89,7 +89,7 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
|
||||
|
||||
// fully connected + fully connected + add
|
||||
absl::Status TryFCFCAdd(
|
||||
const DeviceInfo& device_info, CalculationsPrecision precision,
|
||||
const GpuInfo& gpu_info, CalculationsPrecision precision,
|
||||
const GraphFloat32& graph, NodeId first_node_id,
|
||||
const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
|
||||
std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
|
||||
@ -160,7 +160,7 @@ absl::Status TryFCFCAdd(
|
||||
}
|
||||
std::unique_ptr<GPUOperation>* gpu_op =
|
||||
InitSingleOpSubgraph(fc0_inputs, add_outputs, gpu_subgraph);
|
||||
FCFCAdd fc = CreateFCFCAdd(device_info, op_def, fc0_attr, fc1_attr);
|
||||
FCFCAdd fc = CreateFCFCAdd(gpu_info, op_def, fc0_attr, fc1_attr);
|
||||
*gpu_op = absl::make_unique<FCFCAdd>(std::move(fc));
|
||||
consumed_nodes->insert(fc0_node->id);
|
||||
consumed_nodes->insert(fc1_node->id);
|
||||
@ -170,12 +170,12 @@ absl::Status TryFCFCAdd(
|
||||
} // namespace
|
||||
|
||||
absl::Status GPUSubgraphFromGraph(
|
||||
const DeviceInfo& device_info, CalculationsPrecision precision,
|
||||
const GpuInfo& gpu_info, CalculationsPrecision precision,
|
||||
const GraphFloat32& graph, NodeId first_node_id,
|
||||
const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
|
||||
std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph,
|
||||
std::string* name) {
|
||||
if ((device_info.IsAdreno() || device_info.IsNvidia()) &&
|
||||
if ((gpu_info.IsAdreno() || gpu_info.IsNvidia()) &&
|
||||
TryDepthwiseConvPlus1x1Conv(precision, graph, first_node_id,
|
||||
tensor_descriptors, consumed_nodes,
|
||||
gpu_subgraph)
|
||||
@ -183,9 +183,9 @@ absl::Status GPUSubgraphFromGraph(
|
||||
*name = "depthwise_conv_plus_1x1_conv";
|
||||
return absl::OkStatus();
|
||||
}
|
||||
if ((device_info.IsIntel() || device_info.IsNvidia()) &&
|
||||
TryFCFCAdd(device_info, precision, graph, first_node_id,
|
||||
tensor_descriptors, consumed_nodes, gpu_subgraph)
|
||||
if ((gpu_info.IsIntel() || gpu_info.IsNvidia()) &&
|
||||
TryFCFCAdd(gpu_info, precision, graph, first_node_id, tensor_descriptors,
|
||||
consumed_nodes, gpu_subgraph)
|
||||
.ok()) {
|
||||
*name = "fully_connected_x2_and_add";
|
||||
return absl::OkStatus();
|
||||
|
@ -31,7 +31,7 @@ namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
absl::Status GPUSubgraphFromGraph(
|
||||
const DeviceInfo& device_info, CalculationsPrecision precision,
|
||||
const GpuInfo& gpu_info, CalculationsPrecision precision,
|
||||
const GraphFloat32& graph, NodeId first_node_id,
|
||||
const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
|
||||
std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph,
|
||||
|
@ -24,7 +24,7 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWDC& shape,
|
||||
bool CanCreateTensorWithShape(const GpuInfo& gpu_info, const BHWDC& shape,
|
||||
const TensorDescriptor& descriptor) {
|
||||
const int slices = DivideRoundUp(shape.c, 4);
|
||||
switch (descriptor.storage_type) {
|
||||
@ -33,61 +33,61 @@ bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWDC& shape,
|
||||
4 * (descriptor.data_type == DataType::FLOAT32 ? 4 : 2);
|
||||
const int buffer_size =
|
||||
shape.b * shape.w * shape.h * shape.d * slices * flt4_size;
|
||||
return buffer_size <= device_info.buffer_max_size;
|
||||
return buffer_size <= gpu_info.buffer_max_size;
|
||||
}
|
||||
case TensorStorageType::IMAGE_BUFFER:
|
||||
return shape.b * shape.w * shape.h * shape.d * slices <=
|
||||
device_info.image_buffer_max_size;
|
||||
gpu_info.image_buffer_max_size;
|
||||
case TensorStorageType::TEXTURE_3D:
|
||||
if (device_info.cl_version < OpenCLVersion::CL_1_2 && slices == 1) {
|
||||
if (gpu_info.cl_version < OpenCLVersion::CL_1_2 && slices == 1) {
|
||||
// clCreateImage3D (that used in CL 1.0/1.1) can not create image with
|
||||
// depth = 1 by specification;
|
||||
return false;
|
||||
}
|
||||
return shape.w * shape.b <= device_info.image3d_max_width &&
|
||||
shape.h <= device_info.image3d_max_height &&
|
||||
slices * shape.d <= device_info.image3d_max_depth;
|
||||
return shape.w * shape.b <= gpu_info.image3d_max_width &&
|
||||
shape.h <= gpu_info.image3d_max_height &&
|
||||
slices * shape.d <= gpu_info.image3d_max_depth;
|
||||
case TensorStorageType::TEXTURE_ARRAY:
|
||||
// Bug on some Adreno. b/131099086
|
||||
if (slices == 1 && device_info.IsAdreno() &&
|
||||
!device_info.adreno_info.support_one_layer_texture_array) {
|
||||
if (slices == 1 && gpu_info.IsAdreno() &&
|
||||
!gpu_info.adreno_info.support_one_layer_texture_array) {
|
||||
return false;
|
||||
}
|
||||
return shape.w * shape.b <= device_info.image2d_max_width &&
|
||||
shape.h <= device_info.image2d_max_height &&
|
||||
slices * shape.d <= device_info.image_array_max_layers;
|
||||
return shape.w * shape.b <= gpu_info.image2d_max_width &&
|
||||
shape.h <= gpu_info.image2d_max_height &&
|
||||
slices * shape.d <= gpu_info.image_array_max_layers;
|
||||
case TensorStorageType::TEXTURE_2D:
|
||||
return shape.w * shape.b * shape.d <= device_info.image2d_max_width &&
|
||||
shape.h * slices <= device_info.image2d_max_height;
|
||||
return shape.w * shape.b * shape.d <= gpu_info.image2d_max_width &&
|
||||
shape.h * slices <= gpu_info.image2d_max_height;
|
||||
case TensorStorageType::SINGLE_TEXTURE_2D:
|
||||
return shape.c <= 4 &&
|
||||
device_info.SupportsFloatImage2D(descriptor.data_type, shape.c) &&
|
||||
shape.w * shape.b * shape.d <= device_info.image2d_max_width &&
|
||||
shape.h <= device_info.image2d_max_height;
|
||||
gpu_info.SupportsFloatImage2D(descriptor.data_type, shape.c) &&
|
||||
shape.w * shape.b * shape.d <= gpu_info.image2d_max_width &&
|
||||
shape.h <= gpu_info.image2d_max_height;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWC& shape,
|
||||
bool CanCreateTensorWithShape(const GpuInfo& gpu_info, const BHWC& shape,
|
||||
const TensorDescriptor& descriptor) {
|
||||
const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
|
||||
return CanCreateTensorWithShape(device_info, shape5D, descriptor);
|
||||
return CanCreateTensorWithShape(gpu_info, shape5D, descriptor);
|
||||
}
|
||||
|
||||
TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
|
||||
TensorStorageType SelectBestStorageType(const GpuInfo& gpu_info,
|
||||
const BHWC& shape,
|
||||
const TensorStorageType& desired,
|
||||
const DataType& data_type,
|
||||
const Layout& layout) {
|
||||
if (CanCreateTensorWithShape(device_info, shape,
|
||||
if (CanCreateTensorWithShape(gpu_info, shape,
|
||||
TensorDescriptor{data_type, desired, layout})) {
|
||||
return desired;
|
||||
}
|
||||
auto GetBestTypeAfterTextureArray = [&]() {
|
||||
if (device_info.SupportsImageBuffer() &&
|
||||
if (gpu_info.SupportsImageBuffer() &&
|
||||
CanCreateTensorWithShape(
|
||||
device_info, shape,
|
||||
gpu_info, shape,
|
||||
TensorDescriptor{data_type, TensorStorageType::IMAGE_BUFFER,
|
||||
layout})) {
|
||||
return TensorStorageType::IMAGE_BUFFER;
|
||||
@ -96,9 +96,9 @@ TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
|
||||
}
|
||||
};
|
||||
auto GetBestTypeAfterTexture2D = [&]() {
|
||||
if (device_info.SupportsTextureArray() &&
|
||||
if (gpu_info.SupportsTextureArray() &&
|
||||
CanCreateTensorWithShape(
|
||||
device_info, shape,
|
||||
gpu_info, shape,
|
||||
TensorDescriptor{data_type, TensorStorageType::TEXTURE_ARRAY,
|
||||
layout})) {
|
||||
return TensorStorageType::TEXTURE_ARRAY;
|
||||
@ -108,7 +108,7 @@ TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
|
||||
};
|
||||
auto GetBestTypeAfterTexture3D = [&]() {
|
||||
if (CanCreateTensorWithShape(
|
||||
device_info, shape,
|
||||
gpu_info, shape,
|
||||
TensorDescriptor{data_type, TensorStorageType::TEXTURE_2D,
|
||||
layout})) {
|
||||
return TensorStorageType::TEXTURE_2D;
|
||||
|
@ -25,13 +25,13 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWDC& shape,
|
||||
bool CanCreateTensorWithShape(const GpuInfo& gpu_info, const BHWDC& shape,
|
||||
const TensorDescriptor& descriptor);
|
||||
|
||||
bool CanCreateTensorWithShape(const DeviceInfo& device_info, const BHWC& shape,
|
||||
bool CanCreateTensorWithShape(const GpuInfo& gpu_info, const BHWC& shape,
|
||||
const TensorDescriptor& descriptor);
|
||||
|
||||
TensorStorageType SelectBestStorageType(const DeviceInfo& device_info,
|
||||
TensorStorageType SelectBestStorageType(const GpuInfo& gpu_info,
|
||||
const BHWC& shape,
|
||||
const TensorStorageType& desired,
|
||||
const DataType& data_type,
|
||||
|
Loading…
x
Reference in New Issue
Block a user