Added OpenClInfo. Added methods to GpuInfo instead of direct field accesses.
PiperOrigin-RevId: 343511854 Change-Id: I1db674bf181b909bbb426863494ab28b13ef0c0f
This commit is contained in:
parent
0eae6e3b96
commit
7c29278eed
@ -54,29 +54,29 @@ void AddSupportedImageFormats(cl_context context, GpuInfo* info) {
|
|||||||
auto supported_formats =
|
auto supported_formats =
|
||||||
GetSupportedImage2DFormats(context, CL_MEM_READ_WRITE);
|
GetSupportedImage2DFormats(context, CL_MEM_READ_WRITE);
|
||||||
for (auto format : supported_formats) {
|
for (auto format : supported_formats) {
|
||||||
info->supports_r_f16_tex2d =
|
info->opencl_info.supports_r_f16_tex2d =
|
||||||
info->supports_r_f16_tex2d ||
|
info->opencl_info.supports_r_f16_tex2d ||
|
||||||
IsEqualToImageFormat(format, DataType::FLOAT16, 1);
|
IsEqualToImageFormat(format, DataType::FLOAT16, 1);
|
||||||
info->supports_rg_f16_tex2d =
|
info->opencl_info.supports_rg_f16_tex2d =
|
||||||
info->supports_rg_f16_tex2d ||
|
info->opencl_info.supports_rg_f16_tex2d ||
|
||||||
IsEqualToImageFormat(format, DataType::FLOAT16, 2);
|
IsEqualToImageFormat(format, DataType::FLOAT16, 2);
|
||||||
info->supports_rgb_f16_tex2d =
|
info->opencl_info.supports_rgb_f16_tex2d =
|
||||||
info->supports_rgb_f16_tex2d ||
|
info->opencl_info.supports_rgb_f16_tex2d ||
|
||||||
IsEqualToImageFormat(format, DataType::FLOAT16, 3);
|
IsEqualToImageFormat(format, DataType::FLOAT16, 3);
|
||||||
info->supports_rgba_f16_tex2d =
|
info->opencl_info.supports_rgba_f16_tex2d =
|
||||||
info->supports_rgba_f16_tex2d ||
|
info->opencl_info.supports_rgba_f16_tex2d ||
|
||||||
IsEqualToImageFormat(format, DataType::FLOAT16, 4);
|
IsEqualToImageFormat(format, DataType::FLOAT16, 4);
|
||||||
info->supports_r_f32_tex2d =
|
info->opencl_info.supports_r_f32_tex2d =
|
||||||
info->supports_r_f32_tex2d ||
|
info->opencl_info.supports_r_f32_tex2d ||
|
||||||
IsEqualToImageFormat(format, DataType::FLOAT32, 1);
|
IsEqualToImageFormat(format, DataType::FLOAT32, 1);
|
||||||
info->supports_rg_f32_tex2d =
|
info->opencl_info.supports_rg_f32_tex2d =
|
||||||
info->supports_rg_f32_tex2d ||
|
info->opencl_info.supports_rg_f32_tex2d ||
|
||||||
IsEqualToImageFormat(format, DataType::FLOAT32, 2);
|
IsEqualToImageFormat(format, DataType::FLOAT32, 2);
|
||||||
info->supports_rgb_f32_tex2d =
|
info->opencl_info.supports_rgb_f32_tex2d =
|
||||||
info->supports_rgb_f32_tex2d ||
|
info->opencl_info.supports_rgb_f32_tex2d ||
|
||||||
IsEqualToImageFormat(format, DataType::FLOAT32, 3);
|
IsEqualToImageFormat(format, DataType::FLOAT32, 3);
|
||||||
info->supports_rgba_f32_tex2d =
|
info->opencl_info.supports_rgba_f32_tex2d =
|
||||||
info->supports_rgba_f32_tex2d ||
|
info->opencl_info.supports_rgba_f32_tex2d ||
|
||||||
IsEqualToImageFormat(format, DataType::FLOAT32, 4);
|
IsEqualToImageFormat(format, DataType::FLOAT32, 4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -148,7 +148,7 @@ absl::Status CreateCLGLContext(const CLDevice& device,
|
|||||||
cl_context_properties egl_context,
|
cl_context_properties egl_context,
|
||||||
cl_context_properties egl_display,
|
cl_context_properties egl_display,
|
||||||
CLContext* result) {
|
CLContext* result) {
|
||||||
if (!device.SupportsExtension("cl_khr_gl_sharing")) {
|
if (!device.GetInfo().SupportsExtension("cl_khr_gl_sharing")) {
|
||||||
return absl::UnavailableError("Device doesn't support CL-GL sharing.");
|
return absl::UnavailableError("Device doesn't support CL-GL sharing.");
|
||||||
}
|
}
|
||||||
cl_context_properties platform =
|
cl_context_properties platform =
|
||||||
|
@ -169,80 +169,80 @@ GpuInfo GpuInfoFromDeviceID(cl_device_id id) {
|
|||||||
info.mali_info = MaliInfo(device_name);
|
info.mali_info = MaliInfo(device_name);
|
||||||
}
|
}
|
||||||
info.opencl_info.cl_version = ParseCLVersion(opencl_c_version);
|
info.opencl_info.cl_version = ParseCLVersion(opencl_c_version);
|
||||||
info.extensions =
|
info.opencl_info.extensions =
|
||||||
absl::StrSplit(GetDeviceInfo<std::string>(id, CL_DEVICE_EXTENSIONS), ' ');
|
absl::StrSplit(GetDeviceInfo<std::string>(id, CL_DEVICE_EXTENSIONS), ' ');
|
||||||
info.supports_fp16 = false;
|
info.opencl_info.supports_fp16 = false;
|
||||||
info.supports_image3d_writes = false;
|
info.opencl_info.supports_image3d_writes = false;
|
||||||
for (const auto& ext : info.extensions) {
|
for (const auto& ext : info.opencl_info.extensions) {
|
||||||
if (ext == "cl_khr_fp16") {
|
if (ext == "cl_khr_fp16") {
|
||||||
info.supports_fp16 = true;
|
info.opencl_info.supports_fp16 = true;
|
||||||
}
|
}
|
||||||
if (ext == "cl_khr_3d_image_writes") {
|
if (ext == "cl_khr_3d_image_writes") {
|
||||||
info.supports_image3d_writes = true;
|
info.opencl_info.supports_image3d_writes = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cl_device_fp_config f32_config =
|
cl_device_fp_config f32_config =
|
||||||
GetDeviceInfo<cl_device_fp_config>(id, CL_DEVICE_SINGLE_FP_CONFIG);
|
GetDeviceInfo<cl_device_fp_config>(id, CL_DEVICE_SINGLE_FP_CONFIG);
|
||||||
info.supports_fp32_rtn = f32_config & CL_FP_ROUND_TO_NEAREST;
|
info.opencl_info.supports_fp32_rtn = f32_config & CL_FP_ROUND_TO_NEAREST;
|
||||||
|
|
||||||
if (info.supports_fp16) {
|
if (info.opencl_info.supports_fp16) {
|
||||||
cl_device_fp_config f16_config;
|
cl_device_fp_config f16_config;
|
||||||
auto status = GetDeviceInfo<cl_device_fp_config>(
|
auto status = GetDeviceInfo<cl_device_fp_config>(
|
||||||
id, CL_DEVICE_HALF_FP_CONFIG, &f16_config);
|
id, CL_DEVICE_HALF_FP_CONFIG, &f16_config);
|
||||||
// AMD supports cl_khr_fp16 but CL_DEVICE_HALF_FP_CONFIG is empty.
|
// AMD supports cl_khr_fp16 but CL_DEVICE_HALF_FP_CONFIG is empty.
|
||||||
if (status.ok() && !info.IsAMD()) {
|
if (status.ok() && !info.IsAMD()) {
|
||||||
info.supports_fp16_rtn = f16_config & CL_FP_ROUND_TO_NEAREST;
|
info.opencl_info.supports_fp16_rtn = f16_config & CL_FP_ROUND_TO_NEAREST;
|
||||||
} else { // happens on PowerVR
|
} else { // happens on PowerVR
|
||||||
f16_config = f32_config;
|
f16_config = f32_config;
|
||||||
info.supports_fp16_rtn = info.supports_fp32_rtn;
|
info.opencl_info.supports_fp16_rtn = info.opencl_info.supports_fp32_rtn;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
info.supports_fp16_rtn = false;
|
info.opencl_info.supports_fp16_rtn = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (info.IsPowerVR() && !info.supports_fp16) {
|
if (info.IsPowerVR() && !info.opencl_info.supports_fp16) {
|
||||||
// PowerVR doesn't have full support of fp16 and so doesn't list this
|
// PowerVR doesn't have full support of fp16 and so doesn't list this
|
||||||
// extension. But it can support fp16 in MADs and as buffers/textures types,
|
// extension. But it can support fp16 in MADs and as buffers/textures types,
|
||||||
// so we will use it.
|
// so we will use it.
|
||||||
info.supports_fp16 = true;
|
info.opencl_info.supports_fp16 = true;
|
||||||
info.supports_fp16_rtn = info.supports_fp32_rtn;
|
info.opencl_info.supports_fp16_rtn = info.opencl_info.supports_fp32_rtn;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!info.supports_image3d_writes &&
|
if (!info.opencl_info.supports_image3d_writes &&
|
||||||
((info.IsAdreno() && info.adreno_info.IsAdreno4xx()) ||
|
((info.IsAdreno() && info.adreno_info.IsAdreno4xx()) ||
|
||||||
info.IsNvidia())) {
|
info.IsNvidia())) {
|
||||||
// in local tests Adreno 430 can write in image 3d, at least on small sizes,
|
// in local tests Adreno 430 can write in image 3d, at least on small sizes,
|
||||||
// but it doesn't have cl_khr_3d_image_writes in list of available
|
// but it doesn't have cl_khr_3d_image_writes in list of available
|
||||||
// extensions
|
// extensions
|
||||||
// The same for NVidia
|
// The same for NVidia
|
||||||
info.supports_image3d_writes = true;
|
info.opencl_info.supports_image3d_writes = true;
|
||||||
}
|
}
|
||||||
info.compute_units_count =
|
info.opencl_info.compute_units_count =
|
||||||
GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_COMPUTE_UNITS);
|
GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_COMPUTE_UNITS);
|
||||||
info.image2d_max_width =
|
info.opencl_info.image2d_max_width =
|
||||||
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_WIDTH);
|
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_WIDTH);
|
||||||
info.image2d_max_height =
|
info.opencl_info.image2d_max_height =
|
||||||
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
|
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
|
||||||
info.buffer_max_size =
|
info.opencl_info.buffer_max_size =
|
||||||
GetDeviceInfo<cl_ulong>(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE);
|
GetDeviceInfo<cl_ulong>(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE);
|
||||||
if (info.opencl_info.cl_version >= OpenClVersion::kCl1_2) {
|
if (info.opencl_info.cl_version >= OpenClVersion::kCl1_2) {
|
||||||
info.image_buffer_max_size =
|
info.opencl_info.image_buffer_max_size =
|
||||||
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE);
|
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE);
|
||||||
info.image_array_max_layers =
|
info.opencl_info.image_array_max_layers =
|
||||||
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE);
|
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE);
|
||||||
}
|
}
|
||||||
info.image3d_max_width =
|
info.opencl_info.image3d_max_width =
|
||||||
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_WIDTH);
|
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_WIDTH);
|
||||||
info.image3d_max_height =
|
info.opencl_info.image3d_max_height =
|
||||||
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
|
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
|
||||||
info.image3d_max_depth =
|
info.opencl_info.image3d_max_depth =
|
||||||
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_DEPTH);
|
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_DEPTH);
|
||||||
int3 max_work_group_sizes;
|
int3 max_work_group_sizes;
|
||||||
GetDeviceWorkDimsSizes(id, &max_work_group_sizes);
|
GetDeviceWorkDimsSizes(id, &max_work_group_sizes);
|
||||||
info.max_work_group_size_x = max_work_group_sizes.x;
|
info.opencl_info.max_work_group_size_x = max_work_group_sizes.x;
|
||||||
info.max_work_group_size_y = max_work_group_sizes.y;
|
info.opencl_info.max_work_group_size_y = max_work_group_sizes.y;
|
||||||
info.max_work_group_size_z = max_work_group_sizes.z;
|
info.opencl_info.max_work_group_size_z = max_work_group_sizes.z;
|
||||||
|
|
||||||
if (info.IsIntel()) {
|
if (info.IsIntel()) {
|
||||||
if (info.SupportsExtension("cl_intel_required_subgroup_size")) {
|
if (info.SupportsExtension("cl_intel_required_subgroup_size")) {
|
||||||
@ -300,48 +300,10 @@ CLDevice& CLDevice::operator=(CLDevice&& device) {
|
|||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CLDevice::SupportsFP16() const { return info_.supports_fp16; }
|
|
||||||
|
|
||||||
bool CLDevice::SupportsExtension(const std::string& extension) const {
|
|
||||||
return info_.SupportsExtension(extension);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CLDevice::SupportsTextureArray() const {
|
|
||||||
return info_.SupportsTextureArray();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CLDevice::SupportsImageBuffer() const {
|
|
||||||
return info_.SupportsImageBuffer();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CLDevice::SupportsImage3D() const { return info_.SupportsImage3D(); }
|
|
||||||
|
|
||||||
bool CLDevice::SupportsFP32RTN() const { return info_.supports_fp32_rtn; }
|
|
||||||
|
|
||||||
bool CLDevice::SupportsFP16RTN() const { return info_.supports_fp16_rtn; }
|
|
||||||
|
|
||||||
std::string CLDevice::GetPlatformVersion() const {
|
std::string CLDevice::GetPlatformVersion() const {
|
||||||
return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
|
return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CLDevice::IsCL20OrHigher() const { return info_.IsCL20OrHigher(); }
|
|
||||||
|
|
||||||
bool CLDevice::SupportsSubGroupWithSize(int sub_group_size) const {
|
|
||||||
return info_.SupportsSubGroupWithSize(sub_group_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CLDevice::IsAdreno() const { return info_.IsAdreno(); }
|
|
||||||
|
|
||||||
bool CLDevice::IsPowerVR() const { return info_.IsPowerVR(); }
|
|
||||||
|
|
||||||
bool CLDevice::IsNvidia() const { return info_.IsNvidia(); }
|
|
||||||
|
|
||||||
bool CLDevice::IsMali() const { return info_.IsMali(); }
|
|
||||||
|
|
||||||
bool CLDevice::IsAMD() const { return info_.IsAMD(); }
|
|
||||||
|
|
||||||
bool CLDevice::IsIntel() const { return info_.IsIntel(); }
|
|
||||||
|
|
||||||
void CLDevice::DisableOneLayerTextureArray() {
|
void CLDevice::DisableOneLayerTextureArray() {
|
||||||
info_.adreno_info.support_one_layer_texture_array = false;
|
info_.adreno_info.support_one_layer_texture_array = false;
|
||||||
}
|
}
|
||||||
|
@ -46,23 +46,6 @@ class CLDevice {
|
|||||||
cl_platform_id platform() const { return platform_id_; }
|
cl_platform_id platform() const { return platform_id_; }
|
||||||
std::string GetPlatformVersion() const;
|
std::string GetPlatformVersion() const;
|
||||||
|
|
||||||
GpuVendor vendor() const { return info_.gpu_vendor; }
|
|
||||||
bool SupportsFP16() const;
|
|
||||||
bool SupportsTextureArray() const;
|
|
||||||
bool SupportsImageBuffer() const;
|
|
||||||
bool SupportsImage3D() const;
|
|
||||||
bool SupportsExtension(const std::string& extension) const;
|
|
||||||
bool SupportsFP32RTN() const;
|
|
||||||
bool SupportsFP16RTN() const;
|
|
||||||
bool IsCL20OrHigher() const;
|
|
||||||
bool SupportsSubGroupWithSize(int sub_group_size) const;
|
|
||||||
bool IsAdreno() const;
|
|
||||||
bool IsPowerVR() const;
|
|
||||||
bool IsNvidia() const;
|
|
||||||
bool IsMali() const;
|
|
||||||
bool IsAMD() const;
|
|
||||||
bool IsIntel() const;
|
|
||||||
|
|
||||||
// To track bug on some Adreno. b/131099086
|
// To track bug on some Adreno. b/131099086
|
||||||
void DisableOneLayerTextureArray();
|
void DisableOneLayerTextureArray();
|
||||||
|
|
||||||
|
@ -301,6 +301,8 @@ bool MaliInfo::IsValhall() const {
|
|||||||
gpu_version == MaliGpu::kG68 || gpu_version == MaliGpu::kG78;
|
gpu_version == MaliGpu::kG68 || gpu_version == MaliGpu::kG78;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool GpuInfo::SupportsFP16() const { return opencl_info.supports_fp16; }
|
||||||
|
|
||||||
bool GpuInfo::SupportsTextureArray() const {
|
bool GpuInfo::SupportsTextureArray() const {
|
||||||
return opencl_info.cl_version >= OpenClVersion::kCl1_2;
|
return opencl_info.cl_version >= OpenClVersion::kCl1_2;
|
||||||
}
|
}
|
||||||
@ -314,29 +316,29 @@ bool GpuInfo::SupportsImage3D() const {
|
|||||||
// On Mali T880 read_imageh doesn't compile with image3d_t
|
// On Mali T880 read_imageh doesn't compile with image3d_t
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return supports_image3d_writes;
|
return opencl_info.supports_image3d_writes;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool GpuInfo::SupportsFloatImage2D(DataType data_type, int channels) const {
|
bool GpuInfo::SupportsFloatImage2D(DataType data_type, int channels) const {
|
||||||
if (channels == 1) {
|
if (channels == 1) {
|
||||||
return data_type == DataType::FLOAT32 ? supports_r_f32_tex2d
|
return data_type == DataType::FLOAT32 ? opencl_info.supports_r_f32_tex2d
|
||||||
: supports_r_f16_tex2d;
|
: opencl_info.supports_r_f16_tex2d;
|
||||||
} else if (channels == 2) {
|
} else if (channels == 2) {
|
||||||
return data_type == DataType::FLOAT32 ? supports_rg_f32_tex2d
|
return data_type == DataType::FLOAT32 ? opencl_info.supports_rg_f32_tex2d
|
||||||
: supports_rg_f16_tex2d;
|
: opencl_info.supports_rg_f16_tex2d;
|
||||||
} else if (channels == 3) {
|
} else if (channels == 3) {
|
||||||
return data_type == DataType::FLOAT32 ? supports_rgb_f32_tex2d
|
return data_type == DataType::FLOAT32 ? opencl_info.supports_rgb_f32_tex2d
|
||||||
: supports_rgb_f16_tex2d;
|
: opencl_info.supports_rgb_f16_tex2d;
|
||||||
} else if (channels == 4) {
|
} else if (channels == 4) {
|
||||||
return data_type == DataType::FLOAT32 ? supports_rgba_f32_tex2d
|
return data_type == DataType::FLOAT32 ? opencl_info.supports_rgba_f32_tex2d
|
||||||
: supports_rgba_f16_tex2d;
|
: opencl_info.supports_rgba_f16_tex2d;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool GpuInfo::SupportsExtension(const std::string& extension) const {
|
bool GpuInfo::SupportsExtension(const std::string& extension) const {
|
||||||
for (const auto& ext : extensions) {
|
for (const auto& ext : opencl_info.extensions) {
|
||||||
if (ext == extension) {
|
if (ext == extension) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -365,6 +367,58 @@ bool GpuInfo::SupportsSubGroupWithSize(int sub_group_size) const {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int GpuInfo::GetComputeUnitsCount() const {
|
||||||
|
return opencl_info.compute_units_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool GpuInfo::IsRoundToNearestSupported() const {
|
||||||
|
return opencl_info.supports_fp16_rtn || opencl_info.supports_fp32_rtn;
|
||||||
|
}
|
||||||
|
|
||||||
|
int GpuInfo::GetMaxWorkGroupSizeForX() const {
|
||||||
|
return opencl_info.max_work_group_size_x;
|
||||||
|
}
|
||||||
|
|
||||||
|
int GpuInfo::GetMaxWorkGroupSizeForY() const {
|
||||||
|
return opencl_info.max_work_group_size_y;
|
||||||
|
}
|
||||||
|
|
||||||
|
int GpuInfo::GetMaxWorkGroupSizeForZ() const {
|
||||||
|
return opencl_info.max_work_group_size_z;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t GpuInfo::GetMaxImage2DWidth() const {
|
||||||
|
return opencl_info.image2d_max_width;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t GpuInfo::GetMaxImage2DHeight() const {
|
||||||
|
return opencl_info.image2d_max_height;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t GpuInfo::GetMaxImage3DWidth() const {
|
||||||
|
return opencl_info.image3d_max_width;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t GpuInfo::GetMaxImage3DHeight() const {
|
||||||
|
return opencl_info.image3d_max_height;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t GpuInfo::GetMaxImage3DDepth() const {
|
||||||
|
return opencl_info.image3d_max_depth;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t GpuInfo::GetMaxBufferSize() const {
|
||||||
|
return opencl_info.buffer_max_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t GpuInfo::GetMaxImageBufferWidth() const {
|
||||||
|
return opencl_info.image_buffer_max_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t GpuInfo::GetMaxImage2DArrayLayers() const {
|
||||||
|
return opencl_info.image_array_max_layers;
|
||||||
|
}
|
||||||
|
|
||||||
bool GpuInfo::IsAdreno() const { return gpu_vendor == GpuVendor::kQualcomm; }
|
bool GpuInfo::IsAdreno() const { return gpu_vendor == GpuVendor::kQualcomm; }
|
||||||
|
|
||||||
bool GpuInfo::IsApple() const { return gpu_vendor == GpuVendor::kApple; }
|
bool GpuInfo::IsApple() const { return gpu_vendor == GpuVendor::kApple; }
|
||||||
|
@ -179,34 +179,10 @@ std::string OpenClVersionToString(OpenClVersion version);
|
|||||||
|
|
||||||
struct OpenClInfo {
|
struct OpenClInfo {
|
||||||
OpenClVersion cl_version;
|
OpenClVersion cl_version;
|
||||||
};
|
|
||||||
|
|
||||||
struct GpuInfo {
|
|
||||||
GpuInfo() = default;
|
|
||||||
|
|
||||||
bool IsAdreno() const;
|
|
||||||
bool IsApple() const;
|
|
||||||
bool IsMali() const;
|
|
||||||
bool IsPowerVR() const;
|
|
||||||
bool IsNvidia() const;
|
|
||||||
bool IsAMD() const;
|
|
||||||
bool IsIntel() const;
|
|
||||||
|
|
||||||
bool SupportsTextureArray() const;
|
|
||||||
bool SupportsImageBuffer() const;
|
|
||||||
bool SupportsImage3D() const;
|
|
||||||
|
|
||||||
bool SupportsFloatImage2D(DataType data_type, int channels) const;
|
|
||||||
|
|
||||||
bool SupportsExtension(const std::string& extension) const;
|
|
||||||
bool IsCL20OrHigher() const;
|
|
||||||
bool IsCL30OrHigher() const;
|
|
||||||
bool SupportsSubGroupWithSize(int sub_group_size) const;
|
|
||||||
|
|
||||||
std::vector<std::string> extensions;
|
std::vector<std::string> extensions;
|
||||||
bool supports_fp16;
|
bool supports_fp16;
|
||||||
bool supports_image3d_writes;
|
bool supports_image3d_writes;
|
||||||
GpuVendor gpu_vendor;
|
|
||||||
int compute_units_count;
|
int compute_units_count;
|
||||||
uint64_t buffer_max_size;
|
uint64_t buffer_max_size;
|
||||||
uint64_t image2d_max_width;
|
uint64_t image2d_max_width;
|
||||||
@ -219,7 +195,6 @@ struct GpuInfo {
|
|||||||
int max_work_group_size_x;
|
int max_work_group_size_x;
|
||||||
int max_work_group_size_y;
|
int max_work_group_size_y;
|
||||||
int max_work_group_size_z;
|
int max_work_group_size_z;
|
||||||
std::vector<int> supported_subgroup_sizes;
|
|
||||||
|
|
||||||
// rtn is ROUND_TO_NEAREST
|
// rtn is ROUND_TO_NEAREST
|
||||||
// with rtn precision is much better then with rtz (ROUND_TO_ZERO)
|
// with rtn precision is much better then with rtz (ROUND_TO_ZERO)
|
||||||
@ -238,6 +213,54 @@ struct GpuInfo {
|
|||||||
bool supports_rg_f32_tex2d = false;
|
bool supports_rg_f32_tex2d = false;
|
||||||
bool supports_rgb_f32_tex2d = false;
|
bool supports_rgb_f32_tex2d = false;
|
||||||
bool supports_rgba_f32_tex2d = false;
|
bool supports_rgba_f32_tex2d = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct GpuInfo {
|
||||||
|
GpuInfo() = default;
|
||||||
|
|
||||||
|
bool IsAdreno() const;
|
||||||
|
bool IsApple() const;
|
||||||
|
bool IsMali() const;
|
||||||
|
bool IsPowerVR() const;
|
||||||
|
bool IsNvidia() const;
|
||||||
|
bool IsAMD() const;
|
||||||
|
bool IsIntel() const;
|
||||||
|
|
||||||
|
bool SupportsFP16() const;
|
||||||
|
|
||||||
|
bool SupportsTextureArray() const;
|
||||||
|
bool SupportsImageBuffer() const;
|
||||||
|
bool SupportsImage3D() const;
|
||||||
|
|
||||||
|
bool SupportsFloatImage2D(DataType data_type, int channels) const;
|
||||||
|
|
||||||
|
bool SupportsExtension(const std::string& extension) const;
|
||||||
|
bool IsCL20OrHigher() const;
|
||||||
|
bool IsCL30OrHigher() const;
|
||||||
|
bool SupportsSubGroupWithSize(int sub_group_size) const;
|
||||||
|
|
||||||
|
int GetComputeUnitsCount() const;
|
||||||
|
|
||||||
|
// floating point rounding mode
|
||||||
|
bool IsRoundToNearestSupported() const;
|
||||||
|
|
||||||
|
int GetMaxWorkGroupSizeForX() const;
|
||||||
|
int GetMaxWorkGroupSizeForY() const;
|
||||||
|
int GetMaxWorkGroupSizeForZ() const;
|
||||||
|
|
||||||
|
uint64_t GetMaxImage2DWidth() const;
|
||||||
|
uint64_t GetMaxImage2DHeight() const;
|
||||||
|
uint64_t GetMaxImage3DWidth() const;
|
||||||
|
uint64_t GetMaxImage3DHeight() const;
|
||||||
|
uint64_t GetMaxImage3DDepth() const;
|
||||||
|
|
||||||
|
uint64_t GetMaxBufferSize() const;
|
||||||
|
uint64_t GetMaxImageBufferWidth() const;
|
||||||
|
uint64_t GetMaxImage2DArrayLayers() const;
|
||||||
|
|
||||||
|
std::vector<int> supported_subgroup_sizes;
|
||||||
|
|
||||||
|
GpuVendor gpu_vendor;
|
||||||
|
|
||||||
AdrenoInfo adreno_info;
|
AdrenoInfo adreno_info;
|
||||||
MaliInfo mali_info;
|
MaliInfo mali_info;
|
||||||
|
@ -48,6 +48,39 @@ absl::Status CreateEnvironment(Environment* result, bool shared,
|
|||||||
return result->Init();
|
return result->Init();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool IsGpuSupportsStorageType(const GpuInfo& gpu_info,
|
||||||
|
TensorStorageType storage_type) {
|
||||||
|
switch (storage_type) {
|
||||||
|
case TensorStorageType::TEXTURE_2D:
|
||||||
|
return !gpu_info.IsAMD();
|
||||||
|
case TensorStorageType::BUFFER:
|
||||||
|
return true;
|
||||||
|
case TensorStorageType::TEXTURE_ARRAY:
|
||||||
|
return !gpu_info.IsAMD() && gpu_info.SupportsTextureArray();
|
||||||
|
case TensorStorageType::IMAGE_BUFFER:
|
||||||
|
return (gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsNvidia()) &&
|
||||||
|
gpu_info.SupportsImageBuffer();
|
||||||
|
case TensorStorageType::TEXTURE_3D:
|
||||||
|
return !gpu_info.IsAMD() && gpu_info.SupportsImage3D();
|
||||||
|
case TensorStorageType::SINGLE_TEXTURE_2D:
|
||||||
|
return false;
|
||||||
|
case TensorStorageType::UNKNOWN:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool IsGpuSupportsPrecision(const GpuInfo& gpu_info,
|
||||||
|
CalculationsPrecision precision) {
|
||||||
|
switch (precision) {
|
||||||
|
case CalculationsPrecision::F32_F16:
|
||||||
|
case CalculationsPrecision::F16:
|
||||||
|
return gpu_info.SupportsFP16();
|
||||||
|
case CalculationsPrecision::F32:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
Environment::Environment(CLDevice&& device, CLContext&& context,
|
Environment::Environment(CLDevice&& device, CLContext&& context,
|
||||||
@ -77,7 +110,8 @@ Environment& Environment::operator=(Environment&& environment) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
absl::Status Environment::Init() {
|
absl::Status Environment::Init() {
|
||||||
if (device().IsAdreno() && device().SupportsTextureArray()) {
|
if (device().GetInfo().IsAdreno() &&
|
||||||
|
device().GetInfo().SupportsTextureArray()) {
|
||||||
const auto& adreno_info = device().info_.adreno_info;
|
const auto& adreno_info = device().info_.adreno_info;
|
||||||
// Some Adreno < 600 have bug with one layer texture array. b/131099086
|
// Some Adreno < 600 have bug with one layer texture array. b/131099086
|
||||||
// If we have one layer texture array and will write smt from kernel to this
|
// If we have one layer texture array and will write smt from kernel to this
|
||||||
@ -117,13 +151,7 @@ std::vector<CalculationsPrecision> Environment::GetSupportedPrecisions() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool Environment::IsSupported(CalculationsPrecision precision) const {
|
bool Environment::IsSupported(CalculationsPrecision precision) const {
|
||||||
switch (precision) {
|
return IsGpuSupportsPrecision(device_.GetInfo(), precision);
|
||||||
case CalculationsPrecision::F32_F16:
|
|
||||||
case CalculationsPrecision::F16:
|
|
||||||
return device_.SupportsFP16();
|
|
||||||
case CalculationsPrecision::F32:
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<TensorStorageType> Environment::GetSupportedStorages() const {
|
std::vector<TensorStorageType> Environment::GetSupportedStorages() const {
|
||||||
@ -153,24 +181,7 @@ Environment::GetSupportedStoragesWithHWZeroClampSupport() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool Environment::IsSupported(TensorStorageType storage_type) const {
|
bool Environment::IsSupported(TensorStorageType storage_type) const {
|
||||||
switch (storage_type) {
|
return IsGpuSupportsStorageType(device_.GetInfo(), storage_type);
|
||||||
case TensorStorageType::TEXTURE_2D:
|
|
||||||
return !device_.IsAMD();
|
|
||||||
case TensorStorageType::BUFFER:
|
|
||||||
return true;
|
|
||||||
case TensorStorageType::TEXTURE_ARRAY:
|
|
||||||
return !device_.IsAMD() && device_.SupportsTextureArray();
|
|
||||||
case TensorStorageType::IMAGE_BUFFER:
|
|
||||||
return (device_.IsAdreno() || device_.IsAMD() || device_.IsNvidia()) &&
|
|
||||||
device_.SupportsImageBuffer();
|
|
||||||
case TensorStorageType::TEXTURE_3D:
|
|
||||||
return !device_.IsAMD() && device_.SupportsImage3D();
|
|
||||||
case TensorStorageType::SINGLE_TEXTURE_2D:
|
|
||||||
return false;
|
|
||||||
case TensorStorageType::UNKNOWN:
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TensorStorageType GetFastestStorageType(const GpuInfo& gpu_info) {
|
TensorStorageType GetFastestStorageType(const GpuInfo& gpu_info) {
|
||||||
|
@ -89,7 +89,7 @@ absl::Status CreateClEventFromEglSync(cl_context context,
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool IsClEventFromEglSyncSupported(const CLDevice& device) {
|
bool IsClEventFromEglSyncSupported(const CLDevice& device) {
|
||||||
return device.SupportsExtension("cl_khr_egl_event");
|
return device.GetInfo().SupportsExtension("cl_khr_egl_event");
|
||||||
}
|
}
|
||||||
|
|
||||||
absl::Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id,
|
absl::Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id,
|
||||||
@ -126,7 +126,7 @@ absl::Status CreateClMemoryFromGlTexture(GLenum texture_target,
|
|||||||
|
|
||||||
bool IsGlSharingSupported(const CLDevice& device) {
|
bool IsGlSharingSupported(const CLDevice& device) {
|
||||||
return clCreateFromGLBuffer && clCreateFromGLTexture &&
|
return clCreateFromGLBuffer && clCreateFromGLTexture &&
|
||||||
device.SupportsExtension("cl_khr_gl_sharing");
|
device.GetInfo().SupportsExtension("cl_khr_gl_sharing");
|
||||||
}
|
}
|
||||||
|
|
||||||
AcquiredGlObjects::~AcquiredGlObjects() { Release({}, nullptr).IgnoreError(); }
|
AcquiredGlObjects::~AcquiredGlObjects() { Release({}, nullptr).IgnoreError(); }
|
||||||
|
@ -163,14 +163,14 @@ absl::Status InferenceContext::InitFromGraph(
|
|||||||
ReserveGraphTensors(create_info, creation_context.GetGpuInfo(), graph);
|
ReserveGraphTensors(create_info, creation_context.GetGpuInfo(), graph);
|
||||||
precision_ = create_info.precision;
|
precision_ = create_info.precision;
|
||||||
storage_type_ = create_info.storage_type;
|
storage_type_ = create_info.storage_type;
|
||||||
if (env->device().IsMali()) {
|
if (env->device().GetInfo().IsMali()) {
|
||||||
need_flush_ = true;
|
need_flush_ = true;
|
||||||
need_manual_release_ = true;
|
need_manual_release_ = true;
|
||||||
|
|
||||||
flush_periodically_ = true;
|
flush_periodically_ = true;
|
||||||
flush_period_ = 24;
|
flush_period_ = 24;
|
||||||
}
|
}
|
||||||
if (env->device().IsPowerVR()) {
|
if (env->device().GetInfo().IsPowerVR()) {
|
||||||
need_flush_ = true;
|
need_flush_ = true;
|
||||||
}
|
}
|
||||||
CopyInAndOutIds(graph);
|
CopyInAndOutIds(graph);
|
||||||
|
@ -139,7 +139,7 @@ ConvBuffer1x1::ConvParams GetBestParams(const GpuInfo& gpu_info,
|
|||||||
conv_params.element_size = 4;
|
conv_params.element_size = 4;
|
||||||
conv_params.block_size = int3(1, 1, 1);
|
conv_params.block_size = int3(1, 1, 1);
|
||||||
if (gpu_info.IsMali() && definition.precision == CalculationsPrecision::F16 &&
|
if (gpu_info.IsMali() && definition.precision == CalculationsPrecision::F16 &&
|
||||||
gpu_info.compute_units_count <= 4) {
|
gpu_info.GetComputeUnitsCount() <= 4) {
|
||||||
conv_params.block_size.x *= 2;
|
conv_params.block_size.x *= 2;
|
||||||
}
|
}
|
||||||
return conv_params;
|
return conv_params;
|
||||||
|
@ -1045,7 +1045,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
|||||||
if (dst_shape) {
|
if (dst_shape) {
|
||||||
int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
|
int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
|
||||||
float task_size_per_cu =
|
float task_size_per_cu =
|
||||||
static_cast<float>(task_size) / gpu_info.compute_units_count;
|
static_cast<float>(task_size) / gpu_info.GetComputeUnitsCount();
|
||||||
int block_size = conv_params.block_size.x * conv_params.block_size.y *
|
int block_size = conv_params.block_size.x * conv_params.block_size.y *
|
||||||
conv_params.block_size.w;
|
conv_params.block_size.w;
|
||||||
float threads_per_cu = task_size_per_cu / block_size;
|
float threads_per_cu = task_size_per_cu / block_size;
|
||||||
|
@ -95,7 +95,7 @@ MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
|
|||||||
// For now, fix workgroup size to the biggest supported by the device, but not
|
// For now, fix workgroup size to the biggest supported by the device, but not
|
||||||
// larger than the number of tensor slices.
|
// larger than the number of tensor slices.
|
||||||
int desired_work_group_size =
|
int desired_work_group_size =
|
||||||
std::min(tensor_slices, gpu_info.max_work_group_size_x);
|
std::min(tensor_slices, gpu_info.GetMaxWorkGroupSizeForX());
|
||||||
if (gpu_info.IsMali()) {
|
if (gpu_info.IsMali()) {
|
||||||
// Don't use more than 64 work items per work group on ARM Mali. They
|
// Don't use more than 64 work items per work group on ARM Mali. They
|
||||||
// implement local memory using the global memory, larger workgroups have
|
// implement local memory using the global memory, larger workgroups have
|
||||||
|
@ -118,7 +118,7 @@ int GetRecommendedBlockSizeForConv(const GpuInfo& gpu_info,
|
|||||||
CalculationsPrecision precision,
|
CalculationsPrecision precision,
|
||||||
int task_size) {
|
int task_size) {
|
||||||
const float task_size_per_cu =
|
const float task_size_per_cu =
|
||||||
task_size / static_cast<float>(gpu_info.compute_units_count);
|
task_size / static_cast<float>(gpu_info.GetComputeUnitsCount());
|
||||||
int block_size = 1;
|
int block_size = 1;
|
||||||
float threshold_1 = FLT_MAX;
|
float threshold_1 = FLT_MAX;
|
||||||
float threshold_2 = FLT_MAX;
|
float threshold_2 = FLT_MAX;
|
||||||
|
@ -78,9 +78,13 @@ TEST_F(OpenCLOperationTest, Winograd4x4To36) {
|
|||||||
for (auto precision : env_.GetSupportedPrecisions()) {
|
for (auto precision : env_.GetSupportedPrecisions()) {
|
||||||
float eps;
|
float eps;
|
||||||
if (precision == CalculationsPrecision::F32) {
|
if (precision == CalculationsPrecision::F32) {
|
||||||
eps = 1e-5f * (env_.device().SupportsFP32RTN() ? 1.0f : 4.0f);
|
eps = 1e-5f * (env_.device().GetInfo().opencl_info.supports_fp32_rtn
|
||||||
|
? 1.0f
|
||||||
|
: 4.0f);
|
||||||
} else {
|
} else {
|
||||||
eps = 1e-2f * (env_.device().SupportsFP16RTN() ? 1.0f : 4.0f);
|
eps = 1e-2f * (env_.device().GetInfo().opencl_info.supports_fp16_rtn
|
||||||
|
? 1.0f
|
||||||
|
: 4.0f);
|
||||||
}
|
}
|
||||||
OperationDef op_def;
|
OperationDef op_def;
|
||||||
op_def.precision = precision;
|
op_def.precision = precision;
|
||||||
@ -151,9 +155,13 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
|
|||||||
for (auto precision : env_.GetSupportedPrecisions()) {
|
for (auto precision : env_.GetSupportedPrecisions()) {
|
||||||
float eps;
|
float eps;
|
||||||
if (precision == CalculationsPrecision::F32) {
|
if (precision == CalculationsPrecision::F32) {
|
||||||
eps = 1e-5f * (env_.device().SupportsFP32RTN() ? 1.0f : 4.0f);
|
eps = 1e-5f * (env_.device().GetInfo().opencl_info.supports_fp32_rtn
|
||||||
|
? 1.0f
|
||||||
|
: 4.0f);
|
||||||
} else {
|
} else {
|
||||||
eps = 1e-2f * (env_.device().SupportsFP16RTN() ? 1.0f : 4.0f);
|
eps = 1e-2f * (env_.device().GetInfo().opencl_info.supports_fp16_rtn
|
||||||
|
? 1.0f
|
||||||
|
: 4.0f);
|
||||||
}
|
}
|
||||||
OperationDef op_def;
|
OperationDef op_def;
|
||||||
op_def.precision = precision;
|
op_def.precision = precision;
|
||||||
|
@ -52,9 +52,9 @@ std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(
|
|||||||
if (work_group_size_xy * z > kernel_info.max_work_group_size) {
|
if (work_group_size_xy * z > kernel_info.max_work_group_size) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (x <= gpu_info.max_work_group_size_x &&
|
if (x <= gpu_info.GetMaxWorkGroupSizeForX() &&
|
||||||
y <= gpu_info.max_work_group_size_y &&
|
y <= gpu_info.GetMaxWorkGroupSizeForY() &&
|
||||||
z <= gpu_info.max_work_group_size_z) {
|
z <= gpu_info.GetMaxWorkGroupSizeForZ()) {
|
||||||
work_groups.push_back({x, y, z});
|
work_groups.push_back({x, y, z});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -78,9 +78,9 @@ std::vector<int3> GenerateWorkGroupSizesXMultipleOf(
|
|||||||
x += multiplier) {
|
x += multiplier) {
|
||||||
for (auto y : possible_y_sizes) {
|
for (auto y : possible_y_sizes) {
|
||||||
for (auto z : possible_z_sizes) {
|
for (auto z : possible_z_sizes) {
|
||||||
if (x <= gpu_info.max_work_group_size_x &&
|
if (x <= gpu_info.GetMaxWorkGroupSizeForX() &&
|
||||||
y <= gpu_info.max_work_group_size_y &&
|
y <= gpu_info.GetMaxWorkGroupSizeForY() &&
|
||||||
z <= gpu_info.max_work_group_size_z &&
|
z <= gpu_info.GetMaxWorkGroupSizeForZ() &&
|
||||||
x * y * z <= kernel_info.max_work_group_size) {
|
x * y * z <= kernel_info.max_work_group_size) {
|
||||||
work_groups.push_back({x, y, z});
|
work_groups.push_back({x, y, z});
|
||||||
}
|
}
|
||||||
@ -94,9 +94,9 @@ void GetWorkGroupsAlignedToGrid(const GpuInfo& gpu_info,
|
|||||||
const KernelInfo& kernel_info, const int3& grid,
|
const KernelInfo& kernel_info, const int3& grid,
|
||||||
std::vector<int3>* work_groups) {
|
std::vector<int3>* work_groups) {
|
||||||
int3 max_wg_size;
|
int3 max_wg_size;
|
||||||
max_wg_size.x = gpu_info.max_work_group_size_x;
|
max_wg_size.x = gpu_info.GetMaxWorkGroupSizeForX();
|
||||||
max_wg_size.y = gpu_info.max_work_group_size_y;
|
max_wg_size.y = gpu_info.GetMaxWorkGroupSizeForY();
|
||||||
max_wg_size.z = gpu_info.max_work_group_size_z;
|
max_wg_size.z = gpu_info.GetMaxWorkGroupSizeForZ();
|
||||||
GenerateWorkGroupSizesAlignedToGrid(
|
GenerateWorkGroupSizesAlignedToGrid(
|
||||||
grid, max_wg_size, kernel_info.max_work_group_size, work_groups);
|
grid, max_wg_size, kernel_info.max_work_group_size, work_groups);
|
||||||
}
|
}
|
||||||
@ -275,7 +275,7 @@ void GetPossibleWorkGroupsConv(TuningType tuning_type, const GpuInfo& gpu_info,
|
|||||||
if (gpu_info.IsAdreno()) {
|
if (gpu_info.IsAdreno()) {
|
||||||
max_z_size = gpu_info.adreno_info.IsAdreno3xx() ? 16 : 64;
|
max_z_size = gpu_info.adreno_info.IsAdreno3xx() ? 16 : 64;
|
||||||
}
|
}
|
||||||
max_z_size = std::min(max_z_size, gpu_info.max_work_group_size_z);
|
max_z_size = std::min(max_z_size, gpu_info.GetMaxWorkGroupSizeForZ());
|
||||||
work_groups->push_back(
|
work_groups->push_back(
|
||||||
GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size));
|
GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size));
|
||||||
return;
|
return;
|
||||||
|
@ -33,11 +33,11 @@ bool CanCreateTensorWithShape(const GpuInfo& gpu_info, const BHWDC& shape,
|
|||||||
4 * (descriptor.data_type == DataType::FLOAT32 ? 4 : 2);
|
4 * (descriptor.data_type == DataType::FLOAT32 ? 4 : 2);
|
||||||
const int buffer_size =
|
const int buffer_size =
|
||||||
shape.b * shape.w * shape.h * shape.d * slices * flt4_size;
|
shape.b * shape.w * shape.h * shape.d * slices * flt4_size;
|
||||||
return buffer_size <= gpu_info.buffer_max_size;
|
return buffer_size <= gpu_info.GetMaxBufferSize();
|
||||||
}
|
}
|
||||||
case TensorStorageType::IMAGE_BUFFER:
|
case TensorStorageType::IMAGE_BUFFER:
|
||||||
return shape.b * shape.w * shape.h * shape.d * slices <=
|
return shape.b * shape.w * shape.h * shape.d * slices <=
|
||||||
gpu_info.image_buffer_max_size;
|
gpu_info.GetMaxImageBufferWidth();
|
||||||
case TensorStorageType::TEXTURE_3D:
|
case TensorStorageType::TEXTURE_3D:
|
||||||
if (gpu_info.opencl_info.cl_version < OpenClVersion::kCl1_2 &&
|
if (gpu_info.opencl_info.cl_version < OpenClVersion::kCl1_2 &&
|
||||||
slices == 1) {
|
slices == 1) {
|
||||||
@ -45,26 +45,26 @@ bool CanCreateTensorWithShape(const GpuInfo& gpu_info, const BHWDC& shape,
|
|||||||
// depth = 1 by specification;
|
// depth = 1 by specification;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return shape.w * shape.b <= gpu_info.image3d_max_width &&
|
return shape.w * shape.b <= gpu_info.GetMaxImage3DWidth() &&
|
||||||
shape.h <= gpu_info.image3d_max_height &&
|
shape.h <= gpu_info.GetMaxImage3DHeight() &&
|
||||||
slices * shape.d <= gpu_info.image3d_max_depth;
|
slices * shape.d <= gpu_info.GetMaxImage3DDepth();
|
||||||
case TensorStorageType::TEXTURE_ARRAY:
|
case TensorStorageType::TEXTURE_ARRAY:
|
||||||
// Bug on some Adreno. b/131099086
|
// Bug on some Adreno. b/131099086
|
||||||
if (slices == 1 && gpu_info.IsAdreno() &&
|
if (slices == 1 && gpu_info.IsAdreno() &&
|
||||||
!gpu_info.adreno_info.support_one_layer_texture_array) {
|
!gpu_info.adreno_info.support_one_layer_texture_array) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return shape.w * shape.b <= gpu_info.image2d_max_width &&
|
return shape.w * shape.b <= gpu_info.GetMaxImage2DWidth() &&
|
||||||
shape.h <= gpu_info.image2d_max_height &&
|
shape.h <= gpu_info.GetMaxImage2DHeight() &&
|
||||||
slices * shape.d <= gpu_info.image_array_max_layers;
|
slices * shape.d <= gpu_info.GetMaxImage2DArrayLayers();
|
||||||
case TensorStorageType::TEXTURE_2D:
|
case TensorStorageType::TEXTURE_2D:
|
||||||
return shape.w * shape.b * shape.d <= gpu_info.image2d_max_width &&
|
return shape.w * shape.b * shape.d <= gpu_info.GetMaxImage2DWidth() &&
|
||||||
shape.h * slices <= gpu_info.image2d_max_height;
|
shape.h * slices <= gpu_info.GetMaxImage2DHeight();
|
||||||
case TensorStorageType::SINGLE_TEXTURE_2D:
|
case TensorStorageType::SINGLE_TEXTURE_2D:
|
||||||
return shape.c <= 4 &&
|
return shape.c <= 4 &&
|
||||||
gpu_info.SupportsFloatImage2D(descriptor.data_type, shape.c) &&
|
gpu_info.SupportsFloatImage2D(descriptor.data_type, shape.c) &&
|
||||||
shape.w * shape.b * shape.d <= gpu_info.image2d_max_width &&
|
shape.w * shape.b * shape.d <= gpu_info.GetMaxImage2DWidth() &&
|
||||||
shape.h <= gpu_info.image2d_max_height;
|
shape.h <= gpu_info.GetMaxImage2DHeight();
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user