Fixed TransposedConv4x4.
Changed Softmax/FullyConnected/Conv barrier definition. PiperOrigin-RevId: 304517962 Change-Id: Iedf1fb2c4d810abd5a14dbe245825aafe7a8ceb9
This commit is contained in:
parent
2883cb5fb1
commit
d27151c789
@ -111,7 +111,7 @@ using ::tflite::gpu::ValueId;
|
||||
@"TO_ACCUM2_TYPE" : toAccumulatorType2,
|
||||
@"TO_ACCUM3_TYPE" : toAccumulatorType3,
|
||||
@"TO_ACCUM4_TYPE" : toAccumulatorType4,
|
||||
@"BARRIER" : barrier,
|
||||
@"SIMDGROUP_BARRIER" : barrier,
|
||||
};
|
||||
|
||||
NSString* code = [NSString stringWithCString:desc->shader_source.c_str()
|
||||
|
@ -57,6 +57,9 @@ struct AppleGPUInfo {
|
||||
// floating point rounding mode
|
||||
bool IsRoundToNearestSupported() const;
|
||||
|
||||
// returns true if device have fixed wave size equal to 32
|
||||
bool IsWaveSizeEqualTo32() const;
|
||||
|
||||
int GetComputeUnitsCount() const;
|
||||
};
|
||||
|
||||
@ -75,6 +78,9 @@ struct DeviceInfo {
|
||||
// floating point rounding mode
|
||||
bool IsRoundToNearestSupported() const;
|
||||
|
||||
// returns true if device have fixed wave size equal to 32
|
||||
bool IsWaveSizeEqualTo32() const;
|
||||
|
||||
int GetComputeUnitsCount() const;
|
||||
};
|
||||
|
||||
|
@ -78,6 +78,10 @@ bool AppleGPUInfo::IsRoundToNearestSupported() const {
|
||||
return IsBionic();
|
||||
}
|
||||
|
||||
bool AppleGPUInfo::IsWaveSizeEqualTo32() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
int AppleGPUInfo::GetComputeUnitsCount() const {
|
||||
switch (gpu_type) {
|
||||
case AppleGPU::kA7:
|
||||
@ -135,6 +139,14 @@ bool DeviceInfo::IsRoundToNearestSupported() const {
|
||||
}
|
||||
}
|
||||
|
||||
bool DeviceInfo::IsWaveSizeEqualTo32() const {
|
||||
if (vendor == Vendor::kApple) {
|
||||
return apple_info.IsWaveSizeEqualTo32();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int DeviceInfo::GetComputeUnitsCount() const {
|
||||
if (vendor == Vendor::kApple) {
|
||||
return apple_info.GetComputeUnitsCount();
|
||||
|
@ -397,11 +397,11 @@ kernel void ComputeFunction(
|
||||
const int total_work_items = params.work_group_size.x *
|
||||
params.work_group_size.y *
|
||||
params.work_group_size.z;
|
||||
c += " BARRIER(mem_flags::mem_none);\n";
|
||||
c += " SIMDGROUP_BARRIER(mem_flags::mem_none);\n";
|
||||
c += GenerateUploadByThreads("weights_cache", "tmp",
|
||||
/*global_offset_name*/ "", "tid",
|
||||
total_work_items, local_mem_size);
|
||||
c += " BARRIER(mem_flags::mem_threadgroup);\n";
|
||||
c += " SIMDGROUP_BARRIER(mem_flags::mem_threadgroup);\n";
|
||||
} else if (use_simd_broadcast) {
|
||||
int parts = local_mem_size / simd_size;
|
||||
int reminder = local_mem_size % simd_size;
|
||||
|
@ -42,8 +42,9 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
|
||||
bool shared_memory =
|
||||
device_info.IsAppleGPU() &&
|
||||
device_info.apple_info.IsLocalMemoryPreferredOverGlobal();
|
||||
const std::string barrier =
|
||||
device_info.IsAppleGPU() ? "BARRIER" : "threadgroup_barrier";
|
||||
const std::string barrier = device_info.IsWaveSizeEqualTo32()
|
||||
? "SIMDGROUP_BARRIER"
|
||||
: "threadgroup_barrier";
|
||||
const int src_depth = IntegralDivideRoundUp(src_channels, 4);
|
||||
std::stringstream code;
|
||||
code << R"(
|
||||
|
@ -33,8 +33,9 @@ namespace gpu {
|
||||
namespace metal {
|
||||
namespace {
|
||||
std::string GetSoftmax1x1Code(const DeviceInfo& device_info) {
|
||||
const std::string barrier =
|
||||
device_info.IsAppleGPU() ? "BARRIER" : "threadgroup_barrier";
|
||||
const std::string barrier = device_info.IsWaveSizeEqualTo32()
|
||||
? "SIMDGROUP_BARRIER"
|
||||
: "threadgroup_barrier";
|
||||
std::string code = R"(
|
||||
#include <metal_stdlib>
|
||||
using namespace metal;
|
||||
|
@ -275,7 +275,18 @@ std::string GetDeconvolutionShared(const ConvolutionTransposedAttributes& attr,
|
||||
src_local_size_x, src_local_size_y, workgroup_x, workgroup_y);
|
||||
}
|
||||
|
||||
std::string GetDeconvolution4x4(const int2& block_size, bool use_local_mem) {
|
||||
std::string GetDeconvolution4x4(const int2& block_size,
|
||||
const DeviceInfo& device_info) {
|
||||
bool use_local_mem = false;
|
||||
if (device_info.IsAppleGPU() && device_info.apple_info.IsBionic()) {
|
||||
use_local_mem = true;
|
||||
}
|
||||
if (device_info.IsIntelGPU()) {
|
||||
use_local_mem = true;
|
||||
}
|
||||
const std::string barrier = device_info.IsWaveSizeEqualTo32()
|
||||
? "SIMDGROUP_BARRIER"
|
||||
: "threadgroup_barrier";
|
||||
std::string c = R"(
|
||||
#include <metal_stdlib>
|
||||
using namespace metal;
|
||||
@ -349,7 +360,7 @@ std::string GetDeconvolution4x4(const int2& block_size, bool use_local_mem) {
|
||||
}
|
||||
c += " for (int s = 0; s < params.src_size.z; ++s) {\n";
|
||||
if (use_local_mem) {
|
||||
c += " BARRIER(mem_flags::mem_none);\n";
|
||||
c += " " + barrier + "(mem_flags::mem_none);\n";
|
||||
c += " weights_cache[local_id] = filters[f_offset + local_id];\n";
|
||||
c += " weights_cache[local_id + 32] = filters[f_offset + local_id + "
|
||||
"32];\n";
|
||||
@ -365,7 +376,7 @@ std::string GetDeconvolution4x4(const int2& block_size, bool use_local_mem) {
|
||||
}
|
||||
c += " f_offset += 64;\n";
|
||||
if (use_local_mem) {
|
||||
c += " BARRIER(mem_flags::mem_threadgroup);\n";
|
||||
c += " " + barrier + "(mem_flags::mem_threadgroup);\n";
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
const int result_sub_pixel_id = i % 4;
|
||||
@ -595,12 +606,20 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
|
||||
desc->id = id;
|
||||
desc->is_linkable = false;
|
||||
|
||||
const bool recommended_2x =
|
||||
device_info.apple_info.IsBionic() &&
|
||||
options.storage_precision == RuntimeOptions::Precision::FP16;
|
||||
const bool use_local_mem = !device_info.apple_info.IsBionic();
|
||||
bool recommended_2x = false;
|
||||
if (device_info.IsAppleGPU()) {
|
||||
if (device_info.apple_info.IsBionic() &&
|
||||
options.storage_precision == RuntimeOptions::Precision::FP16) {
|
||||
recommended_2x = true;
|
||||
}
|
||||
} else {
|
||||
if (options.storage_precision == RuntimeOptions::Precision::FP16) {
|
||||
recommended_2x = true;
|
||||
}
|
||||
}
|
||||
|
||||
const int2 block_size(recommended_2x ? 2 : 1, 1);
|
||||
desc->shader_source = GetDeconvolution4x4(block_size, use_local_mem);
|
||||
desc->shader_source = GetDeconvolution4x4(block_size, device_info);
|
||||
|
||||
desc->input_buffers = {
|
||||
{input_id, "device FLT4* const src_buffer"},
|
||||
|
Loading…
x
Reference in New Issue
Block a user