Fixed TransposedConv4x4.

Changed Softmax/FullyConnected/Conv barrier definition.

PiperOrigin-RevId: 304517962
Change-Id: Iedf1fb2c4d810abd5a14dbe245825aafe7a8ceb9
This commit is contained in:
Raman Sarokin 2020-04-02 18:17:37 -07:00 committed by TensorFlower Gardener
parent 2883cb5fb1
commit d27151c789
7 changed files with 54 additions and 15 deletions

View File

@ -111,7 +111,7 @@ using ::tflite::gpu::ValueId;
@"TO_ACCUM2_TYPE" : toAccumulatorType2,
@"TO_ACCUM3_TYPE" : toAccumulatorType3,
@"TO_ACCUM4_TYPE" : toAccumulatorType4,
@"BARRIER" : barrier,
@"SIMDGROUP_BARRIER" : barrier,
};
NSString* code = [NSString stringWithCString:desc->shader_source.c_str()

View File

@ -57,6 +57,9 @@ struct AppleGPUInfo {
// floating point rounding mode
bool IsRoundToNearestSupported() const;
// returns true if device have fixed wave size equal to 32
bool IsWaveSizeEqualTo32() const;
int GetComputeUnitsCount() const;
};
@ -75,6 +78,9 @@ struct DeviceInfo {
// floating point rounding mode
bool IsRoundToNearestSupported() const;
// returns true if device have fixed wave size equal to 32
bool IsWaveSizeEqualTo32() const;
int GetComputeUnitsCount() const;
};

View File

@ -78,6 +78,10 @@ bool AppleGPUInfo::IsRoundToNearestSupported() const {
return IsBionic();
}
bool AppleGPUInfo::IsWaveSizeEqualTo32() const {
return true;
}
int AppleGPUInfo::GetComputeUnitsCount() const {
switch (gpu_type) {
case AppleGPU::kA7:
@ -135,6 +139,14 @@ bool DeviceInfo::IsRoundToNearestSupported() const {
}
}
bool DeviceInfo::IsWaveSizeEqualTo32() const {
if (vendor == Vendor::kApple) {
return apple_info.IsWaveSizeEqualTo32();
} else {
return false;
}
}
int DeviceInfo::GetComputeUnitsCount() const {
if (vendor == Vendor::kApple) {
return apple_info.GetComputeUnitsCount();

View File

@ -397,11 +397,11 @@ kernel void ComputeFunction(
const int total_work_items = params.work_group_size.x *
params.work_group_size.y *
params.work_group_size.z;
c += " BARRIER(mem_flags::mem_none);\n";
c += " SIMDGROUP_BARRIER(mem_flags::mem_none);\n";
c += GenerateUploadByThreads("weights_cache", "tmp",
/*global_offset_name*/ "", "tid",
total_work_items, local_mem_size);
c += " BARRIER(mem_flags::mem_threadgroup);\n";
c += " SIMDGROUP_BARRIER(mem_flags::mem_threadgroup);\n";
} else if (use_simd_broadcast) {
int parts = local_mem_size / simd_size;
int reminder = local_mem_size % simd_size;

View File

@ -42,8 +42,9 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
bool shared_memory =
device_info.IsAppleGPU() &&
device_info.apple_info.IsLocalMemoryPreferredOverGlobal();
const std::string barrier =
device_info.IsAppleGPU() ? "BARRIER" : "threadgroup_barrier";
const std::string barrier = device_info.IsWaveSizeEqualTo32()
? "SIMDGROUP_BARRIER"
: "threadgroup_barrier";
const int src_depth = IntegralDivideRoundUp(src_channels, 4);
std::stringstream code;
code << R"(

View File

@ -33,8 +33,9 @@ namespace gpu {
namespace metal {
namespace {
std::string GetSoftmax1x1Code(const DeviceInfo& device_info) {
const std::string barrier =
device_info.IsAppleGPU() ? "BARRIER" : "threadgroup_barrier";
const std::string barrier = device_info.IsWaveSizeEqualTo32()
? "SIMDGROUP_BARRIER"
: "threadgroup_barrier";
std::string code = R"(
#include <metal_stdlib>
using namespace metal;

View File

@ -275,7 +275,18 @@ std::string GetDeconvolutionShared(const ConvolutionTransposedAttributes& attr,
src_local_size_x, src_local_size_y, workgroup_x, workgroup_y);
}
std::string GetDeconvolution4x4(const int2& block_size, bool use_local_mem) {
std::string GetDeconvolution4x4(const int2& block_size,
const DeviceInfo& device_info) {
bool use_local_mem = false;
if (device_info.IsAppleGPU() && device_info.apple_info.IsBionic()) {
use_local_mem = true;
}
if (device_info.IsIntelGPU()) {
use_local_mem = true;
}
const std::string barrier = device_info.IsWaveSizeEqualTo32()
? "SIMDGROUP_BARRIER"
: "threadgroup_barrier";
std::string c = R"(
#include <metal_stdlib>
using namespace metal;
@ -349,7 +360,7 @@ std::string GetDeconvolution4x4(const int2& block_size, bool use_local_mem) {
}
c += " for (int s = 0; s < params.src_size.z; ++s) {\n";
if (use_local_mem) {
c += " BARRIER(mem_flags::mem_none);\n";
c += " " + barrier + "(mem_flags::mem_none);\n";
c += " weights_cache[local_id] = filters[f_offset + local_id];\n";
c += " weights_cache[local_id + 32] = filters[f_offset + local_id + "
"32];\n";
@ -365,7 +376,7 @@ std::string GetDeconvolution4x4(const int2& block_size, bool use_local_mem) {
}
c += " f_offset += 64;\n";
if (use_local_mem) {
c += " BARRIER(mem_flags::mem_threadgroup);\n";
c += " " + barrier + "(mem_flags::mem_threadgroup);\n";
}
for (int i = 0; i < 16; ++i) {
const int result_sub_pixel_id = i % 4;
@ -595,12 +606,20 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
desc->id = id;
desc->is_linkable = false;
const bool recommended_2x =
device_info.apple_info.IsBionic() &&
options.storage_precision == RuntimeOptions::Precision::FP16;
const bool use_local_mem = !device_info.apple_info.IsBionic();
bool recommended_2x = false;
if (device_info.IsAppleGPU()) {
if (device_info.apple_info.IsBionic() &&
options.storage_precision == RuntimeOptions::Precision::FP16) {
recommended_2x = true;
}
} else {
if (options.storage_precision == RuntimeOptions::Precision::FP16) {
recommended_2x = true;
}
}
const int2 block_size(recommended_2x ? 2 : 1, 1);
desc->shader_source = GetDeconvolution4x4(block_size, use_local_mem);
desc->shader_source = GetDeconvolution4x4(block_size, device_info);
desc->input_buffers = {
{input_id, "device FLT4* const src_buffer"},