diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.mm b/tensorflow/lite/delegates/gpu/metal/compute_task.mm index d3e3466ca6f..88be8676651 100644 --- a/tensorflow/lite/delegates/gpu/metal/compute_task.mm +++ b/tensorflow/lite/delegates/gpu/metal/compute_task.mm @@ -111,7 +111,7 @@ using ::tflite::gpu::ValueId; @"TO_ACCUM2_TYPE" : toAccumulatorType2, @"TO_ACCUM3_TYPE" : toAccumulatorType3, @"TO_ACCUM4_TYPE" : toAccumulatorType4, - @"BARRIER" : barrier, + @"SIMDGROUP_BARRIER" : barrier, }; NSString* code = [NSString stringWithCString:desc->shader_source.c_str() diff --git a/tensorflow/lite/delegates/gpu/metal/environment.h b/tensorflow/lite/delegates/gpu/metal/environment.h index 732dbe1d18b..14c8860dee2 100644 --- a/tensorflow/lite/delegates/gpu/metal/environment.h +++ b/tensorflow/lite/delegates/gpu/metal/environment.h @@ -57,6 +57,9 @@ struct AppleGPUInfo { // floating point rounding mode bool IsRoundToNearestSupported() const; + // returns true if device have fixed wave size equal to 32 + bool IsWaveSizeEqualTo32() const; + int GetComputeUnitsCount() const; }; @@ -75,6 +78,9 @@ struct DeviceInfo { // floating point rounding mode bool IsRoundToNearestSupported() const; + // returns true if device have fixed wave size equal to 32 + bool IsWaveSizeEqualTo32() const; + int GetComputeUnitsCount() const; }; diff --git a/tensorflow/lite/delegates/gpu/metal/environment.mm b/tensorflow/lite/delegates/gpu/metal/environment.mm index 78376b70c8c..f08a9beef47 100644 --- a/tensorflow/lite/delegates/gpu/metal/environment.mm +++ b/tensorflow/lite/delegates/gpu/metal/environment.mm @@ -78,6 +78,10 @@ bool AppleGPUInfo::IsRoundToNearestSupported() const { return IsBionic(); } +bool AppleGPUInfo::IsWaveSizeEqualTo32() const { + return true; +} + int AppleGPUInfo::GetComputeUnitsCount() const { switch (gpu_type) { case AppleGPU::kA7: @@ -135,6 +139,14 @@ bool DeviceInfo::IsRoundToNearestSupported() const { } } +bool DeviceInfo::IsWaveSizeEqualTo32() const { + if (vendor == Vendor::kApple) { + return apple_info.IsWaveSizeEqualTo32(); + } else { + return false; + } +} + int DeviceInfo::GetComputeUnitsCount() const { if (vendor == Vendor::kApple) { return apple_info.GetComputeUnitsCount(); diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc index 0a851a00a57..f9ff87e75e2 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc +++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc @@ -397,11 +397,11 @@ kernel void ComputeFunction( const int total_work_items = params.work_group_size.x * params.work_group_size.y * params.work_group_size.z; - c += " BARRIER(mem_flags::mem_none);\n"; + c += " SIMDGROUP_BARRIER(mem_flags::mem_none);\n"; c += GenerateUploadByThreads("weights_cache", "tmp", /*global_offset_name*/ "", "tid", total_work_items, local_mem_size); - c += " BARRIER(mem_flags::mem_threadgroup);\n"; + c += " SIMDGROUP_BARRIER(mem_flags::mem_threadgroup);\n"; } else if (use_simd_broadcast) { int parts = local_mem_size / simd_size; int reminder = local_mem_size % simd_size; diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc index d3f46fad4d3..cfc727123a2 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc +++ b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc @@ -42,8 +42,9 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info, bool shared_memory = device_info.IsAppleGPU() && device_info.apple_info.IsLocalMemoryPreferredOverGlobal(); - const std::string barrier = - device_info.IsAppleGPU() ? "BARRIER" : "threadgroup_barrier"; + const std::string barrier = device_info.IsWaveSizeEqualTo32() + ? "SIMDGROUP_BARRIER" + : "threadgroup_barrier"; const int src_depth = IntegralDivideRoundUp(src_channels, 4); std::stringstream code; code << R"( diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc index de0c3885e6f..0ed2e0650e1 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc +++ b/tensorflow/lite/delegates/gpu/metal/kernels/softmax.cc @@ -33,8 +33,9 @@ namespace gpu { namespace metal { namespace { std::string GetSoftmax1x1Code(const DeviceInfo& device_info) { - const std::string barrier = - device_info.IsAppleGPU() ? "BARRIER" : "threadgroup_barrier"; + const std::string barrier = device_info.IsWaveSizeEqualTo32() + ? "SIMDGROUP_BARRIER" + : "threadgroup_barrier"; std::string code = R"( #include using namespace metal; diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc index 1b6e6963fb5..56630c5d2af 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc +++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc @@ -275,7 +275,18 @@ std::string GetDeconvolutionShared(const ConvolutionTransposedAttributes& attr, src_local_size_x, src_local_size_y, workgroup_x, workgroup_y); } -std::string GetDeconvolution4x4(const int2& block_size, bool use_local_mem) { +std::string GetDeconvolution4x4(const int2& block_size, + const DeviceInfo& device_info) { + bool use_local_mem = false; + if (device_info.IsAppleGPU() && device_info.apple_info.IsBionic()) { + use_local_mem = true; + } + if (device_info.IsIntelGPU()) { + use_local_mem = true; + } + const std::string barrier = device_info.IsWaveSizeEqualTo32() + ? "SIMDGROUP_BARRIER" + : "threadgroup_barrier"; std::string c = R"( #include using namespace metal; @@ -349,7 +360,7 @@ std::string GetDeconvolution4x4(const int2& block_size, bool use_local_mem) { } c += " for (int s = 0; s < params.src_size.z; ++s) {\n"; if (use_local_mem) { - c += " BARRIER(mem_flags::mem_none);\n"; + c += " " + barrier + "(mem_flags::mem_none);\n"; c += " weights_cache[local_id] = filters[f_offset + local_id];\n"; c += " weights_cache[local_id + 32] = filters[f_offset + local_id + " "32];\n"; @@ -365,7 +376,7 @@ std::string GetDeconvolution4x4(const int2& block_size, bool use_local_mem) { } c += " f_offset += 64;\n"; if (use_local_mem) { - c += " BARRIER(mem_flags::mem_threadgroup);\n"; + c += " " + barrier + "(mem_flags::mem_threadgroup);\n"; } for (int i = 0; i < 16; ++i) { const int result_sub_pixel_id = i % 4; @@ -595,12 +606,20 @@ std::vector ConvolutionTransposed4x4( desc->id = id; desc->is_linkable = false; - const bool recommended_2x = - device_info.apple_info.IsBionic() && - options.storage_precision == RuntimeOptions::Precision::FP16; - const bool use_local_mem = !device_info.apple_info.IsBionic(); + bool recommended_2x = false; + if (device_info.IsAppleGPU()) { + if (device_info.apple_info.IsBionic() && + options.storage_precision == RuntimeOptions::Precision::FP16) { + recommended_2x = true; + } + } else { + if (options.storage_precision == RuntimeOptions::Precision::FP16) { + recommended_2x = true; + } + } + const int2 block_size(recommended_2x ? 2 : 1, 1); - desc->shader_source = GetDeconvolution4x4(block_size, use_local_mem); + desc->shader_source = GetDeconvolution4x4(block_size, device_info); desc->input_buffers = { {input_id, "device FLT4* const src_buffer"},