Return a failed status (instead of crashing) when block count is too high
PiperOrigin-RevId: 358360541 Change-Id: Ia08a289c1368bfb2c8b726e707c8bb94a208d224
This commit is contained in:
parent
0a5daafe5d
commit
db445bbf91
@ -615,8 +615,15 @@ static StatusOr<bool> DeviceCompare(se::Stream* stream,
|
||||
gpu_device_info.threads_per_core_limit =
|
||||
executor->GetDeviceDescription().threads_per_core_limit();
|
||||
gpu_device_info.core_count = executor->GetDeviceDescription().core_count();
|
||||
LaunchDimensions dim =
|
||||
CalculateLaunchDimensions(buffer_shape, gpu_device_info);
|
||||
gpu_device_info.block_dim_limit_x =
|
||||
executor->GetDeviceDescription().block_dim_limit().x;
|
||||
gpu_device_info.block_dim_limit_y =
|
||||
executor->GetDeviceDescription().block_dim_limit().y;
|
||||
gpu_device_info.block_dim_limit_z =
|
||||
executor->GetDeviceDescription().block_dim_limit().z;
|
||||
|
||||
TF_ASSIGN_OR_RETURN(LaunchDimensions dim,
|
||||
CalculateLaunchDimensions(buffer_shape, gpu_device_info));
|
||||
|
||||
LaunchDimensions::Dim3D thread_counts = dim.thread_counts_per_block();
|
||||
LaunchDimensions::Dim3D block_counts = dim.block_counts();
|
||||
|
@ -971,6 +971,12 @@ GpuDeviceInfo GetGpuDeviceInfo(se::StreamExecutor* stream_exec) {
|
||||
gpu_device_info.threads_per_core_limit =
|
||||
stream_exec->GetDeviceDescription().threads_per_core_limit();
|
||||
gpu_device_info.core_count = stream_exec->GetDeviceDescription().core_count();
|
||||
gpu_device_info.block_dim_limit_x =
|
||||
stream_exec->GetDeviceDescription().block_dim_limit().x;
|
||||
gpu_device_info.block_dim_limit_y =
|
||||
stream_exec->GetDeviceDescription().block_dim_limit().y;
|
||||
gpu_device_info.block_dim_limit_z =
|
||||
stream_exec->GetDeviceDescription().block_dim_limit().z;
|
||||
return gpu_device_info;
|
||||
}
|
||||
|
||||
|
@ -34,6 +34,9 @@ struct GpuDeviceInfo {
|
||||
int shared_memory_per_block;
|
||||
int threads_per_core_limit;
|
||||
int core_count;
|
||||
int block_dim_limit_x;
|
||||
int block_dim_limit_y;
|
||||
int block_dim_limit_z;
|
||||
};
|
||||
} // namespace gpu
|
||||
} // namespace xla
|
||||
|
@ -913,8 +913,10 @@ Status IrEmitterUnnested::EmitPadToStaticFromMlir(MlirEmitterInput mlir_input) {
|
||||
return Status::OK();
|
||||
};
|
||||
|
||||
LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
|
||||
input_shape, ir_emitter_context_->gpu_device_info(), unroll_factor);
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
LaunchDimensions launch_dimensions,
|
||||
CalculateLaunchDimensions(
|
||||
input_shape, ir_emitter_context_->gpu_device_info(), unroll_factor));
|
||||
UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
|
||||
ir_emitter_context_->llvm_module());
|
||||
TF_RETURN_IF_ERROR(
|
||||
@ -1036,8 +1038,10 @@ Status IrEmitterUnnested::EmitSliceToDynamicFromMlir(
|
||||
return Status::OK();
|
||||
};
|
||||
|
||||
LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
|
||||
input_shape, ir_emitter_context_->gpu_device_info(), unroll_factor);
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
LaunchDimensions launch_dimensions,
|
||||
CalculateLaunchDimensions(
|
||||
input_shape, ir_emitter_context_->gpu_device_info(), unroll_factor));
|
||||
UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
|
||||
ir_emitter_context_->llvm_module());
|
||||
|
||||
@ -1830,9 +1834,10 @@ Status IrEmitterUnnested::EmitLoopFusionFromMlir(
|
||||
}();
|
||||
|
||||
Shape element_shape = context.output_shapes[0];
|
||||
LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
|
||||
element_shape, ir_emitter_context_->gpu_device_info(), unroll_factor,
|
||||
few_waves);
|
||||
TF_ASSIGN_OR_RETURN(LaunchDimensions launch_dimensions,
|
||||
CalculateLaunchDimensions(
|
||||
element_shape, ir_emitter_context_->gpu_device_info(),
|
||||
unroll_factor, few_waves));
|
||||
UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
|
||||
ir_emitter_context_->llvm_module());
|
||||
llvm::Type* index_type = GetIndexTypeForKernelFromMlir(
|
||||
@ -1907,9 +1912,11 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
|
||||
auto unroll_factor =
|
||||
ComputeMaxUnrollFactor(fusion_op, hlo_module_config_);
|
||||
const Shape& element_shape = root->shape();
|
||||
LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
|
||||
element_shape, ir_emitter_context_->gpu_device_info(),
|
||||
unroll_factor, /*few_waves=*/false);
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
LaunchDimensions launch_dimensions,
|
||||
CalculateLaunchDimensions(element_shape,
|
||||
ir_emitter_context_->gpu_device_info(),
|
||||
unroll_factor, /*few_waves=*/false));
|
||||
UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
|
||||
ir_emitter_context_->llvm_module());
|
||||
TF_RETURN_IF_ERROR(
|
||||
@ -2033,8 +2040,10 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
|
||||
// same as operand 0's array.
|
||||
const IrArray& output_array = ir_arrays.back();
|
||||
|
||||
LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
|
||||
update_shape, ir_emitter_context_->gpu_device_info());
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
LaunchDimensions launch_dimensions,
|
||||
CalculateLaunchDimensions(update_shape,
|
||||
ir_emitter_context_->gpu_device_info()));
|
||||
UpdateLaunchDimensions(launch_dimensions, fusion_thunk.get(),
|
||||
ir_emitter_context_->llvm_module());
|
||||
AddThunkToThunkSequence(std::move(fusion_thunk));
|
||||
@ -2229,8 +2238,10 @@ Status IrEmitterUnnested::EmitSelectAndScatterFromMlir(
|
||||
TypeToShape(select_and_scatter_op.operand().getType());
|
||||
const int64 rank = operand_shape.rank();
|
||||
|
||||
LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
|
||||
source_shape, ir_emitter_context_->gpu_device_info());
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
LaunchDimensions launch_dimensions,
|
||||
CalculateLaunchDimensions(source_shape,
|
||||
ir_emitter_context_->gpu_device_info()));
|
||||
llvm::Type* index_type = GetIndexTypeForKernelFromMlir(
|
||||
select_and_scatter_op, launch_dimensions.launch_bound(), &b_);
|
||||
auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
|
||||
@ -2713,8 +2724,10 @@ Status IrEmitterUnnested::EmitScatter(const ScatterDescriptor& desc,
|
||||
// Launch a kernel that reads every element in the updates tensor. We could
|
||||
// also do one kernel per window instead if bounds checks turn out to be a
|
||||
// bottleneck.
|
||||
LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
|
||||
desc.updates_shape, ir_emitter_context_->gpu_device_info());
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
LaunchDimensions launch_dimensions,
|
||||
CalculateLaunchDimensions(desc.updates_shape,
|
||||
ir_emitter_context_->gpu_device_info()));
|
||||
UpdateLaunchDimensions(launch_dimensions, thunk,
|
||||
ir_emitter_context_->llvm_module());
|
||||
|
||||
@ -2922,8 +2935,10 @@ Status IrEmitterUnnested::EmitSortFromMlir(MlirEmitterInput mlir_input) {
|
||||
uint64 standard_num_iterations_in_sort_dim = 1ULL << (num_stages - 1);
|
||||
standard_iteration_shape.set_dimensions(dimension_to_sort,
|
||||
standard_num_iterations_in_sort_dim);
|
||||
LaunchDimensions standard_launch_dimensions = CalculateLaunchDimensions(
|
||||
standard_iteration_shape, ir_emitter_context_->gpu_device_info());
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
LaunchDimensions standard_launch_dimensions,
|
||||
CalculateLaunchDimensions(standard_iteration_shape,
|
||||
ir_emitter_context_->gpu_device_info()));
|
||||
|
||||
// Calculate the launch dimensions for the case where we use tiling. We split
|
||||
// the dimension that should be sorted into tiles of size 'kTileSize'. This
|
||||
@ -3664,8 +3679,9 @@ IrEmitterUnnested::BuildInitializerThunkForMlir(mlir::Operation* op,
|
||||
const llvm_ir::IrArray dest_array = ir_arrays[1];
|
||||
|
||||
const Shape dest_shape = TypeToShape(dest.getType());
|
||||
LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
|
||||
dest_shape, ir_emitter_context_->gpu_device_info());
|
||||
TF_ASSIGN_OR_RETURN(LaunchDimensions launch_dimensions,
|
||||
CalculateLaunchDimensions(
|
||||
dest_shape, ir_emitter_context_->gpu_device_info()));
|
||||
UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
|
||||
ir_emitter_context_->llvm_module());
|
||||
|
||||
@ -3708,8 +3724,9 @@ IrEmitterUnnested::BuildFusedInitializerThunkForMlir(
|
||||
ir_arrays[input_buffers.size() + output_index];
|
||||
|
||||
const Shape dest_shape = TypeToShape(dest.getType());
|
||||
LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
|
||||
dest_shape, ir_emitter_context_->gpu_device_info());
|
||||
TF_ASSIGN_OR_RETURN(LaunchDimensions launch_dimensions,
|
||||
CalculateLaunchDimensions(
|
||||
dest_shape, ir_emitter_context_->gpu_device_info()));
|
||||
UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
|
||||
ir_emitter_context_->llvm_module());
|
||||
|
||||
@ -5802,9 +5819,10 @@ Status IrEmitterUnnested::EmitInputFusibleNonStridedSlices(
|
||||
|
||||
TF_ASSIGN_OR_RETURN(Shape element_shape,
|
||||
GetConsistentInputShapeForRootSlices(fused_computation));
|
||||
|
||||
LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
|
||||
element_shape, ir_emitter_context_->gpu_device_info(), unroll_factor);
|
||||
TF_ASSIGN_OR_RETURN(LaunchDimensions launch_dimensions,
|
||||
CalculateLaunchDimensions(
|
||||
element_shape, ir_emitter_context_->gpu_device_info(),
|
||||
unroll_factor));
|
||||
UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
|
||||
ir_emitter_context_->llvm_module());
|
||||
|
||||
|
@ -54,9 +54,9 @@ static int64 ThreadsPerBlockLimit(GpuDeviceInfo gpu_device_info) {
|
||||
}
|
||||
|
||||
// Calculates the launch dimensions used to invoke `hlo`.
|
||||
LaunchDimensions CalculateLaunchDimensions(const Shape& shape,
|
||||
GpuDeviceInfo gpu_device_info,
|
||||
int unroll_factor, bool few_waves) {
|
||||
StatusOr<LaunchDimensions> CalculateLaunchDimensions(
|
||||
const Shape& shape, GpuDeviceInfo gpu_device_info, int unroll_factor,
|
||||
bool few_waves) {
|
||||
int64 num_elements = ShapeUtil::ElementsIn(shape);
|
||||
if (num_elements <= 1) {
|
||||
return LaunchDimensions();
|
||||
@ -102,6 +102,15 @@ LaunchDimensions CalculateLaunchDimensions(const Shape& shape,
|
||||
block_count = capped_block_count;
|
||||
}
|
||||
}
|
||||
|
||||
if (gpu_device_info.block_dim_limit_x > 0 &&
|
||||
block_count >= gpu_device_info.block_dim_limit_x) {
|
||||
return tensorflow::errors::Unimplemented(
|
||||
"Kernel launch needs more blocks (", block_count,
|
||||
") than allowed by hardware (", gpu_device_info.block_dim_limit_x,
|
||||
").");
|
||||
}
|
||||
|
||||
VLOG(2) << absl::StrFormat(
|
||||
"Initialized the block count to ceil(# of elements / threads per "
|
||||
"block) = ceil(%d/%d) = %d",
|
||||
|
@ -65,10 +65,9 @@ class LaunchDimensions {
|
||||
std::ostream& operator<<(std::ostream& out,
|
||||
const LaunchDimensions& launch_dims);
|
||||
|
||||
LaunchDimensions CalculateLaunchDimensions(const Shape& shape,
|
||||
GpuDeviceInfo gpu_device_info,
|
||||
int unroll_factor = 1,
|
||||
bool few_waves = false);
|
||||
StatusOr<LaunchDimensions> CalculateLaunchDimensions(
|
||||
const Shape& shape, GpuDeviceInfo gpu_device_info, int unroll_factor = 1,
|
||||
bool few_waves = false);
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace xla
|
||||
|
@ -129,6 +129,21 @@ tf_cc_test(
|
||||
],
|
||||
)
|
||||
|
||||
tf_cc_test(
|
||||
name = "gpu_too_many_blocks_test",
|
||||
srcs = [
|
||||
"gpu_too_many_blocks_test.cc",
|
||||
],
|
||||
tags = tf_cuda_tests_tags(),
|
||||
deps = [
|
||||
":gpu_codegen_test",
|
||||
"//tensorflow/compiler/xla:statusor",
|
||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||
"//tensorflow/core:test",
|
||||
"//tensorflow/core:test_main",
|
||||
],
|
||||
)
|
||||
|
||||
tf_cc_test(
|
||||
name = "reduction_degenerate_dim_remover_test",
|
||||
srcs = [
|
||||
|
@ -0,0 +1,60 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
|
||||
#include "tensorflow/compiler/xla/statusor.h"
|
||||
#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
|
||||
#include "tensorflow/core/lib/core/status_test_util.h"
|
||||
#include "tensorflow/core/platform/test.h"
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
namespace {
|
||||
|
||||
class TooManyBlocksTest : public GpuCodegenTest {};
|
||||
|
||||
TEST_F(TooManyBlocksTest, FailsWithInvalidStatus) {
|
||||
const char* hlo_text = R"(
|
||||
HloModule primitive_computation_mul.8
|
||||
|
||||
ENTRY primitive_computation_mul.8 {
|
||||
parameter.1 = f32[4,1048576,1,1]{3,2,1,0} parameter(0)
|
||||
reshape.3 = f32[4,1048576,1]{2,1,0} reshape(parameter.1)
|
||||
broadcast.4 = f32[4,1048576,1048576,1]{3,2,1,0} broadcast(reshape.3), dimensions={0,1,3}
|
||||
parameter.2 = f32[4,1,1048576,1]{3,2,1,0} parameter(1)
|
||||
reshape.5 = f32[4,1048576,1]{2,1,0} reshape(parameter.2)
|
||||
broadcast.6 = f32[4,1048576,1048576,1]{3,2,1,0} broadcast(reshape.5), dimensions={0,2,3}
|
||||
ROOT multiply.7 = f32[4,1048576,1048576,1]{3,2,1,0} multiply(broadcast.4, broadcast.6)
|
||||
}
|
||||
)";
|
||||
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
|
||||
GetOptimizedModule(hlo_text));
|
||||
|
||||
StatusOr<std::unique_ptr<Executable>> failed_executable =
|
||||
backend().compiler()->RunBackend(
|
||||
std::move(optimized_module), backend().default_stream_executor(),
|
||||
backend().default_stream_executor()->GetAllocator());
|
||||
|
||||
EXPECT_FALSE(failed_executable.ok());
|
||||
EXPECT_THAT(failed_executable.status().ToString(),
|
||||
::testing::HasSubstr("Kernel launch needs more blocks"));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace gpu
|
||||
} // namespace xla
|
@ -49,6 +49,9 @@ xla::Status CompileAndPrintLlvmIr(const std::string& hlo_text) {
|
||||
gpu_device_info.shared_memory_per_block = 49152;
|
||||
gpu_device_info.core_count = 80;
|
||||
gpu_device_info.threads_per_core_limit = 2048;
|
||||
gpu_device_info.block_dim_limit_x = 2147483647;
|
||||
gpu_device_info.block_dim_limit_y = 65535;
|
||||
gpu_device_info.block_dim_limit_z = 65535;
|
||||
|
||||
xla::gpu::CudaComputeCapability cuda_compute_capability;
|
||||
cuda_compute_capability.cc_major = 7;
|
||||
|
Loading…
x
Reference in New Issue
Block a user