From a8a9a98935143de7ec7e08aaef03b73b6a8137cf Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 13 Jan 2020 09:31:14 -0800 Subject: [PATCH 01/52] Add env variable to allow different reduction case. --- .../xla/service/gpu/ir_emitter_unnested.cc | 88 +++++++++++++------ 1 file changed, 62 insertions(+), 26 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 528a847b3ed..35f5f128108 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1959,6 +1959,12 @@ void IrEmitterUnnested::EmitTile( auto ceil_of_ratio = [&](llvm::Value* a, llvm::Value* b) { return b_.CreateUDiv(b_.CreateAdd(b_.CreateAdd(a, b), constant(-1)), b); }; + int vec_stride = 1; + char* env = getenv("VEC_STRIDE"); + if (env) + vec_stride = atoi(env); + std::cout << 'VEC_STRIDE ' << vec_stride << std::endl; + std::cout << "num_threads_x " << num_threads_x << " num_threads_y " << num_threads_y << " tile_size_x " << tile_size_x << std::endl; // True iff all threads always execute all instructions in the tiling // dimension X. @@ -1978,32 +1984,62 @@ void IrEmitterUnnested::EmitTile( // // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the // workaround. - ksl->For(loop_name + "_y_in_tile", - /*start=*/constant(0), - /*end=*/ - ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y), - num_threads_y), - /*step=*/constant(1), [&](llvm::Value* y_indvar) { - llvm::Value* y_loc = - b_.CreateAdd(thread_id_info.thread_id_y, - b_.CreateMul(y_indvar, num_threads_y)); - for (int64 j = 0; j < x_num_steps; j++) { - llvm::Value* x_loc = - b_.CreateAdd(constant(j * step_x), start_offset_x, "x_loc"); - IrArray::Index source_idx_x = - source_idx.AddOffsetToDim(y_loc, kDimY, &b_) - .AddOffsetToDim(constant(j * step_x), kDimX, &b_); - auto emit_element = [&] { - return emit_elem_function(source_idx_x, y_loc, x_loc, j); - }; - if (!x_tile_fits) { - ksl->If(loop_name + "_x_in_tile", - b_.CreateICmpULT(x_loc, tile_width), emit_element); - } else { - emit_element(); - } - } - }); + ksl->For( + loop_name + "_y_in_tile", + /*start=*/constant(0), + /*end=*/ + ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y), + num_threads_y), + /*step=*/constant(1), [&](llvm::Value* y_indvar) { + + + + llvm::Value* y_loc = b_.CreateAdd( + thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); + if (vec_stride == 1) { + for (int64 j = 0; j < x_num_steps; j++) { + llvm::Value* x_loc = + b_.CreateAdd(constant(j * step_x), start_offset_x, "x_loc"); + IrArray::Index source_idx_x = + source_idx.AddOffsetToDim(y_loc, kDimY, &b_) + .AddOffsetToDim(constant(j * step_x), kDimX, &b_); + auto emit_element = [&] { + return emit_elem_function(source_idx_x, y_loc, x_loc, j); + }; + if (!x_tile_fits) { + ksl->If(loop_name + "_x_in_tile", + b_.CreateICmpULT(x_loc, tile_width), emit_element); + } else { + emit_element(); + } + } + } else { + for (int64 j = 0; j < x_num_steps/vec_stride; j++) { + //Prep some values. If we do not do this, LLVM doesn't vectorize. + llvm::Value* x_loc_base = + b_.CreateAdd(constant(j * step_x * vec_stride), start_offset_x, "x_loc_base"); + IrArray::Index source_idx_x_base = + source_idx.AddOffsetToDim(y_loc, kDimY, &b_) + .AddOffsetToDim(constant(j * step_x * vec_stride), kDimX, &b_); + + for (int i = 0; i < vec_stride; i++) { + int old_j = j * vec_stride + i; + llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc"); + IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim( + constant(i), kDimX, &b_); + auto emit_element = [&] { + return emit_elem_function(source_idx_x, y_loc, x_loc, j); + }; + if (!x_tile_fits) { + ksl->If(loop_name + "_x_in_tile", + b_.CreateICmpULT(x_loc, tile_width), emit_element); + } else { + emit_element(); + } + } + } + } + }); } // Emits code to process a tensor element in a tile for the given kCopy HLO that From bbe5592773d203172aae07c878f720633d16c906 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 21 Jan 2020 09:11:49 -0800 Subject: [PATCH 02/52] Enable the new vectorized+strided iteration mode only for row reduction. --- .../xla/service/gpu/ir_emitter_unnested.cc | 50 +++++++++++++------ .../xla/service/gpu/kernel_mapping_scheme.h | 31 ++++++++---- 2 files changed, 54 insertions(+), 27 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 35f5f128108..9c9bb0bd6ba 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1919,9 +1919,13 @@ Status IrEmitterUnnested::EmitTargetElementLoop( static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme, llvm::Value* thread_id_x, llvm::Type* index_ty, - llvm::IRBuilder<>* b) { - if (mapping_scheme.DilatedX()) { + llvm::IRBuilder<>* b, + int vec_stride = 1) { + if (mapping_scheme.GetIndexingOrder() == KernelMappingScheme::DilatedIndexingX) { return thread_id_x; + } else if (mapping_scheme.GetIndexingOrder() == + KernelMappingScheme::VectorizedCoalescedIndexingX) { + return b->CreateMul(thread_id_x, llvm::ConstantInt::get(index_ty, vec_stride)); } int64 x_num_steps = mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX(); @@ -1944,14 +1948,23 @@ void IrEmitterUnnested::EmitTile( int64 tile_size_x = mapping_scheme.GetTileSizeX(); int64 x_num_steps = tile_size_x / num_threads_x; - llvm::Value* start_offset_x = GetStartOffsetX( - mapping_scheme, thread_id_info.thread_id_x, index_ty, &b_); + int vec_stride = 2; + char* env = getenv("VEC_STRIDE"); + if (env) + vec_stride = atoi(env); + if (mapping_scheme.GetIndexingOrder() != KernelMappingScheme::VectorizedCoalescedIndexingX ) { + vec_stride = 1; + } + + // TODO: should I modify the other call to GetStartOffsetX? + llvm::Value* start_offset_x = + GetStartOffsetX(mapping_scheme, thread_id_info.thread_id_x, index_ty, &b_, vec_stride); // Using dilated mapping scheme, each thread steps with a stride of number // of threads. // Otherwise, the stride is one, but we multiply each offset by the limit of // number of steps which can be made. - int64 step_x = mapping_scheme.DilatedX() ? num_threads_x : 1; + int64 step_x = mapping_scheme.GetIndexingOrder() == KernelMappingScheme::LinearIndexingX? 1 : num_threads_x; IrArray::Index source_idx = tile_origin_index.AddOffsetToDim(start_offset_x, kDimX, &b_); @@ -1959,13 +1972,6 @@ void IrEmitterUnnested::EmitTile( auto ceil_of_ratio = [&](llvm::Value* a, llvm::Value* b) { return b_.CreateUDiv(b_.CreateAdd(b_.CreateAdd(a, b), constant(-1)), b); }; - int vec_stride = 1; - char* env = getenv("VEC_STRIDE"); - if (env) - vec_stride = atoi(env); - std::cout << 'VEC_STRIDE ' << vec_stride << std::endl; - std::cout << "num_threads_x " << num_threads_x << " num_threads_y " << num_threads_y << " tile_size_x " << tile_size_x << std::endl; - // True iff all threads always execute all instructions in the tiling // dimension X. bool x_tile_fits = mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0; @@ -2122,7 +2128,8 @@ static int GetNumberOfPartialResults( if (reduction_info.IsRowReduction()) { return 1; } - int64 num_partial_results = mapping_scheme.DilatedX() ? 1 : 2; + int64 num_partial_results = mapping_scheme.GetIndexingOrder() == + KernelMappingScheme::DilatedIndexingX ? 1 : 2; CHECK_EQ(num_partial_results, (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX())); return num_partial_results; @@ -2706,7 +2713,7 @@ void IrEmitterUnnested::EmitHlo021Tile( /*tile_sizes=*/{1, kWarpSize, kWarpSize}, /*num_threads_y=*/kNumRows, /*num_threads_x=*/kWarpSize, - /*is_dilated_x=*/false); + /*indexing_order=*/KernelMappingScheme::LinearIndexingX); LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(), mapping_scheme.GetThreadsPerBlock()); llvm::Type* index_type = @@ -3152,7 +3159,18 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape, reduction_dimensions.dimensions[2]); - if (!dilated_x && !reduction_dimensions.is_row_reduction) { + KernelMappingScheme::IndexingOrder indexing_order; + if (reduction_dimensions.is_row_reduction) { + indexing_order = KernelMappingScheme::VectorizedCoalescedIndexingX; + } else if (IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape, + reduction_dimensions.dimensions[2])) { + indexing_order = KernelMappingScheme::LinearIndexingX; + } else { + indexing_order = KernelMappingScheme::DilatedIndexingX; + } + + if (indexing_order == KernelMappingScheme::LinearIndexingX && + !reduction_dimensions.is_row_reduction) { // Vectorized loads: a single thread reduces two adjacent columns. reduction_tiling[2] *= 2; } @@ -3173,7 +3191,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( reduction_dimensions.dimensions, {reduction_tiling[0], reduction_tiling[1] * num_threads_y, reduction_tiling[2] * num_threads_x}, - num_threads_y, num_threads_x, dilated_x); + num_threads_y, num_threads_x, indexing_order); return ReductionCodegenInfo(mapping_scheme, reduction_dimensions.is_row_reduction); } diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index eeab8d4dc80..022104361e6 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -76,19 +76,23 @@ namespace gpu { class KernelMappingScheme { public: enum { DimZ = 0, DimY, DimX, DimTot }; + // TODO: rename Dilated to Strided? + enum IndexingOrder { LinearIndexingX, DilatedIndexingX, VectorizedCoalescedIndexingX }; + KernelMappingScheme(absl::Span dims_in_elems, absl::Span tile_sizes, int64 num_threads_y, - int64 num_threads_x, bool is_dilated_x) + int64 num_threads_x, IndexingOrder indexing_order) : dims_in_elems_{dims_in_elems[0], dims_in_elems[1], dims_in_elems[2]}, tile_sizes_{tile_sizes[0], tile_sizes[1], tile_sizes[2]}, num_threads_x_(num_threads_x), num_threads_y_(num_threads_y), - dilated_x_(is_dilated_x) { + indexing_order_(indexing_order) { CHECK_EQ(tile_sizes[1] % num_threads_y_, 0); CHECK_EQ(tile_sizes[2] % num_threads_x_, 0); VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ","); - if (!dilated_x_) { - // dilated_x_=false is for the purpose of vectorization, which requires + if (indexing_order != LinearIndexingX) { + // DilatedIndexingX, and VectorizedCoalescedIndexingX + // is for the purpose of vectorization, which requires // GetTileSizeFor(DimX) to be a multiplier of num_threads_x_. CHECK_EQ(GetTileSizeFor(DimX) % num_threads_x_, 0); } @@ -118,7 +122,9 @@ class KernelMappingScheme { return GetNumThreadsX() * GetNumThreadsY(); } - bool DilatedX() const { return dilated_x_; } + IndexingOrder GetIndexingOrder() const { + return indexing_order_; + } private: // The number of elements in each dimension. @@ -133,12 +139,15 @@ class KernelMappingScheme { // Number of threads used to process elements in the Y direction of a tile. const int64 num_threads_y_; - // When num_threads_x threads process a total of tile_size_x elements in the - // X dimension of a tile, each threads process n=tile_size_x/num_threads_x - // elements. When dilated_x=false, the n elements processed by a thread are - // contiguous. On the other hand, when dilated_x=true the n elements are - // dilated by a factor of num_threads_x. - const bool dilated_x_; + // When num_threads_x threads process a total of tile_size_x + // elements in the X dimension of a tile, each threads process + // n=tile_size_x/num_threads_x elements. When indexing_order_ have + // the value LinearIndexingX, the n elements processed by a thread + // are contiguous. When the value is DilatedIndexingX, the n + // elements are dilated by a factor of num_threads_x. When the value + // is VectorizedCoalescedIndexingX, it is a mix of both to enable + // vectorizing while keeping memory coalescing. + const IndexingOrder indexing_order_; }; // Information to support the code generation for a tiled reduction kernel. From 04594ea65bf0c3e74576295b91d3101b06a4795d Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Sat, 15 Feb 2020 15:54:51 -0800 Subject: [PATCH 03/52] Move the vector size into KernelMappingScheme --- .../xla/service/gpu/ir_emitter_unnested.cc | 52 +++++++++---------- .../xla/service/gpu/kernel_mapping_scheme.h | 24 ++++++--- 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 9c9bb0bd6ba..9a523d0c111 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1919,13 +1919,13 @@ Status IrEmitterUnnested::EmitTargetElementLoop( static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme, llvm::Value* thread_id_x, llvm::Type* index_ty, - llvm::IRBuilder<>* b, - int vec_stride = 1) { + llvm::IRBuilder<>* b) { if (mapping_scheme.GetIndexingOrder() == KernelMappingScheme::DilatedIndexingX) { return thread_id_x; } else if (mapping_scheme.GetIndexingOrder() == - KernelMappingScheme::VectorizedCoalescedIndexingX) { - return b->CreateMul(thread_id_x, llvm::ConstantInt::get(index_ty, vec_stride)); + KernelMappingScheme::LinearDilatedIndexingX) { + int vector_size = mapping_scheme.GetVectorSize(); + return b->CreateMul(thread_id_x, llvm::ConstantInt::get(index_ty, vector_size)); } int64 x_num_steps = mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX(); @@ -1948,17 +1948,8 @@ void IrEmitterUnnested::EmitTile( int64 tile_size_x = mapping_scheme.GetTileSizeX(); int64 x_num_steps = tile_size_x / num_threads_x; - int vec_stride = 2; - char* env = getenv("VEC_STRIDE"); - if (env) - vec_stride = atoi(env); - if (mapping_scheme.GetIndexingOrder() != KernelMappingScheme::VectorizedCoalescedIndexingX ) { - vec_stride = 1; - } - - // TODO: should I modify the other call to GetStartOffsetX? - llvm::Value* start_offset_x = - GetStartOffsetX(mapping_scheme, thread_id_info.thread_id_x, index_ty, &b_, vec_stride); + llvm::Value* start_offset_x = GetStartOffsetX( + mapping_scheme, thread_id_info.thread_id_x, index_ty, &b_); // Using dilated mapping scheme, each thread steps with a stride of number // of threads. @@ -1972,6 +1963,7 @@ void IrEmitterUnnested::EmitTile( auto ceil_of_ratio = [&](llvm::Value* a, llvm::Value* b) { return b_.CreateUDiv(b_.CreateAdd(b_.CreateAdd(a, b), constant(-1)), b); }; + // True iff all threads always execute all instructions in the tiling // dimension X. bool x_tile_fits = mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0; @@ -1990,6 +1982,7 @@ void IrEmitterUnnested::EmitTile( // // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the // workaround. + int vec_stride = mapping_scheme.GetVectorSize(); ksl->For( loop_name + "_y_in_tile", /*start=*/constant(0), @@ -1997,9 +1990,7 @@ void IrEmitterUnnested::EmitTile( ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y), num_threads_y), /*step=*/constant(1), [&](llvm::Value* y_indvar) { - - - + printf("VEC_STRIDE %d\n", vec_stride); llvm::Value* y_loc = b_.CreateAdd( thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); if (vec_stride == 1) { @@ -2020,18 +2011,18 @@ void IrEmitterUnnested::EmitTile( } } } else { - for (int64 j = 0; j < x_num_steps/vec_stride; j++) { + for (int64 j = 0; j < x_num_steps/vec_stride; j++) { //Prep some values. If we do not do this, LLVM doesn't vectorize. llvm::Value* x_loc_base = - b_.CreateAdd(constant(j * step_x * vec_stride), start_offset_x, "x_loc_base"); + b_.CreateAdd(constant(j * step_x * vec_stride), start_offset_x, "x_loc_base"); IrArray::Index source_idx_x_base = source_idx.AddOffsetToDim(y_loc, kDimY, &b_) .AddOffsetToDim(constant(j * step_x * vec_stride), kDimX, &b_); - for (int i = 0; i < vec_stride; i++) { + for (int i = 0; i < vec_stride; i++) { int old_j = j * vec_stride + i; - llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc"); - IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim( + llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc"); + IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim( constant(i), kDimX, &b_); auto emit_element = [&] { return emit_elem_function(source_idx_x, y_loc, x_loc, j); @@ -2713,7 +2704,8 @@ void IrEmitterUnnested::EmitHlo021Tile( /*tile_sizes=*/{1, kWarpSize, kWarpSize}, /*num_threads_y=*/kNumRows, /*num_threads_x=*/kWarpSize, - /*indexing_order=*/KernelMappingScheme::LinearIndexingX); + /*indexing_order=*/KernelMappingScheme::LinearIndexingX, + /*vector_size*/1); LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(), mapping_scheme.GetThreadsPerBlock()); llvm::Type* index_type = @@ -3161,7 +3153,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( KernelMappingScheme::IndexingOrder indexing_order; if (reduction_dimensions.is_row_reduction) { - indexing_order = KernelMappingScheme::VectorizedCoalescedIndexingX; + indexing_order = KernelMappingScheme::LinearDilatedIndexingX; } else if (IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape, reduction_dimensions.dimensions[2])) { indexing_order = KernelMappingScheme::LinearIndexingX; @@ -3187,11 +3179,19 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( return kWarpSize; }(); + int vec_stride = 2; + char* env = getenv("VEC_STRIDE"); + if (env) + vec_stride = atoi(env); + if (indexing_order != KernelMappingScheme::LinearDilatedIndexingX) { + vec_stride = 1; + } KernelMappingScheme mapping_scheme( reduction_dimensions.dimensions, {reduction_tiling[0], reduction_tiling[1] * num_threads_y, reduction_tiling[2] * num_threads_x}, - num_threads_y, num_threads_x, indexing_order); + num_threads_y, num_threads_x, indexing_order, + vec_stride); return ReductionCodegenInfo(mapping_scheme, reduction_dimensions.is_row_reduction); } diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index 022104361e6..c8f2864c3e9 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -77,21 +77,23 @@ class KernelMappingScheme { public: enum { DimZ = 0, DimY, DimX, DimTot }; // TODO: rename Dilated to Strided? - enum IndexingOrder { LinearIndexingX, DilatedIndexingX, VectorizedCoalescedIndexingX }; + enum IndexingOrder { LinearIndexingX, DilatedIndexingX, LinearDilatedIndexingX }; KernelMappingScheme(absl::Span dims_in_elems, absl::Span tile_sizes, int64 num_threads_y, - int64 num_threads_x, IndexingOrder indexing_order) + int64 num_threads_x, IndexingOrder indexing_order, + int vector_size) : dims_in_elems_{dims_in_elems[0], dims_in_elems[1], dims_in_elems[2]}, tile_sizes_{tile_sizes[0], tile_sizes[1], tile_sizes[2]}, num_threads_x_(num_threads_x), num_threads_y_(num_threads_y), - indexing_order_(indexing_order) { + indexing_order_(indexing_order), + vector_size_(vector_size){ CHECK_EQ(tile_sizes[1] % num_threads_y_, 0); CHECK_EQ(tile_sizes[2] % num_threads_x_, 0); VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ","); if (indexing_order != LinearIndexingX) { - // DilatedIndexingX, and VectorizedCoalescedIndexingX + // DilatedIndexingX, and LinearDilatedIndexingX // is for the purpose of vectorization, which requires // GetTileSizeFor(DimX) to be a multiplier of num_threads_x_. CHECK_EQ(GetTileSizeFor(DimX) % num_threads_x_, 0); @@ -125,6 +127,9 @@ class KernelMappingScheme { IndexingOrder GetIndexingOrder() const { return indexing_order_; } + int GetVectorSize() const { + return vector_size_; + } private: // The number of elements in each dimension. @@ -145,9 +150,16 @@ class KernelMappingScheme { // the value LinearIndexingX, the n elements processed by a thread // are contiguous. When the value is DilatedIndexingX, the n // elements are dilated by a factor of num_threads_x. When the value - // is VectorizedCoalescedIndexingX, it is a mix of both to enable - // vectorizing while keeping memory coalescing. + // is LinearDilatedIndexingX, it is a mix of both to enable + // vectorizing while keeping memory coalescing. It reads + // vector_size_ elements, then do a step to the next vector_size_ + // elements. const IndexingOrder indexing_order_; + // vector_size_ only supported for row reduction. + // Must be a divisor of tile_sizes_[2]/num_threads_x. + // Interesting values are 2 and 4 to trigger vectorized loads on GPUs + // While keeping memory coalescing. + const int vector_size_; }; // Information to support the code generation for a tiled reduction kernel. From 5fb7dcbe077e66b6f473a6758a6e97ca766c7525 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 18 Feb 2020 13:59:05 -0800 Subject: [PATCH 04/52] If some blocks fit completly, unroll to vectorize for them. --- .../xla/service/gpu/ir_emitter_unnested.cc | 76 ++++++++++++++++++- 1 file changed, 73 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 9a523d0c111..135a6396c27 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2011,6 +2011,7 @@ void IrEmitterUnnested::EmitTile( } } } else { + auto unroll = [&](bool add_if, int64 max_element, int64 vec_stride) { for (int64 j = 0; j < x_num_steps/vec_stride; j++) { //Prep some values. If we do not do this, LLVM doesn't vectorize. llvm::Value* x_loc_base = @@ -2027,7 +2028,7 @@ void IrEmitterUnnested::EmitTile( auto emit_element = [&] { return emit_elem_function(source_idx_x, y_loc, x_loc, j); }; - if (!x_tile_fits) { + if (add_if) { ksl->If(loop_name + "_x_in_tile", b_.CreateICmpULT(x_loc, tile_width), emit_element); } else { @@ -2035,8 +2036,77 @@ void IrEmitterUnnested::EmitTile( } } } - } - }); + }; + + char * env_if = getenv("RED_IF"); + int red_if = 1; + if (env_if) { + red_if = atoi(env_if); + printf("RED_IF2 = %d %s\n", red_if, env_if); + } + if (red_if == 1) { + std::cout << "IF_NB 1: " << std::endl; + unroll(!x_tile_fits, x_num_steps, vec_stride); + } else if (red_if == 2) { + std::cout << "IF_NB 2" << std::endl; + ksl->If(loop_name + "_is_full_tile", + //b->CreateICmpULT(last_element, tile_width), + b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width), + [&] {unroll(false, x_num_steps, vec_stride);}, + [&] {unroll(true, x_num_steps, vec_stride);}); + } else { + std::cout << "IF_NB 3" << std::endl; + //b->CreateICmpULT(start_offset_x+j * step_x * vec_stride + i, tile_width) + int last_block_left_element = mapping_scheme.GetDimsInElems()[2] % x_num_steps; + std::cout << "MAPPING " << mapping_scheme.GetDimsInElems()[0] << " " + << mapping_scheme.GetDimsInElems()[1] << " " + << mapping_scheme.GetDimsInElems()[2] << std::endl; + std::cout << "LAST_BLOCK x_num_steps " << x_num_steps + << " last_block" << last_block_left_element << std::endl; + // NB block per reduction. + int nb_block = CeilOfRatio(mapping_scheme.GetDimsInElems()[2], + tile_size_x); + std::cout << "NB_BLOCK" << nb_block << std::endl; + if (x_tile_fits) { + // All threads will completly unroll + unroll(false, x_num_steps, vec_stride); + } else if(nb_block == 1) { + // No thread will completly unroll. + // TODO: unroll by the right amount + unroll(true, x_num_steps, vec_stride); + } else { + // For some blocks, all threads will will completly unroll. + // For other blocks, some of its threads will completly unroll, others will partially and some won't be used. + // So do an if(thread fully unroll) {code with no if between elements} else {code with if between each elements} + // TODO: in the else part, unroll without if but with the right number of elements left. + + llvm::Value* block_id = gpu::EmitCallToTargetIntrinsic( + gpu::TargetIntrinsicID::kBlockIdx, {}, {}, &b_); + llvm::Value* last_element = b_.CreateAdd(constant(x_num_steps * tile_size_x), + start_offset_x, "last_element"); + int x_num_steps_partial = mapping_scheme.GetDimsInElems()[2] % tile_size_x; + x_num_steps_partial *= 2; + //x_num_steps_partial = x_num_steps; + ksl->If(loop_name + "_is_full_tile", + // Test if all the elements of this thread is withing tile. + b_.CreateICmpULT(last_element, tile_width), + // Not the last block, so unroll without ifs. + [&] {unroll(false, x_num_steps, vec_stride);}, + // The last block isn't completly unrolled. + + // TODO: unroll the right size. Take care + // vec_stride must match the above unroll for + // now. + // TODO: after unroll of the right size, remove the IFs. + // ONGOING, try to make it work with less + // then unroll x_num_steps + + [&] {unroll(true, x_num_steps, vec_stride);}); // works + //[&] {unroll(true, x_num_steps, x_num_steps);}); + //[&] {unroll(true, x_num_steps_partial, vec_stride);}); + } + } + }}); } // Emits code to process a tensor element in a tile for the given kCopy HLO that From 24ac05af45f90b7e6738587a64d713870379662a Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 19 Feb 2020 11:42:29 -0800 Subject: [PATCH 05/52] Add comment --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 135a6396c27..b63da486a1e 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2051,6 +2051,7 @@ void IrEmitterUnnested::EmitTile( std::cout << "IF_NB 2" << std::endl; ksl->If(loop_name + "_is_full_tile", //b->CreateICmpULT(last_element, tile_width), + // If (the thread fully unrolled) {no condition path} else {condition path} b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width), [&] {unroll(false, x_num_steps, vec_stride);}, [&] {unroll(true, x_num_steps, vec_stride);}); From 51229ce3ffd9fcdde61147d21c343c5773cc2c48 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 19 Feb 2020 07:40:59 -0800 Subject: [PATCH 06/52] Only vectorize when the tile completly fit. --- .../xla/service/gpu/ir_emitter_unnested.cc | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index b63da486a1e..7aebf2e9577 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -3250,17 +3250,20 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( return kWarpSize; }(); - int vec_stride = 2; + int tile_size_x = reduction_tiling[2] * num_threads_x; + + int vec_stride = 1; char* env = getenv("VEC_STRIDE"); - if (env) + if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX && + reduction_dimensions.dimensions[2] % tile_size_x == 0) { + vec_stride = 2; + } + if (env) { vec_stride = atoi(env); - if (indexing_order != KernelMappingScheme::LinearDilatedIndexingX) { - vec_stride = 1; } KernelMappingScheme mapping_scheme( reduction_dimensions.dimensions, - {reduction_tiling[0], reduction_tiling[1] * num_threads_y, - reduction_tiling[2] * num_threads_x}, + {reduction_tiling[0], reduction_tiling[1] * num_threads_y, tile_size_x}, num_threads_y, num_threads_x, indexing_order, vec_stride); return ReductionCodegenInfo(mapping_scheme, From 8a0e361243629c9806a4c1183d1cb19a7ea19c14 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 19 Feb 2020 07:56:23 -0800 Subject: [PATCH 07/52] Limit default vectorization to we know LLVM won't vectorize --- .../xla/service/gpu/ir_emitter_unnested.cc | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 7aebf2e9577..7beb5ddbe50 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1860,12 +1860,13 @@ namespace { // Returns true if the fusion contains any instruction that is likely // translated to complex LLVM IR, such as loops, and prevent vectorization. -bool MayPreventVectorization(const HloInstruction& hlo) { +bool MayPreventVectorization(const HloInstruction& hlo, const bool tolerate_reduce=false) { if (hlo.opcode() == HloOpcode::kFusion) { return absl::c_any_of(hlo.fused_instructions_computation()->instructions(), - [](const HloInstruction* instr) { + [tolerate_reduce](const HloInstruction* instr) { switch (instr->opcode()) { case HloOpcode::kReduce: + return !tolerate_reduce; case HloOpcode::kReduceWindow: case HloOpcode::kSort: case HloOpcode::kDot: @@ -1892,6 +1893,8 @@ bool MayPreventVectorization(const HloInstruction& hlo) { default: return false; } + } else if (hlo.opcode() == HloOpcode::kReduce) { + return !tolerate_reduce; } return true; } @@ -3254,12 +3257,21 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( int vec_stride = 1; char* env = getenv("VEC_STRIDE"); - if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX && - reduction_dimensions.dimensions[2] % tile_size_x == 0) { - vec_stride = 2; + if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX) { + if (reduction_dimensions.dimensions[2] % tile_size_x == 0 && + // As XLA unroll and suppose LLVM will vectorize, + // disable the unroll for case that LLVM doesn't vectorize. + !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/true)) { + vec_stride = 2; + } else { + indexing_order = KernelMappingScheme::DilatedIndexingX; + } } if (env) { vec_stride = atoi(env); + if (vec_stride > 1) { + indexing_order = KernelMappingScheme::LinearDilatedIndexingX; + } } KernelMappingScheme mapping_scheme( reduction_dimensions.dimensions, From 66f52846a68f01dc2442aad8a69b459cdc4d39d7 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 19 Feb 2020 08:04:29 -0800 Subject: [PATCH 08/52] Rename a variable --- .../compiler/xla/service/gpu/ir_emitter_unnested.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 7beb5ddbe50..f9958547a59 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -3255,21 +3255,21 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( int tile_size_x = reduction_tiling[2] * num_threads_x; - int vec_stride = 1; - char* env = getenv("VEC_STRIDE"); + int vector_size = 1; + char* env = getenv("VECTOR_SIZE"); if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX) { if (reduction_dimensions.dimensions[2] % tile_size_x == 0 && // As XLA unroll and suppose LLVM will vectorize, // disable the unroll for case that LLVM doesn't vectorize. !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/true)) { - vec_stride = 2; + vector_size = 2; } else { indexing_order = KernelMappingScheme::DilatedIndexingX; } } if (env) { - vec_stride = atoi(env); - if (vec_stride > 1) { + vector_size = atoi(env); + if (vector_size > 1) { indexing_order = KernelMappingScheme::LinearDilatedIndexingX; } } @@ -3277,7 +3277,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( reduction_dimensions.dimensions, {reduction_tiling[0], reduction_tiling[1] * num_threads_y, tile_size_x}, num_threads_y, num_threads_x, indexing_order, - vec_stride); + vector_size); return ReductionCodegenInfo(mapping_scheme, reduction_dimensions.is_row_reduction); } From fbfdfbbd5a668d509c84a8213217768768ef053e Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 24 Feb 2020 08:30:31 -0800 Subject: [PATCH 09/52] Enable vectorization for even row size by default. --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index f9958547a59..46bf010b65f 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -3258,7 +3258,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( int vector_size = 1; char* env = getenv("VECTOR_SIZE"); if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX) { - if (reduction_dimensions.dimensions[2] % tile_size_x == 0 && + if (reduction_dimensions.dimensions[2] % 2 == 0 && // As XLA unroll and suppose LLVM will vectorize, // disable the unroll for case that LLVM doesn't vectorize. !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/true)) { From ec3edaf0277041350ad312e477db48266cbd860f Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 24 Feb 2020 08:45:18 -0800 Subject: [PATCH 10/52] Remove not working code path for odd row size. --- .../xla/service/gpu/ir_emitter_unnested.cc | 62 ++----------------- 1 file changed, 6 insertions(+), 56 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 46bf010b65f..3e8703134d2 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2042,73 +2042,23 @@ void IrEmitterUnnested::EmitTile( }; char * env_if = getenv("RED_IF"); - int red_if = 1; + int red_if = 0; if (env_if) { red_if = atoi(env_if); printf("RED_IF2 = %d %s\n", red_if, env_if); } - if (red_if == 1) { + if (red_if == 1 || x_tile_fits) { std::cout << "IF_NB 1: " << std::endl; unroll(!x_tile_fits, x_num_steps, vec_stride); - } else if (red_if == 2) { + } else { std::cout << "IF_NB 2" << std::endl; ksl->If(loop_name + "_is_full_tile", - //b->CreateICmpULT(last_element, tile_width), - // If (the thread fully unrolled) {no condition path} else {condition path} + // if (block fully fit) {fast path} else {slow path} + // tile_width is always exact. For the last block, + // it will be the exact number of elements left. b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width), [&] {unroll(false, x_num_steps, vec_stride);}, [&] {unroll(true, x_num_steps, vec_stride);}); - } else { - std::cout << "IF_NB 3" << std::endl; - //b->CreateICmpULT(start_offset_x+j * step_x * vec_stride + i, tile_width) - int last_block_left_element = mapping_scheme.GetDimsInElems()[2] % x_num_steps; - std::cout << "MAPPING " << mapping_scheme.GetDimsInElems()[0] << " " - << mapping_scheme.GetDimsInElems()[1] << " " - << mapping_scheme.GetDimsInElems()[2] << std::endl; - std::cout << "LAST_BLOCK x_num_steps " << x_num_steps - << " last_block" << last_block_left_element << std::endl; - // NB block per reduction. - int nb_block = CeilOfRatio(mapping_scheme.GetDimsInElems()[2], - tile_size_x); - std::cout << "NB_BLOCK" << nb_block << std::endl; - if (x_tile_fits) { - // All threads will completly unroll - unroll(false, x_num_steps, vec_stride); - } else if(nb_block == 1) { - // No thread will completly unroll. - // TODO: unroll by the right amount - unroll(true, x_num_steps, vec_stride); - } else { - // For some blocks, all threads will will completly unroll. - // For other blocks, some of its threads will completly unroll, others will partially and some won't be used. - // So do an if(thread fully unroll) {code with no if between elements} else {code with if between each elements} - // TODO: in the else part, unroll without if but with the right number of elements left. - - llvm::Value* block_id = gpu::EmitCallToTargetIntrinsic( - gpu::TargetIntrinsicID::kBlockIdx, {}, {}, &b_); - llvm::Value* last_element = b_.CreateAdd(constant(x_num_steps * tile_size_x), - start_offset_x, "last_element"); - int x_num_steps_partial = mapping_scheme.GetDimsInElems()[2] % tile_size_x; - x_num_steps_partial *= 2; - //x_num_steps_partial = x_num_steps; - ksl->If(loop_name + "_is_full_tile", - // Test if all the elements of this thread is withing tile. - b_.CreateICmpULT(last_element, tile_width), - // Not the last block, so unroll without ifs. - [&] {unroll(false, x_num_steps, vec_stride);}, - // The last block isn't completly unrolled. - - // TODO: unroll the right size. Take care - // vec_stride must match the above unroll for - // now. - // TODO: after unroll of the right size, remove the IFs. - // ONGOING, try to make it work with less - // then unroll x_num_steps - - [&] {unroll(true, x_num_steps, vec_stride);}); // works - //[&] {unroll(true, x_num_steps, x_num_steps);}); - //[&] {unroll(true, x_num_steps_partial, vec_stride);}); - } } }}); } From e98822187cd53a75b2fb038c404333eba608f511 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 24 Feb 2020 09:57:02 -0800 Subject: [PATCH 11/52] Add test --- .../compiler/xla/service/gpu/tests/BUILD | 27 +++ .../gpu/tests/reduction_vectorization_test.cc | 192 ++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD index 1fd51c78988..e04dba418d9 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/BUILD +++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD @@ -164,6 +164,33 @@ tf_cc_test( ], ) +tf_cc_test( + name = "reduction_vectorization_test", + srcs = [ + "reduction_vectorization_test.cc", + ], + tags = tf_cuda_tests_tags() + ["no_rocm"], + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla:debug_options_flags", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla/service:gpu_plugin", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_module_config", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/service/gpu:gemm_rewriter", + "//tensorflow/compiler/xla/service/gpu:gpu_executable", + "//tensorflow/compiler/xla/tests:filecheck", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:llvm_irgen_test_base", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/stream_executor/lib", + "@com_google_absl//absl/memory", + ], +) + tf_cc_test( name = "reduction_dimension_grouper_test", srcs = [ diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc new file mode 100644 index 00000000000..470696e003b --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc @@ -0,0 +1,192 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/tests/filecheck.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/stream_executor/lib/statusor.h" + +namespace xla { +namespace gpu { + +namespace { + +class ReductionVectorizationTest : public GpuCodegenTest { + + protected: + void EnsureDeterminism(absl::string_view hlo_text) { + std::vector profiles; + profiles.emplace_back(); + profiles.emplace_back(); + EXPECT_TRUE(RunMultipleTimes(hlo_text, + /*run_hlo_passes=*/true, + /*profiles=*/&profiles, + /*backend_config=*/"", + /*assert_determinism=*/true)); + } +}; + +TEST_F(ReductionVectorizationTest, Power2) { + const char* hlo_text = R"( +HloModule ReducePower2 + +%max_ (x.5: f32[], y.6: f32[]) -> f32[] { + %x = f32[] parameter(0) + %y = f32[] parameter(1) + ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y) +} + +ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,131072]) -> f32[5]{0} { + %param_0 = f32[5,131072]{1,0} parameter(0), parameter_replication={false} + %constant.3 = f32[] constant(0) + ROOT %reduce.8 = f32[5]{0} reduce(f32[5,131072]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_ +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, + ParseAndReturnVerifiedModule(hlo_text)); + CompileAndOptionallyVerifyPtx(std::move(optimized_module), + R"( +CHECK: ld.global.nc.v2.f32 +)"); + + EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); +} + +TEST_F(ReductionVectorizationTest, TileFit) { + const char* hlo_text = R"( +HloModule ReduceTileFit + +%max_ (x.5: f32[], y.6: f32[]) -> f32[] { + %x = f32[] parameter(0) + %y = f32[] parameter(1) + ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y) +} + +ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,122880]) -> f32[5]{0} { + %param_0 = f32[5,122880]{1,0} parameter(0), parameter_replication={false} + %constant.3 = f32[] constant(0) + ROOT %reduce.8 = f32[5]{0} reduce(f32[5,122880]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_ +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, + ParseAndReturnVerifiedModule(hlo_text)); + CompileAndOptionallyVerifyPtx(std::move(optimized_module), + R"( +CHECK: ld.global.nc.v2.f32 +)"); + + EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); +} + +TEST_F(ReductionVectorizationTest, DisableOddColumns) { + const char* hlo_text = R"( +HloModule ReduceTileFit + +%max_ { + %x = f32[] parameter(0) + %y = f32[] parameter(1) + ROOT %maximum.7 = f32[] maximum(%x, %y) +} + +ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,131071]) -> f32[5]{0} { + %param_0 = f32[5,131071]{1,0} parameter(0), parameter_replication={false} + %constant.3 = f32[] constant(0) + ROOT %reduce.8 = f32[5]{0} reduce(f32[5,131071]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_ +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, + ParseAndReturnVerifiedModule(hlo_text)); + CompileAndOptionallyVerifyPtx(std::move(optimized_module), + R"( +CHECK-NOT: ld.global.nc.v2.f32 +CHECK-NOT: ld.global.nc.v4.f32 +CHECK-NOT: ld.global.nc.u64 +CHECK-NOT: ld.global.u64 +)"); + + EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); +} + +TEST_F(ReductionVectorizationTest, Exp) { + const char* hlo_text = R"( +HloModule DisableSin + +%add_float { + %x = f32[] parameter(0) + %y = f32[] parameter(1) + ROOT %add.17 = f32[] add(f32[] %x, f32[] %y) +} + +ENTRY %cluster_0 { + %arg0.1 = f32[5,131072]{1,0} parameter(0) + %sine = f32[5,131072]{1,0} exponential(f32[5,131072]{1,0} %arg0.1) + %constant.0 = f32[] constant(0) + ROOT %reduce.18 = f32[5]{0} reduce(f32[5,131072]{1,0} %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, + ParseAndReturnVerifiedModule(hlo_text)); + CompileAndOptionallyVerifyPtx(std::move(optimized_module), + R"( +CHECK: ld.global.nc.v2.f32 +)"); + + EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); +} + +TEST_F(ReductionVectorizationTest, DisableSin) { + const char* hlo_text = R"( +HloModule DisableSin + +%add_float { + %x = f32[] parameter(0) + %y = f32[] parameter(1) + ROOT %add.17 = f32[] add(f32[] %x, f32[] %y) +} + +ENTRY %cluster_0 { + %arg0.1 = f32[5,131072]{1,0} parameter(0) + %sine = f32[5,131072]{1,0} sine(f32[5,131072]{1,0} %arg0.1) + %constant.0 = f32[] constant(0) + ROOT %reduce.18 = f32[5]{0} reduce(f32[5,131072]{1,0} %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, + ParseAndReturnVerifiedModule(hlo_text)); + CompileAndOptionallyVerifyPtx(std::move(optimized_module), + R"( +CHECK-NOT: ld.global.nc.v2.f32 +CHECK-NOT: ld.global.nc.v4.f32 +CHECK-NOT: ld.global.nc.u64 +CHECK-NOT: ld.global.u64 +)"); + + EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); +} + +} // namespace +} // namespace gpu +} // namespace xla From b23e1c4708f04543edf1cd2f5efaa0438ac5d789 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 26 Feb 2020 13:39:06 -0800 Subject: [PATCH 12/52] Better test --- .../gpu/tests/reduction_vectorization_test.cc | 81 ++++++++++++++----- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc index 470696e003b..d811eeb52ae 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc @@ -51,16 +51,16 @@ TEST_F(ReductionVectorizationTest, Power2) { const char* hlo_text = R"( HloModule ReducePower2 -%max_ (x.5: f32[], y.6: f32[]) -> f32[] { +%max_ { %x = f32[] parameter(0) %y = f32[] parameter(1) ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y) } -ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,131072]) -> f32[5]{0} { - %param_0 = f32[5,131072]{1,0} parameter(0), parameter_replication={false} +ENTRY %main { + %param_0 = f32[5,131072] parameter(0) %constant.3 = f32[] constant(0) - ROOT %reduce.8 = f32[5]{0} reduce(f32[5,131072]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_ + ROOT %reduce.8 = f32[5] reduce(f32[5,131072] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_ } )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, @@ -68,6 +68,9 @@ ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceAr CompileAndOptionallyVerifyPtx(std::move(optimized_module), R"( CHECK: ld.global.nc.v2.f32 +CHECK: ld.global.nc.v2.f32 +CHECK: ld.global.nc.v2.f32 +CHECK: ld.global.nc.v2.f32 )"); EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); @@ -77,23 +80,63 @@ TEST_F(ReductionVectorizationTest, TileFit) { const char* hlo_text = R"( HloModule ReduceTileFit -%max_ (x.5: f32[], y.6: f32[]) -> f32[] { +%max_ { %x = f32[] parameter(0) %y = f32[] parameter(1) ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y) } -ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,122880]) -> f32[5]{0} { - %param_0 = f32[5,122880]{1,0} parameter(0), parameter_replication={false} +ENTRY %main { + %param_0 = f32[5,122880] parameter(0) %constant.3 = f32[] constant(0) - ROOT %reduce.8 = f32[5]{0} reduce(f32[5,122880]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_ + ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_ } )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, ParseAndReturnVerifiedModule(hlo_text)); CompileAndOptionallyVerifyPtx(std::move(optimized_module), R"( +// TODO: Make this a vectorized load +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 CHECK: ld.global.nc.v2.f32 +CHECK: ld.global.nc.v2.f32 +CHECK: ld.global.nc.v2.f32 +)"); + + EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); +} + +TEST_F(ReductionVectorizationTest, EvenColums) { + const char* hlo_text = R"( +HloModule ReducePower2 + +%max_ { + %x = f32[] parameter(0) + %y = f32[] parameter(1) + ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y) +} + +ENTRY %main { + %param_0 = f32[5,131070] parameter(0) + %constant.3 = f32[] constant(0) + ROOT %reduce.8 = f32[5] reduce(f32[5,131070] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_ +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, + ParseAndReturnVerifiedModule(hlo_text)); + CompileAndOptionallyVerifyPtx(std::move(optimized_module), + R"( +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +// TODO: Make this a vectorized load +CHECK-NOT: ld.global.nc.v2.f32 +CHECK: ld.global.nc.v2.f32 +CHECK: ld.global.nc.v2.f32 +CHECK-NOT: ld.global.nc.v2.f32 +// TODO: find how to modify LLVM/opt to get this merged? vectorize before some loop opt? +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 )"); EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); @@ -109,10 +152,10 @@ HloModule ReduceTileFit ROOT %maximum.7 = f32[] maximum(%x, %y) } -ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,131071]) -> f32[5]{0} { - %param_0 = f32[5,131071]{1,0} parameter(0), parameter_replication={false} +ENTRY %main { + %param_0 = f32[5,131071] parameter(0) %constant.3 = f32[] constant(0) - ROOT %reduce.8 = f32[5]{0} reduce(f32[5,131071]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_ + ROOT %reduce.8 = f32[5] reduce(f32[5,131071] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_ } )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, @@ -138,11 +181,11 @@ HloModule DisableSin ROOT %add.17 = f32[] add(f32[] %x, f32[] %y) } -ENTRY %cluster_0 { - %arg0.1 = f32[5,131072]{1,0} parameter(0) - %sine = f32[5,131072]{1,0} exponential(f32[5,131072]{1,0} %arg0.1) +ENTRY %main { + %arg0.1 = f32[5,131072] parameter(0) + %sine = f32[5,131072] exponential(f32[5,131072] %arg0.1) %constant.0 = f32[] constant(0) - ROOT %reduce.18 = f32[5]{0} reduce(f32[5,131072]{1,0} %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float + ROOT %reduce.18 = f32[5] reduce(f32[5,131072] %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float } )"; @@ -166,11 +209,11 @@ HloModule DisableSin ROOT %add.17 = f32[] add(f32[] %x, f32[] %y) } -ENTRY %cluster_0 { - %arg0.1 = f32[5,131072]{1,0} parameter(0) - %sine = f32[5,131072]{1,0} sine(f32[5,131072]{1,0} %arg0.1) +ENTRY %main { + %arg0.1 = f32[5,131072] parameter(0) + %sine = f32[5,131072] sine(f32[5,131072] %arg0.1) %constant.0 = f32[] constant(0) - ROOT %reduce.18 = f32[5]{0} reduce(f32[5,131072]{1,0} %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float + ROOT %reduce.18 = f32[5] reduce(f32[5,131072] %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float } )"; From 601ad82b6b620053dea3c260c73ad81c1eea4d32 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 24 Feb 2020 12:51:49 -0800 Subject: [PATCH 13/52] Rename a variable --- .../xla/service/gpu/ir_emitter_unnested.cc | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 3e8703134d2..6733183644f 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1985,7 +1985,7 @@ void IrEmitterUnnested::EmitTile( // // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the // workaround. - int vec_stride = mapping_scheme.GetVectorSize(); + int vector_size = mapping_scheme.GetVectorSize(); ksl->For( loop_name + "_y_in_tile", /*start=*/constant(0), @@ -1993,10 +1993,11 @@ void IrEmitterUnnested::EmitTile( ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y), num_threads_y), /*step=*/constant(1), [&](llvm::Value* y_indvar) { - printf("VEC_STRIDE %d\n", vec_stride); + printf("VEC_STRIDE %d\n", vector_size); llvm::Value* y_loc = b_.CreateAdd( thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); - if (vec_stride == 1) { + // TODO: can I get rid of this conditions? + if (vector_size == 1) { for (int64 j = 0; j < x_num_steps; j++) { llvm::Value* x_loc = b_.CreateAdd(constant(j * step_x), start_offset_x, "x_loc"); @@ -2014,17 +2015,17 @@ void IrEmitterUnnested::EmitTile( } } } else { - auto unroll = [&](bool add_if, int64 max_element, int64 vec_stride) { - for (int64 j = 0; j < x_num_steps/vec_stride; j++) { + auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) { + for (int64 j = 0; j < x_num_steps/vector_size; j++) { //Prep some values. If we do not do this, LLVM doesn't vectorize. llvm::Value* x_loc_base = - b_.CreateAdd(constant(j * step_x * vec_stride), start_offset_x, "x_loc_base"); + b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x, "x_loc_base"); IrArray::Index source_idx_x_base = source_idx.AddOffsetToDim(y_loc, kDimY, &b_) - .AddOffsetToDim(constant(j * step_x * vec_stride), kDimX, &b_); + .AddOffsetToDim(constant(j * step_x * vector_size), kDimX, &b_); - for (int i = 0; i < vec_stride; i++) { - int old_j = j * vec_stride + i; + for (int i = 0; i < vector_size; i++) { + int old_j = j * vector_size + i; llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc"); IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim( constant(i), kDimX, &b_); @@ -2049,7 +2050,7 @@ void IrEmitterUnnested::EmitTile( } if (red_if == 1 || x_tile_fits) { std::cout << "IF_NB 1: " << std::endl; - unroll(!x_tile_fits, x_num_steps, vec_stride); + unroll(!x_tile_fits, x_num_steps, vector_size); } else { std::cout << "IF_NB 2" << std::endl; ksl->If(loop_name + "_is_full_tile", @@ -2057,8 +2058,8 @@ void IrEmitterUnnested::EmitTile( // tile_width is always exact. For the last block, // it will be the exact number of elements left. b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width), - [&] {unroll(false, x_num_steps, vec_stride);}, - [&] {unroll(true, x_num_steps, vec_stride);}); + [&] {unroll(false, x_num_steps, vector_size);}, + [&] {unroll(true, x_num_steps, vector_size);}); } }}); } From 3b8c3030b95463b60caf2b9686fa456f14d326b0 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 24 Feb 2020 12:58:41 -0800 Subject: [PATCH 14/52] Code simplification --- .../xla/service/gpu/ir_emitter_unnested.cc | 29 +++++-------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 6733183644f..310dd1e266e 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1996,25 +1996,7 @@ void IrEmitterUnnested::EmitTile( printf("VEC_STRIDE %d\n", vector_size); llvm::Value* y_loc = b_.CreateAdd( thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); - // TODO: can I get rid of this conditions? - if (vector_size == 1) { - for (int64 j = 0; j < x_num_steps; j++) { - llvm::Value* x_loc = - b_.CreateAdd(constant(j * step_x), start_offset_x, "x_loc"); - IrArray::Index source_idx_x = - source_idx.AddOffsetToDim(y_loc, kDimY, &b_) - .AddOffsetToDim(constant(j * step_x), kDimX, &b_); - auto emit_element = [&] { - return emit_elem_function(source_idx_x, y_loc, x_loc, j); - }; - if (!x_tile_fits) { - ksl->If(loop_name + "_x_in_tile", - b_.CreateICmpULT(x_loc, tile_width), emit_element); - } else { - emit_element(); - } - } - } else { + { auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) { for (int64 j = 0; j < x_num_steps/vector_size; j++) { //Prep some values. If we do not do this, LLVM doesn't vectorize. @@ -2026,9 +2008,12 @@ void IrEmitterUnnested::EmitTile( for (int i = 0; i < vector_size; i++) { int old_j = j * vector_size + i; - llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc"); - IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim( - constant(i), kDimX, &b_); + llvm::Value* x_loc = x_loc_base; + IrArray::Index source_idx_x = source_idx_x_base; + if (i > 0) { + x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc"); + source_idx_x = source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_); + } auto emit_element = [&] { return emit_elem_function(source_idx_x, y_loc, x_loc, j); }; From 407ed60524ab2e750158fee3e3e7728ab319d0b9 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 24 Feb 2020 12:59:25 -0800 Subject: [PATCH 15/52] Remove useless indentation --- .../xla/service/gpu/ir_emitter_unnested.cc | 51 +++++++++---------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 310dd1e266e..a1cf8f248f9 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1996,15 +1996,14 @@ void IrEmitterUnnested::EmitTile( printf("VEC_STRIDE %d\n", vector_size); llvm::Value* y_loc = b_.CreateAdd( thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); - { - auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) { + auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) { for (int64 j = 0; j < x_num_steps/vector_size; j++) { //Prep some values. If we do not do this, LLVM doesn't vectorize. llvm::Value* x_loc_base = - b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x, "x_loc_base"); + b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x, "x_loc_base"); IrArray::Index source_idx_x_base = - source_idx.AddOffsetToDim(y_loc, kDimY, &b_) - .AddOffsetToDim(constant(j * step_x * vector_size), kDimX, &b_); + source_idx.AddOffsetToDim(y_loc, kDimY, &b_) + .AddOffsetToDim(constant(j * step_x * vector_size), kDimX, &b_); for (int i = 0; i < vector_size; i++) { int old_j = j * vector_size + i; @@ -2025,28 +2024,28 @@ void IrEmitterUnnested::EmitTile( } } } - }; + }; - char * env_if = getenv("RED_IF"); - int red_if = 0; - if (env_if) { - red_if = atoi(env_if); - printf("RED_IF2 = %d %s\n", red_if, env_if); - } - if (red_if == 1 || x_tile_fits) { - std::cout << "IF_NB 1: " << std::endl; - unroll(!x_tile_fits, x_num_steps, vector_size); - } else { - std::cout << "IF_NB 2" << std::endl; - ksl->If(loop_name + "_is_full_tile", - // if (block fully fit) {fast path} else {slow path} - // tile_width is always exact. For the last block, - // it will be the exact number of elements left. - b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width), - [&] {unroll(false, x_num_steps, vector_size);}, - [&] {unroll(true, x_num_steps, vector_size);}); - } - }}); + char * env_if = getenv("RED_IF"); + int red_if = 0; + if (env_if) { + red_if = atoi(env_if); + printf("RED_IF2 = %d %s\n", red_if, env_if); + } + if (red_if == 1 || x_tile_fits) { + std::cout << "IF_NB 1: " << std::endl; + unroll(!x_tile_fits, x_num_steps, vector_size); + } else { + std::cout << "IF_NB 2" << std::endl; + ksl->If(loop_name + "_is_full_tile", + // if (block fully fit) {fast path} else {slow path} + // tile_width is always exact. For the last block, + // it will be the exact number of elements left. + b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width), + [&] {unroll(false, x_num_steps, vector_size);}, + [&] {unroll(true, x_num_steps, vector_size);}); + } + }); } // Emits code to process a tensor element in a tile for the given kCopy HLO that From 5e84cdf212ed55ea366bb7de5f0d945a1931d0f2 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 25 Feb 2020 10:22:39 -0800 Subject: [PATCH 16/52] Remove print --- .../xla/service/gpu/ir_emitter_unnested.cc | 34 ++++++++----------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index a1cf8f248f9..c5b4269cdb4 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1993,7 +1993,6 @@ void IrEmitterUnnested::EmitTile( ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y), num_threads_y), /*step=*/constant(1), [&](llvm::Value* y_indvar) { - printf("VEC_STRIDE %d\n", vector_size); llvm::Value* y_loc = b_.CreateAdd( thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) { @@ -2026,17 +2025,13 @@ void IrEmitterUnnested::EmitTile( } }; - char * env_if = getenv("RED_IF"); - int red_if = 0; - if (env_if) { - red_if = atoi(env_if); - printf("RED_IF2 = %d %s\n", red_if, env_if); - } - if (red_if == 1 || x_tile_fits) { - std::cout << "IF_NB 1: " << std::endl; - unroll(!x_tile_fits, x_num_steps, vector_size); - } else { - std::cout << "IF_NB 2" << std::endl; + if (!x_tile_fits && + mapping_scheme.GetIndexingOrder() == + KernelMappingScheme::LinearDilatedIndexingX) { + // Only try this path when we try to vectorize the loads. + + // Special case when the tile doesn't fit completly for even row size. + // For odd row size every other row isn't aligned, so can't be vectorized. ksl->If(loop_name + "_is_full_tile", // if (block fully fit) {fast path} else {slow path} // tile_width is always exact. For the last block, @@ -2044,7 +2039,10 @@ void IrEmitterUnnested::EmitTile( b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width), [&] {unroll(false, x_num_steps, vector_size);}, [&] {unroll(true, x_num_steps, vector_size);}); + } else { + unroll(!x_tile_fits, x_num_steps, vector_size); } + }); } @@ -3161,7 +3159,10 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( reduction_dimensions.dimensions[2]); KernelMappingScheme::IndexingOrder indexing_order; - if (reduction_dimensions.is_row_reduction) { + if (reduction_dimensions.is_row_reduction && + // Only try to vectorize+coales memory access for row of even size. + // For odd row size, every other row isn't aligned, so can't be vectorized. + reduction_dimensions.dimensions[2] % 2 == 0) { indexing_order = KernelMappingScheme::LinearDilatedIndexingX; } else if (IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape, reduction_dimensions.dimensions[2])) { @@ -3191,7 +3192,6 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( int tile_size_x = reduction_tiling[2] * num_threads_x; int vector_size = 1; - char* env = getenv("VECTOR_SIZE"); if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX) { if (reduction_dimensions.dimensions[2] % 2 == 0 && // As XLA unroll and suppose LLVM will vectorize, @@ -3202,12 +3202,6 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( indexing_order = KernelMappingScheme::DilatedIndexingX; } } - if (env) { - vector_size = atoi(env); - if (vector_size > 1) { - indexing_order = KernelMappingScheme::LinearDilatedIndexingX; - } - } KernelMappingScheme mapping_scheme( reduction_dimensions.dimensions, {reduction_tiling[0], reduction_tiling[1] * num_threads_y, tile_size_x}, From f0fa15e9466f747bda563e7cd85143707d86e628 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 26 Feb 2020 06:34:27 -0800 Subject: [PATCH 17/52] Fix new latent bug. --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index c5b4269cdb4..be0e605e76d 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2013,7 +2013,7 @@ void IrEmitterUnnested::EmitTile( source_idx_x = source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_); } auto emit_element = [&] { - return emit_elem_function(source_idx_x, y_loc, x_loc, j); + return emit_elem_function(source_idx_x, y_loc, x_loc, old_j); }; if (add_if) { ksl->If(loop_name + "_x_in_tile", From 2aac34b5171b304df84147abbb54dd17dbd35ce6 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 2 Mar 2020 14:18:11 -0800 Subject: [PATCH 18/52] NFC: clang-format --- .../xla/service/gpu/ir_emitter_unnested.cc | 77 +++++++++++-------- .../xla/service/gpu/kernel_mapping_scheme.h | 16 ++-- .../gpu/tests/reduction_vectorization_test.cc | 1 - 3 files changed, 53 insertions(+), 41 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index be0e605e76d..bc9a01af7c2 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1860,7 +1860,8 @@ namespace { // Returns true if the fusion contains any instruction that is likely // translated to complex LLVM IR, such as loops, and prevent vectorization. -bool MayPreventVectorization(const HloInstruction& hlo, const bool tolerate_reduce=false) { +bool MayPreventVectorization(const HloInstruction& hlo, + const bool tolerate_reduce = false) { if (hlo.opcode() == HloOpcode::kFusion) { return absl::c_any_of(hlo.fused_instructions_computation()->instructions(), [tolerate_reduce](const HloInstruction* instr) { @@ -1923,12 +1924,14 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme, llvm::Value* thread_id_x, llvm::Type* index_ty, llvm::IRBuilder<>* b) { - if (mapping_scheme.GetIndexingOrder() == KernelMappingScheme::DilatedIndexingX) { + if (mapping_scheme.GetIndexingOrder() == + KernelMappingScheme::DilatedIndexingX) { return thread_id_x; } else if (mapping_scheme.GetIndexingOrder() == KernelMappingScheme::LinearDilatedIndexingX) { int vector_size = mapping_scheme.GetVectorSize(); - return b->CreateMul(thread_id_x, llvm::ConstantInt::get(index_ty, vector_size)); + return b->CreateMul(thread_id_x, + llvm::ConstantInt::get(index_ty, vector_size)); } int64 x_num_steps = mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX(); @@ -1958,7 +1961,10 @@ void IrEmitterUnnested::EmitTile( // of threads. // Otherwise, the stride is one, but we multiply each offset by the limit of // number of steps which can be made. - int64 step_x = mapping_scheme.GetIndexingOrder() == KernelMappingScheme::LinearIndexingX? 1 : num_threads_x; + int64 step_x = + mapping_scheme.GetIndexingOrder() == KernelMappingScheme::LinearIndexingX + ? 1 + : num_threads_x; IrArray::Index source_idx = tile_origin_index.AddOffsetToDim(start_offset_x, kDimX, &b_); @@ -1996,13 +2002,15 @@ void IrEmitterUnnested::EmitTile( llvm::Value* y_loc = b_.CreateAdd( thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) { - for (int64 j = 0; j < x_num_steps/vector_size; j++) { - //Prep some values. If we do not do this, LLVM doesn't vectorize. + for (int64 j = 0; j < x_num_steps / vector_size; j++) { + // Prep some values. If we do not do this, LLVM doesn't vectorize. llvm::Value* x_loc_base = - b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x, "x_loc_base"); + b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x, + "x_loc_base"); IrArray::Index source_idx_x_base = - source_idx.AddOffsetToDim(y_loc, kDimY, &b_) - .AddOffsetToDim(constant(j * step_x * vector_size), kDimX, &b_); + source_idx.AddOffsetToDim(y_loc, kDimY, &b_) + .AddOffsetToDim(constant(j * step_x * vector_size), kDimX, + &b_); for (int i = 0; i < vector_size; i++) { int old_j = j * vector_size + i; @@ -2010,7 +2018,8 @@ void IrEmitterUnnested::EmitTile( IrArray::Index source_idx_x = source_idx_x_base; if (i > 0) { x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc"); - source_idx_x = source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_); + source_idx_x = + source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_); } auto emit_element = [&] { return emit_elem_function(source_idx_x, y_loc, x_loc, old_j); @@ -2025,24 +2034,24 @@ void IrEmitterUnnested::EmitTile( } }; - if (!x_tile_fits && - mapping_scheme.GetIndexingOrder() == - KernelMappingScheme::LinearDilatedIndexingX) { + if (!x_tile_fits && mapping_scheme.GetIndexingOrder() == + KernelMappingScheme::LinearDilatedIndexingX) { // Only try this path when we try to vectorize the loads. // Special case when the tile doesn't fit completly for even row size. - // For odd row size every other row isn't aligned, so can't be vectorized. + // For odd row size every other row isn't aligned, so can't be + // vectorized. ksl->If(loop_name + "_is_full_tile", // if (block fully fit) {fast path} else {slow path} // tile_width is always exact. For the last block, // it will be the exact number of elements left. - b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width), - [&] {unroll(false, x_num_steps, vector_size);}, - [&] {unroll(true, x_num_steps, vector_size);}); + b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), + tile_width), + [&] { unroll(false, x_num_steps, vector_size); }, + [&] { unroll(true, x_num_steps, vector_size); }); } else { unroll(!x_tile_fits, x_num_steps, vector_size); } - }); } @@ -2126,8 +2135,10 @@ static int GetNumberOfPartialResults( if (reduction_info.IsRowReduction()) { return 1; } - int64 num_partial_results = mapping_scheme.GetIndexingOrder() == - KernelMappingScheme::DilatedIndexingX ? 1 : 2; + int64 num_partial_results = + mapping_scheme.GetIndexingOrder() == KernelMappingScheme::DilatedIndexingX + ? 1 + : 2; CHECK_EQ(num_partial_results, (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX())); return num_partial_results; @@ -2707,12 +2718,13 @@ void IrEmitterUnnested::EmitHlo021Tile( absl::Span reduced_output_dims, absl::Span tiled_param_ids) { constexpr int kNumRows = 4; - KernelMappingScheme mapping_scheme(reduced_output_dims, - /*tile_sizes=*/{1, kWarpSize, kWarpSize}, - /*num_threads_y=*/kNumRows, - /*num_threads_x=*/kWarpSize, - /*indexing_order=*/KernelMappingScheme::LinearIndexingX, - /*vector_size*/1); + KernelMappingScheme mapping_scheme( + reduced_output_dims, + /*tile_sizes=*/{1, kWarpSize, kWarpSize}, + /*num_threads_y=*/kNumRows, + /*num_threads_x=*/kWarpSize, + /*indexing_order=*/KernelMappingScheme::LinearIndexingX, + /*vector_size*/ 1); LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(), mapping_scheme.GetThreadsPerBlock()); llvm::Type* index_type = @@ -3161,11 +3173,13 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( KernelMappingScheme::IndexingOrder indexing_order; if (reduction_dimensions.is_row_reduction && // Only try to vectorize+coales memory access for row of even size. - // For odd row size, every other row isn't aligned, so can't be vectorized. + // For odd row size, every other row isn't aligned, so can't be + // vectorized. reduction_dimensions.dimensions[2] % 2 == 0) { indexing_order = KernelMappingScheme::LinearDilatedIndexingX; - } else if (IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape, - reduction_dimensions.dimensions[2])) { + } else if (IsUnrollingColumnReductionBeneficial( + unnested_hlo, input_shape, + reduction_dimensions.dimensions[2])) { indexing_order = KernelMappingScheme::LinearIndexingX; } else { indexing_order = KernelMappingScheme::DilatedIndexingX; @@ -3196,7 +3210,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( if (reduction_dimensions.dimensions[2] % 2 == 0 && // As XLA unroll and suppose LLVM will vectorize, // disable the unroll for case that LLVM doesn't vectorize. - !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/true)) { + !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/ true)) { vector_size = 2; } else { indexing_order = KernelMappingScheme::DilatedIndexingX; @@ -3205,8 +3219,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( KernelMappingScheme mapping_scheme( reduction_dimensions.dimensions, {reduction_tiling[0], reduction_tiling[1] * num_threads_y, tile_size_x}, - num_threads_y, num_threads_x, indexing_order, - vector_size); + num_threads_y, num_threads_x, indexing_order, vector_size); return ReductionCodegenInfo(mapping_scheme, reduction_dimensions.is_row_reduction); } diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index c8f2864c3e9..d90d75a67c5 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -77,7 +77,11 @@ class KernelMappingScheme { public: enum { DimZ = 0, DimY, DimX, DimTot }; // TODO: rename Dilated to Strided? - enum IndexingOrder { LinearIndexingX, DilatedIndexingX, LinearDilatedIndexingX }; + enum IndexingOrder { + LinearIndexingX, + DilatedIndexingX, + LinearDilatedIndexingX + }; KernelMappingScheme(absl::Span dims_in_elems, absl::Span tile_sizes, int64 num_threads_y, @@ -88,7 +92,7 @@ class KernelMappingScheme { num_threads_x_(num_threads_x), num_threads_y_(num_threads_y), indexing_order_(indexing_order), - vector_size_(vector_size){ + vector_size_(vector_size) { CHECK_EQ(tile_sizes[1] % num_threads_y_, 0); CHECK_EQ(tile_sizes[2] % num_threads_x_, 0); VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ","); @@ -124,12 +128,8 @@ class KernelMappingScheme { return GetNumThreadsX() * GetNumThreadsY(); } - IndexingOrder GetIndexingOrder() const { - return indexing_order_; - } - int GetVectorSize() const { - return vector_size_; - } + IndexingOrder GetIndexingOrder() const { return indexing_order_; } + int GetVectorSize() const { return vector_size_; } private: // The number of elements in each dimension. diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc index d811eeb52ae..cd3fac63fb8 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc @@ -33,7 +33,6 @@ namespace gpu { namespace { class ReductionVectorizationTest : public GpuCodegenTest { - protected: void EnsureDeterminism(absl::string_view hlo_text) { std::vector profiles; From c9d2dea5a84b99ee6e61fc9b6c0b3100bf15b9a1 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 3 Mar 2020 07:08:12 -0800 Subject: [PATCH 19/52] Better comment --- .../xla/service/gpu/kernel_mapping_scheme.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index d90d75a67c5..e07efb21e9a 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -77,6 +77,12 @@ class KernelMappingScheme { public: enum { DimZ = 0, DimY, DimX, DimTot }; // TODO: rename Dilated to Strided? + // LinearIndexing mean each thread reads consecutive elements. + // DilatedIndexingX mean each thread reads Dilated(Strided) + // elements. This conserve memory coalescing. + // LinearDilatedIndexingX mean each thread read a few consecutive + // elements then take a bigger step. The goal is to trigger + // vectorized reads and keep memory coalescing. enum IndexingOrder { LinearIndexingX, DilatedIndexingX, @@ -146,14 +152,8 @@ class KernelMappingScheme { // When num_threads_x threads process a total of tile_size_x // elements in the X dimension of a tile, each threads process - // n=tile_size_x/num_threads_x elements. When indexing_order_ have - // the value LinearIndexingX, the n elements processed by a thread - // are contiguous. When the value is DilatedIndexingX, the n - // elements are dilated by a factor of num_threads_x. When the value - // is LinearDilatedIndexingX, it is a mix of both to enable - // vectorizing while keeping memory coalescing. It reads - // vector_size_ elements, then do a step to the next vector_size_ - // elements. + // n=tile_size_x/num_threads_x elements. + // indexing_order_ define which tile's elements each thread reads. const IndexingOrder indexing_order_; // vector_size_ only supported for row reduction. // Must be a divisor of tile_sizes_[2]/num_threads_x. From dda66aabfc3ad0a41a840b86f3b36cab61fa02c9 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 4 Mar 2020 10:25:42 -0800 Subject: [PATCH 20/52] Use constant to make the code clearer. --- .../xla/service/gpu/ir_emitter_unnested.cc | 48 +++++++++---------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index bc9a01af7c2..11560ca554b 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -106,6 +106,11 @@ const auto kDimY = KernelMappingScheme::DimY; const auto kDimZ = KernelMappingScheme::DimZ; const auto kDimTot = KernelMappingScheme::DimTot; +const auto kLinearIndexingX = KernelMappingScheme::LinearIndexingX; +const auto kDilatedIndexingX = KernelMappingScheme::DilatedIndexingX; +const auto kLinearDilatedIndexingX = + KernelMappingScheme::LinearDilatedIndexingX; + // If a dimensions is smaller than this, untiled transposition may be more // efficient. const int64 kMinDimensionToTransposeTiled = 16; @@ -1924,11 +1929,9 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme, llvm::Value* thread_id_x, llvm::Type* index_ty, llvm::IRBuilder<>* b) { - if (mapping_scheme.GetIndexingOrder() == - KernelMappingScheme::DilatedIndexingX) { + if (mapping_scheme.GetIndexingOrder() == kDilatedIndexingX) { return thread_id_x; - } else if (mapping_scheme.GetIndexingOrder() == - KernelMappingScheme::LinearDilatedIndexingX) { + } else if (mapping_scheme.GetIndexingOrder() == kLinearDilatedIndexingX) { int vector_size = mapping_scheme.GetVectorSize(); return b->CreateMul(thread_id_x, llvm::ConstantInt::get(index_ty, vector_size)); @@ -1962,9 +1965,7 @@ void IrEmitterUnnested::EmitTile( // Otherwise, the stride is one, but we multiply each offset by the limit of // number of steps which can be made. int64 step_x = - mapping_scheme.GetIndexingOrder() == KernelMappingScheme::LinearIndexingX - ? 1 - : num_threads_x; + mapping_scheme.GetIndexingOrder() == kLinearIndexingX ? 1 : num_threads_x; IrArray::Index source_idx = tile_origin_index.AddOffsetToDim(start_offset_x, kDimX, &b_); @@ -2034,8 +2035,8 @@ void IrEmitterUnnested::EmitTile( } }; - if (!x_tile_fits && mapping_scheme.GetIndexingOrder() == - KernelMappingScheme::LinearDilatedIndexingX) { + if (!x_tile_fits && + mapping_scheme.GetIndexingOrder() == kLinearDilatedIndexingX) { // Only try this path when we try to vectorize the loads. // Special case when the tile doesn't fit completly for even row size. @@ -2136,9 +2137,7 @@ static int GetNumberOfPartialResults( return 1; } int64 num_partial_results = - mapping_scheme.GetIndexingOrder() == KernelMappingScheme::DilatedIndexingX - ? 1 - : 2; + mapping_scheme.GetIndexingOrder() == kDilatedIndexingX ? 1 : 2; CHECK_EQ(num_partial_results, (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX())); return num_partial_results; @@ -2718,13 +2717,12 @@ void IrEmitterUnnested::EmitHlo021Tile( absl::Span reduced_output_dims, absl::Span tiled_param_ids) { constexpr int kNumRows = 4; - KernelMappingScheme mapping_scheme( - reduced_output_dims, - /*tile_sizes=*/{1, kWarpSize, kWarpSize}, - /*num_threads_y=*/kNumRows, - /*num_threads_x=*/kWarpSize, - /*indexing_order=*/KernelMappingScheme::LinearIndexingX, - /*vector_size*/ 1); + KernelMappingScheme mapping_scheme(reduced_output_dims, + /*tile_sizes=*/{1, kWarpSize, kWarpSize}, + /*num_threads_y=*/kNumRows, + /*num_threads_x=*/kWarpSize, + /*indexing_order=*/kLinearIndexingX, + /*vector_size*/ 1); LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(), mapping_scheme.GetThreadsPerBlock()); llvm::Type* index_type = @@ -3176,16 +3174,16 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( // For odd row size, every other row isn't aligned, so can't be // vectorized. reduction_dimensions.dimensions[2] % 2 == 0) { - indexing_order = KernelMappingScheme::LinearDilatedIndexingX; + indexing_order = kLinearDilatedIndexingX; } else if (IsUnrollingColumnReductionBeneficial( unnested_hlo, input_shape, reduction_dimensions.dimensions[2])) { - indexing_order = KernelMappingScheme::LinearIndexingX; + indexing_order = kLinearIndexingX; } else { - indexing_order = KernelMappingScheme::DilatedIndexingX; + indexing_order = kDilatedIndexingX; } - if (indexing_order == KernelMappingScheme::LinearIndexingX && + if (indexing_order == kLinearIndexingX && !reduction_dimensions.is_row_reduction) { // Vectorized loads: a single thread reduces two adjacent columns. reduction_tiling[2] *= 2; @@ -3206,14 +3204,14 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( int tile_size_x = reduction_tiling[2] * num_threads_x; int vector_size = 1; - if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX) { + if (indexing_order == kLinearDilatedIndexingX) { if (reduction_dimensions.dimensions[2] % 2 == 0 && // As XLA unroll and suppose LLVM will vectorize, // disable the unroll for case that LLVM doesn't vectorize. !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/ true)) { vector_size = 2; } else { - indexing_order = KernelMappingScheme::DilatedIndexingX; + indexing_order = kDilatedIndexingX; } } KernelMappingScheme mapping_scheme( From 24ad7db0407ceec0bd31c93484db965add1e03ac Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 4 Mar 2020 10:51:55 -0800 Subject: [PATCH 21/52] Small NFC from review. --- .../xla/service/gpu/ir_emitter_unnested.cc | 16 ++++++++-------- .../xla/service/gpu/kernel_mapping_scheme.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 11560ca554b..6806f235d28 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1866,10 +1866,10 @@ namespace { // Returns true if the fusion contains any instruction that is likely // translated to complex LLVM IR, such as loops, and prevent vectorization. bool MayPreventVectorization(const HloInstruction& hlo, - const bool tolerate_reduce = false) { + bool tolerate_reduce = false) { if (hlo.opcode() == HloOpcode::kFusion) { return absl::c_any_of(hlo.fused_instructions_computation()->instructions(), - [tolerate_reduce](const HloInstruction* instr) { + [&](const HloInstruction* instr) { switch (instr->opcode()) { case HloOpcode::kReduce: return !tolerate_reduce; @@ -2002,7 +2002,7 @@ void IrEmitterUnnested::EmitTile( /*step=*/constant(1), [&](llvm::Value* y_indvar) { llvm::Value* y_loc = b_.CreateAdd( thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); - auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) { + auto unroll = [&](bool add_if, int64 vector_size) { for (int64 j = 0; j < x_num_steps / vector_size; j++) { // Prep some values. If we do not do this, LLVM doesn't vectorize. llvm::Value* x_loc_base = @@ -2046,12 +2046,12 @@ void IrEmitterUnnested::EmitTile( // if (block fully fit) {fast path} else {slow path} // tile_width is always exact. For the last block, // it will be the exact number of elements left. - b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), + b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()), tile_width), - [&] { unroll(false, x_num_steps, vector_size); }, - [&] { unroll(true, x_num_steps, vector_size); }); + [&] { unroll(/*add_if=*/false, vector_size); }, + [&] { unroll(/*add_if=*/true, vector_size); }); } else { - unroll(!x_tile_fits, x_num_steps, vector_size); + unroll(/*add_if=*/!x_tile_fits, vector_size); } }); } @@ -2722,7 +2722,7 @@ void IrEmitterUnnested::EmitHlo021Tile( /*num_threads_y=*/kNumRows, /*num_threads_x=*/kWarpSize, /*indexing_order=*/kLinearIndexingX, - /*vector_size*/ 1); + /*vector_size=*/1); LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(), mapping_scheme.GetThreadsPerBlock()); llvm::Type* index_type = diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index e07efb21e9a..1bea776275a 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -158,7 +158,7 @@ class KernelMappingScheme { // vector_size_ only supported for row reduction. // Must be a divisor of tile_sizes_[2]/num_threads_x. // Interesting values are 2 and 4 to trigger vectorized loads on GPUs - // While keeping memory coalescing. + // while keeping memory coalescing. const int vector_size_; }; From d40cb300200bb70e070192c378d2e1511a97b6f4 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 4 Mar 2020 11:14:30 -0800 Subject: [PATCH 22/52] Use strided instead of dilated. This is more know in our community working on the software. --- .../xla/service/gpu/ir_emitter_unnested.cc | 22 +++++++++---------- .../xla/service/gpu/kernel_mapping_scheme.h | 13 +++++------ 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 6806f235d28..4cb26e03867 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -107,9 +107,9 @@ const auto kDimZ = KernelMappingScheme::DimZ; const auto kDimTot = KernelMappingScheme::DimTot; const auto kLinearIndexingX = KernelMappingScheme::LinearIndexingX; -const auto kDilatedIndexingX = KernelMappingScheme::DilatedIndexingX; -const auto kLinearDilatedIndexingX = - KernelMappingScheme::LinearDilatedIndexingX; +const auto kStridedIndexingX = KernelMappingScheme::StridedIndexingX; +const auto kLinearStridedIndexingX = + KernelMappingScheme::LinearStridedIndexingX; // If a dimensions is smaller than this, untiled transposition may be more // efficient. @@ -1929,9 +1929,9 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme, llvm::Value* thread_id_x, llvm::Type* index_ty, llvm::IRBuilder<>* b) { - if (mapping_scheme.GetIndexingOrder() == kDilatedIndexingX) { + if (mapping_scheme.GetIndexingOrder() == kStridedIndexingX) { return thread_id_x; - } else if (mapping_scheme.GetIndexingOrder() == kLinearDilatedIndexingX) { + } else if (mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { int vector_size = mapping_scheme.GetVectorSize(); return b->CreateMul(thread_id_x, llvm::ConstantInt::get(index_ty, vector_size)); @@ -2036,7 +2036,7 @@ void IrEmitterUnnested::EmitTile( }; if (!x_tile_fits && - mapping_scheme.GetIndexingOrder() == kLinearDilatedIndexingX) { + mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { // Only try this path when we try to vectorize the loads. // Special case when the tile doesn't fit completly for even row size. @@ -2137,7 +2137,7 @@ static int GetNumberOfPartialResults( return 1; } int64 num_partial_results = - mapping_scheme.GetIndexingOrder() == kDilatedIndexingX ? 1 : 2; + mapping_scheme.GetIndexingOrder() == kStridedIndexingX ? 1 : 2; CHECK_EQ(num_partial_results, (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX())); return num_partial_results; @@ -3174,13 +3174,13 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( // For odd row size, every other row isn't aligned, so can't be // vectorized. reduction_dimensions.dimensions[2] % 2 == 0) { - indexing_order = kLinearDilatedIndexingX; + indexing_order = kLinearStridedIndexingX; } else if (IsUnrollingColumnReductionBeneficial( unnested_hlo, input_shape, reduction_dimensions.dimensions[2])) { indexing_order = kLinearIndexingX; } else { - indexing_order = kDilatedIndexingX; + indexing_order = kStridedIndexingX; } if (indexing_order == kLinearIndexingX && @@ -3204,14 +3204,14 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( int tile_size_x = reduction_tiling[2] * num_threads_x; int vector_size = 1; - if (indexing_order == kLinearDilatedIndexingX) { + if (indexing_order == kLinearStridedIndexingX) { if (reduction_dimensions.dimensions[2] % 2 == 0 && // As XLA unroll and suppose LLVM will vectorize, // disable the unroll for case that LLVM doesn't vectorize. !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/ true)) { vector_size = 2; } else { - indexing_order = kDilatedIndexingX; + indexing_order = kStridedIndexingX; } } KernelMappingScheme mapping_scheme( diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index 1bea776275a..67e1c3fc546 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -76,17 +76,16 @@ namespace gpu { class KernelMappingScheme { public: enum { DimZ = 0, DimY, DimX, DimTot }; - // TODO: rename Dilated to Strided? // LinearIndexing mean each thread reads consecutive elements. - // DilatedIndexingX mean each thread reads Dilated(Strided) - // elements. This conserve memory coalescing. - // LinearDilatedIndexingX mean each thread read a few consecutive + // StridedIndexingX mean each thread reads strided elements. + // This conserve memory coalescing. + // LinearStridedIndexingX mean each thread read a few consecutive // elements then take a bigger step. The goal is to trigger // vectorized reads and keep memory coalescing. enum IndexingOrder { LinearIndexingX, - DilatedIndexingX, - LinearDilatedIndexingX + StridedIndexingX, + LinearStridedIndexingX }; KernelMappingScheme(absl::Span dims_in_elems, @@ -103,7 +102,7 @@ class KernelMappingScheme { CHECK_EQ(tile_sizes[2] % num_threads_x_, 0); VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ","); if (indexing_order != LinearIndexingX) { - // DilatedIndexingX, and LinearDilatedIndexingX + // StridedIndexingX, and LinearStridedIndexingX // is for the purpose of vectorization, which requires // GetTileSizeFor(DimX) to be a multiplier of num_threads_x_. CHECK_EQ(GetTileSizeFor(DimX) % num_threads_x_, 0); From ecac846c420d6c231d280bb87cb26eaf1250aa83 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 4 Mar 2020 11:53:51 -0800 Subject: [PATCH 23/52] NFC: Rename a variable --- .../xla/service/gpu/ir_emitter_unnested.cc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 4cb26e03867..11912e056e7 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2002,7 +2002,8 @@ void IrEmitterUnnested::EmitTile( /*step=*/constant(1), [&](llvm::Value* y_indvar) { llvm::Value* y_loc = b_.CreateAdd( thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); - auto unroll = [&](bool add_if, int64 vector_size) { + auto unroll = [&](bool add_index_boundary_condition, + int64 vector_size) { for (int64 j = 0; j < x_num_steps / vector_size; j++) { // Prep some values. If we do not do this, LLVM doesn't vectorize. llvm::Value* x_loc_base = @@ -2025,7 +2026,7 @@ void IrEmitterUnnested::EmitTile( auto emit_element = [&] { return emit_elem_function(source_idx_x, y_loc, x_loc, old_j); }; - if (add_if) { + if (add_index_boundary_condition) { ksl->If(loop_name + "_x_in_tile", b_.CreateICmpULT(x_loc, tile_width), emit_element); } else { @@ -2048,10 +2049,14 @@ void IrEmitterUnnested::EmitTile( // it will be the exact number of elements left. b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()), tile_width), - [&] { unroll(/*add_if=*/false, vector_size); }, - [&] { unroll(/*add_if=*/true, vector_size); }); + [&] { + unroll(/*add_index_boundary_condition=*/false, vector_size); + }, + [&] { + unroll(/*add_index_boundary_condition=*/true, vector_size); + }); } else { - unroll(/*add_if=*/!x_tile_fits, vector_size); + unroll(/*add_index_boundary_condition=*/!x_tile_fits, vector_size); } }); } From 150c2abcb33526e05aa168198a525efd7d95c55e Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 4 Mar 2020 11:55:02 -0800 Subject: [PATCH 24/52] Simplify the code. The condition is only used in one of the path, so no need to test it. --- .../compiler/xla/service/gpu/ir_emitter_unnested.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 11912e056e7..177ce7be558 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1865,14 +1865,13 @@ namespace { // Returns true if the fusion contains any instruction that is likely // translated to complex LLVM IR, such as loops, and prevent vectorization. -bool MayPreventVectorization(const HloInstruction& hlo, - bool tolerate_reduce = false) { +bool MayPreventVectorization(const HloInstruction& hlo) { if (hlo.opcode() == HloOpcode::kFusion) { return absl::c_any_of(hlo.fused_instructions_computation()->instructions(), [&](const HloInstruction* instr) { switch (instr->opcode()) { case HloOpcode::kReduce: - return !tolerate_reduce; + return false; case HloOpcode::kReduceWindow: case HloOpcode::kSort: case HloOpcode::kDot: @@ -1900,7 +1899,7 @@ bool MayPreventVectorization(const HloInstruction& hlo, return false; } } else if (hlo.opcode() == HloOpcode::kReduce) { - return !tolerate_reduce; + return false; } return true; } @@ -3213,7 +3212,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( if (reduction_dimensions.dimensions[2] % 2 == 0 && // As XLA unroll and suppose LLVM will vectorize, // disable the unroll for case that LLVM doesn't vectorize. - !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/ true)) { + !MayPreventVectorization(*unnested_hlo)) { vector_size = 2; } else { indexing_order = kStridedIndexingX; From 865f3600b7f7c5b6ee096ef12f528318ff975c44 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 4 Mar 2020 12:13:41 -0800 Subject: [PATCH 25/52] Remove a condition that is optimized by LLVM and doesn't remove vectorization. --- .../compiler/xla/service/gpu/ir_emitter_unnested.cc | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 177ce7be558..5d46d940bf1 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2015,13 +2015,9 @@ void IrEmitterUnnested::EmitTile( for (int i = 0; i < vector_size; i++) { int old_j = j * vector_size + i; - llvm::Value* x_loc = x_loc_base; - IrArray::Index source_idx_x = source_idx_x_base; - if (i > 0) { - x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc"); - source_idx_x = - source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_); - } + llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc"); + IrArray::Index source_idx_x = + source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_); auto emit_element = [&] { return emit_elem_function(source_idx_x, y_loc, x_loc, old_j); }; From 38a064a347f791b9a27a53fd29aae6b761b29ec9 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 4 Mar 2020 12:18:49 -0800 Subject: [PATCH 26/52] Simplify code. This isn't needed anymore. --- .../xla/service/gpu/ir_emitter_unnested.cc | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 5d46d940bf1..666d8388510 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2004,20 +2004,15 @@ void IrEmitterUnnested::EmitTile( auto unroll = [&](bool add_index_boundary_condition, int64 vector_size) { for (int64 j = 0; j < x_num_steps / vector_size; j++) { - // Prep some values. If we do not do this, LLVM doesn't vectorize. - llvm::Value* x_loc_base = - b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x, - "x_loc_base"); - IrArray::Index source_idx_x_base = - source_idx.AddOffsetToDim(y_loc, kDimY, &b_) - .AddOffsetToDim(constant(j * step_x * vector_size), kDimX, - &b_); - for (int i = 0; i < vector_size; i++) { int old_j = j * vector_size + i; - llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc"); + llvm::Value* x_loc = + b_.CreateAdd(constant(j * step_x * vector_size + i), + start_offset_x, "x_loc"); IrArray::Index source_idx_x = - source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_); + source_idx.AddOffsetToDim(y_loc, kDimY, &b_) + .AddOffsetToDim(constant(j * step_x * vector_size + i), + kDimX, &b_); auto emit_element = [&] { return emit_elem_function(source_idx_x, y_loc, x_loc, old_j); }; From 18ae338407eaac894e52f2eef7dbeac6b15a1a88 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 4 Mar 2020 12:41:45 -0800 Subject: [PATCH 27/52] Simplify comment. --- .../compiler/xla/service/gpu/kernel_mapping_scheme.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index 67e1c3fc546..11f257fa0ab 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -76,15 +76,14 @@ namespace gpu { class KernelMappingScheme { public: enum { DimZ = 0, DimY, DimX, DimTot }; - // LinearIndexing mean each thread reads consecutive elements. - // StridedIndexingX mean each thread reads strided elements. - // This conserve memory coalescing. - // LinearStridedIndexingX mean each thread read a few consecutive - // elements then take a bigger step. The goal is to trigger - // vectorized reads and keep memory coalescing. enum IndexingOrder { + // Threads read consecutive elements. LinearIndexingX, + // Threads read strided elements while keeping memory coalescing. StridedIndexingX, + // Threads read a few consecutive elements then take a strided + // step. This can trigger vectorized reads and keep memory + // coalescing. LinearStridedIndexingX }; From 7309beda2532384ba5a7bc8c34c10642ff707fcd Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 5 Mar 2020 06:32:54 -0800 Subject: [PATCH 28/52] NFC: better code/comments following code review. --- .../xla/service/gpu/ir_emitter_unnested.cc | 69 +++++++++---------- .../xla/service/gpu/kernel_mapping_scheme.h | 19 ++--- .../gpu/tests/reduction_vectorization_test.cc | 4 +- 3 files changed, 45 insertions(+), 47 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 666d8388510..74fac7c47a0 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1870,8 +1870,6 @@ bool MayPreventVectorization(const HloInstruction& hlo) { return absl::c_any_of(hlo.fused_instructions_computation()->instructions(), [&](const HloInstruction* instr) { switch (instr->opcode()) { - case HloOpcode::kReduce: - return false; case HloOpcode::kReduceWindow: case HloOpcode::kSort: case HloOpcode::kDot: @@ -1880,6 +1878,8 @@ bool MayPreventVectorization(const HloInstruction& hlo) { case HloOpcode::kPower: case HloOpcode::kAtan2: return true; + case HloOpcode::kReduce: + // TODO: check the to_apply() attribute. default: return false; } @@ -1899,6 +1899,7 @@ bool MayPreventVectorization(const HloInstruction& hlo) { return false; } } else if (hlo.opcode() == HloOpcode::kReduce) { + // TODO: check the to_apply() attribute. return false; } return true; @@ -1928,17 +1929,17 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme, llvm::Value* thread_id_x, llvm::Type* index_ty, llvm::IRBuilder<>* b) { + auto constant = [&](int64 val) { + return llvm::ConstantInt::get(index_ty, val); + }; if (mapping_scheme.GetIndexingOrder() == kStridedIndexingX) { return thread_id_x; } else if (mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { - int vector_size = mapping_scheme.GetVectorSize(); - return b->CreateMul(thread_id_x, - llvm::ConstantInt::get(index_ty, vector_size)); + return b->CreateMul(thread_id_x, constant(mapping_scheme.GetVectorSize())); } int64 x_num_steps = mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX(); - return b->CreateMul(thread_id_x, - llvm::ConstantInt::get(index_ty, x_num_steps)); + return b->CreateMul(thread_id_x, constant(x_num_steps)); } void IrEmitterUnnested::EmitTile( @@ -1965,6 +1966,7 @@ void IrEmitterUnnested::EmitTile( // number of steps which can be made. int64 step_x = mapping_scheme.GetIndexingOrder() == kLinearIndexingX ? 1 : num_threads_x; + int64 vector_size = mapping_scheme.GetVectorSize(); IrArray::Index source_idx = tile_origin_index.AddOffsetToDim(start_offset_x, kDimX, &b_); @@ -1991,7 +1993,6 @@ void IrEmitterUnnested::EmitTile( // // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the // workaround. - int vector_size = mapping_scheme.GetVectorSize(); ksl->For( loop_name + "_y_in_tile", /*start=*/constant(0), @@ -2001,11 +2002,10 @@ void IrEmitterUnnested::EmitTile( /*step=*/constant(1), [&](llvm::Value* y_indvar) { llvm::Value* y_loc = b_.CreateAdd( thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); - auto unroll = [&](bool add_index_boundary_condition, - int64 vector_size) { - for (int64 j = 0; j < x_num_steps / vector_size; j++) { + auto unroll = [&](bool add_index_boundary_condition) { + for (int j = 0; j < x_num_steps / vector_size; j++) { for (int i = 0; i < vector_size; i++) { - int old_j = j * vector_size + i; + int linear_index = j * vector_size + i; llvm::Value* x_loc = b_.CreateAdd(constant(j * step_x * vector_size + i), start_offset_x, "x_loc"); @@ -2014,7 +2014,8 @@ void IrEmitterUnnested::EmitTile( .AddOffsetToDim(constant(j * step_x * vector_size + i), kDimX, &b_); auto emit_element = [&] { - return emit_elem_function(source_idx_x, y_loc, x_loc, old_j); + return emit_elem_function(source_idx_x, y_loc, x_loc, + linear_index); }; if (add_index_boundary_condition) { ksl->If(loop_name + "_x_in_tile", @@ -2039,14 +2040,10 @@ void IrEmitterUnnested::EmitTile( // it will be the exact number of elements left. b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()), tile_width), - [&] { - unroll(/*add_index_boundary_condition=*/false, vector_size); - }, - [&] { - unroll(/*add_index_boundary_condition=*/true, vector_size); - }); + [&] { unroll(/*add_index_boundary_condition=*/false); }, + [&] { unroll(/*add_index_boundary_condition=*/true); }); } else { - unroll(/*add_index_boundary_condition=*/!x_tile_fits, vector_size); + unroll(/*add_index_boundary_condition=*/!x_tile_fits); } }); } @@ -3163,21 +3160,21 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape, reduction_dimensions.dimensions[2]); - KernelMappingScheme::IndexingOrder indexing_order; - if (reduction_dimensions.is_row_reduction && - // Only try to vectorize+coales memory access for row of even size. - // For odd row size, every other row isn't aligned, so can't be - // vectorized. - reduction_dimensions.dimensions[2] % 2 == 0) { - indexing_order = kLinearStridedIndexingX; - } else if (IsUnrollingColumnReductionBeneficial( - unnested_hlo, input_shape, - reduction_dimensions.dimensions[2])) { - indexing_order = kLinearIndexingX; - } else { - indexing_order = kStridedIndexingX; - } - + KernelMappingScheme::IndexingOrder indexing_order = [&]() { + if (reduction_dimensions.is_row_reduction && + // Only try to vectorize+coales memory access for row of even size. + // For odd row size, every other row isn't aligned, so can't be + // vectorized. + reduction_dimensions.dimensions[2] % 2 == 0) { + return kLinearStridedIndexingX; + } else if (IsUnrollingColumnReductionBeneficial( + unnested_hlo, input_shape, + reduction_dimensions.dimensions[2])) { + return kLinearIndexingX; + } else { + return kStridedIndexingX; + } + }(); if (indexing_order == kLinearIndexingX && !reduction_dimensions.is_row_reduction) { // Vectorized loads: a single thread reduces two adjacent columns. @@ -3201,7 +3198,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( int vector_size = 1; if (indexing_order == kLinearStridedIndexingX) { if (reduction_dimensions.dimensions[2] % 2 == 0 && - // As XLA unroll and suppose LLVM will vectorize, + // Assuming XLA will perform the unrolling and LLVM will vectorize, // disable the unroll for case that LLVM doesn't vectorize. !MayPreventVectorization(*unnested_hlo)) { vector_size = 2; diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index 11f257fa0ab..02dd2f3885e 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -77,13 +77,13 @@ class KernelMappingScheme { public: enum { DimZ = 0, DimY, DimX, DimTot }; enum IndexingOrder { - // Threads read consecutive elements. + // Thread reads consecutive elements. LinearIndexingX, - // Threads read strided elements while keeping memory coalescing. + // Thread reads strided elements while keeping memory coalescing. StridedIndexingX, - // Threads read a few consecutive elements then take a strided - // step. This can trigger vectorized reads and keep memory - // coalescing. + // Thread reads a few consecutive elements then take a strided + // step. This can trigger vectorized reads and keep memory + // coalescing. LinearStridedIndexingX }; @@ -152,11 +152,12 @@ class KernelMappingScheme { // elements in the X dimension of a tile, each threads process // n=tile_size_x/num_threads_x elements. // indexing_order_ define which tile's elements each thread reads. + const IndexingOrder indexing_order_; - // vector_size_ only supported for row reduction. - // Must be a divisor of tile_sizes_[2]/num_threads_x. - // Interesting values are 2 and 4 to trigger vectorized loads on GPUs - // while keeping memory coalescing. + // vector_size_ only supported for row reduction and must be a divisor + // of tile_sizes_[2]/num_threads_x. Interesting values are 2 and 4 + // to trigger vectorized loads on GPUs while keeping memory + // coalescing. const int vector_size_; }; diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc index cd3fac63fb8..589bff1c160 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc @@ -106,7 +106,7 @@ CHECK: ld.global.nc.v2.f32 EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); } -TEST_F(ReductionVectorizationTest, EvenColums) { +TEST_F(ReductionVectorizationTest, EvenColumns) { const char* hlo_text = R"( HloModule ReducePower2 @@ -133,7 +133,7 @@ CHECK-NOT: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 CHECK-NOT: ld.global.nc.v2.f32 -// TODO: find how to modify LLVM/opt to get this merged? vectorize before some loop opt? +// TODO: Make this a vectorized load CHECK: ld.global.nc.f32 CHECK: ld.global.nc.f32 )"); From dc1c95f8436968f6527d128b2f858877475b0871 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 5 Mar 2020 11:15:41 -0800 Subject: [PATCH 29/52] Comment and reuse useless case. --- .../compiler/xla/service/gpu/ir_emitter_unnested.cc | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 74fac7c47a0..eceed11714f 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1878,8 +1878,6 @@ bool MayPreventVectorization(const HloInstruction& hlo) { case HloOpcode::kPower: case HloOpcode::kAtan2: return true; - case HloOpcode::kReduce: - // TODO: check the to_apply() attribute. default: return false; } @@ -2027,13 +2025,12 @@ void IrEmitterUnnested::EmitTile( } }; + // Only try this path when we try to vectorize the loads. + // Special case when the tile doesn't fit completly for even row size. + // For odd row size every other row isn't aligned, so can't be + // vectorized this way by LLVM. if (!x_tile_fits && mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { - // Only try this path when we try to vectorize the loads. - - // Special case when the tile doesn't fit completly for even row size. - // For odd row size every other row isn't aligned, so can't be - // vectorized. ksl->If(loop_name + "_is_full_tile", // if (block fully fit) {fast path} else {slow path} // tile_width is always exact. For the last block, From ab41708e12cb7cd1239766b90b31e51381ce1ef4 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 5 Mar 2020 12:23:07 -0800 Subject: [PATCH 30/52] Move a lambda to its own function. --- .../xla/service/gpu/ir_emitter_unnested.cc | 115 ++++++++++-------- 1 file changed, 64 insertions(+), 51 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index eceed11714f..cddd3aca6cc 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1940,6 +1940,38 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme, return b->CreateMul(thread_id_x, constant(x_num_steps)); } +auto Unroll(bool add_index_boundary_condition, int64 x_num_steps, int64 step_x, + int64 vector_size, const string& loop_name, + KernelSupportLibrary* ksl, llvm::Value* start_offset_x, + llvm::Value* y_loc, llvm::Value* tile_width, + IrArray::Index& source_idx, llvm::IRBuilder<>& b_, + const IrEmitterUnnested::EmitElementFunction* emit_elem_function) { + llvm::Type* index_ty = tile_width->getType(); + auto constant = [&](int64 val) { + return llvm::ConstantInt::get(index_ty, val); + }; + for (int j = 0; j < x_num_steps / vector_size; j++) { + for (int i = 0; i < vector_size; i++) { + int linear_index = j * vector_size + i; + llvm::Value* x_loc = b_.CreateAdd(constant(j * step_x * vector_size + i), + start_offset_x, "x_loc"); + IrArray::Index source_idx_x = + source_idx.AddOffsetToDim(y_loc, kDimY, &b_) + .AddOffsetToDim(constant(j * step_x * vector_size + i), kDimX, + &b_); + auto emit_element = [&] { + return (*emit_elem_function)(source_idx_x, y_loc, x_loc, linear_index); + }; + if (add_index_boundary_condition) { + ksl->If(loop_name + "_x_in_tile", b_.CreateICmpULT(x_loc, tile_width), + emit_element); + } else { + emit_element(); + } + } + } +} + void IrEmitterUnnested::EmitTile( const KernelMappingScheme& mapping_scheme, const IrArray::Index& tile_origin_index, const string& loop_name, @@ -1991,58 +2023,39 @@ void IrEmitterUnnested::EmitTile( // // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the // workaround. - ksl->For( - loop_name + "_y_in_tile", - /*start=*/constant(0), - /*end=*/ - ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y), - num_threads_y), - /*step=*/constant(1), [&](llvm::Value* y_indvar) { - llvm::Value* y_loc = b_.CreateAdd( - thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); - auto unroll = [&](bool add_index_boundary_condition) { - for (int j = 0; j < x_num_steps / vector_size; j++) { - for (int i = 0; i < vector_size; i++) { - int linear_index = j * vector_size + i; - llvm::Value* x_loc = - b_.CreateAdd(constant(j * step_x * vector_size + i), - start_offset_x, "x_loc"); - IrArray::Index source_idx_x = - source_idx.AddOffsetToDim(y_loc, kDimY, &b_) - .AddOffsetToDim(constant(j * step_x * vector_size + i), - kDimX, &b_); - auto emit_element = [&] { - return emit_elem_function(source_idx_x, y_loc, x_loc, - linear_index); - }; - if (add_index_boundary_condition) { - ksl->If(loop_name + "_x_in_tile", - b_.CreateICmpULT(x_loc, tile_width), emit_element); - } else { - emit_element(); - } - } - } - }; + ksl->For(loop_name + "_y_in_tile", + /*start=*/constant(0), + /*end=*/ + ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y), + num_threads_y), + /*step=*/constant(1), [&](llvm::Value* y_indvar) { + llvm::Value* y_loc = + b_.CreateAdd(thread_id_info.thread_id_y, + b_.CreateMul(y_indvar, num_threads_y)); + auto unroll = [&](bool add_index_boundary_condition) { + return Unroll(add_index_boundary_condition, x_num_steps, step_x, + vector_size, loop_name, ksl, start_offset_x, y_loc, + tile_width, source_idx, b_, &emit_elem_function); + }; - // Only try this path when we try to vectorize the loads. - // Special case when the tile doesn't fit completly for even row size. - // For odd row size every other row isn't aligned, so can't be - // vectorized this way by LLVM. - if (!x_tile_fits && - mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { - ksl->If(loop_name + "_is_full_tile", - // if (block fully fit) {fast path} else {slow path} - // tile_width is always exact. For the last block, - // it will be the exact number of elements left. - b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()), - tile_width), - [&] { unroll(/*add_index_boundary_condition=*/false); }, - [&] { unroll(/*add_index_boundary_condition=*/true); }); - } else { - unroll(/*add_index_boundary_condition=*/!x_tile_fits); - } - }); + // Only try this path when we try to vectorize the loads. + // Special case when the tile doesn't fit completly for even row + // size. For odd row size every other row isn't aligned, so can't + // be vectorized this way by LLVM. + if (!x_tile_fits && + mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { + ksl->If(loop_name + "_is_full_tile", + // if (block fully fit) {fast path} else {slow path} + // tile_width is always exact. For the last block, + // it will be the exact number of elements left. + b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()), + tile_width), + [&] { unroll(/*add_index_boundary_condition=*/false); }, + [&] { unroll(/*add_index_boundary_condition=*/true); }); + } else { + unroll(/*add_index_boundary_condition=*/!x_tile_fits); + } + }); } // Emits code to process a tensor element in a tile for the given kCopy HLO that From 5dadd1069c9e238f14b22535f2dc1a527c1d08aa Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 5 Mar 2020 12:25:26 -0800 Subject: [PATCH 31/52] Add a CHECK. --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index cddd3aca6cc..06d547a0224 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1935,6 +1935,7 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme, } else if (mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { return b->CreateMul(thread_id_x, constant(mapping_scheme.GetVectorSize())); } + CHECK_EQ(mapping_scheme.GetIndexingOrder(), kLinearIndexingX); int64 x_num_steps = mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX(); return b->CreateMul(thread_id_x, constant(x_num_steps)); From 291f43e0eb14a4720fccb82471cba0b7d5947db3 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 9 Mar 2020 07:28:23 -0700 Subject: [PATCH 32/52] Small change following code review. --- .../xla/service/gpu/ir_emitter_unnested.cc | 56 +++++++++++-------- .../xla/service/gpu/kernel_mapping_scheme.h | 4 +- .../gpu/tests/reduction_vectorization_test.cc | 14 +---- 3 files changed, 36 insertions(+), 38 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 06d547a0224..1fa5d54f2ae 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1897,7 +1897,8 @@ bool MayPreventVectorization(const HloInstruction& hlo) { return false; } } else if (hlo.opcode() == HloOpcode::kReduce) { - // TODO: check the to_apply() attribute. + // TODO: check if the to_apply() attribute contain instruction + // that break LLVM vectorization. return false; } return true; @@ -1941,12 +1942,22 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme, return b->CreateMul(thread_id_x, constant(x_num_steps)); } -auto Unroll(bool add_index_boundary_condition, int64 x_num_steps, int64 step_x, - int64 vector_size, const string& loop_name, - KernelSupportLibrary* ksl, llvm::Value* start_offset_x, - llvm::Value* y_loc, llvm::Value* tile_width, - IrArray::Index& source_idx, llvm::IRBuilder<>& b_, - const IrEmitterUnnested::EmitElementFunction* emit_elem_function) { +// This function calls emit_elem_function() x_num_steps times. If +// vector_size==1, then each element index passed to +// emit_elem_function() will be separated by step_x. If vector_size>1, +// then it must be a multiple of x_num_steps. In that case, it +// triggers a different indexing order that is vectorizable by +// LLVM. It generates many groups of calls to emit_elem_function. Each +// group is separated by step_x elements. Inside a group, elements +// are consecutive. If check_x_tile_bounds is true, then it will check +// if the element index is in bound compared to tile_width before +// calling emit_elem_function. +static void Unroll( + bool check_x_tile_bounds, int64 x_num_steps, int64 step_x, + int64 vector_size, const string& loop_name, KernelSupportLibrary* ksl, + llvm::Value* start_offset_x, llvm::Value* y_loc, llvm::Value* tile_width, + IrArray::Index& source_idx, llvm::IRBuilder<>& b_, + const IrEmitterUnnested::EmitElementFunction* emit_elem_function) { llvm::Type* index_ty = tile_width->getType(); auto constant = [&](int64 val) { return llvm::ConstantInt::get(index_ty, val); @@ -1963,7 +1974,7 @@ auto Unroll(bool add_index_boundary_condition, int64 x_num_steps, int64 step_x, auto emit_element = [&] { return (*emit_elem_function)(source_idx_x, y_loc, x_loc, linear_index); }; - if (add_index_boundary_condition) { + if (check_x_tile_bounds) { ksl->If(loop_name + "_x_in_tile", b_.CreateICmpULT(x_loc, tile_width), emit_element); } else { @@ -2033,28 +2044,27 @@ void IrEmitterUnnested::EmitTile( llvm::Value* y_loc = b_.CreateAdd(thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); - auto unroll = [&](bool add_index_boundary_condition) { - return Unroll(add_index_boundary_condition, x_num_steps, step_x, + auto unroll = [&](bool check_x_tile_bounds) { + return Unroll(check_x_tile_bounds, x_num_steps, step_x, vector_size, loop_name, ksl, start_offset_x, y_loc, tile_width, source_idx, b_, &emit_elem_function); }; - // Only try this path when we try to vectorize the loads. - // Special case when the tile doesn't fit completly for even row - // size. For odd row size every other row isn't aligned, so can't - // be vectorized this way by LLVM. + // Only take this path when we unroll in a way vectorizable by + // LLVM. Special case when the tile doesn't fit completely for even + // row size. For odd row size every other row isn't aligned to the + // vectorized size, so it can't be vectorized by LLVM. if (!x_tile_fits && mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { ksl->If(loop_name + "_is_full_tile", - // if (block fully fit) {fast path} else {slow path} - // tile_width is always exact. For the last block, - // it will be the exact number of elements left. + // For the last block, tile_width will be the number of + // elements left. b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()), tile_width), - [&] { unroll(/*add_index_boundary_condition=*/false); }, - [&] { unroll(/*add_index_boundary_condition=*/true); }); + [&] { unroll(/*check_x_tile_bounds=*/false); }, + [&] { unroll(/*check_x_tile_bounds=*/true); }); } else { - unroll(/*add_index_boundary_condition=*/!x_tile_fits); + unroll(/*check_x_tile_bounds=*/!x_tile_fits); } }); } @@ -3173,8 +3183,8 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( KernelMappingScheme::IndexingOrder indexing_order = [&]() { if (reduction_dimensions.is_row_reduction && - // Only try to vectorize+coales memory access for row of even size. - // For odd row size, every other row isn't aligned, so can't be + // Only try to vectorize+coales memory access for rows of even size. + // For odd row sizes, every other row isn't aligned, so it can't be // vectorized. reduction_dimensions.dimensions[2] % 2 == 0) { return kLinearStridedIndexingX; @@ -3210,7 +3220,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( if (indexing_order == kLinearStridedIndexingX) { if (reduction_dimensions.dimensions[2] % 2 == 0 && // Assuming XLA will perform the unrolling and LLVM will vectorize, - // disable the unroll for case that LLVM doesn't vectorize. + // disable the unroll for the cases that LLVM doesn't vectorize. !MayPreventVectorization(*unnested_hlo)) { vector_size = 2; } else { diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index 02dd2f3885e..359ee3c900a 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -151,9 +151,9 @@ class KernelMappingScheme { // When num_threads_x threads process a total of tile_size_x // elements in the X dimension of a tile, each threads process // n=tile_size_x/num_threads_x elements. - // indexing_order_ define which tile's elements each thread reads. - + // indexing_order defines which tile's elements each thread reads. const IndexingOrder indexing_order_; + // vector_size_ only supported for row reduction and must be a divisor // of tile_sizes_[2]/num_threads_x. Interesting values are 2 and 4 // to trigger vectorized loads on GPUs while keeping memory diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc index 589bff1c160..c6cd1b79672 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc @@ -32,19 +32,7 @@ namespace gpu { namespace { -class ReductionVectorizationTest : public GpuCodegenTest { - protected: - void EnsureDeterminism(absl::string_view hlo_text) { - std::vector profiles; - profiles.emplace_back(); - profiles.emplace_back(); - EXPECT_TRUE(RunMultipleTimes(hlo_text, - /*run_hlo_passes=*/true, - /*profiles=*/&profiles, - /*backend_config=*/"", - /*assert_determinism=*/true)); - } -}; +class ReductionVectorizationTest : public GpuCodegenTest {}; TEST_F(ReductionVectorizationTest, Power2) { const char* hlo_text = R"( From a13b9d4d58e311729de3a967d8c780a95f6691ae Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 9 Mar 2020 13:57:17 -0700 Subject: [PATCH 33/52] NFC following code review. comment typo and rename unroll to unrollInnerTileLoop. --- .../xla/service/gpu/ir_emitter_unnested.cc | 85 ++++++++++--------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 1fa5d54f2ae..14346fa3dac 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1897,7 +1897,7 @@ bool MayPreventVectorization(const HloInstruction& hlo) { return false; } } else if (hlo.opcode() == HloOpcode::kReduce) { - // TODO: check if the to_apply() attribute contain instruction + // TODO: check if the to_apply() attribute contains instruction // that break LLVM vectorization. return false; } @@ -1942,17 +1942,17 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme, return b->CreateMul(thread_id_x, constant(x_num_steps)); } -// This function calls emit_elem_function() x_num_steps times. If -// vector_size==1, then each element index passed to -// emit_elem_function() will be separated by step_x. If vector_size>1, -// then it must be a multiple of x_num_steps. In that case, it +// Calls `emit_elem_function()` `x_num_steps` times. If +// `vector_size`==1, then each element index passed to +// `emit_elem_function()` will be separated by `step_x`. If `vector_size`>1, +// then it must be a multiple of `x_num_steps`. In that case, it // triggers a different indexing order that is vectorizable by -// LLVM. It generates many groups of calls to emit_elem_function. Each -// group is separated by step_x elements. Inside a group, elements -// are consecutive. If check_x_tile_bounds is true, then it will check -// if the element index is in bound compared to tile_width before -// calling emit_elem_function. -static void Unroll( +// LLVM. It generates many groups of calls to `emit_elem_function`. Each +// group is separated by `step_x` elements. Inside a group, elements +// are consecutive. If `check_x_tile_bounds` is true, then it will check +// if the element index is in bound compared to `tile_width` before +// calling `emit_elem_function`. +static void UnrollInnerTileLoop( bool check_x_tile_bounds, int64 x_num_steps, int64 step_x, int64 vector_size, const string& loop_name, KernelSupportLibrary* ksl, llvm::Value* start_offset_x, llvm::Value* y_loc, llvm::Value* tile_width, @@ -2035,38 +2035,39 @@ void IrEmitterUnnested::EmitTile( // // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the // workaround. - ksl->For(loop_name + "_y_in_tile", - /*start=*/constant(0), - /*end=*/ - ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y), - num_threads_y), - /*step=*/constant(1), [&](llvm::Value* y_indvar) { - llvm::Value* y_loc = - b_.CreateAdd(thread_id_info.thread_id_y, - b_.CreateMul(y_indvar, num_threads_y)); - auto unroll = [&](bool check_x_tile_bounds) { - return Unroll(check_x_tile_bounds, x_num_steps, step_x, - vector_size, loop_name, ksl, start_offset_x, y_loc, - tile_width, source_idx, b_, &emit_elem_function); - }; + ksl->For( + loop_name + "_y_in_tile", + /*start=*/constant(0), + /*end=*/ + ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y), + num_threads_y), + /*step=*/constant(1), [&](llvm::Value* y_indvar) { + llvm::Value* y_loc = b_.CreateAdd( + thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); + auto unrollInnerTileLoop = [&](bool check_x_tile_bounds) { + return UnrollInnerTileLoop(check_x_tile_bounds, x_num_steps, step_x, + vector_size, loop_name, ksl, + start_offset_x, y_loc, tile_width, + source_idx, b_, &emit_elem_function); + }; - // Only take this path when we unroll in a way vectorizable by - // LLVM. Special case when the tile doesn't fit completely for even - // row size. For odd row size every other row isn't aligned to the - // vectorized size, so it can't be vectorized by LLVM. - if (!x_tile_fits && - mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { - ksl->If(loop_name + "_is_full_tile", - // For the last block, tile_width will be the number of - // elements left. - b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()), - tile_width), - [&] { unroll(/*check_x_tile_bounds=*/false); }, - [&] { unroll(/*check_x_tile_bounds=*/true); }); - } else { - unroll(/*check_x_tile_bounds=*/!x_tile_fits); - } - }); + // Only take this path when we unroll in a way vectorizable by + // LLVM. Special case when the tile doesn't fit completely for even + // row size. For odd row size every other row isn't aligned to the + // vectorized size, so it can't be vectorized by LLVM. + if (!x_tile_fits && + mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { + ksl->If(loop_name + "_is_full_tile", + // For the last block, tile_width will be the number of + // elements left. + b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()), + tile_width), + [&] { unrollInnerTileLoop(/*check_x_tile_bounds=*/false); }, + [&] { unrollInnerTileLoop(/*check_x_tile_bounds=*/true); }); + } else { + unrollInnerTileLoop(/*check_x_tile_bounds=*/!x_tile_fits); + } + }); } // Emits code to process a tensor element in a tile for the given kCopy HLO that From 3dadac1d4a43dd58062e7ee345372b138211da86 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Sun, 15 Mar 2020 17:39:00 -0700 Subject: [PATCH 34/52] Enable more vectorization by LLVMby generating simpler indexing. LLVM doesn't simplify correctly the complicated indexing generated. This souldn't affect the speed as ptxas was vectorizing the missing vectorization. --- .../compiler/xla/service/gpu/ir_emitter_unnested.cc | 10 ++++++++++ .../service/gpu/tests/reduction_vectorization_test.cc | 7 ++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 14346fa3dac..b02356e8cc0 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2099,6 +2099,16 @@ static IrArray::Index GetUnnormalizedIndex( const Shape& unnormalized_shape, llvm::IRBuilder<>* b_, const KernelMappingScheme& kernel_mapping_scheme) { DCHECK_EQ(normalized_shape_index.size(), 3); + // If the normalization only add a new dimensions of size 1, + // generate simpler indexing. LLVM doesn't always simplify the more + // complicated indexing and this prevent him from vectorizing some + // cases. + if (unnormalized_shape.rank() == 2) { + DCHECK_EQ(normalized_shape_index.dims()[0], 0); + auto multidim = normalized_shape_index.multidim(); + return IrArray::Index({multidim[1], multidim[2]}, unnormalized_shape, + normalized_shape_index.GetType()); + } llvm::Value* linear = normalized_shape_index.Linearize( kernel_mapping_scheme.GetDimsInElems(), b_); return IrArray::Index(linear, unnormalized_shape, b_); diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc index c6cd1b79672..10ad4ef3fda 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc @@ -83,9 +83,7 @@ ENTRY %main { ParseAndReturnVerifiedModule(hlo_text)); CompileAndOptionallyVerifyPtx(std::move(optimized_module), R"( -// TODO: Make this a vectorized load -CHECK: ld.global.nc.f32 -CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 @@ -116,8 +114,7 @@ ENTRY %main { R"( CHECK: ld.global.nc.f32 CHECK: ld.global.nc.f32 -// TODO: Make this a vectorized load -CHECK-NOT: ld.global.nc.v2.f32 +CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 CHECK-NOT: ld.global.nc.v2.f32 From 5704a9a295711ee46b66ad2e33c690c325982dd0 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Sun, 15 Mar 2020 17:53:05 -0700 Subject: [PATCH 35/52] Vectorize reduction based on arch and dtypes. --- .../compiler/xla/service/gpu/ir_emitter_unnested.cc | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index b02356e8cc0..240fa76e199 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -3192,12 +3192,20 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape, reduction_dimensions.dimensions[2]); + int cc_major = 0, cc_minor = 0; + ir_emitter_context_->device_description().cuda_compute_capability(&cc_major, + &cc_minor); + KernelMappingScheme::IndexingOrder indexing_order = [&]() { if (reduction_dimensions.is_row_reduction && // Only try to vectorize+coales memory access for rows of even size. // For odd row sizes, every other row isn't aligned, so it can't be // vectorized. - reduction_dimensions.dimensions[2] % 2 == 0) { + reduction_dimensions.dimensions[2] % 2 == 0 && + // Vectorization on P100 speed up only float16 and smaller + // dtype. Vectorization on P100 speed up or do not hurt all + // dtypes. + ((cc_major == 6 && smallest_input_dtype_bits <= 16) || cc_major >= 7)) { return kLinearStridedIndexingX; } else if (IsUnrollingColumnReductionBeneficial( unnested_hlo, input_shape, From 2dabd537f22e3a5f8780fd89b24655e32cfdf1a8 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 16 Mar 2020 07:37:31 -0700 Subject: [PATCH 36/52] Only trigger reduction columns indexing for columns reductions. --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 240fa76e199..690581ef8dd 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -3207,7 +3207,8 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( // dtypes. ((cc_major == 6 && smallest_input_dtype_bits <= 16) || cc_major >= 7)) { return kLinearStridedIndexingX; - } else if (IsUnrollingColumnReductionBeneficial( + } else if (!reduction_dimensions.is_row_reduction && + IsUnrollingColumnReductionBeneficial( unnested_hlo, input_shape, reduction_dimensions.dimensions[2])) { return kLinearIndexingX; From 2231173d4684ad0983ab1672502c4587522ebd61 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 16 Mar 2020 07:37:53 -0700 Subject: [PATCH 37/52] For row reduction do not try to vectorize on P100 when the tile doesn't fit perfectly and for dtype bigger then 32 bits. --- .../xla/service/gpu/ir_emitter_unnested.cc | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 690581ef8dd..d21702672d6 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -3192,20 +3192,34 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape, reduction_dimensions.dimensions[2]); + int64 num_threads_y = reduction_dimensions.is_row_reduction ? 1 : kWarpSize; + int64 num_threads_x = [&] { + if (reduction_dimensions.is_row_reduction) { + return std::min( + kWarpSize * kWarpSize, + RoundUpToNearest(CeilOfRatio(reduction_dimensions.dimensions[2], + reduction_tiling[2]), + kWarpSize)); + } + return kWarpSize; + }(); + + int tile_size_x = reduction_tiling[2] * num_threads_x; + bool tile_fit = reduction_dimensions.dimensions[kDimX] % tile_size_x == 0; + int cc_major = 0, cc_minor = 0; ir_emitter_context_->device_description().cuda_compute_capability(&cc_major, &cc_minor); KernelMappingScheme::IndexingOrder indexing_order = [&]() { if (reduction_dimensions.is_row_reduction && - // Only try to vectorize+coales memory access for rows of even size. - // For odd row sizes, every other row isn't aligned, so it can't be - // vectorized. - reduction_dimensions.dimensions[2] % 2 == 0 && - // Vectorization on P100 speed up only float16 and smaller - // dtype. Vectorization on P100 speed up or do not hurt all - // dtypes. - ((cc_major == 6 && smallest_input_dtype_bits <= 16) || cc_major >= 7)) { + // P100, only ttry to vectorize+coales memory access when the + // tile size fit exactly and dtypes <= 32 bits + ((cc_major == 6 && smallest_input_dtype_bits <= 32 && tile_fit) || + // On V100, only try to vectorize+coales memory access for + // rows of even size. For odd row sizes, every other row + // isn't aligned, so it can't be vectorized. + (cc_major >= 7 && reduction_dimensions.dimensions[2] % 2 == 0))) { return kLinearStridedIndexingX; } else if (!reduction_dimensions.is_row_reduction && IsUnrollingColumnReductionBeneficial( @@ -3222,20 +3236,6 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( reduction_tiling[2] *= 2; } - int64 num_threads_y = reduction_dimensions.is_row_reduction ? 1 : kWarpSize; - int64 num_threads_x = [&] { - if (reduction_dimensions.is_row_reduction) { - return std::min( - kWarpSize * kWarpSize, - RoundUpToNearest(CeilOfRatio(reduction_dimensions.dimensions[2], - reduction_tiling[2]), - kWarpSize)); - } - return kWarpSize; - }(); - - int tile_size_x = reduction_tiling[2] * num_threads_x; - int vector_size = 1; if (indexing_order == kLinearStridedIndexingX) { if (reduction_dimensions.dimensions[2] % 2 == 0 && From f0b32e48c06d4d8e228e786ab0ff027e19848849 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 16 Mar 2020 08:31:28 -0700 Subject: [PATCH 38/52] Update test to check the GPU arch for the expected ptx. --- .../gpu/tests/reduction_vectorization_test.cc | 104 ++++++++++++++++-- 1 file changed, 92 insertions(+), 12 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc index 10ad4ef3fda..5f27df08b90 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc @@ -52,13 +52,32 @@ ENTRY %main { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, ParseAndReturnVerifiedModule(hlo_text)); - CompileAndOptionallyVerifyPtx(std::move(optimized_module), - R"( + se::StreamExecutor* executor = backend().default_stream_executor(); + int cc_major = 0, cc_minor = 0; + executor->GetDeviceDescription().cuda_compute_capability(&cc_major, + &cc_minor); + string expected_ptx; + if (cc_major >= 6) { + expected_ptx = R"( CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 -)"); +)"; + } else { + expected_ptx = R"( +CHECK-NOT: ld.global.nc.v2.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +)"; + } + CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx); EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); } @@ -81,13 +100,32 @@ ENTRY %main { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, ParseAndReturnVerifiedModule(hlo_text)); - CompileAndOptionallyVerifyPtx(std::move(optimized_module), - R"( + se::StreamExecutor* executor = backend().default_stream_executor(); + int cc_major = 0, cc_minor = 0; + executor->GetDeviceDescription().cuda_compute_capability(&cc_major, + &cc_minor); + string expected_ptx; + if (cc_major >= 6) { + expected_ptx = R"( CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 CHECK: ld.global.nc.v2.f32 -)"); +)"; + } else { + expected_ptx = R"( +CHECK-NOT: ld.global.nc.v2.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +)"; + } + CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx); EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); } @@ -110,8 +148,13 @@ ENTRY %main { )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, ParseAndReturnVerifiedModule(hlo_text)); - CompileAndOptionallyVerifyPtx(std::move(optimized_module), - R"( + se::StreamExecutor* executor = backend().default_stream_executor(); + int cc_major = 0, cc_minor = 0; + executor->GetDeviceDescription().cuda_compute_capability(&cc_major, + &cc_minor); + string expected_ptx; + if (cc_major >= 7) { + expected_ptx = R"( CHECK: ld.global.nc.f32 CHECK: ld.global.nc.f32 CHECK: ld.global.nc.v2.f32 @@ -121,7 +164,22 @@ CHECK-NOT: ld.global.nc.v2.f32 // TODO: Make this a vectorized load CHECK: ld.global.nc.f32 CHECK: ld.global.nc.f32 -)"); +)"; + } else { + expected_ptx = R"( +CHECK-NOT: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +)"; + } + CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx); EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); } @@ -175,10 +233,32 @@ ENTRY %main { TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, ParseAndReturnVerifiedModule(hlo_text)); - CompileAndOptionallyVerifyPtx(std::move(optimized_module), - R"( + se::StreamExecutor* executor = backend().default_stream_executor(); + int cc_major = 0, cc_minor = 0; + executor->GetDeviceDescription().cuda_compute_capability(&cc_major, + &cc_minor); + string expected_ptx; + if (cc_major >= 6) { + expected_ptx = R"( CHECK: ld.global.nc.v2.f32 -)"); +CHECK: ld.global.nc.v2.f32 +CHECK: ld.global.nc.v2.f32 +CHECK: ld.global.nc.v2.f32 +)"; + } else { + expected_ptx = R"( +CHECK-NOT: ld.global.nc.v2.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +CHECK: ld.global.nc.f32 +)"; + } + CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx); EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); } From a54445d7001c370b668bddf1083b481524b53c6d Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 16 Mar 2020 08:31:40 -0700 Subject: [PATCH 39/52] clang-format --- .../xla/service/gpu/ir_emitter_unnested.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index d21702672d6..4131ff0eecc 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -3213,16 +3213,16 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( KernelMappingScheme::IndexingOrder indexing_order = [&]() { if (reduction_dimensions.is_row_reduction && - // P100, only ttry to vectorize+coales memory access when the - // tile size fit exactly and dtypes <= 32 bits - ((cc_major == 6 && smallest_input_dtype_bits <= 32 && tile_fit) || - // On V100, only try to vectorize+coales memory access for - // rows of even size. For odd row sizes, every other row - // isn't aligned, so it can't be vectorized. - (cc_major >= 7 && reduction_dimensions.dimensions[2] % 2 == 0))) { + // P100, only ttry to vectorize+coales memory access when the + // tile size fit exactly and dtypes <= 32 bits + ((cc_major == 6 && smallest_input_dtype_bits <= 32 && tile_fit) || + // On V100, only try to vectorize+coales memory access for + // rows of even size. For odd row sizes, every other row + // isn't aligned, so it can't be vectorized. + (cc_major >= 7 && reduction_dimensions.dimensions[2] % 2 == 0))) { return kLinearStridedIndexingX; } else if (!reduction_dimensions.is_row_reduction && - IsUnrollingColumnReductionBeneficial( + IsUnrollingColumnReductionBeneficial( unnested_hlo, input_shape, reduction_dimensions.dimensions[2])) { return kLinearIndexingX; From 229326ce9c379a7a7d9c08d5975becc069103121 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 16 Mar 2020 10:26:19 -0700 Subject: [PATCH 40/52] Crash fix. We where not returning the right reduction tile size. --- .../xla/service/gpu/ir_emitter_unnested.cc | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 4131ff0eecc..174db1e19b9 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2161,7 +2161,8 @@ static int GetNumberOfPartialResults( return 1; } int64 num_partial_results = - mapping_scheme.GetIndexingOrder() == kStridedIndexingX ? 1 : 2; + mapping_scheme.GetIndexingOrder() == kLinearIndexingX ? 2 : 1; + CHECK_EQ(num_partial_results, (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX())); return num_partial_results; @@ -3204,8 +3205,9 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( return kWarpSize; }(); - int tile_size_x = reduction_tiling[2] * num_threads_x; - bool tile_fit = reduction_dimensions.dimensions[kDimX] % tile_size_x == 0; + bool tile_fit = reduction_dimensions.dimensions[kDimX] % + (reduction_tiling[2] * num_threads_x) == + 0; int cc_major = 0, cc_minor = 0; ir_emitter_context_->device_description().cuda_compute_capability(&cc_major, @@ -3225,16 +3227,12 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( IsUnrollingColumnReductionBeneficial( unnested_hlo, input_shape, reduction_dimensions.dimensions[2])) { + reduction_tiling[2] *= 2; return kLinearIndexingX; } else { return kStridedIndexingX; } }(); - if (indexing_order == kLinearIndexingX && - !reduction_dimensions.is_row_reduction) { - // Vectorized loads: a single thread reduces two adjacent columns. - reduction_tiling[2] *= 2; - } int vector_size = 1; if (indexing_order == kLinearStridedIndexingX) { @@ -3249,7 +3247,8 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( } KernelMappingScheme mapping_scheme( reduction_dimensions.dimensions, - {reduction_tiling[0], reduction_tiling[1] * num_threads_y, tile_size_x}, + {reduction_tiling[0], reduction_tiling[1] * num_threads_y, + reduction_tiling[2] * num_threads_x}, num_threads_y, num_threads_x, indexing_order, vector_size); return ReductionCodegenInfo(mapping_scheme, reduction_dimensions.is_row_reduction); From 46cf90800e9a0be1fd4647d5e4fa9de29d3e1c42 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 16 Mar 2020 10:36:50 -0700 Subject: [PATCH 41/52] Small refactoring. Instead of recomputing a value, store it. This is safer as all the numbers are computed and validated once. --- .../xla/service/gpu/ir_emitter_unnested.cc | 29 +++++-------------- .../xla/service/gpu/kernel_mapping_scheme.h | 10 ++++++- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 174db1e19b9..ec749a48bcb 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2151,23 +2151,6 @@ void IrEmitterUnnested::EmitTileElementForFusion( } } -// Gets the number of partial results accumulated by a single thread performing -// reduction. -static int GetNumberOfPartialResults( - const ReductionCodegenInfo& reduction_info) { - const KernelMappingScheme& mapping_scheme = - reduction_info.GetKernelMappingScheme(); - if (reduction_info.IsRowReduction()) { - return 1; - } - int64 num_partial_results = - mapping_scheme.GetIndexingOrder() == kLinearIndexingX ? 2 : 1; - - CHECK_EQ(num_partial_results, - (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX())); - return num_partial_results; -} - void IrEmitterUnnested::EmitPrologueForReduction( HloInstruction* unnested_hlo, ReductionCodegenInfo* reduction_info, absl::Span reduce_instructions, @@ -2194,7 +2177,7 @@ void IrEmitterUnnested::EmitPrologueForReduction( llvm::AllocaInst* reduction_input_address = Alloca(element_type); reduction_input_addresses->push_back(reduction_input_address); - int num_partial_results = GetNumberOfPartialResults(*reduction_info); + int num_partial_results = reduction_info->GetNumPartialResults(); AddressVector* partial_result_addresses = reduction_info->GetMutablePartialResultAddresses(); llvm::AllocaInst* partial_result_address = @@ -2346,7 +2329,7 @@ void IrEmitterUnnested::EmitEpilogueForReduction( absl::Span partial_result_addresses = reduction_info.GetPartialResultAddresses(); - int num_partial_results = GetNumberOfPartialResults(reduction_info); + int num_partial_results = reduction_info.GetNumPartialResults(); // Emit an atomic operation that accumulates the partial reduction to the // output element. For row reduction, this is only for lane 0 due to the @@ -2560,7 +2543,7 @@ void IrEmitterUnnested::EmitTileElementForReduction( // GetElementPointer with array types. This enables the vectorization of // the computation for different partial results. Use this index if // 'num_partial_results > 1'. - int num_partial_results = GetNumberOfPartialResults(reduction_info); + int num_partial_results = reduction_info.GetNumPartialResults(); auto index_without_linear = IrArray::Index( input_index.multidim(), reduction_operand_shape, input_index.GetType()); @@ -3213,6 +3196,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( ir_emitter_context_->device_description().cuda_compute_capability(&cc_major, &cc_minor); + int num_partial_results = 1; KernelMappingScheme::IndexingOrder indexing_order = [&]() { if (reduction_dimensions.is_row_reduction && // P100, only ttry to vectorize+coales memory access when the @@ -3227,7 +3211,8 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( IsUnrollingColumnReductionBeneficial( unnested_hlo, input_shape, reduction_dimensions.dimensions[2])) { - reduction_tiling[2] *= 2; + num_partial_results = 2; + reduction_tiling[2] *= num_partial_results; return kLinearIndexingX; } else { return kStridedIndexingX; @@ -3250,7 +3235,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( {reduction_tiling[0], reduction_tiling[1] * num_threads_y, reduction_tiling[2] * num_threads_x}, num_threads_y, num_threads_x, indexing_order, vector_size); - return ReductionCodegenInfo(mapping_scheme, + return ReductionCodegenInfo(mapping_scheme, num_partial_results, reduction_dimensions.is_row_reduction); } diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index 359ee3c900a..9291d7bc6a3 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -166,8 +166,14 @@ using AddressVector = absl::InlinedVector; class ReductionCodegenInfo { public: explicit ReductionCodegenInfo(KernelMappingScheme mapping_scheme, + int num_partial_results, bool is_row_reduction) - : mapping_scheme_(mapping_scheme), is_row_reduction_(is_row_reduction) {} + : mapping_scheme_(mapping_scheme), num_partial_results_(num_partial_results), is_row_reduction_(is_row_reduction) { + if (num_partial_results > 1) { + CHECK_EQ(num_partial_results, + (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX())); + } + } const KernelMappingScheme& GetKernelMappingScheme() const { return mapping_scheme_; @@ -203,6 +209,7 @@ class ReductionCodegenInfo { return reduction_input_addresses_; } + int GetNumPartialResults() const { return num_partial_results_; } bool IsRowReduction() const { return is_row_reduction_; } // Gets a pointer to a mutable shared cache used by reduction. @@ -221,6 +228,7 @@ class ReductionCodegenInfo { const KernelMappingScheme mapping_scheme_; AddressVector partial_result_addresses_; AddressVector reduction_input_addresses_; + int num_partial_results_; bool is_row_reduction_; }; From 89db28e2fffeaeafce3f430de4de554e76cd378a Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Fri, 27 Mar 2020 07:24:36 -0700 Subject: [PATCH 42/52] Fix comment typo --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index ec749a48bcb..cd0e034e486 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2101,7 +2101,7 @@ static IrArray::Index GetUnnormalizedIndex( DCHECK_EQ(normalized_shape_index.size(), 3); // If the normalization only add a new dimensions of size 1, // generate simpler indexing. LLVM doesn't always simplify the more - // complicated indexing and this prevent him from vectorizing some + // complicated indexing and this prevents it from vectorizing some // cases. if (unnormalized_shape.rank() == 2) { DCHECK_EQ(normalized_shape_index.dims()[0], 0); @@ -3199,8 +3199,8 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( int num_partial_results = 1; KernelMappingScheme::IndexingOrder indexing_order = [&]() { if (reduction_dimensions.is_row_reduction && - // P100, only ttry to vectorize+coales memory access when the - // tile size fit exactly and dtypes <= 32 bits + // P100, only try to vectorize+coales memory access when the + // tile size fits exactly and dtypes <= 32 bits ((cc_major == 6 && smallest_input_dtype_bits <= 32 && tile_fit) || // On V100, only try to vectorize+coales memory access for // rows of even size. For odd row sizes, every other row From 1b7fef45a5f7339a4202444e41706c9e5fd5c80a Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 30 Mar 2020 13:01:47 -0700 Subject: [PATCH 43/52] Do a simple unnormalize only for row major memory format and exact shape. --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index cd0e034e486..512280e8a03 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2102,8 +2102,11 @@ static IrArray::Index GetUnnormalizedIndex( // If the normalization only add a new dimensions of size 1, // generate simpler indexing. LLVM doesn't always simplify the more // complicated indexing and this prevents it from vectorizing some - // cases. - if (unnormalized_shape.rank() == 2) { + // cases. We do this only for major_to_minor memory layout. + if (unnormalized_shape.rank() == 2 && unnormalized_shape.has_layout() && + unnormalized_shape.dimensions()[0] == normalized_shape_index.dims()[1] && + unnormalized_shape.dimensions()[1] == normalized_shape_index.dims()[2] && + unnormalized_shape.layout().minor_to_major(1) == 0) { DCHECK_EQ(normalized_shape_index.dims()[0], 0); auto multidim = normalized_shape_index.multidim(); return IrArray::Index({multidim[1], multidim[2]}, unnormalized_shape, From abc7c0a03dd6f21e36d077a1f2caba3b0d950825 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 1 Apr 2020 09:52:21 -0700 Subject: [PATCH 44/52] Fix DCHECK and make it a CHECK. --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 512280e8a03..ffc4df06749 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2107,7 +2107,7 @@ static IrArray::Index GetUnnormalizedIndex( unnormalized_shape.dimensions()[0] == normalized_shape_index.dims()[1] && unnormalized_shape.dimensions()[1] == normalized_shape_index.dims()[2] && unnormalized_shape.layout().minor_to_major(1) == 0) { - DCHECK_EQ(normalized_shape_index.dims()[0], 0); + CHECK_EQ(normalized_shape_index.dims()[0], 1); auto multidim = normalized_shape_index.multidim(); return IrArray::Index({multidim[1], multidim[2]}, unnormalized_shape, normalized_shape_index.GetType()); From e1db51a42fcb79a63e8ab1cd9ab4660b79052f20 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 1 Apr 2020 09:54:16 -0700 Subject: [PATCH 45/52] Code clean up --- .../xla/service/gpu/ir_emitter_unnested.cc | 34 ++++++++----------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index ffc4df06749..863079d1978 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1956,26 +1956,26 @@ static void UnrollInnerTileLoop( bool check_x_tile_bounds, int64 x_num_steps, int64 step_x, int64 vector_size, const string& loop_name, KernelSupportLibrary* ksl, llvm::Value* start_offset_x, llvm::Value* y_loc, llvm::Value* tile_width, - IrArray::Index& source_idx, llvm::IRBuilder<>& b_, + const IrArray::Index& source_idx, llvm::IRBuilder<>* b_, const IrEmitterUnnested::EmitElementFunction* emit_elem_function) { llvm::Type* index_ty = tile_width->getType(); auto constant = [&](int64 val) { return llvm::ConstantInt::get(index_ty, val); }; - for (int j = 0; j < x_num_steps / vector_size; j++) { - for (int i = 0; i < vector_size; i++) { + for (int64 j = 0; j < x_num_steps / vector_size; j++) { + IrArray::Index source_idx_x_base = + source_idx.AddOffsetToDim(y_loc, kDimY, b_); + for (int64 i = 0; i < vector_size; i++) { int linear_index = j * vector_size + i; - llvm::Value* x_loc = b_.CreateAdd(constant(j * step_x * vector_size + i), - start_offset_x, "x_loc"); - IrArray::Index source_idx_x = - source_idx.AddOffsetToDim(y_loc, kDimY, &b_) - .AddOffsetToDim(constant(j * step_x * vector_size + i), kDimX, - &b_); + llvm::Value* x_loc = b_->CreateAdd(constant(j * step_x * vector_size + i), + start_offset_x, "x_loc"); + IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim( + constant(j * step_x * vector_size + i), kDimX, b_); auto emit_element = [&] { return (*emit_elem_function)(source_idx_x, y_loc, x_loc, linear_index); }; if (check_x_tile_bounds) { - ksl->If(loop_name + "_x_in_tile", b_.CreateICmpULT(x_loc, tile_width), + ksl->If(loop_name + "_x_in_tile", b_->CreateICmpULT(x_loc, tile_width), emit_element); } else { emit_element(); @@ -2044,11 +2044,11 @@ void IrEmitterUnnested::EmitTile( /*step=*/constant(1), [&](llvm::Value* y_indvar) { llvm::Value* y_loc = b_.CreateAdd( thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y)); - auto unrollInnerTileLoop = [&](bool check_x_tile_bounds) { + auto unroll_inner_tile_loop = [&](bool check_x_tile_bounds) { return UnrollInnerTileLoop(check_x_tile_bounds, x_num_steps, step_x, vector_size, loop_name, ksl, start_offset_x, y_loc, tile_width, - source_idx, b_, &emit_elem_function); + source_idx, &b_, &emit_elem_function); }; // Only take this path when we unroll in a way vectorizable by @@ -2062,10 +2062,10 @@ void IrEmitterUnnested::EmitTile( // elements left. b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()), tile_width), - [&] { unrollInnerTileLoop(/*check_x_tile_bounds=*/false); }, - [&] { unrollInnerTileLoop(/*check_x_tile_bounds=*/true); }); + [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/false); }, + [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/true); }); } else { - unrollInnerTileLoop(/*check_x_tile_bounds=*/!x_tile_fits); + unroll_inner_tile_loop(/*check_x_tile_bounds=*/!x_tile_fits); } }); } @@ -3174,10 +3174,6 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( std::array reduction_tiling = GetReductionTiling(reduction_dimensions, smallest_input_dtype_bits, &ir_emitter_context_->device_description()); - bool dilated_x = - reduction_dimensions.is_row_reduction || - !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape, - reduction_dimensions.dimensions[2]); int64 num_threads_y = reduction_dimensions.is_row_reduction ? 1 : kWarpSize; int64 num_threads_x = [&] { From cb538d1fe9771986833bffdd6688d208bbe71974 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 2 Apr 2020 06:32:03 -0700 Subject: [PATCH 46/52] NFC: rename a variable --- .../compiler/xla/service/gpu/ir_emitter_unnested.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 863079d1978..cf5f3716dbb 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1956,7 +1956,7 @@ static void UnrollInnerTileLoop( bool check_x_tile_bounds, int64 x_num_steps, int64 step_x, int64 vector_size, const string& loop_name, KernelSupportLibrary* ksl, llvm::Value* start_offset_x, llvm::Value* y_loc, llvm::Value* tile_width, - const IrArray::Index& source_idx, llvm::IRBuilder<>* b_, + const IrArray::Index& source_idx, llvm::IRBuilder<>* b, const IrEmitterUnnested::EmitElementFunction* emit_elem_function) { llvm::Type* index_ty = tile_width->getType(); auto constant = [&](int64 val) { @@ -1964,18 +1964,18 @@ static void UnrollInnerTileLoop( }; for (int64 j = 0; j < x_num_steps / vector_size; j++) { IrArray::Index source_idx_x_base = - source_idx.AddOffsetToDim(y_loc, kDimY, b_); + source_idx.AddOffsetToDim(y_loc, kDimY, b); for (int64 i = 0; i < vector_size; i++) { int linear_index = j * vector_size + i; - llvm::Value* x_loc = b_->CreateAdd(constant(j * step_x * vector_size + i), - start_offset_x, "x_loc"); + llvm::Value* x_loc = b->CreateAdd(constant(j * step_x * vector_size + i), + start_offset_x, "x_loc"); IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim( - constant(j * step_x * vector_size + i), kDimX, b_); + constant(j * step_x * vector_size + i), kDimX, b); auto emit_element = [&] { return (*emit_elem_function)(source_idx_x, y_loc, x_loc, linear_index); }; if (check_x_tile_bounds) { - ksl->If(loop_name + "_x_in_tile", b_->CreateICmpULT(x_loc, tile_width), + ksl->If(loop_name + "_x_in_tile", b->CreateICmpULT(x_loc, tile_width), emit_element); } else { emit_element(); From ab8eb15d1f25dfb95a55f3a498ea024bd08cc158 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 2 Apr 2020 06:32:12 -0700 Subject: [PATCH 47/52] clang-format --- .../xla/service/gpu/ir_emitter_unnested.cc | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index cf5f3716dbb..7493a943329 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2057,13 +2057,14 @@ void IrEmitterUnnested::EmitTile( // vectorized size, so it can't be vectorized by LLVM. if (!x_tile_fits && mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { - ksl->If(loop_name + "_is_full_tile", - // For the last block, tile_width will be the number of - // elements left. - b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()), - tile_width), - [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/false); }, - [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/true); }); + ksl->If( + loop_name + "_is_full_tile", + // For the last block, tile_width will be the number of + // elements left. + b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()), + tile_width), + [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/false); }, + [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/true); }); } else { unroll_inner_tile_loop(/*check_x_tile_bounds=*/!x_tile_fits); } From 8d724f869346f0bfd4cd57bbcd9b42bab9ef500e Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Fri, 3 Apr 2020 14:45:34 -0700 Subject: [PATCH 48/52] Rename a variable --- .../compiler/xla/service/gpu/ir_emitter_unnested.cc | 12 ++++++------ .../compiler/xla/service/gpu/kernel_mapping_scheme.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 7493a943329..73c78204f69 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -108,8 +108,8 @@ const auto kDimTot = KernelMappingScheme::DimTot; const auto kLinearIndexingX = KernelMappingScheme::LinearIndexingX; const auto kStridedIndexingX = KernelMappingScheme::StridedIndexingX; -const auto kLinearStridedIndexingX = - KernelMappingScheme::LinearStridedIndexingX; +const auto kStridedLinearIndexingX = + KernelMappingScheme::StridedLinearIndexingX; // If a dimensions is smaller than this, untiled transposition may be more // efficient. @@ -1933,7 +1933,7 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme, }; if (mapping_scheme.GetIndexingOrder() == kStridedIndexingX) { return thread_id_x; - } else if (mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { + } else if (mapping_scheme.GetIndexingOrder() == kStridedLinearIndexingX) { return b->CreateMul(thread_id_x, constant(mapping_scheme.GetVectorSize())); } CHECK_EQ(mapping_scheme.GetIndexingOrder(), kLinearIndexingX); @@ -2056,7 +2056,7 @@ void IrEmitterUnnested::EmitTile( // row size. For odd row size every other row isn't aligned to the // vectorized size, so it can't be vectorized by LLVM. if (!x_tile_fits && - mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) { + mapping_scheme.GetIndexingOrder() == kStridedLinearIndexingX) { ksl->If( loop_name + "_is_full_tile", // For the last block, tile_width will be the number of @@ -3206,7 +3206,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( // rows of even size. For odd row sizes, every other row // isn't aligned, so it can't be vectorized. (cc_major >= 7 && reduction_dimensions.dimensions[2] % 2 == 0))) { - return kLinearStridedIndexingX; + return kStridedLinearIndexingX; } else if (!reduction_dimensions.is_row_reduction && IsUnrollingColumnReductionBeneficial( unnested_hlo, input_shape, @@ -3220,7 +3220,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( }(); int vector_size = 1; - if (indexing_order == kLinearStridedIndexingX) { + if (indexing_order == kStridedLinearIndexingX) { if (reduction_dimensions.dimensions[2] % 2 == 0 && // Assuming XLA will perform the unrolling and LLVM will vectorize, // disable the unroll for the cases that LLVM doesn't vectorize. diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index 9291d7bc6a3..3123063cae8 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -84,7 +84,7 @@ class KernelMappingScheme { // Thread reads a few consecutive elements then take a strided // step. This can trigger vectorized reads and keep memory // coalescing. - LinearStridedIndexingX + StridedLinearIndexingX }; KernelMappingScheme(absl::Span dims_in_elems, @@ -101,7 +101,7 @@ class KernelMappingScheme { CHECK_EQ(tile_sizes[2] % num_threads_x_, 0); VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ","); if (indexing_order != LinearIndexingX) { - // StridedIndexingX, and LinearStridedIndexingX + // StridedIndexingX, and StridedLinearIndexingX // is for the purpose of vectorization, which requires // GetTileSizeFor(DimX) to be a multiplier of num_threads_x_. CHECK_EQ(GetTileSizeFor(DimX) % num_threads_x_, 0); From 8779db9c79385c000b22ec6649e41aaad9fd42c8 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Fri, 3 Apr 2020 14:45:52 -0700 Subject: [PATCH 49/52] Small simplification --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 4 ++-- .../xla/service/gpu/tests/reduction_vectorization_test.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 73c78204f69..d98c63e3370 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1962,9 +1962,9 @@ static void UnrollInnerTileLoop( auto constant = [&](int64 val) { return llvm::ConstantInt::get(index_ty, val); }; + IrArray::Index source_idx_x_base = + source_idx.AddOffsetToDim(y_loc, kDimY, b); for (int64 j = 0; j < x_num_steps / vector_size; j++) { - IrArray::Index source_idx_x_base = - source_idx.AddOffsetToDim(y_loc, kDimY, b); for (int64 i = 0; i < vector_size; i++) { int linear_index = j * vector_size + i; llvm::Value* x_loc = b->CreateAdd(constant(j * step_x * vector_size + i), diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc index 5f27df08b90..8ee0eed3ca3 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc @@ -184,7 +184,7 @@ CHECK: ld.global.nc.f32 EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); } -TEST_F(ReductionVectorizationTest, DisableOddColumns) { +TEST_F(ReductionVectorizationTest, DisabledOddColumns) { const char* hlo_text = R"( HloModule ReduceTileFit From dd571a0133195923ba65afccebbcd91707a9c260 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 6 Apr 2020 14:28:24 -0700 Subject: [PATCH 50/52] Fix the NCHECK failure. EmitTargetElementLoop call ComputeMaxUnrollFactor when MayPreventVectorization return false. But ComputeMaxUnrollFactor call ElementsIn that works only on DenseArray. Now behave as before in that case --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index d98c63e3370..725209f7c45 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1896,7 +1896,7 @@ bool MayPreventVectorization(const HloInstruction& hlo) { default: return false; } - } else if (hlo.opcode() == HloOpcode::kReduce) { + } else if (hlo.opcode() == HloOpcode::kReduce && hlo.shape().IsArray()) { // TODO: check if the to_apply() attribute contains instruction // that break LLVM vectorization. return false; From 45f3ebb53fb594bbfb06ad6ae23a21d7c88d885c Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 9 Apr 2020 14:44:57 -0700 Subject: [PATCH 51/52] Make XLA emit the Store annotation about aliasing. This allows LLVM to vectorize them and fix the speed regression reported. --- .../xla/service/gpu/ir_emitter_unnested.cc | 7 +-- .../gpu/tests/reduction_vectorization_test.cc | 61 +++++++++++++++++++ 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 725209f7c45..32170229813 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -538,13 +538,10 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce( absl::Span> extra_output_gens) { for (int i = 0; i != extra_output_gens.size(); ++i) { - llvm::Value* extra_output_address = - GetIrArray(*unnested_hlo, *unnested_hlo, extra_output_gens[i].second) - .EmitArrayElementAddress(index, &b_, "extra_output_element_address", - use_linear_index); TF_ASSIGN_OR_RETURN(llvm::Value* const extra_output_ir_value, extra_output_gens[i].first(index)); - Store(extra_output_ir_value, extra_output_address); + GetIrArray(*unnested_hlo, *unnested_hlo, extra_output_gens[i].second) + .EmitWriteArrayElement(index, extra_output_ir_value, &b_, use_linear_index); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc index 8ee0eed3ca3..040025ee1a2 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc @@ -294,6 +294,67 @@ CHECK-NOT: ld.global.u64 EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); } +class ReductionVectorizationNoOptTest : public GpuCodegenTest { + DebugOptions GetDebugOptionsForTest() override { + DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest(); + // The test MultiOutputStore contain a MOF fusion and XLA optimizer pass doesn't like this. + debug_options.set_xla_disable_all_hlo_passes(true); + return debug_options; + } +}; + +TEST_F(ReductionVectorizationNoOptTest, MultiOutputStore) { + const char* hlo_text = R"( +HloModule MultiOutputStore + +%add_f32 { + %x = f32[] parameter(0) + %y = f32[] parameter(1) + ROOT %add = f32[] add(%x, %y) +} + +%fused_computation { + %param_0 = f32[2,384,1024] parameter(0) + %param_1 = f32[2,384] parameter(1) + %constant0 = f32[] constant(0.0009765625) + %broadcast0 = f32[2,384] broadcast(%constant0), dimensions={} + %multiply0 = f32[2,384] multiply(%param_1, %broadcast0) + %broadcast1 = f32[2,384,1024] broadcast(%multiply0), dimensions={0,1} + %subtract = f32[2,384,1024] subtract(%param_0, %broadcast1) + %multiply1 = f32[2,384,1024] multiply(%subtract, %subtract) + %constant1 = f32[] constant(0) + %reduce = f32[2,384] reduce(%multiply1, %constant1), dimensions={2}, to_apply=%add_f32 + ROOT %tuple = (f32[2,384], f32[2,384,1024], f32[2,384,1024]) tuple(%reduce, %subtract, %broadcast1) +} + +ENTRY %cluster { + %param0 = f32[2,384,1024] parameter(0) + %param1 = f32[2,384] parameter(1) + ROOT %fusion = (f32[2,384], f32[2,384,1024], f32[2,384,1024]) fusion(%param0, %param1), kind=kInput, calls=%fused_computation +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr optimized_module, + ParseAndReturnVerifiedModule(hlo_text)); + CompileAndOptionallyVerifyPtx(std::move(optimized_module), + R"( +CHECK: ld.global.nc.v2.f32 +CHECK: st.global.v2.f32 +CHECK: st.global.v2.f32 +CHECK: ld.global.nc.v2.f32 +CHECK: st.global.v2.f32 +CHECK: st.global.v2.f32 +CHECK: ld.global.nc.v2.f32 +CHECK: st.global.v2.f32 +CHECK: st.global.v2.f32 +CHECK: ld.global.nc.v2.f32 +CHECK: st.global.v2.f32 +CHECK: st.global.v2.f32 +)"); + + EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); +} + } // namespace } // namespace gpu } // namespace xla From 502c51cc351b9ea9f8e5a18141be8ce83679f791 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 14 Apr 2020 07:49:40 -0700 Subject: [PATCH 52/52] Fix test on some GPUs/env combination. --- .../xla/service/gpu/tests/reduction_vectorization_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc index 040025ee1a2..4f8ada9ef93 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc @@ -176,7 +176,6 @@ CHECK: ld.global.nc.f32 CHECK: ld.global.nc.f32 CHECK: ld.global.nc.f32 CHECK: ld.global.nc.f32 -CHECK: ld.global.nc.f32 )"; } CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);