From a8a9a98935143de7ec7e08aaef03b73b6a8137cf Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 13 Jan 2020 09:31:14 -0800
Subject: [PATCH 01/52] Add env variable to allow different reduction case.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 88 +++++++++++++------
 1 file changed, 62 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 528a847b3ed..35f5f128108 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1959,6 +1959,12 @@ void IrEmitterUnnested::EmitTile(
   auto ceil_of_ratio = [&](llvm::Value* a, llvm::Value* b) {
     return b_.CreateUDiv(b_.CreateAdd(b_.CreateAdd(a, b), constant(-1)), b);
   };
+  int vec_stride = 1;
+  char* env = getenv("VEC_STRIDE");
+  if (env)
+    vec_stride = atoi(env);
+  std::cout << 'VEC_STRIDE ' << vec_stride << std::endl;
+  std::cout << "num_threads_x " << num_threads_x << " num_threads_y " << num_threads_y << " tile_size_x " << tile_size_x << std::endl;
 
   // True iff all threads always execute all instructions in the tiling
   // dimension X.
@@ -1978,32 +1984,62 @@ void IrEmitterUnnested::EmitTile(
   //
   // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the
   // workaround.
-  ksl->For(loop_name + "_y_in_tile",
-           /*start=*/constant(0),
-           /*end=*/
-           ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
-                         num_threads_y),
-           /*step=*/constant(1), [&](llvm::Value* y_indvar) {
-             llvm::Value* y_loc =
-                 b_.CreateAdd(thread_id_info.thread_id_y,
-                              b_.CreateMul(y_indvar, num_threads_y));
-             for (int64 j = 0; j < x_num_steps; j++) {
-               llvm::Value* x_loc =
-                   b_.CreateAdd(constant(j * step_x), start_offset_x, "x_loc");
-               IrArray::Index source_idx_x =
-                   source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
-                       .AddOffsetToDim(constant(j * step_x), kDimX, &b_);
-               auto emit_element = [&] {
-                 return emit_elem_function(source_idx_x, y_loc, x_loc, j);
-               };
-               if (!x_tile_fits) {
-                 ksl->If(loop_name + "_x_in_tile",
-                         b_.CreateICmpULT(x_loc, tile_width), emit_element);
-               } else {
-                 emit_element();
-               }
-             }
-           });
+  ksl->For(
+      loop_name + "_y_in_tile",
+      /*start=*/constant(0),
+      /*end=*/
+      ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
+                    num_threads_y),
+      /*step=*/constant(1), [&](llvm::Value* y_indvar) {
+
+
+        
+        llvm::Value* y_loc = b_.CreateAdd(
+            thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
+        if (vec_stride == 1) {
+          for (int64 j = 0; j < x_num_steps; j++) {
+            llvm::Value* x_loc =
+                b_.CreateAdd(constant(j * step_x), start_offset_x, "x_loc");
+            IrArray::Index source_idx_x =
+                source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
+                    .AddOffsetToDim(constant(j * step_x), kDimX, &b_);
+            auto emit_element = [&] {
+              return emit_elem_function(source_idx_x, y_loc, x_loc, j);
+            };
+            if (!x_tile_fits) {
+              ksl->If(loop_name + "_x_in_tile",
+                      b_.CreateICmpULT(x_loc, tile_width), emit_element);
+            } else {
+              emit_element();
+            }
+          }
+        } else {
+	  for (int64 j = 0; j < x_num_steps/vec_stride; j++) {
+            //Prep some values. If we do not do this, LLVM doesn't vectorize.
+            llvm::Value* x_loc_base =
+	        b_.CreateAdd(constant(j * step_x * vec_stride), start_offset_x, "x_loc_base");
+            IrArray::Index source_idx_x_base =
+                source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
+                  .AddOffsetToDim(constant(j * step_x * vec_stride), kDimX, &b_);
+
+	    for (int i = 0; i < vec_stride; i++) {
+              int old_j = j * vec_stride + i;
+	      llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc");
+	      IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim(
+                  constant(i), kDimX, &b_);
+              auto emit_element = [&] {
+                return emit_elem_function(source_idx_x, y_loc, x_loc, j);
+              };
+              if (!x_tile_fits) {
+                ksl->If(loop_name + "_x_in_tile",
+                        b_.CreateICmpULT(x_loc, tile_width), emit_element);
+              } else {
+                emit_element();
+              }
+            }
+          }
+        }
+      });
 }
 
 // Emits code to process a tensor element in a tile for the given kCopy HLO that

From bbe5592773d203172aae07c878f720633d16c906 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Tue, 21 Jan 2020 09:11:49 -0800
Subject: [PATCH 02/52] Enable the new vectorized+strided iteration mode only
 for row reduction.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 50 +++++++++++++------
 .../xla/service/gpu/kernel_mapping_scheme.h   | 31 ++++++++----
 2 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 35f5f128108..9c9bb0bd6ba 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1919,9 +1919,13 @@ Status IrEmitterUnnested::EmitTargetElementLoop(
 static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
                                     llvm::Value* thread_id_x,
                                     llvm::Type* index_ty,
-                                    llvm::IRBuilder<>* b) {
-  if (mapping_scheme.DilatedX()) {
+                                    llvm::IRBuilder<>* b,
+                                    int vec_stride = 1) {
+  if (mapping_scheme.GetIndexingOrder() == KernelMappingScheme::DilatedIndexingX) {
     return thread_id_x;
+  } else if (mapping_scheme.GetIndexingOrder() ==
+             KernelMappingScheme::VectorizedCoalescedIndexingX) {
+    return b->CreateMul(thread_id_x, llvm::ConstantInt::get(index_ty, vec_stride));
   }
   int64 x_num_steps =
       mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX();
@@ -1944,14 +1948,23 @@ void IrEmitterUnnested::EmitTile(
   int64 tile_size_x = mapping_scheme.GetTileSizeX();
 
   int64 x_num_steps = tile_size_x / num_threads_x;
-  llvm::Value* start_offset_x = GetStartOffsetX(
-      mapping_scheme, thread_id_info.thread_id_x, index_ty, &b_);
+  int vec_stride = 2;
+  char* env = getenv("VEC_STRIDE");
+  if (env)
+    vec_stride = atoi(env);
+  if (mapping_scheme.GetIndexingOrder() != KernelMappingScheme::VectorizedCoalescedIndexingX ) {
+    vec_stride = 1;
+  }
+
+  // TODO: should I modify the other call to GetStartOffsetX?
+  llvm::Value* start_offset_x =
+      GetStartOffsetX(mapping_scheme, thread_id_info.thread_id_x, index_ty, &b_, vec_stride);
 
   // Using dilated mapping scheme, each thread steps with a stride of number
   // of threads.
   // Otherwise, the stride is one, but we multiply each offset by the limit of
   // number of steps which can be made.
-  int64 step_x = mapping_scheme.DilatedX() ? num_threads_x : 1;
+  int64 step_x = mapping_scheme.GetIndexingOrder() == KernelMappingScheme::LinearIndexingX? 1 : num_threads_x;
 
   IrArray::Index source_idx =
       tile_origin_index.AddOffsetToDim(start_offset_x, kDimX, &b_);
@@ -1959,13 +1972,6 @@ void IrEmitterUnnested::EmitTile(
   auto ceil_of_ratio = [&](llvm::Value* a, llvm::Value* b) {
     return b_.CreateUDiv(b_.CreateAdd(b_.CreateAdd(a, b), constant(-1)), b);
   };
-  int vec_stride = 1;
-  char* env = getenv("VEC_STRIDE");
-  if (env)
-    vec_stride = atoi(env);
-  std::cout << 'VEC_STRIDE ' << vec_stride << std::endl;
-  std::cout << "num_threads_x " << num_threads_x << " num_threads_y " << num_threads_y << " tile_size_x " << tile_size_x << std::endl;
-
   // True iff all threads always execute all instructions in the tiling
   // dimension X.
   bool x_tile_fits = mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0;
@@ -2122,7 +2128,8 @@ static int GetNumberOfPartialResults(
   if (reduction_info.IsRowReduction()) {
     return 1;
   }
-  int64 num_partial_results = mapping_scheme.DilatedX() ? 1 : 2;
+  int64 num_partial_results = mapping_scheme.GetIndexingOrder() ==
+      KernelMappingScheme::DilatedIndexingX ? 1 : 2;
   CHECK_EQ(num_partial_results,
            (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX()));
   return num_partial_results;
@@ -2706,7 +2713,7 @@ void IrEmitterUnnested::EmitHlo021Tile(
                                      /*tile_sizes=*/{1, kWarpSize, kWarpSize},
                                      /*num_threads_y=*/kNumRows,
                                      /*num_threads_x=*/kWarpSize,
-                                     /*is_dilated_x=*/false);
+                                     /*indexing_order=*/KernelMappingScheme::LinearIndexingX);
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
   llvm::Type* index_type =
@@ -3152,7 +3159,18 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
       !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
                                             reduction_dimensions.dimensions[2]);
 
-  if (!dilated_x && !reduction_dimensions.is_row_reduction) {
+  KernelMappingScheme::IndexingOrder indexing_order;
+  if (reduction_dimensions.is_row_reduction) {
+    indexing_order = KernelMappingScheme::VectorizedCoalescedIndexingX;
+  } else if (IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
+                                                   reduction_dimensions.dimensions[2])) {
+    indexing_order = KernelMappingScheme::LinearIndexingX;
+  } else {
+    indexing_order = KernelMappingScheme::DilatedIndexingX;
+  }
+
+  if (indexing_order == KernelMappingScheme::LinearIndexingX &&
+      !reduction_dimensions.is_row_reduction) {
     // Vectorized loads: a single thread reduces two adjacent columns.
     reduction_tiling[2] *= 2;
   }
@@ -3173,7 +3191,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
       reduction_dimensions.dimensions,
       {reduction_tiling[0], reduction_tiling[1] * num_threads_y,
        reduction_tiling[2] * num_threads_x},
-      num_threads_y, num_threads_x, dilated_x);
+      num_threads_y, num_threads_x, indexing_order);
   return ReductionCodegenInfo(mapping_scheme,
                               reduction_dimensions.is_row_reduction);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index eeab8d4dc80..022104361e6 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -76,19 +76,23 @@ namespace gpu {
 class KernelMappingScheme {
  public:
   enum { DimZ = 0, DimY, DimX, DimTot };
+  // TODO: rename Dilated to Strided?
+  enum IndexingOrder { LinearIndexingX, DilatedIndexingX, VectorizedCoalescedIndexingX };
+
   KernelMappingScheme(absl::Span<const int64> dims_in_elems,
                       absl::Span<const int64> tile_sizes, int64 num_threads_y,
-                      int64 num_threads_x, bool is_dilated_x)
+                      int64 num_threads_x, IndexingOrder indexing_order)
       : dims_in_elems_{dims_in_elems[0], dims_in_elems[1], dims_in_elems[2]},
         tile_sizes_{tile_sizes[0], tile_sizes[1], tile_sizes[2]},
         num_threads_x_(num_threads_x),
         num_threads_y_(num_threads_y),
-        dilated_x_(is_dilated_x) {
+        indexing_order_(indexing_order) {
     CHECK_EQ(tile_sizes[1] % num_threads_y_, 0);
     CHECK_EQ(tile_sizes[2] % num_threads_x_, 0);
     VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
-    if (!dilated_x_) {
-      // dilated_x_=false is for the purpose of vectorization, which requires
+    if (indexing_order != LinearIndexingX) {
+      // DilatedIndexingX, and VectorizedCoalescedIndexingX
+      // is for the purpose of vectorization, which requires
       // GetTileSizeFor(DimX) to be a multiplier of num_threads_x_.
       CHECK_EQ(GetTileSizeFor(DimX) % num_threads_x_, 0);
     }
@@ -118,7 +122,9 @@ class KernelMappingScheme {
     return GetNumThreadsX() * GetNumThreadsY();
   }
 
-  bool DilatedX() const { return dilated_x_; }
+  IndexingOrder GetIndexingOrder() const {
+    return indexing_order_;
+  }
 
  private:
   // The number of elements in each dimension.
@@ -133,12 +139,15 @@ class KernelMappingScheme {
   // Number of threads used to process elements in the Y direction of a tile.
   const int64 num_threads_y_;
 
-  // When num_threads_x threads process a total of tile_size_x elements in the
-  // X dimension of a tile, each threads process n=tile_size_x/num_threads_x
-  // elements. When dilated_x=false, the n elements processed by a thread are
-  // contiguous. On the other hand, when dilated_x=true the n elements are
-  // dilated by a factor of num_threads_x.
-  const bool dilated_x_;
+  // When num_threads_x threads process a total of tile_size_x
+  // elements in the X dimension of a tile, each threads process
+  // n=tile_size_x/num_threads_x elements. When indexing_order_ have
+  // the value LinearIndexingX, the n elements processed by a thread
+  // are contiguous. When the value is DilatedIndexingX, the n
+  // elements are dilated by a factor of num_threads_x. When the value
+  // is VectorizedCoalescedIndexingX, it is a mix of both to enable
+  // vectorizing while keeping memory coalescing.
+  const IndexingOrder indexing_order_;
 };
 
 // Information to support the code generation for a tiled reduction kernel.

From 04594ea65bf0c3e74576295b91d3101b06a4795d Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Sat, 15 Feb 2020 15:54:51 -0800
Subject: [PATCH 03/52] Move the vector size into KernelMappingScheme

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 52 +++++++++----------
 .../xla/service/gpu/kernel_mapping_scheme.h   | 24 ++++++---
 2 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 9c9bb0bd6ba..9a523d0c111 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1919,13 +1919,13 @@ Status IrEmitterUnnested::EmitTargetElementLoop(
 static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
                                     llvm::Value* thread_id_x,
                                     llvm::Type* index_ty,
-                                    llvm::IRBuilder<>* b,
-                                    int vec_stride = 1) {
+                                    llvm::IRBuilder<>* b) {
   if (mapping_scheme.GetIndexingOrder() == KernelMappingScheme::DilatedIndexingX) {
     return thread_id_x;
   } else if (mapping_scheme.GetIndexingOrder() ==
-             KernelMappingScheme::VectorizedCoalescedIndexingX) {
-    return b->CreateMul(thread_id_x, llvm::ConstantInt::get(index_ty, vec_stride));
+             KernelMappingScheme::LinearDilatedIndexingX) {
+    int vector_size = mapping_scheme.GetVectorSize();
+    return b->CreateMul(thread_id_x, llvm::ConstantInt::get(index_ty, vector_size));
   }
   int64 x_num_steps =
       mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX();
@@ -1948,17 +1948,8 @@ void IrEmitterUnnested::EmitTile(
   int64 tile_size_x = mapping_scheme.GetTileSizeX();
 
   int64 x_num_steps = tile_size_x / num_threads_x;
-  int vec_stride = 2;
-  char* env = getenv("VEC_STRIDE");
-  if (env)
-    vec_stride = atoi(env);
-  if (mapping_scheme.GetIndexingOrder() != KernelMappingScheme::VectorizedCoalescedIndexingX ) {
-    vec_stride = 1;
-  }
-
-  // TODO: should I modify the other call to GetStartOffsetX?
-  llvm::Value* start_offset_x =
-      GetStartOffsetX(mapping_scheme, thread_id_info.thread_id_x, index_ty, &b_, vec_stride);
+  llvm::Value* start_offset_x = GetStartOffsetX(
+      mapping_scheme, thread_id_info.thread_id_x, index_ty, &b_);
 
   // Using dilated mapping scheme, each thread steps with a stride of number
   // of threads.
@@ -1972,6 +1963,7 @@ void IrEmitterUnnested::EmitTile(
   auto ceil_of_ratio = [&](llvm::Value* a, llvm::Value* b) {
     return b_.CreateUDiv(b_.CreateAdd(b_.CreateAdd(a, b), constant(-1)), b);
   };
+
   // True iff all threads always execute all instructions in the tiling
   // dimension X.
   bool x_tile_fits = mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0;
@@ -1990,6 +1982,7 @@ void IrEmitterUnnested::EmitTile(
   //
   // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the
   // workaround.
+  int vec_stride = mapping_scheme.GetVectorSize();
   ksl->For(
       loop_name + "_y_in_tile",
       /*start=*/constant(0),
@@ -1997,9 +1990,7 @@ void IrEmitterUnnested::EmitTile(
       ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
                     num_threads_y),
       /*step=*/constant(1), [&](llvm::Value* y_indvar) {
-
-
-        
+        printf("VEC_STRIDE %d\n", vec_stride);
         llvm::Value* y_loc = b_.CreateAdd(
             thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
         if (vec_stride == 1) {
@@ -2020,18 +2011,18 @@ void IrEmitterUnnested::EmitTile(
             }
           }
         } else {
-	  for (int64 j = 0; j < x_num_steps/vec_stride; j++) {
+          for (int64 j = 0; j < x_num_steps/vec_stride; j++) {
             //Prep some values. If we do not do this, LLVM doesn't vectorize.
             llvm::Value* x_loc_base =
-	        b_.CreateAdd(constant(j * step_x * vec_stride), start_offset_x, "x_loc_base");
+                b_.CreateAdd(constant(j * step_x * vec_stride), start_offset_x, "x_loc_base");
             IrArray::Index source_idx_x_base =
                 source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
                   .AddOffsetToDim(constant(j * step_x * vec_stride), kDimX, &b_);
 
-	    for (int i = 0; i < vec_stride; i++) {
+            for (int i = 0; i < vec_stride; i++) {
               int old_j = j * vec_stride + i;
-	      llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc");
-	      IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim(
+              llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc");
+              IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim(
                   constant(i), kDimX, &b_);
               auto emit_element = [&] {
                 return emit_elem_function(source_idx_x, y_loc, x_loc, j);
@@ -2713,7 +2704,8 @@ void IrEmitterUnnested::EmitHlo021Tile(
                                      /*tile_sizes=*/{1, kWarpSize, kWarpSize},
                                      /*num_threads_y=*/kNumRows,
                                      /*num_threads_x=*/kWarpSize,
-                                     /*indexing_order=*/KernelMappingScheme::LinearIndexingX);
+                                     /*indexing_order=*/KernelMappingScheme::LinearIndexingX,
+                                     /*vector_size*/1);
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
   llvm::Type* index_type =
@@ -3161,7 +3153,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
 
   KernelMappingScheme::IndexingOrder indexing_order;
   if (reduction_dimensions.is_row_reduction) {
-    indexing_order = KernelMappingScheme::VectorizedCoalescedIndexingX;
+    indexing_order = KernelMappingScheme::LinearDilatedIndexingX;
   } else if (IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
                                                    reduction_dimensions.dimensions[2])) {
     indexing_order = KernelMappingScheme::LinearIndexingX;
@@ -3187,11 +3179,19 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
     return kWarpSize;
   }();
 
+  int vec_stride = 2;
+  char* env = getenv("VEC_STRIDE");
+  if (env)
+    vec_stride = atoi(env);
+  if (indexing_order != KernelMappingScheme::LinearDilatedIndexingX) {
+    vec_stride = 1;
+  }
   KernelMappingScheme mapping_scheme(
       reduction_dimensions.dimensions,
       {reduction_tiling[0], reduction_tiling[1] * num_threads_y,
        reduction_tiling[2] * num_threads_x},
-      num_threads_y, num_threads_x, indexing_order);
+      num_threads_y, num_threads_x, indexing_order,
+      vec_stride);
   return ReductionCodegenInfo(mapping_scheme,
                               reduction_dimensions.is_row_reduction);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 022104361e6..c8f2864c3e9 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -77,21 +77,23 @@ class KernelMappingScheme {
  public:
   enum { DimZ = 0, DimY, DimX, DimTot };
   // TODO: rename Dilated to Strided?
-  enum IndexingOrder { LinearIndexingX, DilatedIndexingX, VectorizedCoalescedIndexingX };
+  enum IndexingOrder { LinearIndexingX, DilatedIndexingX, LinearDilatedIndexingX };
 
   KernelMappingScheme(absl::Span<const int64> dims_in_elems,
                       absl::Span<const int64> tile_sizes, int64 num_threads_y,
-                      int64 num_threads_x, IndexingOrder indexing_order)
+                      int64 num_threads_x, IndexingOrder indexing_order,
+                      int vector_size)
       : dims_in_elems_{dims_in_elems[0], dims_in_elems[1], dims_in_elems[2]},
         tile_sizes_{tile_sizes[0], tile_sizes[1], tile_sizes[2]},
         num_threads_x_(num_threads_x),
         num_threads_y_(num_threads_y),
-        indexing_order_(indexing_order) {
+        indexing_order_(indexing_order),
+        vector_size_(vector_size){
     CHECK_EQ(tile_sizes[1] % num_threads_y_, 0);
     CHECK_EQ(tile_sizes[2] % num_threads_x_, 0);
     VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
     if (indexing_order != LinearIndexingX) {
-      // DilatedIndexingX, and VectorizedCoalescedIndexingX
+      // DilatedIndexingX, and LinearDilatedIndexingX
       // is for the purpose of vectorization, which requires
       // GetTileSizeFor(DimX) to be a multiplier of num_threads_x_.
       CHECK_EQ(GetTileSizeFor(DimX) % num_threads_x_, 0);
@@ -125,6 +127,9 @@ class KernelMappingScheme {
   IndexingOrder GetIndexingOrder() const {
     return indexing_order_;
   }
+  int GetVectorSize() const {
+    return vector_size_;
+  }
 
  private:
   // The number of elements in each dimension.
@@ -145,9 +150,16 @@ class KernelMappingScheme {
   // the value LinearIndexingX, the n elements processed by a thread
   // are contiguous. When the value is DilatedIndexingX, the n
   // elements are dilated by a factor of num_threads_x. When the value
-  // is VectorizedCoalescedIndexingX, it is a mix of both to enable
-  // vectorizing while keeping memory coalescing.
+  // is LinearDilatedIndexingX, it is a mix of both to enable
+  // vectorizing while keeping memory coalescing. It reads
+  // vector_size_ elements, then do a step to the next vector_size_
+  // elements.
   const IndexingOrder indexing_order_;
+  // vector_size_ only supported for row reduction.
+  // Must be a divisor of tile_sizes_[2]/num_threads_x.
+  // Interesting values are 2 and 4 to trigger vectorized loads on GPUs
+  // While keeping memory coalescing.
+  const int vector_size_;
 };
 
 // Information to support the code generation for a tiled reduction kernel.

From 5fb7dcbe077e66b6f473a6758a6e97ca766c7525 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Tue, 18 Feb 2020 13:59:05 -0800
Subject: [PATCH 04/52] If some blocks fit completly, unroll to vectorize for
 them.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 76 ++++++++++++++++++-
 1 file changed, 73 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 9a523d0c111..135a6396c27 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2011,6 +2011,7 @@ void IrEmitterUnnested::EmitTile(
             }
           }
         } else {
+          auto unroll = [&](bool add_if, int64 max_element, int64 vec_stride) {
           for (int64 j = 0; j < x_num_steps/vec_stride; j++) {
             //Prep some values. If we do not do this, LLVM doesn't vectorize.
             llvm::Value* x_loc_base =
@@ -2027,7 +2028,7 @@ void IrEmitterUnnested::EmitTile(
               auto emit_element = [&] {
                 return emit_elem_function(source_idx_x, y_loc, x_loc, j);
               };
-              if (!x_tile_fits) {
+              if (add_if) {
                 ksl->If(loop_name + "_x_in_tile",
                         b_.CreateICmpULT(x_loc, tile_width), emit_element);
               } else {
@@ -2035,8 +2036,77 @@ void IrEmitterUnnested::EmitTile(
               }
             }
           }
-        }
-      });
+          };
+
+          char * env_if = getenv("RED_IF");
+          int red_if = 1;
+          if (env_if) {
+            red_if = atoi(env_if);
+            printf("RED_IF2 = %d %s\n", red_if, env_if);
+          }
+          if (red_if == 1) {
+            std::cout << "IF_NB 1: " << std::endl;
+            unroll(!x_tile_fits, x_num_steps, vec_stride);
+          } else if (red_if == 2) {
+            std::cout << "IF_NB 2" << std::endl;
+            ksl->If(loop_name + "_is_full_tile",
+                    //b->CreateICmpULT(last_element, tile_width),
+                    b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width),
+                    [&] {unroll(false, x_num_steps, vec_stride);},
+                    [&] {unroll(true, x_num_steps, vec_stride);});
+          } else {
+            std::cout << "IF_NB 3" << std::endl;
+            //b->CreateICmpULT(start_offset_x+j * step_x * vec_stride + i, tile_width)
+            int last_block_left_element = mapping_scheme.GetDimsInElems()[2] % x_num_steps;
+            std::cout << "MAPPING " << mapping_scheme.GetDimsInElems()[0] << " "
+                      << mapping_scheme.GetDimsInElems()[1] << " "
+                      << mapping_scheme.GetDimsInElems()[2] << std::endl;
+            std::cout << "LAST_BLOCK x_num_steps " << x_num_steps
+                      << " last_block" << last_block_left_element << std::endl;
+            // NB block per reduction.
+            int nb_block = CeilOfRatio(mapping_scheme.GetDimsInElems()[2],
+                                       tile_size_x);
+            std::cout << "NB_BLOCK" << nb_block << std::endl;
+            if (x_tile_fits) {
+              // All threads will completly unroll
+              unroll(false, x_num_steps, vec_stride);
+            } else if(nb_block == 1) {
+              // No thread will completly unroll.
+              // TODO: unroll by the right amount
+              unroll(true, x_num_steps, vec_stride);
+            } else {
+              // For some blocks, all threads will will completly unroll.
+              // For other blocks, some of its threads will completly unroll, others will partially and some won't be used.
+              // So do an if(thread fully unroll) {code with no if between elements} else {code with if between each elements}
+              // TODO: in the else part, unroll without if but with the right number of elements left.
+
+              llvm::Value* block_id = gpu::EmitCallToTargetIntrinsic(
+                  gpu::TargetIntrinsicID::kBlockIdx, {}, {}, &b_);
+              llvm::Value* last_element = b_.CreateAdd(constant(x_num_steps * tile_size_x),
+                                                       start_offset_x, "last_element");
+              int x_num_steps_partial = mapping_scheme.GetDimsInElems()[2] % tile_size_x;
+              x_num_steps_partial *= 2;
+              //x_num_steps_partial = x_num_steps;
+              ksl->If(loop_name + "_is_full_tile",
+                      // Test if all the elements of this thread is withing tile.
+                      b_.CreateICmpULT(last_element, tile_width),
+                      // Not the last block, so unroll without ifs.
+                      [&] {unroll(false, x_num_steps, vec_stride);},
+                      // The last block isn't completly unrolled.
+
+                      // TODO: unroll the right size. Take care
+                      // vec_stride must match the above unroll for
+                      // now.
+                      // TODO: after unroll of the right size, remove the IFs.
+                      // ONGOING, try to make it work with less
+                      // then unroll x_num_steps
+
+                       [&] {unroll(true, x_num_steps, vec_stride);}); // works
+              //[&] {unroll(true, x_num_steps, x_num_steps);});
+              //[&] {unroll(true, x_num_steps_partial, vec_stride);});
+            }
+          }
+        }});
 }
 
 // Emits code to process a tensor element in a tile for the given kCopy HLO that

From 24ac05af45f90b7e6738587a64d713870379662a Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 19 Feb 2020 11:42:29 -0800
Subject: [PATCH 05/52] Add comment

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 135a6396c27..b63da486a1e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2051,6 +2051,7 @@ void IrEmitterUnnested::EmitTile(
             std::cout << "IF_NB 2" << std::endl;
             ksl->If(loop_name + "_is_full_tile",
                     //b->CreateICmpULT(last_element, tile_width),
+                    // If (the thread fully unrolled) {no condition path} else {condition path}
                     b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width),
                     [&] {unroll(false, x_num_steps, vec_stride);},
                     [&] {unroll(true, x_num_steps, vec_stride);});

From 51229ce3ffd9fcdde61147d21c343c5773cc2c48 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 19 Feb 2020 07:40:59 -0800
Subject: [PATCH 06/52] Only vectorize when the tile completly fit.

---
 .../xla/service/gpu/ir_emitter_unnested.cc        | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index b63da486a1e..7aebf2e9577 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -3250,17 +3250,20 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
     return kWarpSize;
   }();
 
-  int vec_stride = 2;
+  int tile_size_x = reduction_tiling[2] * num_threads_x;
+
+  int vec_stride = 1;
   char* env = getenv("VEC_STRIDE");
-  if (env)
+  if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX &&
+      reduction_dimensions.dimensions[2] % tile_size_x == 0) {
+    vec_stride = 2;
+  }
+  if (env) {
     vec_stride = atoi(env);
-  if (indexing_order != KernelMappingScheme::LinearDilatedIndexingX) {
-    vec_stride = 1;
   }
   KernelMappingScheme mapping_scheme(
       reduction_dimensions.dimensions,
-      {reduction_tiling[0], reduction_tiling[1] * num_threads_y,
-       reduction_tiling[2] * num_threads_x},
+      {reduction_tiling[0], reduction_tiling[1] * num_threads_y, tile_size_x},
       num_threads_y, num_threads_x, indexing_order,
       vec_stride);
   return ReductionCodegenInfo(mapping_scheme,

From 8a0e361243629c9806a4c1183d1cb19a7ea19c14 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 19 Feb 2020 07:56:23 -0800
Subject: [PATCH 07/52] Limit default vectorization to we know LLVM won't
 vectorize

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 7aebf2e9577..7beb5ddbe50 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1860,12 +1860,13 @@ namespace {
 
 // Returns true if the fusion contains any instruction that is likely
 // translated to complex LLVM IR, such as loops, and prevent vectorization.
-bool MayPreventVectorization(const HloInstruction& hlo) {
+bool MayPreventVectorization(const HloInstruction& hlo, const bool tolerate_reduce=false) {
   if (hlo.opcode() == HloOpcode::kFusion) {
     return absl::c_any_of(hlo.fused_instructions_computation()->instructions(),
-                          [](const HloInstruction* instr) {
+                          [tolerate_reduce](const HloInstruction* instr) {
                             switch (instr->opcode()) {
                               case HloOpcode::kReduce:
+                                return !tolerate_reduce;
                               case HloOpcode::kReduceWindow:
                               case HloOpcode::kSort:
                               case HloOpcode::kDot:
@@ -1892,6 +1893,8 @@ bool MayPreventVectorization(const HloInstruction& hlo) {
       default:
         return false;
     }
+  } else if (hlo.opcode() == HloOpcode::kReduce) {
+    return !tolerate_reduce;
   }
   return true;
 }
@@ -3254,12 +3257,21 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
 
   int vec_stride = 1;
   char* env = getenv("VEC_STRIDE");
-  if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX &&
-      reduction_dimensions.dimensions[2] % tile_size_x == 0) {
-    vec_stride = 2;
+  if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX) {
+    if (reduction_dimensions.dimensions[2] % tile_size_x == 0 &&
+        // As XLA unroll and suppose LLVM will vectorize,
+        // disable the unroll for case that LLVM doesn't vectorize.
+        !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/true)) {
+      vec_stride = 2;
+    } else {
+      indexing_order = KernelMappingScheme::DilatedIndexingX;
+    }
   }
   if (env) {
     vec_stride = atoi(env);
+    if (vec_stride > 1) {
+      indexing_order = KernelMappingScheme::LinearDilatedIndexingX;
+    }
   }
   KernelMappingScheme mapping_scheme(
       reduction_dimensions.dimensions,

From 66f52846a68f01dc2442aad8a69b459cdc4d39d7 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 19 Feb 2020 08:04:29 -0800
Subject: [PATCH 08/52] Rename a variable

---
 .../compiler/xla/service/gpu/ir_emitter_unnested.cc  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 7beb5ddbe50..f9958547a59 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -3255,21 +3255,21 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
 
   int tile_size_x = reduction_tiling[2] * num_threads_x;
 
-  int vec_stride = 1;
-  char* env = getenv("VEC_STRIDE");
+  int vector_size = 1;
+  char* env = getenv("VECTOR_SIZE");
   if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX) {
     if (reduction_dimensions.dimensions[2] % tile_size_x == 0 &&
         // As XLA unroll and suppose LLVM will vectorize,
         // disable the unroll for case that LLVM doesn't vectorize.
         !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/true)) {
-      vec_stride = 2;
+      vector_size = 2;
     } else {
       indexing_order = KernelMappingScheme::DilatedIndexingX;
     }
   }
   if (env) {
-    vec_stride = atoi(env);
-    if (vec_stride > 1) {
+    vector_size = atoi(env);
+    if (vector_size > 1) {
       indexing_order = KernelMappingScheme::LinearDilatedIndexingX;
     }
   }
@@ -3277,7 +3277,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
       reduction_dimensions.dimensions,
       {reduction_tiling[0], reduction_tiling[1] * num_threads_y, tile_size_x},
       num_threads_y, num_threads_x, indexing_order,
-      vec_stride);
+      vector_size);
   return ReductionCodegenInfo(mapping_scheme,
                               reduction_dimensions.is_row_reduction);
 }

From fbfdfbbd5a668d509c84a8213217768768ef053e Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 24 Feb 2020 08:30:31 -0800
Subject: [PATCH 09/52] Enable vectorization for even row size by default.

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index f9958547a59..46bf010b65f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -3258,7 +3258,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   int vector_size = 1;
   char* env = getenv("VECTOR_SIZE");
   if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX) {
-    if (reduction_dimensions.dimensions[2] % tile_size_x == 0 &&
+    if (reduction_dimensions.dimensions[2] % 2 == 0 &&
         // As XLA unroll and suppose LLVM will vectorize,
         // disable the unroll for case that LLVM doesn't vectorize.
         !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/true)) {

From ec3edaf0277041350ad312e477db48266cbd860f Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 24 Feb 2020 08:45:18 -0800
Subject: [PATCH 10/52] Remove not working code path for odd row size.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 62 ++-----------------
 1 file changed, 6 insertions(+), 56 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 46bf010b65f..3e8703134d2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2042,73 +2042,23 @@ void IrEmitterUnnested::EmitTile(
           };
 
           char * env_if = getenv("RED_IF");
-          int red_if = 1;
+          int red_if = 0;
           if (env_if) {
             red_if = atoi(env_if);
             printf("RED_IF2 = %d %s\n", red_if, env_if);
           }
-          if (red_if == 1) {
+          if (red_if == 1 || x_tile_fits) {
             std::cout << "IF_NB 1: " << std::endl;
             unroll(!x_tile_fits, x_num_steps, vec_stride);
-          } else if (red_if == 2) {
+          } else {
             std::cout << "IF_NB 2" << std::endl;
             ksl->If(loop_name + "_is_full_tile",
-                    //b->CreateICmpULT(last_element, tile_width),
-                    // If (the thread fully unrolled) {no condition path} else {condition path}
+                    // if (block fully fit) {fast path} else {slow path}
+                    // tile_width is always exact. For the last block,
+                    // it will be the exact number of elements left.
                     b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width),
                     [&] {unroll(false, x_num_steps, vec_stride);},
                     [&] {unroll(true, x_num_steps, vec_stride);});
-          } else {
-            std::cout << "IF_NB 3" << std::endl;
-            //b->CreateICmpULT(start_offset_x+j * step_x * vec_stride + i, tile_width)
-            int last_block_left_element = mapping_scheme.GetDimsInElems()[2] % x_num_steps;
-            std::cout << "MAPPING " << mapping_scheme.GetDimsInElems()[0] << " "
-                      << mapping_scheme.GetDimsInElems()[1] << " "
-                      << mapping_scheme.GetDimsInElems()[2] << std::endl;
-            std::cout << "LAST_BLOCK x_num_steps " << x_num_steps
-                      << " last_block" << last_block_left_element << std::endl;
-            // NB block per reduction.
-            int nb_block = CeilOfRatio(mapping_scheme.GetDimsInElems()[2],
-                                       tile_size_x);
-            std::cout << "NB_BLOCK" << nb_block << std::endl;
-            if (x_tile_fits) {
-              // All threads will completly unroll
-              unroll(false, x_num_steps, vec_stride);
-            } else if(nb_block == 1) {
-              // No thread will completly unroll.
-              // TODO: unroll by the right amount
-              unroll(true, x_num_steps, vec_stride);
-            } else {
-              // For some blocks, all threads will will completly unroll.
-              // For other blocks, some of its threads will completly unroll, others will partially and some won't be used.
-              // So do an if(thread fully unroll) {code with no if between elements} else {code with if between each elements}
-              // TODO: in the else part, unroll without if but with the right number of elements left.
-
-              llvm::Value* block_id = gpu::EmitCallToTargetIntrinsic(
-                  gpu::TargetIntrinsicID::kBlockIdx, {}, {}, &b_);
-              llvm::Value* last_element = b_.CreateAdd(constant(x_num_steps * tile_size_x),
-                                                       start_offset_x, "last_element");
-              int x_num_steps_partial = mapping_scheme.GetDimsInElems()[2] % tile_size_x;
-              x_num_steps_partial *= 2;
-              //x_num_steps_partial = x_num_steps;
-              ksl->If(loop_name + "_is_full_tile",
-                      // Test if all the elements of this thread is withing tile.
-                      b_.CreateICmpULT(last_element, tile_width),
-                      // Not the last block, so unroll without ifs.
-                      [&] {unroll(false, x_num_steps, vec_stride);},
-                      // The last block isn't completly unrolled.
-
-                      // TODO: unroll the right size. Take care
-                      // vec_stride must match the above unroll for
-                      // now.
-                      // TODO: after unroll of the right size, remove the IFs.
-                      // ONGOING, try to make it work with less
-                      // then unroll x_num_steps
-
-                       [&] {unroll(true, x_num_steps, vec_stride);}); // works
-              //[&] {unroll(true, x_num_steps, x_num_steps);});
-              //[&] {unroll(true, x_num_steps_partial, vec_stride);});
-            }
           }
         }});
 }

From e98822187cd53a75b2fb038c404333eba608f511 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 24 Feb 2020 09:57:02 -0800
Subject: [PATCH 11/52] Add test

---
 .../compiler/xla/service/gpu/tests/BUILD      |  27 +++
 .../gpu/tests/reduction_vectorization_test.cc | 192 ++++++++++++++++++
 2 files changed, 219 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc

diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index 1fd51c78988..e04dba418d9 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -164,6 +164,33 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "reduction_vectorization_test",
+    srcs = [
+        "reduction_vectorization_test.cc",
+    ],
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu:gemm_rewriter",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_test(
     name = "reduction_dimension_grouper_test",
     srcs = [
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
new file mode 100644
index 00000000000..470696e003b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -0,0 +1,192 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+class ReductionVectorizationTest : public GpuCodegenTest {
+
+ protected:
+  void EnsureDeterminism(absl::string_view hlo_text) {
+    std::vector<ExecutionProfile> profiles;
+    profiles.emplace_back();
+    profiles.emplace_back();
+    EXPECT_TRUE(RunMultipleTimes(hlo_text,
+                                 /*run_hlo_passes=*/true,
+                                 /*profiles=*/&profiles,
+                                 /*backend_config=*/"",
+                                 /*assert_determinism=*/true));
+  }
+};
+
+TEST_F(ReductionVectorizationTest, Power2) {
+  const char* hlo_text = R"(
+HloModule ReducePower2
+
+%max_ (x.5: f32[], y.6: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,131072]) -> f32[5]{0} {
+  %param_0 = f32[5,131072]{1,0} parameter(0), parameter_replication={false}
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5]{0} reduce(f32[5,131072]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK: ld.global.nc.v2.f32
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ReductionVectorizationTest, TileFit) {
+  const char* hlo_text = R"(
+HloModule ReduceTileFit
+
+%max_ (x.5: f32[], y.6: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,122880]) -> f32[5]{0} {
+  %param_0 = f32[5,122880]{1,0} parameter(0), parameter_replication={false}
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5]{0} reduce(f32[5,122880]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK: ld.global.nc.v2.f32
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ReductionVectorizationTest, DisableOddColumns) {
+  const char* hlo_text = R"(
+HloModule ReduceTileFit
+
+%max_ {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(%x, %y)
+}
+
+ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,131071]) -> f32[5]{0} {
+  %param_0 = f32[5,131071]{1,0} parameter(0), parameter_replication={false}
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5]{0} reduce(f32[5,131071]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK-NOT: ld.global.nc.v2.f32
+CHECK-NOT: ld.global.nc.v4.f32
+CHECK-NOT: ld.global.nc.u64
+CHECK-NOT: ld.global.u64
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ReductionVectorizationTest, Exp) {
+  const char* hlo_text = R"(
+HloModule DisableSin
+
+%add_float {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add.17 = f32[] add(f32[] %x, f32[] %y)
+}
+
+ENTRY %cluster_0 {
+  %arg0.1 = f32[5,131072]{1,0} parameter(0)
+  %sine = f32[5,131072]{1,0} exponential(f32[5,131072]{1,0} %arg0.1)
+  %constant.0 = f32[] constant(0)
+  ROOT %reduce.18 = f32[5]{0} reduce(f32[5,131072]{1,0} %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK: ld.global.nc.v2.f32
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ReductionVectorizationTest, DisableSin) {
+  const char* hlo_text = R"(
+HloModule DisableSin
+
+%add_float {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add.17 = f32[] add(f32[] %x, f32[] %y)
+}
+
+ENTRY %cluster_0 {
+  %arg0.1 = f32[5,131072]{1,0} parameter(0)
+  %sine = f32[5,131072]{1,0} sine(f32[5,131072]{1,0} %arg0.1)
+  %constant.0 = f32[] constant(0)
+  ROOT %reduce.18 = f32[5]{0} reduce(f32[5,131072]{1,0} %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK-NOT: ld.global.nc.v2.f32
+CHECK-NOT: ld.global.nc.v4.f32
+CHECK-NOT: ld.global.nc.u64
+CHECK-NOT: ld.global.u64
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla

From b23e1c4708f04543edf1cd2f5efaa0438ac5d789 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 26 Feb 2020 13:39:06 -0800
Subject: [PATCH 12/52] Better test

---
 .../gpu/tests/reduction_vectorization_test.cc | 81 ++++++++++++++-----
 1 file changed, 62 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index 470696e003b..d811eeb52ae 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -51,16 +51,16 @@ TEST_F(ReductionVectorizationTest, Power2) {
   const char* hlo_text = R"(
 HloModule ReducePower2
 
-%max_ (x.5: f32[], y.6: f32[]) -> f32[] {
+%max_ {
   %x = f32[] parameter(0)
   %y = f32[] parameter(1)
   ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
 }
 
-ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,131072]) -> f32[5]{0} {
-  %param_0 = f32[5,131072]{1,0} parameter(0), parameter_replication={false}
+ENTRY %main {
+  %param_0 = f32[5,131072] parameter(0)
   %constant.3 = f32[] constant(0)
-  ROOT %reduce.8 = f32[5]{0} reduce(f32[5,131072]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+  ROOT %reduce.8 = f32[5] reduce(f32[5,131072] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
@@ -68,6 +68,9 @@ ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceAr
   CompileAndOptionallyVerifyPtx(std::move(optimized_module),
                                 R"(
 CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
@@ -77,23 +80,63 @@ TEST_F(ReductionVectorizationTest, TileFit) {
   const char* hlo_text = R"(
 HloModule ReduceTileFit
 
-%max_ (x.5: f32[], y.6: f32[]) -> f32[] {
+%max_ {
   %x = f32[] parameter(0)
   %y = f32[] parameter(1)
   ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
 }
 
-ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,122880]) -> f32[5]{0} {
-  %param_0 = f32[5,122880]{1,0} parameter(0), parameter_replication={false}
+ENTRY %main {
+  %param_0 = f32[5,122880] parameter(0)
   %constant.3 = f32[] constant(0)
-  ROOT %reduce.8 = f32[5]{0} reduce(f32[5,122880]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+  ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
                           ParseAndReturnVerifiedModule(hlo_text));
   CompileAndOptionallyVerifyPtx(std::move(optimized_module),
                                 R"(
+// TODO: Make this a vectorized load
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
 CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ReductionVectorizationTest, EvenColums) {
+  const char* hlo_text = R"(
+HloModule ReducePower2
+
+%max_ {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+  %param_0 = f32[5,131070] parameter(0)
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5] reduce(f32[5,131070] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+// TODO: Make this a vectorized load
+CHECK-NOT: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK-NOT: ld.global.nc.v2.f32
+// TODO: find how to modify LLVM/opt to get this merged? vectorize before some loop opt?
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
@@ -109,10 +152,10 @@ HloModule ReduceTileFit
   ROOT %maximum.7 = f32[] maximum(%x, %y)
 }
 
-ENTRY %cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.25 (param_0: f32[5,131071]) -> f32[5]{0} {
-  %param_0 = f32[5,131071]{1,0} parameter(0), parameter_replication={false}
+ENTRY %main {
+  %param_0 = f32[5,131071] parameter(0)
   %constant.3 = f32[] constant(0)
-  ROOT %reduce.8 = f32[5]{0} reduce(f32[5,131071]{1,0} %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+  ROOT %reduce.8 = f32[5] reduce(f32[5,131071] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
@@ -138,11 +181,11 @@ HloModule DisableSin
   ROOT %add.17 = f32[] add(f32[] %x, f32[] %y)
 }
 
-ENTRY %cluster_0 {
-  %arg0.1 = f32[5,131072]{1,0} parameter(0)
-  %sine = f32[5,131072]{1,0} exponential(f32[5,131072]{1,0} %arg0.1)
+ENTRY %main {
+  %arg0.1 = f32[5,131072] parameter(0)
+  %sine = f32[5,131072] exponential(f32[5,131072] %arg0.1)
   %constant.0 = f32[] constant(0)
-  ROOT %reduce.18 = f32[5]{0} reduce(f32[5,131072]{1,0} %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float
+  ROOT %reduce.18 = f32[5] reduce(f32[5,131072] %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float
 }
 )";
 
@@ -166,11 +209,11 @@ HloModule DisableSin
   ROOT %add.17 = f32[] add(f32[] %x, f32[] %y)
 }
 
-ENTRY %cluster_0 {
-  %arg0.1 = f32[5,131072]{1,0} parameter(0)
-  %sine = f32[5,131072]{1,0} sine(f32[5,131072]{1,0} %arg0.1)
+ENTRY %main {
+  %arg0.1 = f32[5,131072] parameter(0)
+  %sine = f32[5,131072] sine(f32[5,131072] %arg0.1)
   %constant.0 = f32[] constant(0)
-  ROOT %reduce.18 = f32[5]{0} reduce(f32[5,131072]{1,0} %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float
+  ROOT %reduce.18 = f32[5] reduce(f32[5,131072] %sine, f32[] %constant.0), dimensions={1}, to_apply=%add_float
 }
 )";
 

From 601ad82b6b620053dea3c260c73ad81c1eea4d32 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 24 Feb 2020 12:51:49 -0800
Subject: [PATCH 13/52] Rename a variable

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 3e8703134d2..6733183644f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1985,7 +1985,7 @@ void IrEmitterUnnested::EmitTile(
   //
   // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the
   // workaround.
-  int vec_stride = mapping_scheme.GetVectorSize();
+  int vector_size = mapping_scheme.GetVectorSize();
   ksl->For(
       loop_name + "_y_in_tile",
       /*start=*/constant(0),
@@ -1993,10 +1993,11 @@ void IrEmitterUnnested::EmitTile(
       ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
                     num_threads_y),
       /*step=*/constant(1), [&](llvm::Value* y_indvar) {
-        printf("VEC_STRIDE %d\n", vec_stride);
+        printf("VEC_STRIDE %d\n", vector_size);
         llvm::Value* y_loc = b_.CreateAdd(
             thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
-        if (vec_stride == 1) {
+        // TODO: can I get rid of this conditions?
+        if (vector_size == 1) {
           for (int64 j = 0; j < x_num_steps; j++) {
             llvm::Value* x_loc =
                 b_.CreateAdd(constant(j * step_x), start_offset_x, "x_loc");
@@ -2014,17 +2015,17 @@ void IrEmitterUnnested::EmitTile(
             }
           }
         } else {
-          auto unroll = [&](bool add_if, int64 max_element, int64 vec_stride) {
-          for (int64 j = 0; j < x_num_steps/vec_stride; j++) {
+          auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) {
+          for (int64 j = 0; j < x_num_steps/vector_size; j++) {
             //Prep some values. If we do not do this, LLVM doesn't vectorize.
             llvm::Value* x_loc_base =
-                b_.CreateAdd(constant(j * step_x * vec_stride), start_offset_x, "x_loc_base");
+                b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x, "x_loc_base");
             IrArray::Index source_idx_x_base =
                 source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
-                  .AddOffsetToDim(constant(j * step_x * vec_stride), kDimX, &b_);
+                  .AddOffsetToDim(constant(j * step_x * vector_size), kDimX, &b_);
 
-            for (int i = 0; i < vec_stride; i++) {
-              int old_j = j * vec_stride + i;
+            for (int i = 0; i < vector_size; i++) {
+              int old_j = j * vector_size + i;
               llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc");
               IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim(
                   constant(i), kDimX, &b_);
@@ -2049,7 +2050,7 @@ void IrEmitterUnnested::EmitTile(
           }
           if (red_if == 1 || x_tile_fits) {
             std::cout << "IF_NB 1: " << std::endl;
-            unroll(!x_tile_fits, x_num_steps, vec_stride);
+            unroll(!x_tile_fits, x_num_steps, vector_size);
           } else {
             std::cout << "IF_NB 2" << std::endl;
             ksl->If(loop_name + "_is_full_tile",
@@ -2057,8 +2058,8 @@ void IrEmitterUnnested::EmitTile(
                     // tile_width is always exact. For the last block,
                     // it will be the exact number of elements left.
                     b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width),
-                    [&] {unroll(false, x_num_steps, vec_stride);},
-                    [&] {unroll(true, x_num_steps, vec_stride);});
+                    [&] {unroll(false, x_num_steps, vector_size);},
+                    [&] {unroll(true, x_num_steps, vector_size);});
           }
         }});
 }

From 3b8c3030b95463b60caf2b9686fa456f14d326b0 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 24 Feb 2020 12:58:41 -0800
Subject: [PATCH 14/52] Code simplification

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 29 +++++--------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 6733183644f..310dd1e266e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1996,25 +1996,7 @@ void IrEmitterUnnested::EmitTile(
         printf("VEC_STRIDE %d\n", vector_size);
         llvm::Value* y_loc = b_.CreateAdd(
             thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
-        // TODO: can I get rid of this conditions?
-        if (vector_size == 1) {
-          for (int64 j = 0; j < x_num_steps; j++) {
-            llvm::Value* x_loc =
-                b_.CreateAdd(constant(j * step_x), start_offset_x, "x_loc");
-            IrArray::Index source_idx_x =
-                source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
-                    .AddOffsetToDim(constant(j * step_x), kDimX, &b_);
-            auto emit_element = [&] {
-              return emit_elem_function(source_idx_x, y_loc, x_loc, j);
-            };
-            if (!x_tile_fits) {
-              ksl->If(loop_name + "_x_in_tile",
-                      b_.CreateICmpULT(x_loc, tile_width), emit_element);
-            } else {
-              emit_element();
-            }
-          }
-        } else {
+        {
           auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) {
           for (int64 j = 0; j < x_num_steps/vector_size; j++) {
             //Prep some values. If we do not do this, LLVM doesn't vectorize.
@@ -2026,9 +2008,12 @@ void IrEmitterUnnested::EmitTile(
 
             for (int i = 0; i < vector_size; i++) {
               int old_j = j * vector_size + i;
-              llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc");
-              IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim(
-                  constant(i), kDimX, &b_);
+              llvm::Value* x_loc = x_loc_base;
+              IrArray::Index source_idx_x = source_idx_x_base;
+              if (i > 0) {
+                x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc");
+                source_idx_x = source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_);
+              }
               auto emit_element = [&] {
                 return emit_elem_function(source_idx_x, y_loc, x_loc, j);
               };

From 407ed60524ab2e750158fee3e3e7728ab319d0b9 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 24 Feb 2020 12:59:25 -0800
Subject: [PATCH 15/52] Remove useless indentation

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 51 +++++++++----------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 310dd1e266e..a1cf8f248f9 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1996,15 +1996,14 @@ void IrEmitterUnnested::EmitTile(
         printf("VEC_STRIDE %d\n", vector_size);
         llvm::Value* y_loc = b_.CreateAdd(
             thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
-        {
-          auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) {
+        auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) {
           for (int64 j = 0; j < x_num_steps/vector_size; j++) {
             //Prep some values. If we do not do this, LLVM doesn't vectorize.
             llvm::Value* x_loc_base =
-                b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x, "x_loc_base");
+            b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x, "x_loc_base");
             IrArray::Index source_idx_x_base =
-                source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
-                  .AddOffsetToDim(constant(j * step_x * vector_size), kDimX, &b_);
+            source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
+            .AddOffsetToDim(constant(j * step_x * vector_size), kDimX, &b_);
 
             for (int i = 0; i < vector_size; i++) {
               int old_j = j * vector_size + i;
@@ -2025,28 +2024,28 @@ void IrEmitterUnnested::EmitTile(
               }
             }
           }
-          };
+        };
 
-          char * env_if = getenv("RED_IF");
-          int red_if = 0;
-          if (env_if) {
-            red_if = atoi(env_if);
-            printf("RED_IF2 = %d %s\n", red_if, env_if);
-          }
-          if (red_if == 1 || x_tile_fits) {
-            std::cout << "IF_NB 1: " << std::endl;
-            unroll(!x_tile_fits, x_num_steps, vector_size);
-          } else {
-            std::cout << "IF_NB 2" << std::endl;
-            ksl->If(loop_name + "_is_full_tile",
-                    // if (block fully fit) {fast path} else {slow path}
-                    // tile_width is always exact. For the last block,
-                    // it will be the exact number of elements left.
-                    b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width),
-                    [&] {unroll(false, x_num_steps, vector_size);},
-                    [&] {unroll(true, x_num_steps, vector_size);});
-          }
-        }});
+        char * env_if = getenv("RED_IF");
+        int red_if = 0;
+        if (env_if) {
+          red_if = atoi(env_if);
+          printf("RED_IF2 = %d %s\n", red_if, env_if);
+        }
+        if (red_if == 1 || x_tile_fits) {
+          std::cout << "IF_NB 1: " << std::endl;
+          unroll(!x_tile_fits, x_num_steps, vector_size);
+        } else {
+          std::cout << "IF_NB 2" << std::endl;
+          ksl->If(loop_name + "_is_full_tile",
+                  // if (block fully fit) {fast path} else {slow path}
+                  // tile_width is always exact. For the last block,
+                  // it will be the exact number of elements left.
+                  b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width),
+                  [&] {unroll(false, x_num_steps, vector_size);},
+                  [&] {unroll(true, x_num_steps, vector_size);});
+        }
+      });
 }
 
 // Emits code to process a tensor element in a tile for the given kCopy HLO that

From 5e84cdf212ed55ea366bb7de5f0d945a1931d0f2 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Tue, 25 Feb 2020 10:22:39 -0800
Subject: [PATCH 16/52] Remove print

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 34 ++++++++-----------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index a1cf8f248f9..c5b4269cdb4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1993,7 +1993,6 @@ void IrEmitterUnnested::EmitTile(
       ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
                     num_threads_y),
       /*step=*/constant(1), [&](llvm::Value* y_indvar) {
-        printf("VEC_STRIDE %d\n", vector_size);
         llvm::Value* y_loc = b_.CreateAdd(
             thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
         auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) {
@@ -2026,17 +2025,13 @@ void IrEmitterUnnested::EmitTile(
           }
         };
 
-        char * env_if = getenv("RED_IF");
-        int red_if = 0;
-        if (env_if) {
-          red_if = atoi(env_if);
-          printf("RED_IF2 = %d %s\n", red_if, env_if);
-        }
-        if (red_if == 1 || x_tile_fits) {
-          std::cout << "IF_NB 1: " << std::endl;
-          unroll(!x_tile_fits, x_num_steps, vector_size);
-        } else {
-          std::cout << "IF_NB 2" << std::endl;
+        if (!x_tile_fits &&
+            mapping_scheme.GetIndexingOrder() ==
+            KernelMappingScheme::LinearDilatedIndexingX) {
+          // Only try this path when we try to vectorize the loads.
+
+          // Special case when the tile doesn't fit completly for even row size.
+          // For odd row size every other row isn't aligned, so can't be vectorized.
           ksl->If(loop_name + "_is_full_tile",
                   // if (block fully fit) {fast path} else {slow path}
                   // tile_width is always exact. For the last block,
@@ -2044,7 +2039,10 @@ void IrEmitterUnnested::EmitTile(
                   b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width),
                   [&] {unroll(false, x_num_steps, vector_size);},
                   [&] {unroll(true, x_num_steps, vector_size);});
+        } else {
+          unroll(!x_tile_fits, x_num_steps, vector_size);
         }
+
       });
 }
 
@@ -3161,7 +3159,10 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
                                             reduction_dimensions.dimensions[2]);
 
   KernelMappingScheme::IndexingOrder indexing_order;
-  if (reduction_dimensions.is_row_reduction) {
+  if (reduction_dimensions.is_row_reduction &&
+      // Only try to vectorize+coales memory access for row of even size.
+      // For odd row size, every other row isn't aligned, so can't be vectorized.
+      reduction_dimensions.dimensions[2] % 2 == 0) {
     indexing_order = KernelMappingScheme::LinearDilatedIndexingX;
   } else if (IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
                                                    reduction_dimensions.dimensions[2])) {
@@ -3191,7 +3192,6 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   int tile_size_x = reduction_tiling[2] * num_threads_x;
 
   int vector_size = 1;
-  char* env = getenv("VECTOR_SIZE");
   if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX) {
     if (reduction_dimensions.dimensions[2] % 2 == 0 &&
         // As XLA unroll and suppose LLVM will vectorize,
@@ -3202,12 +3202,6 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
       indexing_order = KernelMappingScheme::DilatedIndexingX;
     }
   }
-  if (env) {
-    vector_size = atoi(env);
-    if (vector_size > 1) {
-      indexing_order = KernelMappingScheme::LinearDilatedIndexingX;
-    }
-  }
   KernelMappingScheme mapping_scheme(
       reduction_dimensions.dimensions,
       {reduction_tiling[0], reduction_tiling[1] * num_threads_y, tile_size_x},

From f0fa15e9466f747bda563e7cd85143707d86e628 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 26 Feb 2020 06:34:27 -0800
Subject: [PATCH 17/52] Fix new latent bug.

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index c5b4269cdb4..be0e605e76d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2013,7 +2013,7 @@ void IrEmitterUnnested::EmitTile(
                 source_idx_x = source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_);
               }
               auto emit_element = [&] {
-                return emit_elem_function(source_idx_x, y_loc, x_loc, j);
+                return emit_elem_function(source_idx_x, y_loc, x_loc, old_j);
               };
               if (add_if) {
                 ksl->If(loop_name + "_x_in_tile",

From 2aac34b5171b304df84147abbb54dd17dbd35ce6 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 2 Mar 2020 14:18:11 -0800
Subject: [PATCH 18/52] NFC: clang-format

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 77 +++++++++++--------
 .../xla/service/gpu/kernel_mapping_scheme.h   | 16 ++--
 .../gpu/tests/reduction_vectorization_test.cc |  1 -
 3 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index be0e605e76d..bc9a01af7c2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1860,7 +1860,8 @@ namespace {
 
 // Returns true if the fusion contains any instruction that is likely
 // translated to complex LLVM IR, such as loops, and prevent vectorization.
-bool MayPreventVectorization(const HloInstruction& hlo, const bool tolerate_reduce=false) {
+bool MayPreventVectorization(const HloInstruction& hlo,
+                             const bool tolerate_reduce = false) {
   if (hlo.opcode() == HloOpcode::kFusion) {
     return absl::c_any_of(hlo.fused_instructions_computation()->instructions(),
                           [tolerate_reduce](const HloInstruction* instr) {
@@ -1923,12 +1924,14 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
                                     llvm::Value* thread_id_x,
                                     llvm::Type* index_ty,
                                     llvm::IRBuilder<>* b) {
-  if (mapping_scheme.GetIndexingOrder() == KernelMappingScheme::DilatedIndexingX) {
+  if (mapping_scheme.GetIndexingOrder() ==
+      KernelMappingScheme::DilatedIndexingX) {
     return thread_id_x;
   } else if (mapping_scheme.GetIndexingOrder() ==
              KernelMappingScheme::LinearDilatedIndexingX) {
     int vector_size = mapping_scheme.GetVectorSize();
-    return b->CreateMul(thread_id_x, llvm::ConstantInt::get(index_ty, vector_size));
+    return b->CreateMul(thread_id_x,
+                        llvm::ConstantInt::get(index_ty, vector_size));
   }
   int64 x_num_steps =
       mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX();
@@ -1958,7 +1961,10 @@ void IrEmitterUnnested::EmitTile(
   // of threads.
   // Otherwise, the stride is one, but we multiply each offset by the limit of
   // number of steps which can be made.
-  int64 step_x = mapping_scheme.GetIndexingOrder() == KernelMappingScheme::LinearIndexingX? 1 : num_threads_x;
+  int64 step_x =
+      mapping_scheme.GetIndexingOrder() == KernelMappingScheme::LinearIndexingX
+          ? 1
+          : num_threads_x;
 
   IrArray::Index source_idx =
       tile_origin_index.AddOffsetToDim(start_offset_x, kDimX, &b_);
@@ -1996,13 +2002,15 @@ void IrEmitterUnnested::EmitTile(
         llvm::Value* y_loc = b_.CreateAdd(
             thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
         auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) {
-          for (int64 j = 0; j < x_num_steps/vector_size; j++) {
-            //Prep some values. If we do not do this, LLVM doesn't vectorize.
+          for (int64 j = 0; j < x_num_steps / vector_size; j++) {
+            // Prep some values. If we do not do this, LLVM doesn't vectorize.
             llvm::Value* x_loc_base =
-            b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x, "x_loc_base");
+                b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x,
+                             "x_loc_base");
             IrArray::Index source_idx_x_base =
-            source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
-            .AddOffsetToDim(constant(j * step_x * vector_size), kDimX, &b_);
+                source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
+                    .AddOffsetToDim(constant(j * step_x * vector_size), kDimX,
+                                    &b_);
 
             for (int i = 0; i < vector_size; i++) {
               int old_j = j * vector_size + i;
@@ -2010,7 +2018,8 @@ void IrEmitterUnnested::EmitTile(
               IrArray::Index source_idx_x = source_idx_x_base;
               if (i > 0) {
                 x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc");
-                source_idx_x = source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_);
+                source_idx_x =
+                    source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_);
               }
               auto emit_element = [&] {
                 return emit_elem_function(source_idx_x, y_loc, x_loc, old_j);
@@ -2025,24 +2034,24 @@ void IrEmitterUnnested::EmitTile(
           }
         };
 
-        if (!x_tile_fits &&
-            mapping_scheme.GetIndexingOrder() ==
-            KernelMappingScheme::LinearDilatedIndexingX) {
+        if (!x_tile_fits && mapping_scheme.GetIndexingOrder() ==
+                                KernelMappingScheme::LinearDilatedIndexingX) {
           // Only try this path when we try to vectorize the loads.
 
           // Special case when the tile doesn't fit completly for even row size.
-          // For odd row size every other row isn't aligned, so can't be vectorized.
+          // For odd row size every other row isn't aligned, so can't be
+          // vectorized.
           ksl->If(loop_name + "_is_full_tile",
                   // if (block fully fit) {fast path} else {slow path}
                   // tile_width is always exact. For the last block,
                   // it will be the exact number of elements left.
-                  b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)), tile_width),
-                  [&] {unroll(false, x_num_steps, vector_size);},
-                  [&] {unroll(true, x_num_steps, vector_size);});
+                  b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)),
+                                  tile_width),
+                  [&] { unroll(false, x_num_steps, vector_size); },
+                  [&] { unroll(true, x_num_steps, vector_size); });
         } else {
           unroll(!x_tile_fits, x_num_steps, vector_size);
         }
-
       });
 }
 
@@ -2126,8 +2135,10 @@ static int GetNumberOfPartialResults(
   if (reduction_info.IsRowReduction()) {
     return 1;
   }
-  int64 num_partial_results = mapping_scheme.GetIndexingOrder() ==
-      KernelMappingScheme::DilatedIndexingX ? 1 : 2;
+  int64 num_partial_results =
+      mapping_scheme.GetIndexingOrder() == KernelMappingScheme::DilatedIndexingX
+          ? 1
+          : 2;
   CHECK_EQ(num_partial_results,
            (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX()));
   return num_partial_results;
@@ -2707,12 +2718,13 @@ void IrEmitterUnnested::EmitHlo021Tile(
     absl::Span<const int64> reduced_output_dims,
     absl::Span<const int64> tiled_param_ids) {
   constexpr int kNumRows = 4;
-  KernelMappingScheme mapping_scheme(reduced_output_dims,
-                                     /*tile_sizes=*/{1, kWarpSize, kWarpSize},
-                                     /*num_threads_y=*/kNumRows,
-                                     /*num_threads_x=*/kWarpSize,
-                                     /*indexing_order=*/KernelMappingScheme::LinearIndexingX,
-                                     /*vector_size*/1);
+  KernelMappingScheme mapping_scheme(
+      reduced_output_dims,
+      /*tile_sizes=*/{1, kWarpSize, kWarpSize},
+      /*num_threads_y=*/kNumRows,
+      /*num_threads_x=*/kWarpSize,
+      /*indexing_order=*/KernelMappingScheme::LinearIndexingX,
+      /*vector_size*/ 1);
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
   llvm::Type* index_type =
@@ -3161,11 +3173,13 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   KernelMappingScheme::IndexingOrder indexing_order;
   if (reduction_dimensions.is_row_reduction &&
       // Only try to vectorize+coales memory access for row of even size.
-      // For odd row size, every other row isn't aligned, so can't be vectorized.
+      // For odd row size, every other row isn't aligned, so can't be
+      // vectorized.
       reduction_dimensions.dimensions[2] % 2 == 0) {
     indexing_order = KernelMappingScheme::LinearDilatedIndexingX;
-  } else if (IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
-                                                   reduction_dimensions.dimensions[2])) {
+  } else if (IsUnrollingColumnReductionBeneficial(
+                 unnested_hlo, input_shape,
+                 reduction_dimensions.dimensions[2])) {
     indexing_order = KernelMappingScheme::LinearIndexingX;
   } else {
     indexing_order = KernelMappingScheme::DilatedIndexingX;
@@ -3196,7 +3210,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
     if (reduction_dimensions.dimensions[2] % 2 == 0 &&
         // As XLA unroll and suppose LLVM will vectorize,
         // disable the unroll for case that LLVM doesn't vectorize.
-        !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/true)) {
+        !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/ true)) {
       vector_size = 2;
     } else {
       indexing_order = KernelMappingScheme::DilatedIndexingX;
@@ -3205,8 +3219,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   KernelMappingScheme mapping_scheme(
       reduction_dimensions.dimensions,
       {reduction_tiling[0], reduction_tiling[1] * num_threads_y, tile_size_x},
-      num_threads_y, num_threads_x, indexing_order,
-      vector_size);
+      num_threads_y, num_threads_x, indexing_order, vector_size);
   return ReductionCodegenInfo(mapping_scheme,
                               reduction_dimensions.is_row_reduction);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index c8f2864c3e9..d90d75a67c5 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -77,7 +77,11 @@ class KernelMappingScheme {
  public:
   enum { DimZ = 0, DimY, DimX, DimTot };
   // TODO: rename Dilated to Strided?
-  enum IndexingOrder { LinearIndexingX, DilatedIndexingX, LinearDilatedIndexingX };
+  enum IndexingOrder {
+    LinearIndexingX,
+    DilatedIndexingX,
+    LinearDilatedIndexingX
+  };
 
   KernelMappingScheme(absl::Span<const int64> dims_in_elems,
                       absl::Span<const int64> tile_sizes, int64 num_threads_y,
@@ -88,7 +92,7 @@ class KernelMappingScheme {
         num_threads_x_(num_threads_x),
         num_threads_y_(num_threads_y),
         indexing_order_(indexing_order),
-        vector_size_(vector_size){
+        vector_size_(vector_size) {
     CHECK_EQ(tile_sizes[1] % num_threads_y_, 0);
     CHECK_EQ(tile_sizes[2] % num_threads_x_, 0);
     VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
@@ -124,12 +128,8 @@ class KernelMappingScheme {
     return GetNumThreadsX() * GetNumThreadsY();
   }
 
-  IndexingOrder GetIndexingOrder() const {
-    return indexing_order_;
-  }
-  int GetVectorSize() const {
-    return vector_size_;
-  }
+  IndexingOrder GetIndexingOrder() const { return indexing_order_; }
+  int GetVectorSize() const { return vector_size_; }
 
  private:
   // The number of elements in each dimension.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index d811eeb52ae..cd3fac63fb8 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -33,7 +33,6 @@ namespace gpu {
 namespace {
 
 class ReductionVectorizationTest : public GpuCodegenTest {
-
  protected:
   void EnsureDeterminism(absl::string_view hlo_text) {
     std::vector<ExecutionProfile> profiles;

From c9d2dea5a84b99ee6e61fc9b6c0b3100bf15b9a1 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Tue, 3 Mar 2020 07:08:12 -0800
Subject: [PATCH 19/52] Better comment

---
 .../xla/service/gpu/kernel_mapping_scheme.h      | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index d90d75a67c5..e07efb21e9a 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -77,6 +77,12 @@ class KernelMappingScheme {
  public:
   enum { DimZ = 0, DimY, DimX, DimTot };
   // TODO: rename Dilated to Strided?
+  // LinearIndexing mean each thread reads consecutive elements.
+  // DilatedIndexingX mean each thread reads Dilated(Strided)
+  //   elements. This conserve memory coalescing.
+  // LinearDilatedIndexingX mean each thread read a few consecutive
+  //   elements then take a bigger step. The goal is to trigger
+  //   vectorized reads and keep memory coalescing.
   enum IndexingOrder {
     LinearIndexingX,
     DilatedIndexingX,
@@ -146,14 +152,8 @@ class KernelMappingScheme {
 
   // When num_threads_x threads process a total of tile_size_x
   // elements in the X dimension of a tile, each threads process
-  // n=tile_size_x/num_threads_x elements. When indexing_order_ have
-  // the value LinearIndexingX, the n elements processed by a thread
-  // are contiguous. When the value is DilatedIndexingX, the n
-  // elements are dilated by a factor of num_threads_x. When the value
-  // is LinearDilatedIndexingX, it is a mix of both to enable
-  // vectorizing while keeping memory coalescing. It reads
-  // vector_size_ elements, then do a step to the next vector_size_
-  // elements.
+  // n=tile_size_x/num_threads_x elements.
+  // indexing_order_ define which tile's elements each thread reads.
   const IndexingOrder indexing_order_;
   // vector_size_ only supported for row reduction.
   // Must be a divisor of tile_sizes_[2]/num_threads_x.

From dda66aabfc3ad0a41a840b86f3b36cab61fa02c9 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 4 Mar 2020 10:25:42 -0800
Subject: [PATCH 20/52] Use constant to make the code clearer.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 48 +++++++++----------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index bc9a01af7c2..11560ca554b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -106,6 +106,11 @@ const auto kDimY = KernelMappingScheme::DimY;
 const auto kDimZ = KernelMappingScheme::DimZ;
 const auto kDimTot = KernelMappingScheme::DimTot;
 
+const auto kLinearIndexingX = KernelMappingScheme::LinearIndexingX;
+const auto kDilatedIndexingX = KernelMappingScheme::DilatedIndexingX;
+const auto kLinearDilatedIndexingX =
+    KernelMappingScheme::LinearDilatedIndexingX;
+
 // If a dimensions is smaller than this, untiled transposition may be more
 // efficient.
 const int64 kMinDimensionToTransposeTiled = 16;
@@ -1924,11 +1929,9 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
                                     llvm::Value* thread_id_x,
                                     llvm::Type* index_ty,
                                     llvm::IRBuilder<>* b) {
-  if (mapping_scheme.GetIndexingOrder() ==
-      KernelMappingScheme::DilatedIndexingX) {
+  if (mapping_scheme.GetIndexingOrder() == kDilatedIndexingX) {
     return thread_id_x;
-  } else if (mapping_scheme.GetIndexingOrder() ==
-             KernelMappingScheme::LinearDilatedIndexingX) {
+  } else if (mapping_scheme.GetIndexingOrder() == kLinearDilatedIndexingX) {
     int vector_size = mapping_scheme.GetVectorSize();
     return b->CreateMul(thread_id_x,
                         llvm::ConstantInt::get(index_ty, vector_size));
@@ -1962,9 +1965,7 @@ void IrEmitterUnnested::EmitTile(
   // Otherwise, the stride is one, but we multiply each offset by the limit of
   // number of steps which can be made.
   int64 step_x =
-      mapping_scheme.GetIndexingOrder() == KernelMappingScheme::LinearIndexingX
-          ? 1
-          : num_threads_x;
+      mapping_scheme.GetIndexingOrder() == kLinearIndexingX ? 1 : num_threads_x;
 
   IrArray::Index source_idx =
       tile_origin_index.AddOffsetToDim(start_offset_x, kDimX, &b_);
@@ -2034,8 +2035,8 @@ void IrEmitterUnnested::EmitTile(
           }
         };
 
-        if (!x_tile_fits && mapping_scheme.GetIndexingOrder() ==
-                                KernelMappingScheme::LinearDilatedIndexingX) {
+        if (!x_tile_fits &&
+            mapping_scheme.GetIndexingOrder() == kLinearDilatedIndexingX) {
           // Only try this path when we try to vectorize the loads.
 
           // Special case when the tile doesn't fit completly for even row size.
@@ -2136,9 +2137,7 @@ static int GetNumberOfPartialResults(
     return 1;
   }
   int64 num_partial_results =
-      mapping_scheme.GetIndexingOrder() == KernelMappingScheme::DilatedIndexingX
-          ? 1
-          : 2;
+      mapping_scheme.GetIndexingOrder() == kDilatedIndexingX ? 1 : 2;
   CHECK_EQ(num_partial_results,
            (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX()));
   return num_partial_results;
@@ -2718,13 +2717,12 @@ void IrEmitterUnnested::EmitHlo021Tile(
     absl::Span<const int64> reduced_output_dims,
     absl::Span<const int64> tiled_param_ids) {
   constexpr int kNumRows = 4;
-  KernelMappingScheme mapping_scheme(
-      reduced_output_dims,
-      /*tile_sizes=*/{1, kWarpSize, kWarpSize},
-      /*num_threads_y=*/kNumRows,
-      /*num_threads_x=*/kWarpSize,
-      /*indexing_order=*/KernelMappingScheme::LinearIndexingX,
-      /*vector_size*/ 1);
+  KernelMappingScheme mapping_scheme(reduced_output_dims,
+                                     /*tile_sizes=*/{1, kWarpSize, kWarpSize},
+                                     /*num_threads_y=*/kNumRows,
+                                     /*num_threads_x=*/kWarpSize,
+                                     /*indexing_order=*/kLinearIndexingX,
+                                     /*vector_size*/ 1);
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
   llvm::Type* index_type =
@@ -3176,16 +3174,16 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
       // For odd row size, every other row isn't aligned, so can't be
       // vectorized.
       reduction_dimensions.dimensions[2] % 2 == 0) {
-    indexing_order = KernelMappingScheme::LinearDilatedIndexingX;
+    indexing_order = kLinearDilatedIndexingX;
   } else if (IsUnrollingColumnReductionBeneficial(
                  unnested_hlo, input_shape,
                  reduction_dimensions.dimensions[2])) {
-    indexing_order = KernelMappingScheme::LinearIndexingX;
+    indexing_order = kLinearIndexingX;
   } else {
-    indexing_order = KernelMappingScheme::DilatedIndexingX;
+    indexing_order = kDilatedIndexingX;
   }
 
-  if (indexing_order == KernelMappingScheme::LinearIndexingX &&
+  if (indexing_order == kLinearIndexingX &&
       !reduction_dimensions.is_row_reduction) {
     // Vectorized loads: a single thread reduces two adjacent columns.
     reduction_tiling[2] *= 2;
@@ -3206,14 +3204,14 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   int tile_size_x = reduction_tiling[2] * num_threads_x;
 
   int vector_size = 1;
-  if (indexing_order == KernelMappingScheme::LinearDilatedIndexingX) {
+  if (indexing_order == kLinearDilatedIndexingX) {
     if (reduction_dimensions.dimensions[2] % 2 == 0 &&
         // As XLA unroll and suppose LLVM will vectorize,
         // disable the unroll for case that LLVM doesn't vectorize.
         !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/ true)) {
       vector_size = 2;
     } else {
-      indexing_order = KernelMappingScheme::DilatedIndexingX;
+      indexing_order = kDilatedIndexingX;
     }
   }
   KernelMappingScheme mapping_scheme(

From 24ad7db0407ceec0bd31c93484db965add1e03ac Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 4 Mar 2020 10:51:55 -0800
Subject: [PATCH 21/52] Small NFC from review.

---
 .../xla/service/gpu/ir_emitter_unnested.cc       | 16 ++++++++--------
 .../xla/service/gpu/kernel_mapping_scheme.h      |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 11560ca554b..6806f235d28 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1866,10 +1866,10 @@ namespace {
 // Returns true if the fusion contains any instruction that is likely
 // translated to complex LLVM IR, such as loops, and prevent vectorization.
 bool MayPreventVectorization(const HloInstruction& hlo,
-                             const bool tolerate_reduce = false) {
+                             bool tolerate_reduce = false) {
   if (hlo.opcode() == HloOpcode::kFusion) {
     return absl::c_any_of(hlo.fused_instructions_computation()->instructions(),
-                          [tolerate_reduce](const HloInstruction* instr) {
+                          [&](const HloInstruction* instr) {
                             switch (instr->opcode()) {
                               case HloOpcode::kReduce:
                                 return !tolerate_reduce;
@@ -2002,7 +2002,7 @@ void IrEmitterUnnested::EmitTile(
       /*step=*/constant(1), [&](llvm::Value* y_indvar) {
         llvm::Value* y_loc = b_.CreateAdd(
             thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
-        auto unroll = [&](bool add_if, int64 max_element, int64 vector_size) {
+        auto unroll = [&](bool add_if, int64 vector_size) {
           for (int64 j = 0; j < x_num_steps / vector_size; j++) {
             // Prep some values. If we do not do this, LLVM doesn't vectorize.
             llvm::Value* x_loc_base =
@@ -2046,12 +2046,12 @@ void IrEmitterUnnested::EmitTile(
                   // if (block fully fit) {fast path} else {slow path}
                   // tile_width is always exact. For the last block,
                   // it will be the exact number of elements left.
-                  b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeFor(2)),
+                  b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
                                   tile_width),
-                  [&] { unroll(false, x_num_steps, vector_size); },
-                  [&] { unroll(true, x_num_steps, vector_size); });
+                  [&] { unroll(/*add_if=*/false, vector_size); },
+                  [&] { unroll(/*add_if=*/true, vector_size); });
         } else {
-          unroll(!x_tile_fits, x_num_steps, vector_size);
+          unroll(/*add_if=*/!x_tile_fits, vector_size);
         }
       });
 }
@@ -2722,7 +2722,7 @@ void IrEmitterUnnested::EmitHlo021Tile(
                                      /*num_threads_y=*/kNumRows,
                                      /*num_threads_x=*/kWarpSize,
                                      /*indexing_order=*/kLinearIndexingX,
-                                     /*vector_size*/ 1);
+                                     /*vector_size=*/1);
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
   llvm::Type* index_type =
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index e07efb21e9a..1bea776275a 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -158,7 +158,7 @@ class KernelMappingScheme {
   // vector_size_ only supported for row reduction.
   // Must be a divisor of tile_sizes_[2]/num_threads_x.
   // Interesting values are 2 and 4 to trigger vectorized loads on GPUs
-  // While keeping memory coalescing.
+  // while keeping memory coalescing.
   const int vector_size_;
 };
 

From d40cb300200bb70e070192c378d2e1511a97b6f4 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 4 Mar 2020 11:14:30 -0800
Subject: [PATCH 22/52] Use strided instead of dilated. This is more know in
 our community working on the software.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 22 +++++++++----------
 .../xla/service/gpu/kernel_mapping_scheme.h   | 13 +++++------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 6806f235d28..4cb26e03867 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -107,9 +107,9 @@ const auto kDimZ = KernelMappingScheme::DimZ;
 const auto kDimTot = KernelMappingScheme::DimTot;
 
 const auto kLinearIndexingX = KernelMappingScheme::LinearIndexingX;
-const auto kDilatedIndexingX = KernelMappingScheme::DilatedIndexingX;
-const auto kLinearDilatedIndexingX =
-    KernelMappingScheme::LinearDilatedIndexingX;
+const auto kStridedIndexingX = KernelMappingScheme::StridedIndexingX;
+const auto kLinearStridedIndexingX =
+    KernelMappingScheme::LinearStridedIndexingX;
 
 // If a dimensions is smaller than this, untiled transposition may be more
 // efficient.
@@ -1929,9 +1929,9 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
                                     llvm::Value* thread_id_x,
                                     llvm::Type* index_ty,
                                     llvm::IRBuilder<>* b) {
-  if (mapping_scheme.GetIndexingOrder() == kDilatedIndexingX) {
+  if (mapping_scheme.GetIndexingOrder() == kStridedIndexingX) {
     return thread_id_x;
-  } else if (mapping_scheme.GetIndexingOrder() == kLinearDilatedIndexingX) {
+  } else if (mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
     int vector_size = mapping_scheme.GetVectorSize();
     return b->CreateMul(thread_id_x,
                         llvm::ConstantInt::get(index_ty, vector_size));
@@ -2036,7 +2036,7 @@ void IrEmitterUnnested::EmitTile(
         };
 
         if (!x_tile_fits &&
-            mapping_scheme.GetIndexingOrder() == kLinearDilatedIndexingX) {
+            mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
           // Only try this path when we try to vectorize the loads.
 
           // Special case when the tile doesn't fit completly for even row size.
@@ -2137,7 +2137,7 @@ static int GetNumberOfPartialResults(
     return 1;
   }
   int64 num_partial_results =
-      mapping_scheme.GetIndexingOrder() == kDilatedIndexingX ? 1 : 2;
+      mapping_scheme.GetIndexingOrder() == kStridedIndexingX ? 1 : 2;
   CHECK_EQ(num_partial_results,
            (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX()));
   return num_partial_results;
@@ -3174,13 +3174,13 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
       // For odd row size, every other row isn't aligned, so can't be
       // vectorized.
       reduction_dimensions.dimensions[2] % 2 == 0) {
-    indexing_order = kLinearDilatedIndexingX;
+    indexing_order = kLinearStridedIndexingX;
   } else if (IsUnrollingColumnReductionBeneficial(
                  unnested_hlo, input_shape,
                  reduction_dimensions.dimensions[2])) {
     indexing_order = kLinearIndexingX;
   } else {
-    indexing_order = kDilatedIndexingX;
+    indexing_order = kStridedIndexingX;
   }
 
   if (indexing_order == kLinearIndexingX &&
@@ -3204,14 +3204,14 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   int tile_size_x = reduction_tiling[2] * num_threads_x;
 
   int vector_size = 1;
-  if (indexing_order == kLinearDilatedIndexingX) {
+  if (indexing_order == kLinearStridedIndexingX) {
     if (reduction_dimensions.dimensions[2] % 2 == 0 &&
         // As XLA unroll and suppose LLVM will vectorize,
         // disable the unroll for case that LLVM doesn't vectorize.
         !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/ true)) {
       vector_size = 2;
     } else {
-      indexing_order = kDilatedIndexingX;
+      indexing_order = kStridedIndexingX;
     }
   }
   KernelMappingScheme mapping_scheme(
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 1bea776275a..67e1c3fc546 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -76,17 +76,16 @@ namespace gpu {
 class KernelMappingScheme {
  public:
   enum { DimZ = 0, DimY, DimX, DimTot };
-  // TODO: rename Dilated to Strided?
   // LinearIndexing mean each thread reads consecutive elements.
-  // DilatedIndexingX mean each thread reads Dilated(Strided)
-  //   elements. This conserve memory coalescing.
-  // LinearDilatedIndexingX mean each thread read a few consecutive
+  // StridedIndexingX mean each thread reads strided elements.
+  //   This conserve memory coalescing.
+  // LinearStridedIndexingX mean each thread read a few consecutive
   //   elements then take a bigger step. The goal is to trigger
   //   vectorized reads and keep memory coalescing.
   enum IndexingOrder {
     LinearIndexingX,
-    DilatedIndexingX,
-    LinearDilatedIndexingX
+    StridedIndexingX,
+    LinearStridedIndexingX
   };
 
   KernelMappingScheme(absl::Span<const int64> dims_in_elems,
@@ -103,7 +102,7 @@ class KernelMappingScheme {
     CHECK_EQ(tile_sizes[2] % num_threads_x_, 0);
     VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
     if (indexing_order != LinearIndexingX) {
-      // DilatedIndexingX, and LinearDilatedIndexingX
+      // StridedIndexingX, and LinearStridedIndexingX
       // is for the purpose of vectorization, which requires
       // GetTileSizeFor(DimX) to be a multiplier of num_threads_x_.
       CHECK_EQ(GetTileSizeFor(DimX) % num_threads_x_, 0);

From ecac846c420d6c231d280bb87cb26eaf1250aa83 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 4 Mar 2020 11:53:51 -0800
Subject: [PATCH 23/52] NFC: Rename a variable

---
 .../xla/service/gpu/ir_emitter_unnested.cc        | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 4cb26e03867..11912e056e7 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2002,7 +2002,8 @@ void IrEmitterUnnested::EmitTile(
       /*step=*/constant(1), [&](llvm::Value* y_indvar) {
         llvm::Value* y_loc = b_.CreateAdd(
             thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
-        auto unroll = [&](bool add_if, int64 vector_size) {
+        auto unroll = [&](bool add_index_boundary_condition,
+                          int64 vector_size) {
           for (int64 j = 0; j < x_num_steps / vector_size; j++) {
             // Prep some values. If we do not do this, LLVM doesn't vectorize.
             llvm::Value* x_loc_base =
@@ -2025,7 +2026,7 @@ void IrEmitterUnnested::EmitTile(
               auto emit_element = [&] {
                 return emit_elem_function(source_idx_x, y_loc, x_loc, old_j);
               };
-              if (add_if) {
+              if (add_index_boundary_condition) {
                 ksl->If(loop_name + "_x_in_tile",
                         b_.CreateICmpULT(x_loc, tile_width), emit_element);
               } else {
@@ -2048,10 +2049,14 @@ void IrEmitterUnnested::EmitTile(
                   // it will be the exact number of elements left.
                   b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
                                   tile_width),
-                  [&] { unroll(/*add_if=*/false, vector_size); },
-                  [&] { unroll(/*add_if=*/true, vector_size); });
+                  [&] {
+                    unroll(/*add_index_boundary_condition=*/false, vector_size);
+                  },
+                  [&] {
+                    unroll(/*add_index_boundary_condition=*/true, vector_size);
+                  });
         } else {
-          unroll(/*add_if=*/!x_tile_fits, vector_size);
+          unroll(/*add_index_boundary_condition=*/!x_tile_fits, vector_size);
         }
       });
 }

From 150c2abcb33526e05aa168198a525efd7d95c55e Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 4 Mar 2020 11:55:02 -0800
Subject: [PATCH 24/52] Simplify the code. The condition is only used in one of
 the path, so no need to test it.

---
 .../compiler/xla/service/gpu/ir_emitter_unnested.cc      | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 11912e056e7..177ce7be558 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1865,14 +1865,13 @@ namespace {
 
 // Returns true if the fusion contains any instruction that is likely
 // translated to complex LLVM IR, such as loops, and prevent vectorization.
-bool MayPreventVectorization(const HloInstruction& hlo,
-                             bool tolerate_reduce = false) {
+bool MayPreventVectorization(const HloInstruction& hlo) {
   if (hlo.opcode() == HloOpcode::kFusion) {
     return absl::c_any_of(hlo.fused_instructions_computation()->instructions(),
                           [&](const HloInstruction* instr) {
                             switch (instr->opcode()) {
                               case HloOpcode::kReduce:
-                                return !tolerate_reduce;
+                                return false;
                               case HloOpcode::kReduceWindow:
                               case HloOpcode::kSort:
                               case HloOpcode::kDot:
@@ -1900,7 +1899,7 @@ bool MayPreventVectorization(const HloInstruction& hlo,
         return false;
     }
   } else if (hlo.opcode() == HloOpcode::kReduce) {
-    return !tolerate_reduce;
+    return false;
   }
   return true;
 }
@@ -3213,7 +3212,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
     if (reduction_dimensions.dimensions[2] % 2 == 0 &&
         // As XLA unroll and suppose LLVM will vectorize,
         // disable the unroll for case that LLVM doesn't vectorize.
-        !MayPreventVectorization(*unnested_hlo, /*tolerate_reduce*/ true)) {
+        !MayPreventVectorization(*unnested_hlo)) {
       vector_size = 2;
     } else {
       indexing_order = kStridedIndexingX;

From 865f3600b7f7c5b6ee096ef12f528318ff975c44 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 4 Mar 2020 12:13:41 -0800
Subject: [PATCH 25/52] Remove a condition that is optimized by LLVM and
 doesn't remove vectorization.

---
 .../compiler/xla/service/gpu/ir_emitter_unnested.cc    | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 177ce7be558..5d46d940bf1 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2015,13 +2015,9 @@ void IrEmitterUnnested::EmitTile(
 
             for (int i = 0; i < vector_size; i++) {
               int old_j = j * vector_size + i;
-              llvm::Value* x_loc = x_loc_base;
-              IrArray::Index source_idx_x = source_idx_x_base;
-              if (i > 0) {
-                x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc");
-                source_idx_x =
-                    source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_);
-              }
+              llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc");
+              IrArray::Index source_idx_x =
+                  source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_);
               auto emit_element = [&] {
                 return emit_elem_function(source_idx_x, y_loc, x_loc, old_j);
               };

From 38a064a347f791b9a27a53fd29aae6b761b29ec9 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 4 Mar 2020 12:18:49 -0800
Subject: [PATCH 26/52] Simplify code. This isn't needed anymore.

---
 .../xla/service/gpu/ir_emitter_unnested.cc      | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 5d46d940bf1..666d8388510 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2004,20 +2004,15 @@ void IrEmitterUnnested::EmitTile(
         auto unroll = [&](bool add_index_boundary_condition,
                           int64 vector_size) {
           for (int64 j = 0; j < x_num_steps / vector_size; j++) {
-            // Prep some values. If we do not do this, LLVM doesn't vectorize.
-            llvm::Value* x_loc_base =
-                b_.CreateAdd(constant(j * step_x * vector_size), start_offset_x,
-                             "x_loc_base");
-            IrArray::Index source_idx_x_base =
-                source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
-                    .AddOffsetToDim(constant(j * step_x * vector_size), kDimX,
-                                    &b_);
-
             for (int i = 0; i < vector_size; i++) {
               int old_j = j * vector_size + i;
-              llvm::Value* x_loc = b_.CreateAdd(constant(i), x_loc_base, "x_loc");
+              llvm::Value* x_loc =
+                  b_.CreateAdd(constant(j * step_x * vector_size + i),
+                               start_offset_x, "x_loc");
               IrArray::Index source_idx_x =
-                  source_idx_x_base.AddOffsetToDim(constant(i), kDimX, &b_);
+                  source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
+                      .AddOffsetToDim(constant(j * step_x * vector_size + i),
+                                      kDimX, &b_);
               auto emit_element = [&] {
                 return emit_elem_function(source_idx_x, y_loc, x_loc, old_j);
               };

From 18ae338407eaac894e52f2eef7dbeac6b15a1a88 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 4 Mar 2020 12:41:45 -0800
Subject: [PATCH 27/52] Simplify comment.

---
 .../compiler/xla/service/gpu/kernel_mapping_scheme.h  | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 67e1c3fc546..11f257fa0ab 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -76,15 +76,14 @@ namespace gpu {
 class KernelMappingScheme {
  public:
   enum { DimZ = 0, DimY, DimX, DimTot };
-  // LinearIndexing mean each thread reads consecutive elements.
-  // StridedIndexingX mean each thread reads strided elements.
-  //   This conserve memory coalescing.
-  // LinearStridedIndexingX mean each thread read a few consecutive
-  //   elements then take a bigger step. The goal is to trigger
-  //   vectorized reads and keep memory coalescing.
   enum IndexingOrder {
+  // Threads read consecutive elements.
     LinearIndexingX,
+  // Threads read strided elements while keeping memory coalescing.
     StridedIndexingX,
+  // Threads read a few consecutive elements then take a strided
+  // step. This can trigger vectorized reads and keep memory
+  // coalescing.
     LinearStridedIndexingX
   };
 

From 7309beda2532384ba5a7bc8c34c10642ff707fcd Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Thu, 5 Mar 2020 06:32:54 -0800
Subject: [PATCH 28/52] NFC: better code/comments following code review.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 69 +++++++++----------
 .../xla/service/gpu/kernel_mapping_scheme.h   | 19 ++---
 .../gpu/tests/reduction_vectorization_test.cc |  4 +-
 3 files changed, 45 insertions(+), 47 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 666d8388510..74fac7c47a0 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1870,8 +1870,6 @@ bool MayPreventVectorization(const HloInstruction& hlo) {
     return absl::c_any_of(hlo.fused_instructions_computation()->instructions(),
                           [&](const HloInstruction* instr) {
                             switch (instr->opcode()) {
-                              case HloOpcode::kReduce:
-                                return false;
                               case HloOpcode::kReduceWindow:
                               case HloOpcode::kSort:
                               case HloOpcode::kDot:
@@ -1880,6 +1878,8 @@ bool MayPreventVectorization(const HloInstruction& hlo) {
                               case HloOpcode::kPower:
                               case HloOpcode::kAtan2:
                                 return true;
+                              case HloOpcode::kReduce:
+                                // TODO: check the to_apply() attribute.
                               default:
                                 return false;
                             }
@@ -1899,6 +1899,7 @@ bool MayPreventVectorization(const HloInstruction& hlo) {
         return false;
     }
   } else if (hlo.opcode() == HloOpcode::kReduce) {
+    // TODO: check the to_apply() attribute.
     return false;
   }
   return true;
@@ -1928,17 +1929,17 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
                                     llvm::Value* thread_id_x,
                                     llvm::Type* index_ty,
                                     llvm::IRBuilder<>* b) {
+  auto constant = [&](int64 val) {
+    return llvm::ConstantInt::get(index_ty, val);
+  };
   if (mapping_scheme.GetIndexingOrder() == kStridedIndexingX) {
     return thread_id_x;
   } else if (mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
-    int vector_size = mapping_scheme.GetVectorSize();
-    return b->CreateMul(thread_id_x,
-                        llvm::ConstantInt::get(index_ty, vector_size));
+    return b->CreateMul(thread_id_x, constant(mapping_scheme.GetVectorSize()));
   }
   int64 x_num_steps =
       mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX();
-  return b->CreateMul(thread_id_x,
-                      llvm::ConstantInt::get(index_ty, x_num_steps));
+  return b->CreateMul(thread_id_x, constant(x_num_steps));
 }
 
 void IrEmitterUnnested::EmitTile(
@@ -1965,6 +1966,7 @@ void IrEmitterUnnested::EmitTile(
   // number of steps which can be made.
   int64 step_x =
       mapping_scheme.GetIndexingOrder() == kLinearIndexingX ? 1 : num_threads_x;
+  int64 vector_size = mapping_scheme.GetVectorSize();
 
   IrArray::Index source_idx =
       tile_origin_index.AddOffsetToDim(start_offset_x, kDimX, &b_);
@@ -1991,7 +1993,6 @@ void IrEmitterUnnested::EmitTile(
   //
   // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the
   // workaround.
-  int vector_size = mapping_scheme.GetVectorSize();
   ksl->For(
       loop_name + "_y_in_tile",
       /*start=*/constant(0),
@@ -2001,11 +2002,10 @@ void IrEmitterUnnested::EmitTile(
       /*step=*/constant(1), [&](llvm::Value* y_indvar) {
         llvm::Value* y_loc = b_.CreateAdd(
             thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
-        auto unroll = [&](bool add_index_boundary_condition,
-                          int64 vector_size) {
-          for (int64 j = 0; j < x_num_steps / vector_size; j++) {
+        auto unroll = [&](bool add_index_boundary_condition) {
+          for (int j = 0; j < x_num_steps / vector_size; j++) {
             for (int i = 0; i < vector_size; i++) {
-              int old_j = j * vector_size + i;
+              int linear_index = j * vector_size + i;
               llvm::Value* x_loc =
                   b_.CreateAdd(constant(j * step_x * vector_size + i),
                                start_offset_x, "x_loc");
@@ -2014,7 +2014,8 @@ void IrEmitterUnnested::EmitTile(
                       .AddOffsetToDim(constant(j * step_x * vector_size + i),
                                       kDimX, &b_);
               auto emit_element = [&] {
-                return emit_elem_function(source_idx_x, y_loc, x_loc, old_j);
+                return emit_elem_function(source_idx_x, y_loc, x_loc,
+                                          linear_index);
               };
               if (add_index_boundary_condition) {
                 ksl->If(loop_name + "_x_in_tile",
@@ -2039,14 +2040,10 @@ void IrEmitterUnnested::EmitTile(
                   // it will be the exact number of elements left.
                   b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
                                   tile_width),
-                  [&] {
-                    unroll(/*add_index_boundary_condition=*/false, vector_size);
-                  },
-                  [&] {
-                    unroll(/*add_index_boundary_condition=*/true, vector_size);
-                  });
+                  [&] { unroll(/*add_index_boundary_condition=*/false); },
+                  [&] { unroll(/*add_index_boundary_condition=*/true); });
         } else {
-          unroll(/*add_index_boundary_condition=*/!x_tile_fits, vector_size);
+          unroll(/*add_index_boundary_condition=*/!x_tile_fits);
         }
       });
 }
@@ -3163,21 +3160,21 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
       !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
                                             reduction_dimensions.dimensions[2]);
 
-  KernelMappingScheme::IndexingOrder indexing_order;
-  if (reduction_dimensions.is_row_reduction &&
-      // Only try to vectorize+coales memory access for row of even size.
-      // For odd row size, every other row isn't aligned, so can't be
-      // vectorized.
-      reduction_dimensions.dimensions[2] % 2 == 0) {
-    indexing_order = kLinearStridedIndexingX;
-  } else if (IsUnrollingColumnReductionBeneficial(
-                 unnested_hlo, input_shape,
-                 reduction_dimensions.dimensions[2])) {
-    indexing_order = kLinearIndexingX;
-  } else {
-    indexing_order = kStridedIndexingX;
-  }
-
+  KernelMappingScheme::IndexingOrder indexing_order = [&]() {
+    if (reduction_dimensions.is_row_reduction &&
+        // Only try to vectorize+coales memory access for row of even size.
+        // For odd row size, every other row isn't aligned, so can't be
+        // vectorized.
+        reduction_dimensions.dimensions[2] % 2 == 0) {
+      return kLinearStridedIndexingX;
+    } else if (IsUnrollingColumnReductionBeneficial(
+                   unnested_hlo, input_shape,
+                   reduction_dimensions.dimensions[2])) {
+      return kLinearIndexingX;
+    } else {
+      return kStridedIndexingX;
+    }
+  }();
   if (indexing_order == kLinearIndexingX &&
       !reduction_dimensions.is_row_reduction) {
     // Vectorized loads: a single thread reduces two adjacent columns.
@@ -3201,7 +3198,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   int vector_size = 1;
   if (indexing_order == kLinearStridedIndexingX) {
     if (reduction_dimensions.dimensions[2] % 2 == 0 &&
-        // As XLA unroll and suppose LLVM will vectorize,
+        // Assuming XLA will perform the unrolling and LLVM will vectorize,
         // disable the unroll for case that LLVM doesn't vectorize.
         !MayPreventVectorization(*unnested_hlo)) {
       vector_size = 2;
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 11f257fa0ab..02dd2f3885e 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -77,13 +77,13 @@ class KernelMappingScheme {
  public:
   enum { DimZ = 0, DimY, DimX, DimTot };
   enum IndexingOrder {
-  // Threads read consecutive elements.
+    // Thread reads consecutive elements.
     LinearIndexingX,
-  // Threads read strided elements while keeping memory coalescing.
+    // Thread reads strided elements while keeping memory coalescing.
     StridedIndexingX,
-  // Threads read a few consecutive elements then take a strided
-  // step. This can trigger vectorized reads and keep memory
-  // coalescing.
+    // Thread reads a few consecutive elements then take a strided
+    // step. This can trigger vectorized reads and keep memory
+    // coalescing.
     LinearStridedIndexingX
   };
 
@@ -152,11 +152,12 @@ class KernelMappingScheme {
   // elements in the X dimension of a tile, each threads process
   // n=tile_size_x/num_threads_x elements.
   // indexing_order_ define which tile's elements each thread reads.
+
   const IndexingOrder indexing_order_;
-  // vector_size_ only supported for row reduction.
-  // Must be a divisor of tile_sizes_[2]/num_threads_x.
-  // Interesting values are 2 and 4 to trigger vectorized loads on GPUs
-  // while keeping memory coalescing.
+  // vector_size_ only supported for row reduction and must be a divisor
+  // of tile_sizes_[2]/num_threads_x.  Interesting values are 2 and 4
+  // to trigger vectorized loads on GPUs while keeping memory
+  // coalescing.
   const int vector_size_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index cd3fac63fb8..589bff1c160 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -106,7 +106,7 @@ CHECK: ld.global.nc.v2.f32
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
-TEST_F(ReductionVectorizationTest, EvenColums) {
+TEST_F(ReductionVectorizationTest, EvenColumns) {
   const char* hlo_text = R"(
 HloModule ReducePower2
 
@@ -133,7 +133,7 @@ CHECK-NOT: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK-NOT: ld.global.nc.v2.f32
-// TODO: find how to modify LLVM/opt to get this merged? vectorize before some loop opt?
+// TODO: Make this a vectorized load
 CHECK: ld.global.nc.f32
 CHECK: ld.global.nc.f32
 )");

From dc1c95f8436968f6527d128b2f858877475b0871 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Thu, 5 Mar 2020 11:15:41 -0800
Subject: [PATCH 29/52] Comment and reuse useless case.

---
 .../compiler/xla/service/gpu/ir_emitter_unnested.cc   | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 74fac7c47a0..eceed11714f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1878,8 +1878,6 @@ bool MayPreventVectorization(const HloInstruction& hlo) {
                               case HloOpcode::kPower:
                               case HloOpcode::kAtan2:
                                 return true;
-                              case HloOpcode::kReduce:
-                                // TODO: check the to_apply() attribute.
                               default:
                                 return false;
                             }
@@ -2027,13 +2025,12 @@ void IrEmitterUnnested::EmitTile(
           }
         };
 
+        // Only try this path when we try to vectorize the loads.
+        // Special case when the tile doesn't fit completly for even row size.
+        // For odd row size every other row isn't aligned, so can't be
+        // vectorized this way by LLVM.
         if (!x_tile_fits &&
             mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
-          // Only try this path when we try to vectorize the loads.
-
-          // Special case when the tile doesn't fit completly for even row size.
-          // For odd row size every other row isn't aligned, so can't be
-          // vectorized.
           ksl->If(loop_name + "_is_full_tile",
                   // if (block fully fit) {fast path} else {slow path}
                   // tile_width is always exact. For the last block,

From ab41708e12cb7cd1239766b90b31e51381ce1ef4 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Thu, 5 Mar 2020 12:23:07 -0800
Subject: [PATCH 30/52] Move a lambda to its own function.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 115 ++++++++++--------
 1 file changed, 64 insertions(+), 51 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index eceed11714f..cddd3aca6cc 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1940,6 +1940,38 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
   return b->CreateMul(thread_id_x, constant(x_num_steps));
 }
 
+auto Unroll(bool add_index_boundary_condition, int64 x_num_steps, int64 step_x,
+            int64 vector_size, const string& loop_name,
+            KernelSupportLibrary* ksl, llvm::Value* start_offset_x,
+            llvm::Value* y_loc, llvm::Value* tile_width,
+            IrArray::Index& source_idx, llvm::IRBuilder<>& b_,
+            const IrEmitterUnnested::EmitElementFunction* emit_elem_function) {
+  llvm::Type* index_ty = tile_width->getType();
+  auto constant = [&](int64 val) {
+    return llvm::ConstantInt::get(index_ty, val);
+  };
+  for (int j = 0; j < x_num_steps / vector_size; j++) {
+    for (int i = 0; i < vector_size; i++) {
+      int linear_index = j * vector_size + i;
+      llvm::Value* x_loc = b_.CreateAdd(constant(j * step_x * vector_size + i),
+                                        start_offset_x, "x_loc");
+      IrArray::Index source_idx_x =
+          source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
+              .AddOffsetToDim(constant(j * step_x * vector_size + i), kDimX,
+                              &b_);
+      auto emit_element = [&] {
+        return (*emit_elem_function)(source_idx_x, y_loc, x_loc, linear_index);
+      };
+      if (add_index_boundary_condition) {
+        ksl->If(loop_name + "_x_in_tile", b_.CreateICmpULT(x_loc, tile_width),
+                emit_element);
+      } else {
+        emit_element();
+      }
+    }
+  }
+}
+
 void IrEmitterUnnested::EmitTile(
     const KernelMappingScheme& mapping_scheme,
     const IrArray::Index& tile_origin_index, const string& loop_name,
@@ -1991,58 +2023,39 @@ void IrEmitterUnnested::EmitTile(
   //
   // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the
   // workaround.
-  ksl->For(
-      loop_name + "_y_in_tile",
-      /*start=*/constant(0),
-      /*end=*/
-      ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
-                    num_threads_y),
-      /*step=*/constant(1), [&](llvm::Value* y_indvar) {
-        llvm::Value* y_loc = b_.CreateAdd(
-            thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
-        auto unroll = [&](bool add_index_boundary_condition) {
-          for (int j = 0; j < x_num_steps / vector_size; j++) {
-            for (int i = 0; i < vector_size; i++) {
-              int linear_index = j * vector_size + i;
-              llvm::Value* x_loc =
-                  b_.CreateAdd(constant(j * step_x * vector_size + i),
-                               start_offset_x, "x_loc");
-              IrArray::Index source_idx_x =
-                  source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
-                      .AddOffsetToDim(constant(j * step_x * vector_size + i),
-                                      kDimX, &b_);
-              auto emit_element = [&] {
-                return emit_elem_function(source_idx_x, y_loc, x_loc,
-                                          linear_index);
-              };
-              if (add_index_boundary_condition) {
-                ksl->If(loop_name + "_x_in_tile",
-                        b_.CreateICmpULT(x_loc, tile_width), emit_element);
-              } else {
-                emit_element();
-              }
-            }
-          }
-        };
+  ksl->For(loop_name + "_y_in_tile",
+           /*start=*/constant(0),
+           /*end=*/
+           ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
+                         num_threads_y),
+           /*step=*/constant(1), [&](llvm::Value* y_indvar) {
+             llvm::Value* y_loc =
+                 b_.CreateAdd(thread_id_info.thread_id_y,
+                              b_.CreateMul(y_indvar, num_threads_y));
+             auto unroll = [&](bool add_index_boundary_condition) {
+               return Unroll(add_index_boundary_condition, x_num_steps, step_x,
+                             vector_size, loop_name, ksl, start_offset_x, y_loc,
+                             tile_width, source_idx, b_, &emit_elem_function);
+             };
 
-        // Only try this path when we try to vectorize the loads.
-        // Special case when the tile doesn't fit completly for even row size.
-        // For odd row size every other row isn't aligned, so can't be
-        // vectorized this way by LLVM.
-        if (!x_tile_fits &&
-            mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
-          ksl->If(loop_name + "_is_full_tile",
-                  // if (block fully fit) {fast path} else {slow path}
-                  // tile_width is always exact. For the last block,
-                  // it will be the exact number of elements left.
-                  b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
-                                  tile_width),
-                  [&] { unroll(/*add_index_boundary_condition=*/false); },
-                  [&] { unroll(/*add_index_boundary_condition=*/true); });
-        } else {
-          unroll(/*add_index_boundary_condition=*/!x_tile_fits);
-        }
-      });
+             // Only try this path when we try to vectorize the loads.
+             // Special case when the tile doesn't fit completly for even row
+             // size. For odd row size every other row isn't aligned, so can't
+             // be vectorized this way by LLVM.
+             if (!x_tile_fits &&
+                 mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
+               ksl->If(loop_name + "_is_full_tile",
+                       // if (block fully fit) {fast path} else {slow path}
+                       // tile_width is always exact. For the last block,
+                       // it will be the exact number of elements left.
+                       b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
+                                       tile_width),
+                       [&] { unroll(/*add_index_boundary_condition=*/false); },
+                       [&] { unroll(/*add_index_boundary_condition=*/true); });
+             } else {
+               unroll(/*add_index_boundary_condition=*/!x_tile_fits);
+             }
+           });
 }
 
 // Emits code to process a tensor element in a tile for the given kCopy HLO that

From 5dadd1069c9e238f14b22535f2dc1a527c1d08aa Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Thu, 5 Mar 2020 12:25:26 -0800
Subject: [PATCH 31/52] Add a CHECK.

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index cddd3aca6cc..06d547a0224 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1935,6 +1935,7 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
   } else if (mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
     return b->CreateMul(thread_id_x, constant(mapping_scheme.GetVectorSize()));
   }
+  CHECK_EQ(mapping_scheme.GetIndexingOrder(), kLinearIndexingX);
   int64 x_num_steps =
       mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX();
   return b->CreateMul(thread_id_x, constant(x_num_steps));

From 291f43e0eb14a4720fccb82471cba0b7d5947db3 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 9 Mar 2020 07:28:23 -0700
Subject: [PATCH 32/52] Small change following code review.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 56 +++++++++++--------
 .../xla/service/gpu/kernel_mapping_scheme.h   |  4 +-
 .../gpu/tests/reduction_vectorization_test.cc | 14 +----
 3 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 06d547a0224..1fa5d54f2ae 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1897,7 +1897,8 @@ bool MayPreventVectorization(const HloInstruction& hlo) {
         return false;
     }
   } else if (hlo.opcode() == HloOpcode::kReduce) {
-    // TODO: check the to_apply() attribute.
+    // TODO: check if the to_apply() attribute contain instruction
+    // that break LLVM vectorization.
     return false;
   }
   return true;
@@ -1941,12 +1942,22 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
   return b->CreateMul(thread_id_x, constant(x_num_steps));
 }
 
-auto Unroll(bool add_index_boundary_condition, int64 x_num_steps, int64 step_x,
-            int64 vector_size, const string& loop_name,
-            KernelSupportLibrary* ksl, llvm::Value* start_offset_x,
-            llvm::Value* y_loc, llvm::Value* tile_width,
-            IrArray::Index& source_idx, llvm::IRBuilder<>& b_,
-            const IrEmitterUnnested::EmitElementFunction* emit_elem_function) {
+// This function calls emit_elem_function() x_num_steps times.  If
+// vector_size==1, then each element index passed to
+// emit_elem_function() will be separated by step_x. If vector_size>1,
+// then it must be a multiple of x_num_steps.  In that case, it
+// triggers a different indexing order that is vectorizable by
+// LLVM. It generates many groups of calls to emit_elem_function. Each
+// group is separated by step_x elements.  Inside a group, elements
+// are consecutive. If check_x_tile_bounds is true, then it will check
+// if the element index is in bound compared to tile_width before
+// calling emit_elem_function.
+static void Unroll(
+    bool check_x_tile_bounds, int64 x_num_steps, int64 step_x,
+    int64 vector_size, const string& loop_name, KernelSupportLibrary* ksl,
+    llvm::Value* start_offset_x, llvm::Value* y_loc, llvm::Value* tile_width,
+    IrArray::Index& source_idx, llvm::IRBuilder<>& b_,
+    const IrEmitterUnnested::EmitElementFunction* emit_elem_function) {
   llvm::Type* index_ty = tile_width->getType();
   auto constant = [&](int64 val) {
     return llvm::ConstantInt::get(index_ty, val);
@@ -1963,7 +1974,7 @@ auto Unroll(bool add_index_boundary_condition, int64 x_num_steps, int64 step_x,
       auto emit_element = [&] {
         return (*emit_elem_function)(source_idx_x, y_loc, x_loc, linear_index);
       };
-      if (add_index_boundary_condition) {
+      if (check_x_tile_bounds) {
         ksl->If(loop_name + "_x_in_tile", b_.CreateICmpULT(x_loc, tile_width),
                 emit_element);
       } else {
@@ -2033,28 +2044,27 @@ void IrEmitterUnnested::EmitTile(
              llvm::Value* y_loc =
                  b_.CreateAdd(thread_id_info.thread_id_y,
                               b_.CreateMul(y_indvar, num_threads_y));
-             auto unroll = [&](bool add_index_boundary_condition) {
-               return Unroll(add_index_boundary_condition, x_num_steps, step_x,
+             auto unroll = [&](bool check_x_tile_bounds) {
+               return Unroll(check_x_tile_bounds, x_num_steps, step_x,
                              vector_size, loop_name, ksl, start_offset_x, y_loc,
                              tile_width, source_idx, b_, &emit_elem_function);
              };
 
-             // Only try this path when we try to vectorize the loads.
-             // Special case when the tile doesn't fit completly for even row
-             // size. For odd row size every other row isn't aligned, so can't
-             // be vectorized this way by LLVM.
+             // Only take this path when we unroll in a way vectorizable by
+             // LLVM. Special case when the tile doesn't fit completely for even
+             // row size. For odd row size every other row isn't aligned to the
+             // vectorized size, so it can't be vectorized by LLVM.
              if (!x_tile_fits &&
                  mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
                ksl->If(loop_name + "_is_full_tile",
-                       // if (block fully fit) {fast path} else {slow path}
-                       // tile_width is always exact. For the last block,
-                       // it will be the exact number of elements left.
+                       // For the last block, tile_width will be the number of
+                       // elements left.
                        b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
                                        tile_width),
-                       [&] { unroll(/*add_index_boundary_condition=*/false); },
-                       [&] { unroll(/*add_index_boundary_condition=*/true); });
+                       [&] { unroll(/*check_x_tile_bounds=*/false); },
+                       [&] { unroll(/*check_x_tile_bounds=*/true); });
              } else {
-               unroll(/*add_index_boundary_condition=*/!x_tile_fits);
+               unroll(/*check_x_tile_bounds=*/!x_tile_fits);
              }
            });
 }
@@ -3173,8 +3183,8 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
 
   KernelMappingScheme::IndexingOrder indexing_order = [&]() {
     if (reduction_dimensions.is_row_reduction &&
-        // Only try to vectorize+coales memory access for row of even size.
-        // For odd row size, every other row isn't aligned, so can't be
+        // Only try to vectorize+coales memory access for rows of even size.
+        // For odd row sizes, every other row isn't aligned, so it can't be
         // vectorized.
         reduction_dimensions.dimensions[2] % 2 == 0) {
       return kLinearStridedIndexingX;
@@ -3210,7 +3220,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   if (indexing_order == kLinearStridedIndexingX) {
     if (reduction_dimensions.dimensions[2] % 2 == 0 &&
         // Assuming XLA will perform the unrolling and LLVM will vectorize,
-        // disable the unroll for case that LLVM doesn't vectorize.
+        // disable the unroll for the cases that LLVM doesn't vectorize.
         !MayPreventVectorization(*unnested_hlo)) {
       vector_size = 2;
     } else {
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 02dd2f3885e..359ee3c900a 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -151,9 +151,9 @@ class KernelMappingScheme {
   // When num_threads_x threads process a total of tile_size_x
   // elements in the X dimension of a tile, each threads process
   // n=tile_size_x/num_threads_x elements.
-  // indexing_order_ define which tile's elements each thread reads.
-
+  // indexing_order defines which tile's elements each thread reads.
   const IndexingOrder indexing_order_;
+
   // vector_size_ only supported for row reduction and must be a divisor
   // of tile_sizes_[2]/num_threads_x.  Interesting values are 2 and 4
   // to trigger vectorized loads on GPUs while keeping memory
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index 589bff1c160..c6cd1b79672 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -32,19 +32,7 @@ namespace gpu {
 
 namespace {
 
-class ReductionVectorizationTest : public GpuCodegenTest {
- protected:
-  void EnsureDeterminism(absl::string_view hlo_text) {
-    std::vector<ExecutionProfile> profiles;
-    profiles.emplace_back();
-    profiles.emplace_back();
-    EXPECT_TRUE(RunMultipleTimes(hlo_text,
-                                 /*run_hlo_passes=*/true,
-                                 /*profiles=*/&profiles,
-                                 /*backend_config=*/"",
-                                 /*assert_determinism=*/true));
-  }
-};
+class ReductionVectorizationTest : public GpuCodegenTest {};
 
 TEST_F(ReductionVectorizationTest, Power2) {
   const char* hlo_text = R"(

From a13b9d4d58e311729de3a967d8c780a95f6691ae Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 9 Mar 2020 13:57:17 -0700
Subject: [PATCH 33/52] NFC following code review. comment typo and rename
 unroll to unrollInnerTileLoop.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 85 ++++++++++---------
 1 file changed, 43 insertions(+), 42 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 1fa5d54f2ae..14346fa3dac 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1897,7 +1897,7 @@ bool MayPreventVectorization(const HloInstruction& hlo) {
         return false;
     }
   } else if (hlo.opcode() == HloOpcode::kReduce) {
-    // TODO: check if the to_apply() attribute contain instruction
+    // TODO: check if the to_apply() attribute contains instruction
     // that break LLVM vectorization.
     return false;
   }
@@ -1942,17 +1942,17 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
   return b->CreateMul(thread_id_x, constant(x_num_steps));
 }
 
-// This function calls emit_elem_function() x_num_steps times.  If
-// vector_size==1, then each element index passed to
-// emit_elem_function() will be separated by step_x. If vector_size>1,
-// then it must be a multiple of x_num_steps.  In that case, it
+// Calls `emit_elem_function()` `x_num_steps` times.  If
+// `vector_size`==1, then each element index passed to
+// `emit_elem_function()` will be separated by `step_x`. If `vector_size`>1,
+// then it must be a multiple of `x_num_steps`.  In that case, it
 // triggers a different indexing order that is vectorizable by
-// LLVM. It generates many groups of calls to emit_elem_function. Each
-// group is separated by step_x elements.  Inside a group, elements
-// are consecutive. If check_x_tile_bounds is true, then it will check
-// if the element index is in bound compared to tile_width before
-// calling emit_elem_function.
-static void Unroll(
+// LLVM. It generates many groups of calls to `emit_elem_function`. Each
+// group is separated by `step_x` elements.  Inside a group, elements
+// are consecutive. If `check_x_tile_bounds` is true, then it will check
+// if the element index is in bound compared to `tile_width` before
+// calling `emit_elem_function`.
+static void UnrollInnerTileLoop(
     bool check_x_tile_bounds, int64 x_num_steps, int64 step_x,
     int64 vector_size, const string& loop_name, KernelSupportLibrary* ksl,
     llvm::Value* start_offset_x, llvm::Value* y_loc, llvm::Value* tile_width,
@@ -2035,38 +2035,39 @@ void IrEmitterUnnested::EmitTile(
   //
   // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the
   // workaround.
-  ksl->For(loop_name + "_y_in_tile",
-           /*start=*/constant(0),
-           /*end=*/
-           ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
-                         num_threads_y),
-           /*step=*/constant(1), [&](llvm::Value* y_indvar) {
-             llvm::Value* y_loc =
-                 b_.CreateAdd(thread_id_info.thread_id_y,
-                              b_.CreateMul(y_indvar, num_threads_y));
-             auto unroll = [&](bool check_x_tile_bounds) {
-               return Unroll(check_x_tile_bounds, x_num_steps, step_x,
-                             vector_size, loop_name, ksl, start_offset_x, y_loc,
-                             tile_width, source_idx, b_, &emit_elem_function);
-             };
+  ksl->For(
+      loop_name + "_y_in_tile",
+      /*start=*/constant(0),
+      /*end=*/
+      ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
+                    num_threads_y),
+      /*step=*/constant(1), [&](llvm::Value* y_indvar) {
+        llvm::Value* y_loc = b_.CreateAdd(
+            thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
+        auto unrollInnerTileLoop = [&](bool check_x_tile_bounds) {
+          return UnrollInnerTileLoop(check_x_tile_bounds, x_num_steps, step_x,
+                                     vector_size, loop_name, ksl,
+                                     start_offset_x, y_loc, tile_width,
+                                     source_idx, b_, &emit_elem_function);
+        };
 
-             // Only take this path when we unroll in a way vectorizable by
-             // LLVM. Special case when the tile doesn't fit completely for even
-             // row size. For odd row size every other row isn't aligned to the
-             // vectorized size, so it can't be vectorized by LLVM.
-             if (!x_tile_fits &&
-                 mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
-               ksl->If(loop_name + "_is_full_tile",
-                       // For the last block, tile_width will be the number of
-                       // elements left.
-                       b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
-                                       tile_width),
-                       [&] { unroll(/*check_x_tile_bounds=*/false); },
-                       [&] { unroll(/*check_x_tile_bounds=*/true); });
-             } else {
-               unroll(/*check_x_tile_bounds=*/!x_tile_fits);
-             }
-           });
+        // Only take this path when we unroll in a way vectorizable by
+        // LLVM. Special case when the tile doesn't fit completely for even
+        // row size. For odd row size every other row isn't aligned to the
+        // vectorized size, so it can't be vectorized by LLVM.
+        if (!x_tile_fits &&
+            mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
+          ksl->If(loop_name + "_is_full_tile",
+                  // For the last block, tile_width will be the number of
+                  // elements left.
+                  b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
+                                  tile_width),
+                  [&] { unrollInnerTileLoop(/*check_x_tile_bounds=*/false); },
+                  [&] { unrollInnerTileLoop(/*check_x_tile_bounds=*/true); });
+        } else {
+          unrollInnerTileLoop(/*check_x_tile_bounds=*/!x_tile_fits);
+        }
+      });
 }
 
 // Emits code to process a tensor element in a tile for the given kCopy HLO that

From 3dadac1d4a43dd58062e7ee345372b138211da86 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Sun, 15 Mar 2020 17:39:00 -0700
Subject: [PATCH 34/52] Enable more vectorization by LLVMby generating simpler
 indexing. LLVM doesn't simplify correctly the complicated indexing generated.
 This souldn't affect the speed as ptxas was vectorizing the missing
 vectorization.

---
 .../compiler/xla/service/gpu/ir_emitter_unnested.cc    | 10 ++++++++++
 .../service/gpu/tests/reduction_vectorization_test.cc  |  7 ++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 14346fa3dac..b02356e8cc0 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2099,6 +2099,16 @@ static IrArray::Index GetUnnormalizedIndex(
     const Shape& unnormalized_shape, llvm::IRBuilder<>* b_,
     const KernelMappingScheme& kernel_mapping_scheme) {
   DCHECK_EQ(normalized_shape_index.size(), 3);
+  // If the normalization only add a new dimensions of size 1,
+  // generate simpler indexing. LLVM doesn't always simplify the more
+  // complicated indexing and this prevent him from vectorizing some
+  // cases.
+  if (unnormalized_shape.rank() == 2) {
+    DCHECK_EQ(normalized_shape_index.dims()[0], 0);
+    auto multidim = normalized_shape_index.multidim();
+    return IrArray::Index({multidim[1], multidim[2]}, unnormalized_shape,
+                          normalized_shape_index.GetType());
+  }
   llvm::Value* linear = normalized_shape_index.Linearize(
       kernel_mapping_scheme.GetDimsInElems(), b_);
   return IrArray::Index(linear, unnormalized_shape, b_);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index c6cd1b79672..10ad4ef3fda 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -83,9 +83,7 @@ ENTRY %main {
                           ParseAndReturnVerifiedModule(hlo_text));
   CompileAndOptionallyVerifyPtx(std::move(optimized_module),
                                 R"(
-// TODO: Make this a vectorized load
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
@@ -116,8 +114,7 @@ ENTRY %main {
                                 R"(
 CHECK: ld.global.nc.f32
 CHECK: ld.global.nc.f32
-// TODO: Make this a vectorized load
-CHECK-NOT: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK-NOT: ld.global.nc.v2.f32

From 5704a9a295711ee46b66ad2e33c690c325982dd0 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Sun, 15 Mar 2020 17:53:05 -0700
Subject: [PATCH 35/52] Vectorize reduction based on arch and dtypes.

---
 .../compiler/xla/service/gpu/ir_emitter_unnested.cc    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index b02356e8cc0..240fa76e199 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -3192,12 +3192,20 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
       !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
                                             reduction_dimensions.dimensions[2]);
 
+  int cc_major = 0, cc_minor = 0;
+  ir_emitter_context_->device_description().cuda_compute_capability(&cc_major,
+                                                                    &cc_minor);
+
   KernelMappingScheme::IndexingOrder indexing_order = [&]() {
     if (reduction_dimensions.is_row_reduction &&
         // Only try to vectorize+coales memory access for rows of even size.
         // For odd row sizes, every other row isn't aligned, so it can't be
         // vectorized.
-        reduction_dimensions.dimensions[2] % 2 == 0) {
+        reduction_dimensions.dimensions[2] % 2 == 0 &&
+        // Vectorization on P100 speed up only float16 and smaller
+        // dtype. Vectorization on P100 speed up or do not hurt all
+        // dtypes.
+        ((cc_major == 6 && smallest_input_dtype_bits <= 16) || cc_major >= 7)) {
       return kLinearStridedIndexingX;
     } else if (IsUnrollingColumnReductionBeneficial(
                    unnested_hlo, input_shape,

From 2dabd537f22e3a5f8780fd89b24655e32cfdf1a8 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 16 Mar 2020 07:37:31 -0700
Subject: [PATCH 36/52] Only trigger reduction columns indexing for columns
 reductions.

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 240fa76e199..690581ef8dd 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -3207,7 +3207,8 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
         // dtypes.
         ((cc_major == 6 && smallest_input_dtype_bits <= 16) || cc_major >= 7)) {
       return kLinearStridedIndexingX;
-    } else if (IsUnrollingColumnReductionBeneficial(
+    } else if (!reduction_dimensions.is_row_reduction &&
+	       IsUnrollingColumnReductionBeneficial(
                    unnested_hlo, input_shape,
                    reduction_dimensions.dimensions[2])) {
       return kLinearIndexingX;

From 2231173d4684ad0983ab1672502c4587522ebd61 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 16 Mar 2020 07:37:53 -0700
Subject: [PATCH 37/52] For row reduction do not try to vectorize on P100 when
 the tile doesn't fit perfectly and for dtype bigger then 32 bits.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 690581ef8dd..d21702672d6 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -3192,20 +3192,34 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
       !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
                                             reduction_dimensions.dimensions[2]);
 
+  int64 num_threads_y = reduction_dimensions.is_row_reduction ? 1 : kWarpSize;
+  int64 num_threads_x = [&] {
+    if (reduction_dimensions.is_row_reduction) {
+      return std::min(
+          kWarpSize * kWarpSize,
+          RoundUpToNearest(CeilOfRatio(reduction_dimensions.dimensions[2],
+                                       reduction_tiling[2]),
+                           kWarpSize));
+    }
+    return kWarpSize;
+  }();
+
+  int tile_size_x = reduction_tiling[2] * num_threads_x;
+  bool tile_fit = reduction_dimensions.dimensions[kDimX] % tile_size_x == 0;
+
   int cc_major = 0, cc_minor = 0;
   ir_emitter_context_->device_description().cuda_compute_capability(&cc_major,
                                                                     &cc_minor);
 
   KernelMappingScheme::IndexingOrder indexing_order = [&]() {
     if (reduction_dimensions.is_row_reduction &&
-        // Only try to vectorize+coales memory access for rows of even size.
-        // For odd row sizes, every other row isn't aligned, so it can't be
-        // vectorized.
-        reduction_dimensions.dimensions[2] % 2 == 0 &&
-        // Vectorization on P100 speed up only float16 and smaller
-        // dtype. Vectorization on P100 speed up or do not hurt all
-        // dtypes.
-        ((cc_major == 6 && smallest_input_dtype_bits <= 16) || cc_major >= 7)) {
+	// P100, only ttry to vectorize+coales memory access when the
+	// tile size fit exactly and dtypes <= 32 bits
+	((cc_major == 6 && smallest_input_dtype_bits <= 32 && tile_fit) ||
+	 // On V100, only try to vectorize+coales memory access for
+	 // rows of even size.  For odd row sizes, every other row
+	 // isn't aligned, so it can't be vectorized.
+	 (cc_major >= 7 && reduction_dimensions.dimensions[2] % 2 == 0))) {
       return kLinearStridedIndexingX;
     } else if (!reduction_dimensions.is_row_reduction &&
 	       IsUnrollingColumnReductionBeneficial(
@@ -3222,20 +3236,6 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
     reduction_tiling[2] *= 2;
   }
 
-  int64 num_threads_y = reduction_dimensions.is_row_reduction ? 1 : kWarpSize;
-  int64 num_threads_x = [&] {
-    if (reduction_dimensions.is_row_reduction) {
-      return std::min(
-          kWarpSize * kWarpSize,
-          RoundUpToNearest(CeilOfRatio(reduction_dimensions.dimensions[2],
-                                       reduction_tiling[2]),
-                           kWarpSize));
-    }
-    return kWarpSize;
-  }();
-
-  int tile_size_x = reduction_tiling[2] * num_threads_x;
-
   int vector_size = 1;
   if (indexing_order == kLinearStridedIndexingX) {
     if (reduction_dimensions.dimensions[2] % 2 == 0 &&

From f0b32e48c06d4d8e228e786ab0ff027e19848849 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 16 Mar 2020 08:31:28 -0700
Subject: [PATCH 38/52] Update test to check the GPU arch for the expected ptx.

---
 .../gpu/tests/reduction_vectorization_test.cc | 104 ++++++++++++++++--
 1 file changed, 92 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index 10ad4ef3fda..5f27df08b90 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -52,13 +52,32 @@ ENTRY %main {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
-                                R"(
+  se::StreamExecutor* executor = backend().default_stream_executor();
+  int cc_major = 0, cc_minor = 0;
+  executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                           &cc_minor);
+  string expected_ptx;
+  if (cc_major >= 6) {
+    expected_ptx = R"(
 CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
-)");
+)";
+  } else {
+    expected_ptx = R"(
+CHECK-NOT: ld.global.nc.v2.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+)";
+  }
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
@@ -81,13 +100,32 @@ ENTRY %main {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
-                                R"(
+  se::StreamExecutor* executor = backend().default_stream_executor();
+  int cc_major = 0, cc_minor = 0;
+  executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                           &cc_minor);
+  string expected_ptx;
+  if (cc_major >= 6) {
+    expected_ptx = R"(
 CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK: ld.global.nc.v2.f32
-)");
+)";
+  } else {
+    expected_ptx = R"(
+CHECK-NOT: ld.global.nc.v2.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+)";
+  }
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
@@ -110,8 +148,13 @@ ENTRY %main {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
-                                R"(
+  se::StreamExecutor* executor = backend().default_stream_executor();
+  int cc_major = 0, cc_minor = 0;
+  executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                           &cc_minor);
+  string expected_ptx;
+  if (cc_major >= 7) {
+    expected_ptx = R"(
 CHECK: ld.global.nc.f32
 CHECK: ld.global.nc.f32
 CHECK: ld.global.nc.v2.f32
@@ -121,7 +164,22 @@ CHECK-NOT: ld.global.nc.v2.f32
 // TODO: Make this a vectorized load
 CHECK: ld.global.nc.f32
 CHECK: ld.global.nc.f32
-)");
+)";
+  } else {
+    expected_ptx = R"(
+CHECK-NOT: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+)";
+  }
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
@@ -175,10 +233,32 @@ ENTRY %main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
-                                R"(
+  se::StreamExecutor* executor = backend().default_stream_executor();
+  int cc_major = 0, cc_minor = 0;
+  executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                           &cc_minor);
+  string expected_ptx;
+  if (cc_major >= 6) {
+    expected_ptx = R"(
 CHECK: ld.global.nc.v2.f32
-)");
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: ld.global.nc.v2.f32
+)";
+  } else {
+    expected_ptx = R"(
+CHECK-NOT: ld.global.nc.v2.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+CHECK: ld.global.nc.f32
+)";
+  }
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }

From a54445d7001c370b668bddf1083b481524b53c6d Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 16 Mar 2020 08:31:40 -0700
Subject: [PATCH 39/52] clang-format

---
 .../xla/service/gpu/ir_emitter_unnested.cc       | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index d21702672d6..4131ff0eecc 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -3213,16 +3213,16 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
 
   KernelMappingScheme::IndexingOrder indexing_order = [&]() {
     if (reduction_dimensions.is_row_reduction &&
-	// P100, only ttry to vectorize+coales memory access when the
-	// tile size fit exactly and dtypes <= 32 bits
-	((cc_major == 6 && smallest_input_dtype_bits <= 32 && tile_fit) ||
-	 // On V100, only try to vectorize+coales memory access for
-	 // rows of even size.  For odd row sizes, every other row
-	 // isn't aligned, so it can't be vectorized.
-	 (cc_major >= 7 && reduction_dimensions.dimensions[2] % 2 == 0))) {
+        // P100, only ttry to vectorize+coales memory access when the
+        // tile size fit exactly and dtypes <= 32 bits
+        ((cc_major == 6 && smallest_input_dtype_bits <= 32 && tile_fit) ||
+         // On V100, only try to vectorize+coales memory access for
+         // rows of even size.  For odd row sizes, every other row
+         // isn't aligned, so it can't be vectorized.
+         (cc_major >= 7 && reduction_dimensions.dimensions[2] % 2 == 0))) {
       return kLinearStridedIndexingX;
     } else if (!reduction_dimensions.is_row_reduction &&
-	       IsUnrollingColumnReductionBeneficial(
+               IsUnrollingColumnReductionBeneficial(
                    unnested_hlo, input_shape,
                    reduction_dimensions.dimensions[2])) {
       return kLinearIndexingX;

From 229326ce9c379a7a7d9c08d5975becc069103121 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 16 Mar 2020 10:26:19 -0700
Subject: [PATCH 40/52] Crash fix. We where not returning the right reduction
 tile size.

---
 .../xla/service/gpu/ir_emitter_unnested.cc      | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 4131ff0eecc..174db1e19b9 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2161,7 +2161,8 @@ static int GetNumberOfPartialResults(
     return 1;
   }
   int64 num_partial_results =
-      mapping_scheme.GetIndexingOrder() == kStridedIndexingX ? 1 : 2;
+      mapping_scheme.GetIndexingOrder() == kLinearIndexingX ? 2 : 1;
+
   CHECK_EQ(num_partial_results,
            (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX()));
   return num_partial_results;
@@ -3204,8 +3205,9 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
     return kWarpSize;
   }();
 
-  int tile_size_x = reduction_tiling[2] * num_threads_x;
-  bool tile_fit = reduction_dimensions.dimensions[kDimX] % tile_size_x == 0;
+  bool tile_fit = reduction_dimensions.dimensions[kDimX] %
+                      (reduction_tiling[2] * num_threads_x) ==
+                  0;
 
   int cc_major = 0, cc_minor = 0;
   ir_emitter_context_->device_description().cuda_compute_capability(&cc_major,
@@ -3225,16 +3227,12 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
                IsUnrollingColumnReductionBeneficial(
                    unnested_hlo, input_shape,
                    reduction_dimensions.dimensions[2])) {
+      reduction_tiling[2] *= 2;
       return kLinearIndexingX;
     } else {
       return kStridedIndexingX;
     }
   }();
-  if (indexing_order == kLinearIndexingX &&
-      !reduction_dimensions.is_row_reduction) {
-    // Vectorized loads: a single thread reduces two adjacent columns.
-    reduction_tiling[2] *= 2;
-  }
 
   int vector_size = 1;
   if (indexing_order == kLinearStridedIndexingX) {
@@ -3249,7 +3247,8 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   }
   KernelMappingScheme mapping_scheme(
       reduction_dimensions.dimensions,
-      {reduction_tiling[0], reduction_tiling[1] * num_threads_y, tile_size_x},
+      {reduction_tiling[0], reduction_tiling[1] * num_threads_y,
+       reduction_tiling[2] * num_threads_x},
       num_threads_y, num_threads_x, indexing_order, vector_size);
   return ReductionCodegenInfo(mapping_scheme,
                               reduction_dimensions.is_row_reduction);

From 46cf90800e9a0be1fd4647d5e4fa9de29d3e1c42 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 16 Mar 2020 10:36:50 -0700
Subject: [PATCH 41/52] Small refactoring. Instead of recomputing a value,
 store it. This is safer as all the numbers are computed and validated once.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 29 +++++--------------
 .../xla/service/gpu/kernel_mapping_scheme.h   | 10 ++++++-
 2 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 174db1e19b9..ec749a48bcb 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2151,23 +2151,6 @@ void IrEmitterUnnested::EmitTileElementForFusion(
   }
 }
 
-// Gets the number of partial results accumulated by a single thread performing
-// reduction.
-static int GetNumberOfPartialResults(
-    const ReductionCodegenInfo& reduction_info) {
-  const KernelMappingScheme& mapping_scheme =
-      reduction_info.GetKernelMappingScheme();
-  if (reduction_info.IsRowReduction()) {
-    return 1;
-  }
-  int64 num_partial_results =
-      mapping_scheme.GetIndexingOrder() == kLinearIndexingX ? 2 : 1;
-
-  CHECK_EQ(num_partial_results,
-           (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX()));
-  return num_partial_results;
-}
-
 void IrEmitterUnnested::EmitPrologueForReduction(
     HloInstruction* unnested_hlo, ReductionCodegenInfo* reduction_info,
     absl::Span<HloInstruction* const> reduce_instructions,
@@ -2194,7 +2177,7 @@ void IrEmitterUnnested::EmitPrologueForReduction(
     llvm::AllocaInst* reduction_input_address = Alloca(element_type);
     reduction_input_addresses->push_back(reduction_input_address);
 
-    int num_partial_results = GetNumberOfPartialResults(*reduction_info);
+    int num_partial_results = reduction_info->GetNumPartialResults();
     AddressVector* partial_result_addresses =
         reduction_info->GetMutablePartialResultAddresses();
     llvm::AllocaInst* partial_result_address =
@@ -2346,7 +2329,7 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
   absl::Span<llvm::AllocaInst* const> partial_result_addresses =
       reduction_info.GetPartialResultAddresses();
 
-  int num_partial_results = GetNumberOfPartialResults(reduction_info);
+  int num_partial_results = reduction_info.GetNumPartialResults();
 
   // Emit an atomic operation that accumulates the partial reduction to the
   // output element. For row reduction, this is only for lane 0 due to the
@@ -2560,7 +2543,7 @@ void IrEmitterUnnested::EmitTileElementForReduction(
   // GetElementPointer with array types. This enables the vectorization of
   // the computation for different partial results. Use this index if
   // 'num_partial_results > 1'.
-  int num_partial_results = GetNumberOfPartialResults(reduction_info);
+  int num_partial_results = reduction_info.GetNumPartialResults();
   auto index_without_linear = IrArray::Index(
       input_index.multidim(), reduction_operand_shape, input_index.GetType());
 
@@ -3213,6 +3196,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   ir_emitter_context_->device_description().cuda_compute_capability(&cc_major,
                                                                     &cc_minor);
 
+  int num_partial_results = 1;
   KernelMappingScheme::IndexingOrder indexing_order = [&]() {
     if (reduction_dimensions.is_row_reduction &&
         // P100, only ttry to vectorize+coales memory access when the
@@ -3227,7 +3211,8 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
                IsUnrollingColumnReductionBeneficial(
                    unnested_hlo, input_shape,
                    reduction_dimensions.dimensions[2])) {
-      reduction_tiling[2] *= 2;
+      num_partial_results = 2;
+      reduction_tiling[2] *= num_partial_results;
       return kLinearIndexingX;
     } else {
       return kStridedIndexingX;
@@ -3250,7 +3235,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
       {reduction_tiling[0], reduction_tiling[1] * num_threads_y,
        reduction_tiling[2] * num_threads_x},
       num_threads_y, num_threads_x, indexing_order, vector_size);
-  return ReductionCodegenInfo(mapping_scheme,
+  return ReductionCodegenInfo(mapping_scheme, num_partial_results,
                               reduction_dimensions.is_row_reduction);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 359ee3c900a..9291d7bc6a3 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -166,8 +166,14 @@ using AddressVector = absl::InlinedVector<llvm::AllocaInst*, 1>;
 class ReductionCodegenInfo {
  public:
   explicit ReductionCodegenInfo(KernelMappingScheme mapping_scheme,
+                                int num_partial_results,
                                 bool is_row_reduction)
-      : mapping_scheme_(mapping_scheme), is_row_reduction_(is_row_reduction) {}
+      : mapping_scheme_(mapping_scheme), num_partial_results_(num_partial_results), is_row_reduction_(is_row_reduction) {
+    if (num_partial_results > 1) {
+      CHECK_EQ(num_partial_results,
+               (mapping_scheme.GetTileSizeX() / mapping_scheme.GetNumThreadsX()));
+    }
+  }
 
   const KernelMappingScheme& GetKernelMappingScheme() const {
     return mapping_scheme_;
@@ -203,6 +209,7 @@ class ReductionCodegenInfo {
     return reduction_input_addresses_;
   }
 
+  int GetNumPartialResults() const { return num_partial_results_; }
   bool IsRowReduction() const { return is_row_reduction_; }
 
   // Gets a pointer to a mutable shared cache used by reduction.
@@ -221,6 +228,7 @@ class ReductionCodegenInfo {
   const KernelMappingScheme mapping_scheme_;
   AddressVector partial_result_addresses_;
   AddressVector reduction_input_addresses_;
+  int num_partial_results_;
   bool is_row_reduction_;
 };
 

From 89db28e2fffeaeafce3f430de4de554e76cd378a Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Fri, 27 Mar 2020 07:24:36 -0700
Subject: [PATCH 42/52] Fix comment typo

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index ec749a48bcb..cd0e034e486 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2101,7 +2101,7 @@ static IrArray::Index GetUnnormalizedIndex(
   DCHECK_EQ(normalized_shape_index.size(), 3);
   // If the normalization only add a new dimensions of size 1,
   // generate simpler indexing. LLVM doesn't always simplify the more
-  // complicated indexing and this prevent him from vectorizing some
+  // complicated indexing and this prevents it from vectorizing some
   // cases.
   if (unnormalized_shape.rank() == 2) {
     DCHECK_EQ(normalized_shape_index.dims()[0], 0);
@@ -3199,8 +3199,8 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   int num_partial_results = 1;
   KernelMappingScheme::IndexingOrder indexing_order = [&]() {
     if (reduction_dimensions.is_row_reduction &&
-        // P100, only ttry to vectorize+coales memory access when the
-        // tile size fit exactly and dtypes <= 32 bits
+        // P100, only try to vectorize+coales memory access when the
+        // tile size fits exactly and dtypes <= 32 bits
         ((cc_major == 6 && smallest_input_dtype_bits <= 32 && tile_fit) ||
          // On V100, only try to vectorize+coales memory access for
          // rows of even size.  For odd row sizes, every other row

From 1b7fef45a5f7339a4202444e41706c9e5fd5c80a Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 30 Mar 2020 13:01:47 -0700
Subject: [PATCH 43/52] Do a simple unnormalize only for row major memory
 format and exact shape.

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index cd0e034e486..512280e8a03 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2102,8 +2102,11 @@ static IrArray::Index GetUnnormalizedIndex(
   // If the normalization only add a new dimensions of size 1,
   // generate simpler indexing. LLVM doesn't always simplify the more
   // complicated indexing and this prevents it from vectorizing some
-  // cases.
-  if (unnormalized_shape.rank() == 2) {
+  // cases. We do this only for major_to_minor memory layout.
+  if (unnormalized_shape.rank() == 2 && unnormalized_shape.has_layout() &&
+      unnormalized_shape.dimensions()[0] == normalized_shape_index.dims()[1] &&
+      unnormalized_shape.dimensions()[1] == normalized_shape_index.dims()[2] &&
+      unnormalized_shape.layout().minor_to_major(1) == 0) {
     DCHECK_EQ(normalized_shape_index.dims()[0], 0);
     auto multidim = normalized_shape_index.multidim();
     return IrArray::Index({multidim[1], multidim[2]}, unnormalized_shape,

From abc7c0a03dd6f21e36d077a1f2caba3b0d950825 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 1 Apr 2020 09:52:21 -0700
Subject: [PATCH 44/52] Fix DCHECK and make it a CHECK.

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 512280e8a03..ffc4df06749 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2107,7 +2107,7 @@ static IrArray::Index GetUnnormalizedIndex(
       unnormalized_shape.dimensions()[0] == normalized_shape_index.dims()[1] &&
       unnormalized_shape.dimensions()[1] == normalized_shape_index.dims()[2] &&
       unnormalized_shape.layout().minor_to_major(1) == 0) {
-    DCHECK_EQ(normalized_shape_index.dims()[0], 0);
+    CHECK_EQ(normalized_shape_index.dims()[0], 1);
     auto multidim = normalized_shape_index.multidim();
     return IrArray::Index({multidim[1], multidim[2]}, unnormalized_shape,
                           normalized_shape_index.GetType());

From e1db51a42fcb79a63e8ab1cd9ab4660b79052f20 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 1 Apr 2020 09:54:16 -0700
Subject: [PATCH 45/52] Code clean up

---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 34 ++++++++-----------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index ffc4df06749..863079d1978 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1956,26 +1956,26 @@ static void UnrollInnerTileLoop(
     bool check_x_tile_bounds, int64 x_num_steps, int64 step_x,
     int64 vector_size, const string& loop_name, KernelSupportLibrary* ksl,
     llvm::Value* start_offset_x, llvm::Value* y_loc, llvm::Value* tile_width,
-    IrArray::Index& source_idx, llvm::IRBuilder<>& b_,
+    const IrArray::Index& source_idx, llvm::IRBuilder<>* b_,
     const IrEmitterUnnested::EmitElementFunction* emit_elem_function) {
   llvm::Type* index_ty = tile_width->getType();
   auto constant = [&](int64 val) {
     return llvm::ConstantInt::get(index_ty, val);
   };
-  for (int j = 0; j < x_num_steps / vector_size; j++) {
-    for (int i = 0; i < vector_size; i++) {
+  for (int64 j = 0; j < x_num_steps / vector_size; j++) {
+    IrArray::Index source_idx_x_base =
+        source_idx.AddOffsetToDim(y_loc, kDimY, b_);
+    for (int64 i = 0; i < vector_size; i++) {
       int linear_index = j * vector_size + i;
-      llvm::Value* x_loc = b_.CreateAdd(constant(j * step_x * vector_size + i),
-                                        start_offset_x, "x_loc");
-      IrArray::Index source_idx_x =
-          source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
-              .AddOffsetToDim(constant(j * step_x * vector_size + i), kDimX,
-                              &b_);
+      llvm::Value* x_loc = b_->CreateAdd(constant(j * step_x * vector_size + i),
+                                         start_offset_x, "x_loc");
+      IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim(
+          constant(j * step_x * vector_size + i), kDimX, b_);
       auto emit_element = [&] {
         return (*emit_elem_function)(source_idx_x, y_loc, x_loc, linear_index);
       };
       if (check_x_tile_bounds) {
-        ksl->If(loop_name + "_x_in_tile", b_.CreateICmpULT(x_loc, tile_width),
+        ksl->If(loop_name + "_x_in_tile", b_->CreateICmpULT(x_loc, tile_width),
                 emit_element);
       } else {
         emit_element();
@@ -2044,11 +2044,11 @@ void IrEmitterUnnested::EmitTile(
       /*step=*/constant(1), [&](llvm::Value* y_indvar) {
         llvm::Value* y_loc = b_.CreateAdd(
             thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
-        auto unrollInnerTileLoop = [&](bool check_x_tile_bounds) {
+        auto unroll_inner_tile_loop = [&](bool check_x_tile_bounds) {
           return UnrollInnerTileLoop(check_x_tile_bounds, x_num_steps, step_x,
                                      vector_size, loop_name, ksl,
                                      start_offset_x, y_loc, tile_width,
-                                     source_idx, b_, &emit_elem_function);
+                                     source_idx, &b_, &emit_elem_function);
         };
 
         // Only take this path when we unroll in a way vectorizable by
@@ -2062,10 +2062,10 @@ void IrEmitterUnnested::EmitTile(
                   // elements left.
                   b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
                                   tile_width),
-                  [&] { unrollInnerTileLoop(/*check_x_tile_bounds=*/false); },
-                  [&] { unrollInnerTileLoop(/*check_x_tile_bounds=*/true); });
+                  [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/false); },
+                  [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/true); });
         } else {
-          unrollInnerTileLoop(/*check_x_tile_bounds=*/!x_tile_fits);
+          unroll_inner_tile_loop(/*check_x_tile_bounds=*/!x_tile_fits);
         }
       });
 }
@@ -3174,10 +3174,6 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   std::array<int64, 3> reduction_tiling =
       GetReductionTiling(reduction_dimensions, smallest_input_dtype_bits,
                          &ir_emitter_context_->device_description());
-  bool dilated_x =
-      reduction_dimensions.is_row_reduction ||
-      !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
-                                            reduction_dimensions.dimensions[2]);
 
   int64 num_threads_y = reduction_dimensions.is_row_reduction ? 1 : kWarpSize;
   int64 num_threads_x = [&] {

From cb538d1fe9771986833bffdd6688d208bbe71974 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Thu, 2 Apr 2020 06:32:03 -0700
Subject: [PATCH 46/52] NFC: rename a variable

---
 .../compiler/xla/service/gpu/ir_emitter_unnested.cc  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 863079d1978..cf5f3716dbb 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1956,7 +1956,7 @@ static void UnrollInnerTileLoop(
     bool check_x_tile_bounds, int64 x_num_steps, int64 step_x,
     int64 vector_size, const string& loop_name, KernelSupportLibrary* ksl,
     llvm::Value* start_offset_x, llvm::Value* y_loc, llvm::Value* tile_width,
-    const IrArray::Index& source_idx, llvm::IRBuilder<>* b_,
+    const IrArray::Index& source_idx, llvm::IRBuilder<>* b,
     const IrEmitterUnnested::EmitElementFunction* emit_elem_function) {
   llvm::Type* index_ty = tile_width->getType();
   auto constant = [&](int64 val) {
@@ -1964,18 +1964,18 @@ static void UnrollInnerTileLoop(
   };
   for (int64 j = 0; j < x_num_steps / vector_size; j++) {
     IrArray::Index source_idx_x_base =
-        source_idx.AddOffsetToDim(y_loc, kDimY, b_);
+        source_idx.AddOffsetToDim(y_loc, kDimY, b);
     for (int64 i = 0; i < vector_size; i++) {
       int linear_index = j * vector_size + i;
-      llvm::Value* x_loc = b_->CreateAdd(constant(j * step_x * vector_size + i),
-                                         start_offset_x, "x_loc");
+      llvm::Value* x_loc = b->CreateAdd(constant(j * step_x * vector_size + i),
+                                        start_offset_x, "x_loc");
       IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim(
-          constant(j * step_x * vector_size + i), kDimX, b_);
+          constant(j * step_x * vector_size + i), kDimX, b);
       auto emit_element = [&] {
         return (*emit_elem_function)(source_idx_x, y_loc, x_loc, linear_index);
       };
       if (check_x_tile_bounds) {
-        ksl->If(loop_name + "_x_in_tile", b_->CreateICmpULT(x_loc, tile_width),
+        ksl->If(loop_name + "_x_in_tile", b->CreateICmpULT(x_loc, tile_width),
                 emit_element);
       } else {
         emit_element();

From ab8eb15d1f25dfb95a55f3a498ea024bd08cc158 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Thu, 2 Apr 2020 06:32:12 -0700
Subject: [PATCH 47/52] clang-format

---
 .../xla/service/gpu/ir_emitter_unnested.cc        | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index cf5f3716dbb..7493a943329 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2057,13 +2057,14 @@ void IrEmitterUnnested::EmitTile(
         // vectorized size, so it can't be vectorized by LLVM.
         if (!x_tile_fits &&
             mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
-          ksl->If(loop_name + "_is_full_tile",
-                  // For the last block, tile_width will be the number of
-                  // elements left.
-                  b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
-                                  tile_width),
-                  [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/false); },
-                  [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/true); });
+          ksl->If(
+              loop_name + "_is_full_tile",
+              // For the last block, tile_width will be the number of
+              // elements left.
+              b_.CreateICmpEQ(constant(mapping_scheme.GetTileSizeX()),
+                              tile_width),
+              [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/false); },
+              [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/true); });
         } else {
           unroll_inner_tile_loop(/*check_x_tile_bounds=*/!x_tile_fits);
         }

From 8d724f869346f0bfd4cd57bbcd9b42bab9ef500e Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Fri, 3 Apr 2020 14:45:34 -0700
Subject: [PATCH 48/52] Rename a variable

---
 .../compiler/xla/service/gpu/ir_emitter_unnested.cc  | 12 ++++++------
 .../compiler/xla/service/gpu/kernel_mapping_scheme.h |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 7493a943329..73c78204f69 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -108,8 +108,8 @@ const auto kDimTot = KernelMappingScheme::DimTot;
 
 const auto kLinearIndexingX = KernelMappingScheme::LinearIndexingX;
 const auto kStridedIndexingX = KernelMappingScheme::StridedIndexingX;
-const auto kLinearStridedIndexingX =
-    KernelMappingScheme::LinearStridedIndexingX;
+const auto kStridedLinearIndexingX =
+    KernelMappingScheme::StridedLinearIndexingX;
 
 // If a dimensions is smaller than this, untiled transposition may be more
 // efficient.
@@ -1933,7 +1933,7 @@ static llvm::Value* GetStartOffsetX(const KernelMappingScheme& mapping_scheme,
   };
   if (mapping_scheme.GetIndexingOrder() == kStridedIndexingX) {
     return thread_id_x;
-  } else if (mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
+  } else if (mapping_scheme.GetIndexingOrder() == kStridedLinearIndexingX) {
     return b->CreateMul(thread_id_x, constant(mapping_scheme.GetVectorSize()));
   }
   CHECK_EQ(mapping_scheme.GetIndexingOrder(), kLinearIndexingX);
@@ -2056,7 +2056,7 @@ void IrEmitterUnnested::EmitTile(
         // row size. For odd row size every other row isn't aligned to the
         // vectorized size, so it can't be vectorized by LLVM.
         if (!x_tile_fits &&
-            mapping_scheme.GetIndexingOrder() == kLinearStridedIndexingX) {
+            mapping_scheme.GetIndexingOrder() == kStridedLinearIndexingX) {
           ksl->If(
               loop_name + "_is_full_tile",
               // For the last block, tile_width will be the number of
@@ -3206,7 +3206,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
          // rows of even size.  For odd row sizes, every other row
          // isn't aligned, so it can't be vectorized.
          (cc_major >= 7 && reduction_dimensions.dimensions[2] % 2 == 0))) {
-      return kLinearStridedIndexingX;
+      return kStridedLinearIndexingX;
     } else if (!reduction_dimensions.is_row_reduction &&
                IsUnrollingColumnReductionBeneficial(
                    unnested_hlo, input_shape,
@@ -3220,7 +3220,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
   }();
 
   int vector_size = 1;
-  if (indexing_order == kLinearStridedIndexingX) {
+  if (indexing_order == kStridedLinearIndexingX) {
     if (reduction_dimensions.dimensions[2] % 2 == 0 &&
         // Assuming XLA will perform the unrolling and LLVM will vectorize,
         // disable the unroll for the cases that LLVM doesn't vectorize.
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 9291d7bc6a3..3123063cae8 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -84,7 +84,7 @@ class KernelMappingScheme {
     // Thread reads a few consecutive elements then take a strided
     // step. This can trigger vectorized reads and keep memory
     // coalescing.
-    LinearStridedIndexingX
+    StridedLinearIndexingX
   };
 
   KernelMappingScheme(absl::Span<const int64> dims_in_elems,
@@ -101,7 +101,7 @@ class KernelMappingScheme {
     CHECK_EQ(tile_sizes[2] % num_threads_x_, 0);
     VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
     if (indexing_order != LinearIndexingX) {
-      // StridedIndexingX, and LinearStridedIndexingX
+      // StridedIndexingX, and StridedLinearIndexingX
       // is for the purpose of vectorization, which requires
       // GetTileSizeFor(DimX) to be a multiplier of num_threads_x_.
       CHECK_EQ(GetTileSizeFor(DimX) % num_threads_x_, 0);

From 8779db9c79385c000b22ec6649e41aaad9fd42c8 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Fri, 3 Apr 2020 14:45:52 -0700
Subject: [PATCH 49/52] Small simplification

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc    | 4 ++--
 .../xla/service/gpu/tests/reduction_vectorization_test.cc     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 73c78204f69..d98c63e3370 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1962,9 +1962,9 @@ static void UnrollInnerTileLoop(
   auto constant = [&](int64 val) {
     return llvm::ConstantInt::get(index_ty, val);
   };
+  IrArray::Index source_idx_x_base =
+      source_idx.AddOffsetToDim(y_loc, kDimY, b);
   for (int64 j = 0; j < x_num_steps / vector_size; j++) {
-    IrArray::Index source_idx_x_base =
-        source_idx.AddOffsetToDim(y_loc, kDimY, b);
     for (int64 i = 0; i < vector_size; i++) {
       int linear_index = j * vector_size + i;
       llvm::Value* x_loc = b->CreateAdd(constant(j * step_x * vector_size + i),
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index 5f27df08b90..8ee0eed3ca3 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -184,7 +184,7 @@ CHECK: ld.global.nc.f32
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
-TEST_F(ReductionVectorizationTest, DisableOddColumns) {
+TEST_F(ReductionVectorizationTest, DisabledOddColumns) {
   const char* hlo_text = R"(
 HloModule ReduceTileFit
 

From dd571a0133195923ba65afccebbcd91707a9c260 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Mon, 6 Apr 2020 14:28:24 -0700
Subject: [PATCH 50/52] Fix the NCHECK failure. EmitTargetElementLoop call
 ComputeMaxUnrollFactor when MayPreventVectorization return false. But
 ComputeMaxUnrollFactor call ElementsIn that works only on DenseArray. Now
 behave as before in that case

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index d98c63e3370..725209f7c45 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1896,7 +1896,7 @@ bool MayPreventVectorization(const HloInstruction& hlo) {
       default:
         return false;
     }
-  } else if (hlo.opcode() == HloOpcode::kReduce) {
+  } else if (hlo.opcode() == HloOpcode::kReduce && hlo.shape().IsArray()) {
     // TODO: check if the to_apply() attribute contains instruction
     // that break LLVM vectorization.
     return false;

From 45f3ebb53fb594bbfb06ad6ae23a21d7c88d885c Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Thu, 9 Apr 2020 14:44:57 -0700
Subject: [PATCH 51/52] Make XLA emit the Store annotation about aliasing. This
 allows LLVM to vectorize them and fix the speed regression reported.

---
 .../xla/service/gpu/ir_emitter_unnested.cc    |  7 +--
 .../gpu/tests/reduction_vectorization_test.cc | 61 +++++++++++++++++++
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 725209f7c45..32170229813 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -538,13 +538,10 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
     absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
         extra_output_gens) {
   for (int i = 0; i != extra_output_gens.size(); ++i) {
-    llvm::Value* extra_output_address =
-        GetIrArray(*unnested_hlo, *unnested_hlo, extra_output_gens[i].second)
-            .EmitArrayElementAddress(index, &b_, "extra_output_element_address",
-                                     use_linear_index);
     TF_ASSIGN_OR_RETURN(llvm::Value* const extra_output_ir_value,
                         extra_output_gens[i].first(index));
-    Store(extra_output_ir_value, extra_output_address);
+    GetIrArray(*unnested_hlo, *unnested_hlo, extra_output_gens[i].second)
+        .EmitWriteArrayElement(index, extra_output_ir_value, &b_, use_linear_index);
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index 8ee0eed3ca3..040025ee1a2 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -294,6 +294,67 @@ CHECK-NOT: ld.global.u64
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
+class ReductionVectorizationNoOptTest : public GpuCodegenTest {
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // The test MultiOutputStore contain a MOF fusion and XLA optimizer pass doesn't like this.
+    debug_options.set_xla_disable_all_hlo_passes(true);
+    return debug_options;
+  }
+};
+
+TEST_F(ReductionVectorizationNoOptTest, MultiOutputStore) {
+  const char* hlo_text = R"(
+HloModule MultiOutputStore
+
+%add_f32 {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+%fused_computation {
+  %param_0 = f32[2,384,1024] parameter(0)
+  %param_1 = f32[2,384] parameter(1)
+  %constant0 = f32[] constant(0.0009765625)
+  %broadcast0 = f32[2,384] broadcast(%constant0), dimensions={}
+  %multiply0 = f32[2,384] multiply(%param_1, %broadcast0)
+  %broadcast1 = f32[2,384,1024] broadcast(%multiply0), dimensions={0,1}
+  %subtract = f32[2,384,1024] subtract(%param_0, %broadcast1)
+  %multiply1 = f32[2,384,1024] multiply(%subtract, %subtract)
+  %constant1 = f32[] constant(0)
+  %reduce = f32[2,384] reduce(%multiply1, %constant1), dimensions={2}, to_apply=%add_f32
+  ROOT %tuple = (f32[2,384], f32[2,384,1024], f32[2,384,1024]) tuple(%reduce, %subtract, %broadcast1)
+}
+
+ENTRY %cluster {
+  %param0 = f32[2,384,1024] parameter(0)
+  %param1 =  f32[2,384] parameter(1)
+  ROOT %fusion = (f32[2,384], f32[2,384,1024], f32[2,384,1024]) fusion(%param0, %param1), kind=kInput, calls=%fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                                R"(
+CHECK: ld.global.nc.v2.f32
+CHECK: st.global.v2.f32
+CHECK: st.global.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: st.global.v2.f32
+CHECK: st.global.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: st.global.v2.f32
+CHECK: st.global.v2.f32
+CHECK: ld.global.nc.v2.f32
+CHECK: st.global.v2.f32
+CHECK: st.global.v2.f32
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From 502c51cc351b9ea9f8e5a18141be8ce83679f791 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Tue, 14 Apr 2020 07:49:40 -0700
Subject: [PATCH 52/52] Fix test on some GPUs/env combination.

---
 .../xla/service/gpu/tests/reduction_vectorization_test.cc        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index 040025ee1a2..4f8ada9ef93 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -176,7 +176,6 @@ CHECK: ld.global.nc.f32
 CHECK: ld.global.nc.f32
 CHECK: ld.global.nc.f32
 CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
 )";
   }
   CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);