From fcad65986c5d0550a7ef6bf02fa9b8c13b277de9 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 23 Jan 2020 08:50:16 -0800
Subject: [PATCH] [XLA/GPU] [NFC] Introduce a function for emitting a
 __syncthreads() call

PiperOrigin-RevId: 291175258
Change-Id: Ibefbbbd6b6b4ec6b38079a94907640cac182549c
---
 .../xla/service/gpu/ir_emitter_unnested.cc         | 14 +++++++++-----
 .../compiler/xla/service/gpu/ir_emitter_unnested.h |  7 +++++++
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 64b84195cd5..6c1fb50d00b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2440,12 +2440,12 @@ IrArray::Index IrEmitterUnnested::EmitTilingKernel(
          ++i) {
       int64 tile_size_for_dim = mapping_scheme.GetTileSizeFor(i);
       // Only last row or column may not have full size.
-      llvm::Value* is_last_row =
+      llvm::Value* is_last =
           b_.CreateICmpEQ(tile_index[i], constant(dims_in_blocks[i] - 1));
-      int64 partial_row_size =
+      int64 partial_row =
           dims_in_elems[i] - (dims_in_blocks[i] - 1) * tile_size_for_dim;
       output_tile_bounds[i] =
-          b_.CreateSelect(is_last_row, constant(partial_row_size),
+          b_.CreateSelect(is_last, constant(partial_row),
                           constant(tile_size_for_dim), "tile_bound");
     }
     IrArray::Index tile_origin =
@@ -2484,6 +2484,10 @@ IrArray::Index IrEmitterUnnested::EmitTilingKernel(
   return GetElementIndexForTileOrigin(starting_tile, mapping_scheme, &b_);
 }
 
+llvm::CallInst* IrEmitterUnnested::EmitSyncThreads() {
+  return EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, &b_);
+}
+
 // Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
 // algorithm to improve the memory access patterns for the input parameters
 // with a shape that is a 0-2-1 transpose of the output tensor shape. The caller
@@ -2617,7 +2621,7 @@ void IrEmitterUnnested::EmitHlo021Tile(
 
           // Wait for all threads to reach this point using `__syncthreads` in
           // CUDA.
-          EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, &b_);
+          EmitSyncThreads();
         }
 
         EmitTile(mapping_scheme, index, loop_name, ksl, &b_, y, x, tile_height,
@@ -2629,7 +2633,7 @@ void IrEmitterUnnested::EmitHlo021Tile(
         // memory buffer for the current tile before we move on to process the
         // next tile and overwrite the shared memory buffers.
         if (block_contains_multi_tiles && !tiled_param_ids.empty()) {
-          EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, &b_);
+          EmitSyncThreads();
         }
       };
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 0fd15dd3064..aaa4510d0c9 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -335,9 +335,16 @@ class IrEmitterUnnested : public IrEmitter,
     // Lane id: `thread_id % kWarpSize`
     llvm::Value* lane_id;
   };
+
+  // Emits the LLVM values for thread_id, thread_id.x, thread_id.y and lane id.
+  //
+  // Returns a struct containting these values.
   ThreadIdInfo EmitThreadIdInfo(int64 threads_per_block, llvm::Type* index_ty,
                                 int64 num_threads_x);
 
+  // Emit __syncthreads(), synchronization barrier for all threads in a block.
+  llvm::CallInst* EmitSyncThreads();
+
   // Emits current block id.
   llvm::Value* EmitBlockId();