From 46644c6c58ab585b6796631f670cca3cd4aaa839 Mon Sep 17 00:00:00 2001
From: jerryyin <zhuoryin@amd.com>
Date: Tue, 28 Jan 2020 02:19:09 -0800
Subject: [PATCH] PR #36187: [ROCm][XLA:GPU] Fixing Atomic CAS codegen in
 ir_emitter

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/36187

This is a follow-up to #35881 and #36110. This CL introduce changes in XLA ir emission logic to fix atomic CAS code-gen in `AMDGPU` side.

- Line 321-324 make sure that GPU kernel variables are allocated (in private register files) at function entry, and made available before execution.
- Line 347-359 make sure `cas_new_out_address` is cast to default address space.

@cheshire @whchung
Copybara import of the project:

--
f6cddc4c6142a03c0f0f836850f02759d8208801 by jerryyin <zhuoryin@amd.com>:

Fixing Atomic CAS codegen in ir_emitter

COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/tensorflow/pull/36187 from ROCmSoftwarePlatform:google-upstream-pr-sync-atomic_cas f6cddc4c6142a03c0f0f836850f02759d8208801
PiperOrigin-RevId: 291892668
Change-Id: I9553727ba477f255e6634d0363284901579d3cde
---
 .../compiler/xla/service/gpu/ir_emitter.cc    | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index fcf8b4b4e9d..011eb07d3bd 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -336,7 +336,6 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
   // element_type is the data type for the binary operation.
   llvm::Type* element_type = output_address_type->getPointerElementType();
   int element_size = llvm_ir::GetSizeInBits(element_type);
-  llvm::Type* element_address_type = element_type->getPointerTo();
 
   int atomic_size = (element_size < 32) ? 32 : element_size;
   llvm::Type* atomic_type = b_.getIntNTy(atomic_size);
@@ -346,10 +345,10 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
   // cas_old_output_address and cas_new_output_address point to the scratch
   // memory where we store the old and new values for the repeated atomicCAS
   // operations.
-  llvm::Value* cas_old_output_address =
-      Alloca(atomic_type, /*ArraySize=*/nullptr, "cas_old_output_address");
-  llvm::Value* cas_new_output_address =
-      Alloca(atomic_type, /*ArraySize=*/nullptr, "cas_new_output_address");
+  llvm::Value* cas_old_output_address = llvm_ir::EmitAllocaAtFunctionEntry(
+      atomic_type, "cas_old_output_address", &b_);
+  llvm::Value* cas_new_output_address = llvm_ir::EmitAllocaAtFunctionEntry(
+      atomic_type, "cas_new_output_address", &b_);
 
   // Emit preparation code to the preheader.
   llvm::BasicBlock* loop_preheader_bb = b_.GetInsertBlock();
@@ -372,11 +371,19 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
         IntToPtr(atomic_memory_address, atomic_address_type);
     binop_output_address =
         Add(PtrToInt(cas_new_output_address, address_int_type), offset);
-    binop_output_address = IntToPtr(binop_output_address, element_address_type);
+    binop_output_address = IntToPtr(
+        binop_output_address,
+        llvm::PointerType::get(
+            element_type,
+            cas_new_output_address->getType()->getPointerAddressSpace()));
   } else {
-    atomic_memory_address = BitCast(output_address, atomic_address_type);
-    binop_output_address =
-        BitCast(cas_new_output_address, element_address_type);
+    atomic_memory_address = b_.CreatePointerBitCastOrAddrSpaceCast(
+        output_address, atomic_address_type);
+    binop_output_address = b_.CreatePointerBitCastOrAddrSpaceCast(
+        cas_new_output_address,
+        llvm::PointerType::get(
+            element_type,
+            cas_new_output_address->getType()->getPointerAddressSpace()));
   }
 
   // Use the value from the memory that atomicCAS operates on to initialize