Merge pull request #29093 from ROCmSoftwarePlatform:rocm_kernel

PiperOrigin-RevId: 251388968
2019-06-04 01:00:34 -07:00 · 2019-06-04 01:00:34 -07:00 · 0b1f96ed96
commit 0b1f96ed96
parent eff4ae822a e95812a9dd
3 changed files with 30 additions and 11 deletions
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@ -238,18 +238,11 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
    }
  }

+  AnnotateFunctionAsGpuKernel(module, kernel, &b_);
+
  // TODO(b/65380986): Investigate if adding fast math flags for generated
  // kernels makes sense.

-  // Add the declaration of this kernel to llvm.nvvm.annotations so that NVPTX
-  // treats it as a CUDA kernel.
-  llvm::NamedMDNode* nvvm_annotations_node =
-      module->getOrInsertNamedMetadata("nvvm.annotations");
-  nvvm_annotations_node->addOperand(llvm::MDNode::get(
-      context, {llvm::ConstantAsMetadata::get(kernel),
-                llvm::MDString::get(context, "kernel"),
-                llvm::ConstantAsMetadata::get(b_.getInt32(1))}));
-
  // Update the insert point to the entry basic block.
  llvm::BasicBlock* entry_bb =
      llvm::BasicBlock::Create(context, /*Name=*/"entry", /*Parent=*/kernel);
--- a/tensorflow/compiler/xla/service/gpu/target_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/target_util.cc
@ -76,8 +76,7 @@ llvm::CallInst* EmitCallToTargetIntrinsic(
  llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
  llvm::Intrinsic::ID llvm_intrinsic_id = llvm::Intrinsic::not_intrinsic;

-  if ((target_triple.getArch() == llvm::Triple::nvptx) ||
-      (target_triple.getArch() == llvm::Triple::nvptx64)) {
+  if (target_triple.isNVPTX()) {
    llvm_intrinsic_id = gpu_intrinsic_id.nvptx_intrinsic;
  } else if (target_triple.getArch() == llvm::Triple::amdgcn) {
    llvm_intrinsic_id = gpu_intrinsic_id.amdgpu_intrinsic;
@ -90,5 +89,28 @@ llvm::CallInst* EmitCallToTargetIntrinsic(
  return b->CreateCall(intrinsic, llvm_ir::AsArrayRef(operands));
 }

+void AnnotateFunctionAsGpuKernel(llvm::Module* module, llvm::Function* func,
+                                 llvm::IRBuilder<>* b) {
+  llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
+  if (target_triple.isNVPTX()) {
+    // Add the declaration of this kernel to llvm.nvvm.annotations so that NVPTX
+    // treats function as a CUDA kernel.
+    llvm::LLVMContext& context = module->getContext();
+    llvm::NamedMDNode* nvvm_annotations_node =
+        module->getOrInsertNamedMetadata("nvvm.annotations");
+    nvvm_annotations_node->addOperand(llvm::MDNode::get(
+        context, {llvm::ConstantAsMetadata::get(func),
+                  llvm::MDString::get(context, "kernel"),
+                  llvm::ConstantAsMetadata::get(b->getInt32(1))}));
+
+  } else if (target_triple.getArch() == llvm::Triple::amdgcn) {
+    // Attach information so AMDGPU can recognize function as a AMDGPU kernel.
+    func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+    func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
+  } else {
+    LOG(FATAL) << "Invalid triple " << target_triple.str();
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/gpu/target_util.h
+++ b/tensorflow/compiler/xla/service/gpu/target_util.h
@ -49,6 +49,10 @@ llvm::CallInst* EmitCallToTargetIntrinsic(
    TargetIntrinsicID intrinsic_id, absl::Span<llvm::Value* const> operands,
    absl::Span<llvm::Type* const> overloaded_types, llvm::IRBuilder<>* b);

+// Annotate the kernel as GPU kernel according to the GPU target.
+void AnnotateFunctionAsGpuKernel(llvm::Module* module, llvm::Function* func,
+                                 llvm::IRBuilder<>* b);
+
 }  // namespace gpu
 }  // namespace xla