Make gpu_ftz_test ready for CUDA 11.

CUDA 11 changed the libdevice implementation of expf from previously two to one ex2.approx.ftz. Change the CHECK directives to handle both cases. PiperOrigin-RevId: 317698875 Change-Id: Idd7b28c77427f299b80fa1b7f4b9be8c7881f963
2020-06-22 11:32:54 -07:00 · 2020-06-22 11:32:54 -07:00 · 8785b4f5b8
commit 8785b4f5b8
parent 18569dde74
1 changed files with 7 additions and 10 deletions
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@ -92,26 +92,23 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
 }

 // In NVPTX, exp(float) is implemented in libdevice, and consults __nvvm_reflect
-// to determine whether or not ftz is enabled.  The implementation uses two
-// calls to ex2.approx.  When ftz is on, we get two calls to the ftz version;
-// when ftz is off, we get one call to the ftz version and one call to the
-// regular version.
+// to determine whether or not ftz is enabled.
+// The implementation in CUDA 11 uses one ex2.approx.ftz, irrespective of ftz
+// being enabled or not. In previous CUDA versions, there is a leading
+// ex2.approx that does obey the ftz setting.
+// Instead of pattern matching implementation details, it might be better to
+// value-test the actual result instead. TODO(csigg): change to value-test.
 TEST_F(GpuFtzEnabledTest, ExpFtz) {
  CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
    CHECK-NOT: ex2.approx.f32
    CHECK:     ex2.approx.ftz.f32
    CHECK-NOT: ex2.approx.f32
-    CHECK:     ex2.approx.ftz.f32
-    CHECK-NOT: ex2.approx.f32
-    CHECK-NOT: ex2.approx.ftz.f32
  )");
 }

 TEST_F(GpuFtzDisabledTest, ExpFtz) {
  CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
-    CHECK-NOT: ex2.approx.f32
-    CHECK-DAG: ex2.approx.ftz.f32
-    CHECK-DAG: ex2.approx.f32
+    CHECK:     ex2.approx.ftz.f32
    CHECK-NOT: ex2.approx.f32
    CHECK-NOT: ex2.approx.ftz.f32
  )");