Merge pull request #31259 from yongfeng-nv:xla-modify-column-reduction-threshold

PiperOrigin-RevId: 263012404
2019-08-12 15:56:41 -07:00 · 2019-08-12 15:56:41 -07:00 · 7685639e2f
commit 7685639e2f
parent 0157067029 cc55b7f346
3 changed files with 68 additions and 2 deletions
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@ -514,6 +514,27 @@ class ResizeNearestNeighborTest(xla_test.XLATestCase):
                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9]],
                          dtype=np.float32))
  def testAlignCorners3x3To12x12_uint8(self):
    # TODO(b/72099414): enable the test for TPU when the issue is fixed.
    if (self.device not in ["XLA_GPU", "XLA_CPU"]):
      return
    # Ensure that resize with convolution works on XLA/GPU for integer types
    self._assertForwardOpMatchesExpected(
        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.uint8), [12, 12],
        expected=np.array([[1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
                           [1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
                           [1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9],
                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9],
                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9]],
                          dtype=np.uint8))
 class ResizeBilinearTest(parameterized.TestCase, xla_test.XLATestCase):
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@ -220,9 +220,9 @@ bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce) {
  }
  // For column reduction, the tile block is tize_size_y x tile_size_x, and we
-  // are reducing along tile_size_y. Both tile_size_x and tile_size_y need to be
+  // are reducing along tile_size_y. Only tile_size_y needs to be
  // large enough to make the tiling implementation efficient.
-  return dims_in_elem[2] >= kWarpSize && dims_in_elem[1] >= kWarpSize;
+  return dims_in_elem[1] >= kWarpSize;
 }
 std::pair<bool, DimensionVector> GetReductionKindAndContiguousComponents(
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@ -536,6 +536,51 @@ TEST_F(GpuKernelTilingTest,
  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.001}));
 }
 TEST_F(GpuKernelTilingTest, ColumnReductionSmallTileSizeX) {
  const char *const kHloString = R"(
  HloModule Test
  scalar_add_computation.1 {
    scalar_lhs.1 = f32[] parameter(0)
    scalar_rhs.1 = f32[] parameter(1)
    ROOT add.6 = f32[] add(scalar_lhs.1, scalar_rhs.1)
  }
  ENTRY Test {
    param_3.241 = f16[512,2,9,9]{1,3,2,0} parameter(3)
    constant_661 = f16[] constant(0)
    broadcast.695 = f16[512,2,9,9]{1,3,2,0} broadcast(constant_661), dimensions={}
    compare.42 = pred[512,2,9,9]{1,3,2,0} compare(param_3.241, broadcast.695), direction=GT
    param_2.401 = f16[512,2,9,9]{1,3,2,0} parameter(2)
    select.40 = f16[512,2,9,9]{1,3,2,0} select(compare.42, param_2.401, broadcast.695)
    convert.196 = f32[512,2,9,9]{1,3,2,0} convert(select.40)
    param_1.809 = f16[512,2,9,9]{1,3,2,0} parameter(1)
    copy.335 = f16[512,2,9,9]{1,3,2,0} copy(param_1.809)
    convert.218 = f32[512,2,9,9]{1,3,2,0} convert(copy.335)
    param_0.668 = f32[2]{0} parameter(0)
    broadcast.687 = f32[512,2,9,9]{1,3,2,0} broadcast(param_0.668), dimensions={1}
    subtract.136 = f32[512,2,9,9]{1,3,2,0} subtract(convert.218, broadcast.687)
    multiply.579 = f32[512,2,9,9]{1,3,2,0} multiply(convert.196, subtract.136)
    constant_485 = f32[] constant(0)
    reduce.139 = f32[2]{0} reduce(multiply.579, constant_485), dimensions={0,2,3}, to_apply=scalar_add_computation.1
    reduce.140.clone.1 = f32[2]{0} reduce(convert.196, constant_485), dimensions={0,2,3}, to_apply=scalar_add_computation.1
    ROOT tuple.102 = (f32[2]{0}, f32[2]{0}) tuple(reduce.139, reduce.140.clone.1)
  })";
  // Check that no loop is generated for reduction.
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
  CompileAndVerifyIr(std::move(hlo_module),
                     R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: reduce.0.loop_header
 ; CHECK: }
 )",
                     /*match_optimized_ir=*/true);
  // Check that the kernel runs correctly.
  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
 }
 TEST_F(GpuKernelTilingTest, RowReductionWithSmallDimensionNotTiled) {
  const char *const kHloString = R"(
    HloModule reduction