diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py index fb4b2711905..5889a011296 100644 --- a/tensorflow/compiler/tests/image_ops_test.py +++ b/tensorflow/compiler/tests/image_ops_test.py @@ -514,6 +514,27 @@ class ResizeNearestNeighborTest(xla_test.XLATestCase): [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9]], dtype=np.float32)) + def testAlignCorners3x3To12x12_uint8(self): + # TODO(b/72099414): enable the test for TPU when the issue is fixed. + if (self.device not in ["XLA_GPU", "XLA_CPU"]): + return + # Ensure that resize with convolution works on XLA/GPU for integer types + self._assertForwardOpMatchesExpected( + np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.uint8), [12, 12], + expected=np.array([[1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3], + [1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3], + [1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3], + [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6], + [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6], + [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6], + [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6], + [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6], + [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6], + [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9], + [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9], + [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9]], + dtype=np.uint8)) + class ResizeBilinearTest(parameterized.TestCase, xla_test.XLATestCase): diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc index 404d3347772..78f8e22a857 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc @@ -220,9 +220,9 @@ bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce) { } // For column reduction, the tile block is tize_size_y x tile_size_x, and we - // are reducing along tile_size_y. Both tile_size_x and tile_size_y need to be + // are reducing along tile_size_y. Only tile_size_y needs to be // large enough to make the tiling implementation efficient. - return dims_in_elem[2] >= kWarpSize && dims_in_elem[1] >= kWarpSize; + return dims_in_elem[1] >= kWarpSize; } std::pair GetReductionKindAndContiguousComponents( diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc index d722973e55e..92bb84065a2 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc @@ -536,6 +536,51 @@ TEST_F(GpuKernelTilingTest, EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.001})); } +TEST_F(GpuKernelTilingTest, ColumnReductionSmallTileSizeX) { + const char *const kHloString = R"( + HloModule Test + + scalar_add_computation.1 { + scalar_lhs.1 = f32[] parameter(0) + scalar_rhs.1 = f32[] parameter(1) + ROOT add.6 = f32[] add(scalar_lhs.1, scalar_rhs.1) + } + ENTRY Test { + param_3.241 = f16[512,2,9,9]{1,3,2,0} parameter(3) + constant_661 = f16[] constant(0) + broadcast.695 = f16[512,2,9,9]{1,3,2,0} broadcast(constant_661), dimensions={} + compare.42 = pred[512,2,9,9]{1,3,2,0} compare(param_3.241, broadcast.695), direction=GT + param_2.401 = f16[512,2,9,9]{1,3,2,0} parameter(2) + select.40 = f16[512,2,9,9]{1,3,2,0} select(compare.42, param_2.401, broadcast.695) + convert.196 = f32[512,2,9,9]{1,3,2,0} convert(select.40) + param_1.809 = f16[512,2,9,9]{1,3,2,0} parameter(1) + copy.335 = f16[512,2,9,9]{1,3,2,0} copy(param_1.809) + convert.218 = f32[512,2,9,9]{1,3,2,0} convert(copy.335) + param_0.668 = f32[2]{0} parameter(0) + broadcast.687 = f32[512,2,9,9]{1,3,2,0} broadcast(param_0.668), dimensions={1} + subtract.136 = f32[512,2,9,9]{1,3,2,0} subtract(convert.218, broadcast.687) + multiply.579 = f32[512,2,9,9]{1,3,2,0} multiply(convert.196, subtract.136) + constant_485 = f32[] constant(0) + reduce.139 = f32[2]{0} reduce(multiply.579, constant_485), dimensions={0,2,3}, to_apply=scalar_add_computation.1 + reduce.140.clone.1 = f32[2]{0} reduce(convert.196, constant_485), dimensions={0,2,3}, to_apply=scalar_add_computation.1 + ROOT tuple.102 = (f32[2]{0}, f32[2]{0}) tuple(reduce.139, reduce.140.clone.1) + })"; + + // Check that no loop is generated for reduction. + auto hlo_module = + ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) + .ValueOrDie(); + CompileAndVerifyIr(std::move(hlo_module), + R"( +; CHECK-LABEL: define void @fusion +; CHECK-NOT: reduce.0.loop_header +; CHECK: } +)", + /*match_optimized_ir=*/true); + // Check that the kernel runs correctly. + EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5})); +} + TEST_F(GpuKernelTilingTest, RowReductionWithSmallDimensionNotTiled) { const char *const kHloString = R"( HloModule reduction