Merge pull request #31259 from yongfeng-nv:xla-modify-column-reduction-threshold

PiperOrigin-RevId: 263012404
This commit is contained in:
TensorFlower Gardener 2019-08-12 15:56:41 -07:00
commit 7685639e2f
3 changed files with 68 additions and 2 deletions

View File

@ -514,6 +514,27 @@ class ResizeNearestNeighborTest(xla_test.XLATestCase):
[7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9]],
dtype=np.float32))
def testAlignCorners3x3To12x12_uint8(self):
# TODO(b/72099414): enable the test for TPU when the issue is fixed.
if (self.device not in ["XLA_GPU", "XLA_CPU"]):
return
# Ensure that resize with convolution works on XLA/GPU for integer types
self._assertForwardOpMatchesExpected(
np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.uint8), [12, 12],
expected=np.array([[1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
[1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
[1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
[4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
[4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
[4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
[4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
[4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
[4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
[7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9],
[7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9],
[7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9]],
dtype=np.uint8))
class ResizeBilinearTest(parameterized.TestCase, xla_test.XLATestCase):

View File

@ -220,9 +220,9 @@ bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce) {
}
// For column reduction, the tile block is tize_size_y x tile_size_x, and we
// are reducing along tile_size_y. Both tile_size_x and tile_size_y need to be
// are reducing along tile_size_y. Only tile_size_y needs to be
// large enough to make the tiling implementation efficient.
return dims_in_elem[2] >= kWarpSize && dims_in_elem[1] >= kWarpSize;
return dims_in_elem[1] >= kWarpSize;
}
std::pair<bool, DimensionVector> GetReductionKindAndContiguousComponents(

View File

@ -536,6 +536,51 @@ TEST_F(GpuKernelTilingTest,
EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.001}));
}
TEST_F(GpuKernelTilingTest, ColumnReductionSmallTileSizeX) {
const char *const kHloString = R"(
HloModule Test
scalar_add_computation.1 {
scalar_lhs.1 = f32[] parameter(0)
scalar_rhs.1 = f32[] parameter(1)
ROOT add.6 = f32[] add(scalar_lhs.1, scalar_rhs.1)
}
ENTRY Test {
param_3.241 = f16[512,2,9,9]{1,3,2,0} parameter(3)
constant_661 = f16[] constant(0)
broadcast.695 = f16[512,2,9,9]{1,3,2,0} broadcast(constant_661), dimensions={}
compare.42 = pred[512,2,9,9]{1,3,2,0} compare(param_3.241, broadcast.695), direction=GT
param_2.401 = f16[512,2,9,9]{1,3,2,0} parameter(2)
select.40 = f16[512,2,9,9]{1,3,2,0} select(compare.42, param_2.401, broadcast.695)
convert.196 = f32[512,2,9,9]{1,3,2,0} convert(select.40)
param_1.809 = f16[512,2,9,9]{1,3,2,0} parameter(1)
copy.335 = f16[512,2,9,9]{1,3,2,0} copy(param_1.809)
convert.218 = f32[512,2,9,9]{1,3,2,0} convert(copy.335)
param_0.668 = f32[2]{0} parameter(0)
broadcast.687 = f32[512,2,9,9]{1,3,2,0} broadcast(param_0.668), dimensions={1}
subtract.136 = f32[512,2,9,9]{1,3,2,0} subtract(convert.218, broadcast.687)
multiply.579 = f32[512,2,9,9]{1,3,2,0} multiply(convert.196, subtract.136)
constant_485 = f32[] constant(0)
reduce.139 = f32[2]{0} reduce(multiply.579, constant_485), dimensions={0,2,3}, to_apply=scalar_add_computation.1
reduce.140.clone.1 = f32[2]{0} reduce(convert.196, constant_485), dimensions={0,2,3}, to_apply=scalar_add_computation.1
ROOT tuple.102 = (f32[2]{0}, f32[2]{0}) tuple(reduce.139, reduce.140.clone.1)
})";
// Check that no loop is generated for reduction.
auto hlo_module =
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
.ValueOrDie();
CompileAndVerifyIr(std::move(hlo_module),
R"(
; CHECK-LABEL: define void @fusion
; CHECK-NOT: reduce.0.loop_header
; CHECK: }
)",
/*match_optimized_ir=*/true);
// Check that the kernel runs correctly.
EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
}
TEST_F(GpuKernelTilingTest, RowReductionWithSmallDimensionNotTiled) {
const char *const kHloString = R"(
HloModule reduction