Convert 4D convolutions with trivial dimensions to lower-dimensional convolutions.

cuDNN cannot handle convolutions with 4 or more spatial dimensions. However we can work around this in some cases by removing trivial dimensions. PiperOrigin-RevId: 291894511 Change-Id: Ic0e3fa4f4181e105ca62f92a235a55413acd7253
2020-01-28 02:37:03 -08:00 · 2020-01-28 02:37:03 -08:00 · 682cef5c04
commit 682cef5c04
parent e2b70c0f80
6 changed files with 420 additions and 0 deletions
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@ -1767,6 +1767,36 @@ cc_library(
    ],
 )
 cc_library(
    name = "convolution_4d_expander",
    srcs = ["convolution_4d_expander.cc"],
    hdrs = ["convolution_4d_expander.h"],
    deps = [
        ":hlo",
        ":op_expander_pass",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla:statusor",
        "//tensorflow/compiler/xla:xla_data_proto_cc",
        "@com_google_absl//absl/algorithm:container",
        "@com_google_absl//absl/strings",
    ],
 )
 tf_cc_test(
    name = "convolution_4d_expander_test",
    srcs = ["convolution_4d_expander_test.cc"],
    deps = [
        "convolution_4d_expander",
        ":hlo",
        ":hlo_matchers",
        "//tensorflow/compiler/xla:test",
        "//tensorflow/compiler/xla:types",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
    ],
 )
 tf_cc_test(
    name = "batchnorm_expander_test",
    size = "small",
--- a/tensorflow/compiler/xla/service/convolution_4d_expander.cc
+++ b/tensorflow/compiler/xla/service/convolution_4d_expander.cc
@ -0,0 +1,175 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/convolution_4d_expander.h"
 #include <algorithm>
 #include <functional>
 #include <vector>
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 namespace xla {
 bool Convolution4DExpander::InstructionMatchesPattern(
    HloInstruction* instruction) {
  if (instruction->opcode() != HloOpcode::kConvolution) {
    return false;
  }
  // Check whether it is a 4D convolution and whether there is at least one
  // trivial dimension.
  const ConvolutionDimensionNumbers& dim_nums =
      instruction->convolution_dimension_numbers();
  if (dim_nums.input_spatial_dimensions().size() != 4) {
    return false;
  }
  Shape input = instruction->operand(0)->shape();
  for (int64 i = 0; i < dim_nums.input_spatial_dimensions().size(); ++i) {
    int64 spatial_dim = dim_nums.input_spatial_dimensions(i);
    if (input.dimensions(spatial_dim) == 1 &&
        instruction->window().dimensions(i).padding_low() == 0 &&
        instruction->window().dimensions(i).padding_high() == 0) {
      return true;
    }
  }
  return false;
 }
 StatusOr<HloInstruction*> Convolution4DExpander::ExpandInstruction(
    HloInstruction* instruction) {
  HloComputation* computation = instruction->parent();
  ConvolutionDimensionNumbers dim_nums =
      instruction->convolution_dimension_numbers();
  ConvolutionDimensionNumbers new_dim_nums = dim_nums;
  std::vector<int64> removed_input_dimensions;
  std::vector<int64> removed_kernel_dimensions;
  std::vector<int64> removed_output_dimensions;
  new_dim_nums.clear_input_spatial_dimensions();
  new_dim_nums.clear_output_spatial_dimensions();
  new_dim_nums.clear_kernel_spatial_dimensions();
  Window new_window;
  HloInstruction* input = instruction->mutable_operand(0);
  // Collect all trivial input spatial dimensions, and the corresponding
  // dimensions of the kernel and the output. Those will be removed.
  for (int64 i = 0; i < dim_nums.input_spatial_dimensions().size(); ++i) {
    int64 input_spatial_dim = dim_nums.input_spatial_dimensions(i);
    int64 output_spatial_dim = dim_nums.output_spatial_dimensions(i);
    int64 kernel_spatial_dim = dim_nums.kernel_spatial_dimensions(i);
    if (input->shape().dimensions(input_spatial_dim) == 1 &&
        instruction->window().dimensions(i).padding_low() == 0 &&
        instruction->window().dimensions(i).padding_high() == 0) {
      removed_input_dimensions.push_back(input_spatial_dim);
      removed_output_dimensions.push_back(output_spatial_dim);
      removed_kernel_dimensions.push_back(kernel_spatial_dim);
    } else {
      *new_window.add_dimensions() = instruction->window().dimensions(i);
      new_dim_nums.add_input_spatial_dimensions(input_spatial_dim);
      new_dim_nums.add_output_spatial_dimensions(output_spatial_dim);
      new_dim_nums.add_kernel_spatial_dimensions(kernel_spatial_dim);
    }
  }
  // We sort the removed dimensions into descending order, because we need to
  // delete higher dimensions first, otherwise we would have to adjust dimension
  // indices.
  std::sort(removed_input_dimensions.begin(), removed_input_dimensions.end(),
            std::greater<>());
  std::sort(removed_output_dimensions.begin(), removed_output_dimensions.end(),
            std::greater<>());
  std::sort(removed_kernel_dimensions.begin(), removed_kernel_dimensions.end(),
            std::greater<>());
  // Compute the new shapes.
  Shape new_input_shape = input->shape();
  for (int64 dim : removed_input_dimensions) {
    new_input_shape.DeleteDimension(dim);
  }
  HloInstruction* kernel = instruction->mutable_operand(1);
  Shape new_kernel_shape = kernel->shape();
  for (int64 dim : removed_kernel_dimensions) {
    new_kernel_shape.DeleteDimension(dim);
  }
  Shape new_output_shape = instruction->shape();
  for (int64 dim : removed_output_dimensions) {
    new_output_shape.DeleteDimension(dim);
  }
  // Relabel the dimension numbers to account for the deleted dimensions. For
  // each dimension number, we need to reduce its value by the number of removed
  // smaller dimensions.
  auto compute_new_dimension = [](const std::vector<int64>& removed_dimensions,
                                  int64 old_dimension) {
    int64 num_smaller = absl::c_count_if(
        removed_dimensions, [old_dimension](int64 removed_dimension) {
          return removed_dimension < old_dimension;
        });
    return old_dimension - num_smaller;
  };
  new_dim_nums.set_input_batch_dimension(compute_new_dimension(
      removed_input_dimensions, new_dim_nums.input_batch_dimension()));
  new_dim_nums.set_input_feature_dimension(compute_new_dimension(
      removed_input_dimensions, new_dim_nums.input_feature_dimension()));
  for (int64 i = 0; i < new_dim_nums.input_spatial_dimensions().size(); ++i) {
    new_dim_nums.set_input_spatial_dimensions(
        i, compute_new_dimension(removed_input_dimensions,
                                 new_dim_nums.input_spatial_dimensions(i)));
  }
  new_dim_nums.set_output_batch_dimension(compute_new_dimension(
      removed_output_dimensions, new_dim_nums.output_batch_dimension()));
  new_dim_nums.set_output_feature_dimension(compute_new_dimension(
      removed_output_dimensions, new_dim_nums.output_feature_dimension()));
  for (int64 i = 0; i < new_dim_nums.output_spatial_dimensions().size(); ++i) {
    new_dim_nums.set_output_spatial_dimensions(
        i, compute_new_dimension(removed_output_dimensions,
                                 new_dim_nums.output_spatial_dimensions(i)));
  }
  new_dim_nums.set_kernel_input_feature_dimension(
      compute_new_dimension(removed_kernel_dimensions,
                            new_dim_nums.kernel_input_feature_dimension()));
  new_dim_nums.set_kernel_output_feature_dimension(
      compute_new_dimension(removed_kernel_dimensions,
                            new_dim_nums.kernel_output_feature_dimension()));
  for (int64 i = 0; i < new_dim_nums.kernel_spatial_dimensions().size(); ++i) {
    new_dim_nums.set_kernel_spatial_dimensions(
        i, compute_new_dimension(removed_kernel_dimensions,
                                 new_dim_nums.kernel_spatial_dimensions(i)));
  }
  // Reshape the input and the kernel.
  HloInstruction* reshaped_input = computation->AddInstruction(
      HloInstruction::CreateReshape(new_input_shape, input));
  HloInstruction* reshaped_kernel = computation->AddInstruction(
      HloInstruction::CreateReshape(new_kernel_shape, kernel));
  // We want to use CloneWithNewOperands, but that doesn't support substituting
  // the window and the ConvolutionDimensionNumbers. So we set this on the old
  // instruction (which is going to be removed anyway) before cloning it.
  instruction->set_convolution_dimension_numbers(new_dim_nums);
  instruction->set_window(new_window);
  HloInstruction* new_convolution =
      computation->AddInstruction(instruction->CloneWithNewOperands(
          new_output_shape, {reshaped_input, reshaped_kernel}));
  return computation->AddInstruction(
      HloInstruction::CreateReshape(instruction->shape(), new_convolution));
 }
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/convolution_4d_expander.h
+++ b/tensorflow/compiler/xla/service/convolution_4d_expander.h
@ -0,0 +1,39 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_4D_EXPANDER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_4D_EXPANDER_H_
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 #include "tensorflow/compiler/xla/statusor.h"
 namespace xla {
 class Convolution4DExpander : public OpExpanderPass {
 public:
  absl::string_view name() const override { return "convolution_4d_expander"; }
 protected:
  bool InstructionMatchesPattern(HloInstruction* instruction) override;
  StatusOr<HloInstruction*> ExpandInstruction(
      HloInstruction* instruction) override;
 };
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_4D_EXPANDER_H_
--- a/tensorflow/compiler/xla/service/convolution_4d_expander_test.cc
+++ b/tensorflow/compiler/xla/service/convolution_4d_expander_test.cc
@ -0,0 +1,172 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/convolution_4d_expander.h"
 #include <memory>
 #include <string>
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 namespace xla {
 namespace {
 using Convolution4DExpanderTest = HloTestBase;
 TEST_F(Convolution4DExpanderTest, ConvertTo2DConvolution) {
  string hlo_string = R"(HloModule convolution_4d_fp32
 ENTRY convolution_computation {
  input = f32[1,10,1,10,5,20]{5,4,3,2,1,0} parameter(0)
  kernel = f32[20,1,2,1,4,15]{5,4,3,2,1,0} parameter(1)
  ROOT conv = f32[15,1,9,1,7,5]{5,4,3,2,1,0} convolution(input, kernel), dim_labels=0123bf_i0123o->f0123b, window={size=1x2x1x4}
 })";
  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                          ParseAndReturnVerifiedModule(hlo_string));
  auto computation = module->entry_computation();
  HloInstruction* root = computation->root_instruction();
  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
  EXPECT_EQ(root->window().dimensions_size(), 4);
  Convolution4DExpander expander_pass;
  ASSERT_TRUE(expander_pass.Run(module.get()).ValueOrDie());
  root = computation->root_instruction();
  EXPECT_EQ(root->opcode(), HloOpcode::kReshape);
  const HloInstruction* new_convolution = root->operand(0);
  // Check that the new convolution has 2 spatial dimensions.
  EXPECT_EQ(new_convolution->opcode(), HloOpcode::kConvolution);
  EXPECT_EQ(new_convolution->window().dimensions_size(), 2);
 }
 TEST_F(Convolution4DExpanderTest, ConvertTo3DConvolution) {
  string hlo_string = R"(HloModule convolution_4d_fp32
 ENTRY convolution_computation {
  input = f32[1,10,1,10,5,20]{5,4,3,2,1,0} parameter(0)
  kernel = f32[20,1,2,1,4,15]{5,4,3,2,1,0} parameter(1)
  ROOT conv = f32[15,1,9,2,7,5]{5,4,3,2,1,0} convolution(input, kernel), dim_labels=0123bf_i0123o->f0123b, window={size=1x2x1x4 pad=0_0x0_0x1_0x0_0}
 })";
  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                          ParseAndReturnVerifiedModule(hlo_string));
  auto computation = module->entry_computation();
  HloInstruction* root = computation->root_instruction();
  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
  EXPECT_EQ(root->window().dimensions_size(), 4);
  Convolution4DExpander expander_pass;
  ASSERT_TRUE(expander_pass.Run(module.get()).ValueOrDie());
  root = computation->root_instruction();
  EXPECT_EQ(root->opcode(), HloOpcode::kReshape);
  const HloInstruction* new_convolution = root->operand(0);
  // Check that the new convolution has 3 spatial dimensions. Note that although
  // there are 2 input dimensions of size 1, one of them is not trivial because
  // with the low padding the output dimension will be 2.
  EXPECT_EQ(new_convolution->opcode(), HloOpcode::kConvolution);
  EXPECT_EQ(new_convolution->window().dimensions_size(), 3);
 }
 TEST_F(Convolution4DExpanderTest, ConvertTo0DConvolution) {
  string hlo_string = R"(HloModule convolution_4d_fp32
 ENTRY convolution_computation {
  input = f32[1,1,1,1,5,20]{5,4,3,2,1,0} parameter(0)
  kernel = f32[20,1,1,1,1,15]{5,4,3,2,1,0} parameter(1)
  ROOT conv = f32[15,1,1,1,1,5]{5,4,3,2,1,0} convolution(input, kernel), dim_labels=0123bf_i0123o->f0123b, window={size=1x1x1x1}
 })";
  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                          ParseAndReturnVerifiedModule(hlo_string));
  auto computation = module->entry_computation();
  HloInstruction* root = computation->root_instruction();
  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
  EXPECT_EQ(root->window().dimensions_size(), 4);
  Convolution4DExpander expander_pass;
  ASSERT_TRUE(expander_pass.Run(module.get()).ValueOrDie());
  root = computation->root_instruction();
  EXPECT_EQ(root->opcode(), HloOpcode::kReshape);
  const HloInstruction* new_convolution = root->operand(0);
  // Check that the new convolution has 0 spatial dimensions.
  EXPECT_EQ(new_convolution->opcode(), HloOpcode::kConvolution);
  EXPECT_EQ(new_convolution->window().dimensions_size(), 0);
 }
 TEST_F(Convolution4DExpanderTest, DontConvert3DConvolution) {
  string hlo_string = R"(HloModule convolution_4d_fp32
 ENTRY convolution_computation {
  input = f32[1,1,1,5,20]{4,3,2,1,0} parameter(0)
  kernel = f32[20,1,1,1,15]{4,3,2,1,0} parameter(1)
  ROOT conv = f32[15,1,1,1,5]{4,3,2,1,0} convolution(input, kernel), dim_labels=012bf_i012o->f012b, window={size=1x1x1}
 })";
  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                          ParseAndReturnVerifiedModule(hlo_string));
  auto computation = module->entry_computation();
  HloInstruction* root = computation->root_instruction();
  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
  EXPECT_EQ(root->window().dimensions_size(), 3);
  Convolution4DExpander expander_pass;
  ASSERT_FALSE(expander_pass.Run(module.get()).ValueOrDie());
 }
 TEST_F(Convolution4DExpanderTest, DontConvertIfNoTrivialDimensionAvailable) {
  string hlo_string = R"(HloModule convolution_4d_fp32
 ENTRY convolution_computation {
  input = f32[2,10,2,10,5,20]{5,4,3,2,1,0} parameter(0)
  kernel = f32[20,2,2,2,4,15]{5,4,3,2,1,0} parameter(1)
  ROOT conv = f32[15,1,9,1,7,5]{5,4,3,2,1,0} convolution(input, kernel), dim_labels=0123bf_i0123o->f0123b, window={size=2x2x2x4}
 })";
  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                          ParseAndReturnVerifiedModule(hlo_string));
  auto computation = module->entry_computation();
  HloInstruction* root = computation->root_instruction();
  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
  EXPECT_EQ(root->window().dimensions_size(), 4);
  Convolution4DExpander expander_pass;
  ASSERT_FALSE(expander_pass.Run(module.get()).ValueOrDie());
 }
 TEST_F(Convolution4DExpanderTest, DontConvertIfPaddingIsNonzero) {
  string hlo_string = R"(HloModule convolution_4d_fp32
 ENTRY convolution_computation {
  input = f32[1,10,1,10,5,20]{5,4,3,2,1,0} parameter(0)
  kernel = f32[20,1,2,1,4,15]{5,4,3,2,1,0} parameter(1)
  ROOT conv = f32[15,1,9,1,7,5]{5,4,3,2,1,0} convolution(input, kernel), dim_labels=0123bf_i0123o->f0123b, window={size=1x2x1x4 stride=2x1x2x1 pad=1_0x0_0x0_1x0_0}
 })";
  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                          ParseAndReturnVerifiedModule(hlo_string));
  auto computation = module->entry_computation();
  HloInstruction* root = computation->root_instruction();
  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
  EXPECT_EQ(root->window().dimensions_size(), 4);
  Convolution4DExpander expander_pass;
  // Although we have two spatial input dimensions of size 1, and the
  // corresponding spatial output dimensions are also of size 1, these
  // dimensions are not trivial because they involve lower and/or higher padding
  // plus stride.
  ASSERT_FALSE(expander_pass.Run(module.get()).ValueOrDie());
 }
 }  // namespace
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@ -1122,6 +1122,7 @@ cc_library(
        "//tensorflow/compiler/xla/service:buffer_assignment",
        "//tensorflow/compiler/xla/service:call_inliner",
        "//tensorflow/compiler/xla/service:conditional_simplifier",
        "//tensorflow/compiler/xla/service:convolution_4d_expander",
        "//tensorflow/compiler/xla/service:convolution_group_converter",
        "//tensorflow/compiler/xla/service:depthwise_convolution_converter",
        "//tensorflow/compiler/xla/service:dot_decomposer",
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/convolution_4d_expander.h"
 #include "tensorflow/compiler/xla/service/convolution_group_converter.h"
 #include "tensorflow/compiler/xla/service/depthwise_convolution_converter.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
@ -140,6 +141,8 @@ Status GpuCompiler::OptimizeHloModule(
    pipeline.AddPass<DotDecomposer>();
    pipeline.AddPass<Convolution4DExpander>();
    auto cost_model = [](HloInstruction*) {
      // We need a cost model for GPUs. Currently, do nothing.
      return false;