From 7691e99586e336c5dc4b7209f355c79019b8cf3e Mon Sep 17 00:00:00 2001
From: amoitra <amoitra@nvidia.com>
Date: Wed, 3 Jul 2019 12:57:46 -0700
Subject: [PATCH 1/8] Enable use of cudnn backprop APIs for grouped
 convolutions

---
 .../xla/service/gpu/cudnn_conv_rewriter.cc    | 51 +++++++-----
 .../service/gpu/cudnn_conv_rewriter_test.cc   | 80 +++++++++++++++++++
 2 files changed, 111 insertions(+), 20 deletions(-)
 mode change 100644 => 100755 tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
old mode 100644
new mode 100755
index e81850db69e..21ef810e64b
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -89,13 +89,11 @@ bool CanImplementAsCudnnForwardConv(HloInstruction* conv) {
 
 // Try to match a backward filter pattern that contains "conv".
 // Precondition: "conv" is a kConvolution.
-std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardFilter(
-    HloInstruction* conv) {
+std::tuple<bool, Window, ConvolutionDimensionNumbers, HloInstruction*>
+MatchBackwardFilter(HloInstruction* conv) {
   const auto no_match_result =
-      std::make_tuple(false, Window(), ConvolutionDimensionNumbers());
-  if (conv->feature_group_count() > 1) {
-    return no_match_result;
-  }
+      std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr);
+
   // Step 1: match the instruction pattern without considering the paddings and
   // dimension numbers just yet. We may need some generic pattern matcher
   // similar to third_party/llvm/llvm/include/llvm/IR/PatternMatch.h
@@ -248,7 +246,29 @@ std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardFilter(
     backward_conv_dnums.add_kernel_spatial_dimensions(output_spatial_dims[i]);
   }
 
-  return std::make_tuple(true, backward_conv_window, backward_conv_dnums);
+  HloInstruction* lhs = conv->mutable_operand(0);
+  if (conv->feature_group_count() == 1) {
+    return std::make_tuple(true, backward_conv_window, backward_conv_dnums,
+                           lhs);
+  }
+  Shape new_shape = lhs->shape();
+
+  int64 input_batch_dimension = backward_conv_dnums.input_batch_dimension();
+  int64 input_feature_dimension = backward_conv_dnums.input_feature_dimension();
+
+  int64 input_batch = new_shape.dimensions(input_batch_dimension);
+  int64 input_feature = new_shape.dimensions(input_feature_dimension);
+
+  // Ensure that input_batch is exact multiple of conv->feature_group_count()
+  CHECK_EQ(input_batch % conv->feature_group_count(), 0);
+  new_shape.set_dimensions(input_batch_dimension,
+                           input_batch / conv->feature_group_count());
+  new_shape.set_dimensions(input_feature_dimension,
+                           input_feature * conv->feature_group_count());
+
+  HloComputation* c = conv->parent();
+  lhs = c->AddInstruction(HloInstruction::CreateReshape(new_shape, lhs));
+  return std::make_tuple(true, backward_conv_window, backward_conv_dnums, lhs);
 }
 
 // Try to match a backward input pattern that contains "conv".
@@ -258,15 +278,6 @@ MatchBackwardInput(HloInstruction* conv) {
   const auto no_match_result =
       std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr);
 
-  // TODO(b/119479517): Theoretically cuDNN supports grouped convolutions also
-  // for the backward input convolution, but at least for now with version 7.1.4
-  // it is slower. This needs to be re-evaluated for future cuDNN versions.
-  // Note that we already have the necessary code down below, the only thing to
-  // enable it is to remove the following early return.
-  if (conv->feature_group_count() > 1) {
-    return no_match_result;
-  }
-
   // Match instruction pattern.
   CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
   HloInstruction* reverse_filter = conv->mutable_operand(1);
@@ -503,13 +514,13 @@ StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
     Window window;
     ConvolutionDimensionNumbers dnums;
     HloInstruction* rhs;
+    HloInstruction* lhs;
 
-    std::tie(match, window, dnums) = MatchBackwardFilter(conv);
+    std::tie(match, window, dnums, lhs) = MatchBackwardFilter(conv);
     if (match) {
       return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, conv->shape(),
-                             conv->mutable_operand(0), conv->mutable_operand(1),
-                             window, dnums, conv->feature_group_count(),
-                             conv->metadata());
+                             lhs, conv->mutable_operand(1), window, dnums,
+                             conv->feature_group_count(), conv->metadata());
     }
 
     std::tie(match, window, dnums, rhs) = MatchBackwardInput(conv);
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
index dbcdc2b075b..362d8d13aab 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
@@ -135,6 +135,86 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolve) {
       << md_after_opt.DebugString() << " vs " << metadata.DebugString();
 }
 
+TEST_F(CudnnConvRewriterTest, BackwardFilterGroupConvolve) {
+  // In a nutshell, before pass:
+  // Input->batch_dim: 3 input_shape(3) = 4
+  // Input->feature_dim: 0 input_shape(0) = 32
+  // Kernel(gradient)->kernel_input_feature_dim (gradient_batch_dimension): 0
+  // Kernel(gradient)->kernel_output_feature_dim (gradient_feature_dimension): 3
+  // Output(dkernel)->output_batch_dim (dkernel_input_feature_dim): 2
+  // Output(dkernel)->output_feature_dim (dkernel_output_feature_dim): 3
+
+  // After pass: All shapes and dimension layout is brought
+  // back to normal as would be acceptable by cudnn
+  // Input->batch_dim: 0 input_shape(0) = 8
+  // Input->feature_dim: 3 input_shape(3) = 16
+  // Kernel(gradient)->kernel_input_feature_dim (gradient_batch_dimension): 2
+  // Kernel(gradient)->kernel_output_feature_dim (gradient_feature_dimension): 3
+  // Output(dkernel)->output_batch_dim (dkernel_input_feature_dim): 0
+  // Output(dkernel)->output_feature_dim (dkernel_output_feature_dim): 3
+  HloComputation::Builder builder(TestName());
+  HloInstruction* activations =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {32, 1, 3, 4}), "activations"));
+  HloInstruction* gradients =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {8, 1, 2, 16}), "gradients"));
+  Window conv_window = default_conv_window_;
+  conv_window.mutable_dimensions(1)->set_size(2);
+  conv_window.mutable_dimensions(1)->set_window_dilation(2);
+  auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeInference::InferConvolveShape(
+          activations->shape(), gradients->shape(), /*feature_group_count=*/4,
+          /*batch_group_count=*/1, conv_window,
+          tf_default_dnums_for_backward_filter_)
+          .ConsumeValueOrDie(),
+      activations, gradients, /*feature_group_count=*/4,
+      /*batch_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
+  OpMetadata metadata;
+  metadata.set_op_name("bar");
+  conv->set_metadata(metadata);
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
+  // Check that metadata was preserved.
+  const auto& md_after_opt =
+      entry_computation->root_instruction()->operand(0)->metadata();
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(md_after_opt, metadata))
+      << md_after_opt.DebugString() << " vs " << metadata.DebugString();
+  const HloInstruction* custom_call =
+      entry_computation->root_instruction()->operand(0);
+  const ConvolutionDimensionNumbers conv_dim =
+      custom_call->convolution_dimension_numbers();
+  const auto lhs_a = custom_call->operand(0);
+  const auto input_shape = lhs_a->shape();
+  // The input (lhs) batch_dim(dim 0 in the original NHWC layout) gets mapped to
+  // be the feature_dim(dim 3) with a value of N*g = 32 in tf2xla. As described
+  // in conv_grad_ops.h, this swap is required to implement backprop using fwd
+  // conv. After the pass the batch_dim gets remapped to dim 0. The batch_dim
+  // value gets scaled to N = N*g/g = 32/4 = 8 to be compatible with cudnn
+  EXPECT_EQ(0, conv_dim.input_batch_dimension());
+  EXPECT_EQ(8, input_shape.dimensions(conv_dim.input_batch_dimension()));
+  // Similarly, the input (lhs) feature_dim(dim 3 in the original NHWC layout)
+  // gets mapped to be the batch_dim(dim 0) with a value of C/g = 4 in tf2xla.
+  // After the pass the batch_dim gets remapped to dim 0. The feature_dim value
+  // gets scaled to C = C/g*g = 4*4 = 16 to be compatible with cudnn
+  EXPECT_EQ(3, conv_dim.input_feature_dimension());
+  EXPECT_EQ(16, input_shape.dimensions(conv_dim.input_feature_dimension()));
+  // Similarly, the feature and batch dims of the incoming gradients (used as
+  // rhs) and the in/out dims of the output of convolution i.e, dgrad have been
+  // been modified in tf2xla (as described in conv_grad_ops.h). This pass remaps
+  // everything back for the layout to be compatible with cudnn backprop APIs.
+  EXPECT_EQ(2, conv_dim.kernel_input_feature_dimension());
+  EXPECT_EQ(3, conv_dim.kernel_output_feature_dimension());
+  EXPECT_EQ(0, conv_dim.output_batch_dimension());
+  EXPECT_EQ(3, conv_dim.output_feature_dimension());
+}
+
 TEST_F(CudnnConvRewriterTest,
        BackwardFilterConvolveEquivalentToForwardConvolution) {
   HloComputation::Builder builder(TestName());

From e7c6533b7d3f1997bfabe9043210845f016ab688 Mon Sep 17 00:00:00 2001
From: amoitra <amoitra@nvidia.com>
Date: Tue, 9 Jul 2019 14:40:29 -0700
Subject: [PATCH 2/8] Incorporate Thomas's comments

---
 .../compiler/xla/service/gpu/cudnn_conv_rewriter.cc   | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
index 21ef810e64b..ca8d63cbcc7 100755
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -251,16 +251,17 @@ MatchBackwardFilter(HloInstruction* conv) {
     return std::make_tuple(true, backward_conv_window, backward_conv_dnums,
                            lhs);
   }
-  Shape new_shape = lhs->shape();
 
   int64 input_batch_dimension = backward_conv_dnums.input_batch_dimension();
   int64 input_feature_dimension = backward_conv_dnums.input_feature_dimension();
 
-  int64 input_batch = new_shape.dimensions(input_batch_dimension);
-  int64 input_feature = new_shape.dimensions(input_feature_dimension);
-
+  int64 input_batch = lhs->shape().dimensions(input_batch_dimension);
   // Ensure that input_batch is exact multiple of conv->feature_group_count()
-  CHECK_EQ(input_batch % conv->feature_group_count(), 0);
+  CHECK_EQ(input_batch % conv->feature_group_count(), 0)
+      << "Input batch should be an exact multiple of feature group count";
+  int64 input_feature = lhs->shape().dimensions(input_feature_dimension);
+
+  Shape new_shape = lhs->shape();
   new_shape.set_dimensions(input_batch_dimension,
                            input_batch / conv->feature_group_count());
   new_shape.set_dimensions(input_feature_dimension,

From 0404f60b100a77059c5164d6da9953b6c18cb8f4 Mon Sep 17 00:00:00 2001
From: amoitra <amoitra@nvidia.com>
Date: Tue, 16 Jul 2019 13:31:45 -0700
Subject: [PATCH 3/8] Add check for depthwise fwd conv addressing test failures
 and reverting change for MatchBackwardInput

---
 .../xla/service/gpu/cudnn_conv_rewriter.cc     | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
index ca8d63cbcc7..9e59b1290ed 100755
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -153,6 +153,15 @@ MatchBackwardFilter(HloInstruction* conv) {
                "to fold it to a backward filter convolution.";
     return no_match_result;
   }
+  auto rhs_in =
+      conv->mutable_operand(1)->shape().dimensions(kernel_input_feature_dim);
+  if ((conv->feature_group_count() > 1) && (rhs_in == 1) &&
+      (input_batch_dim == output_batch_dim)) {
+    VLOG(1) << conv->ToString()
+            << " is a depthwise forward convolution. No need to fold to "
+               "backward filter.";
+    return no_match_result;
+  }
 
   // Step 3: fuse the matched HLOs into a backward convolution instruction.
   //
@@ -279,6 +288,15 @@ MatchBackwardInput(HloInstruction* conv) {
   const auto no_match_result =
       std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr);
 
+  // TODO(b/119479517): Theoretically cuDNN supports grouped convolutions also
+  // for the backward input convolution, but at least for now with version 7.1.4
+  // it is slower. This needs to be re-evaluated for future cuDNN versions.
+  // Note that we already have the necessary code down below, the only thing to
+  // enable it is to remove the following early return.
+  if (conv->feature_group_count() > 1) {
+    return no_match_result;
+  }
+  
   // Match instruction pattern.
   CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
   HloInstruction* reverse_filter = conv->mutable_operand(1);

From 3e2958befaa22595b754018e7e2ef089420ff17d Mon Sep 17 00:00:00 2001
From: amoitra <amoitra@nvidia.com>
Date: Sat, 20 Jul 2019 11:05:22 -0700
Subject: [PATCH 4/8] Added Transpose and a reshape

---
 .../xla/service/gpu/cudnn_conv_rewriter.cc    | 33 +++++++++++++++----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
index 9e59b1290ed..066e2daf52d 100755
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -265,18 +265,37 @@ MatchBackwardFilter(HloInstruction* conv) {
   int64 input_feature_dimension = backward_conv_dnums.input_feature_dimension();
 
   int64 input_batch = lhs->shape().dimensions(input_batch_dimension);
+  int64 input_feature = lhs->shape().dimensions(input_feature_dimension);
+
+  // Reshape batch_dim G*N -> [G,N]
+  std::vector<int64> reshape_dims = lhs->shape().dimensions();
+  auto num_groups = conv->feature_group_count();
   // Ensure that input_batch is exact multiple of conv->feature_group_count()
   CHECK_EQ(input_batch % conv->feature_group_count(), 0)
       << "Input batch should be an exact multiple of feature group count";
-  int64 input_feature = lhs->shape().dimensions(input_feature_dimension);
-
-  Shape new_shape = lhs->shape();
-  new_shape.set_dimensions(input_batch_dimension,
-                           input_batch / conv->feature_group_count());
-  new_shape.set_dimensions(input_feature_dimension,
-                           input_feature * conv->feature_group_count());
+  reshape_dims[input_batch_dimension] =
+      reshape_dims[input_batch_dimension] / num_groups;
+  reshape_dims.insert(reshape_dims.begin() + input_batch_dimension, num_groups);
 
   HloComputation* c = conv->parent();
+  lhs = c->AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(lhs->shape().element_type(), reshape_dims), lhs));
+
+  // Transpose G to the axis before C/G, For eg: [G, N, C/G, H, W] -> [N, G,
+  // C/G, H, W]
+  std::vector<int64> transpose_dims(lhs->shape().dimensions_size());
+  std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
+  transpose_dims.erase(transpose_dims.begin() + input_batch_dimension);
+  transpose_dims.insert(transpose_dims.begin() + input_feature_dimension,
+                        input_batch_dimension);
+  lhs = c->AddInstruction(
+      HloInstruction::CreateTranspose(lhs->shape(), lhs, transpose_dims));
+
+  // Merge [G,C/G] -> [C]
+  Shape new_shape = lhs->shape();
+  new_shape.DeleteDimension(input_feature_dimension);
+  new_shape.set_dimensions(input_feature_dimension,
+                           input_feature * conv->feature_group_count());
   lhs = c->AddInstruction(HloInstruction::CreateReshape(new_shape, lhs));
   return std::make_tuple(true, backward_conv_window, backward_conv_dnums, lhs);
 }

From 4390c4f8463bc5fb8e52fc2b4749951cdfca64ce Mon Sep 17 00:00:00 2001
From: amoitra <amoitra@nvidia.com>
Date: Sun, 21 Jul 2019 20:55:35 -0700
Subject: [PATCH 5/8] minor fix - missed something during merge conflict
 resolution

---
 tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
index ffda48872f2..25a821cb078 100755
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -561,7 +561,7 @@ StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
                              conv->feature_group_count(), conv->metadata());
     }
 
-    std::tie(match, window, dnums) = MatchBackwardFilter(conv);
+    std::tie(match, window, dnums, lhs) = MatchBackwardFilter(conv);
     if (match) {
       return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, conv->shape(),
                              lhs, conv->mutable_operand(1), window, dnums,

From 5471b5f66ed10ef49bce250746e7e73ec0ccf2be Mon Sep 17 00:00:00 2001
From: amoitra <amoitra@nvidia.com>
Date: Mon, 22 Jul 2019 00:33:24 -0700
Subject: [PATCH 6/8] Few more changes

---
 .../xla/service/gpu/cudnn_conv_rewriter.cc    | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)
 mode change 100755 => 100644 tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc

diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
old mode 100755
new mode 100644
index 25a821cb078..a441e70510a
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -278,26 +278,40 @@ MatchBackwardFilter(HloInstruction* conv) {
   reshape_dims.insert(reshape_dims.begin() + input_batch_dimension, num_groups);
 
   HloComputation* c = conv->parent();
-  lhs = c->AddInstruction(HloInstruction::CreateReshape(
-      ShapeUtil::MakeShape(lhs->shape().element_type(), reshape_dims), lhs));
+  HloInstruction* lhs_reshape_1 =
+      c->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(lhs->shape().element_type(), reshape_dims),
+          lhs));
 
   // Transpose G to the axis before C/G, For eg: [G, N, C/G, H, W] -> [N, G,
   // C/G, H, W]
-  std::vector<int64> transpose_dims(lhs->shape().dimensions_size());
+  std::vector<int64> transpose_dims(lhs_reshape_1->shape().dimensions_size());
   std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
   transpose_dims.erase(transpose_dims.begin() + input_batch_dimension);
   transpose_dims.insert(transpose_dims.begin() + input_feature_dimension,
                         input_batch_dimension);
-  lhs = c->AddInstruction(
-      HloInstruction::CreateTranspose(lhs->shape(), lhs, transpose_dims));
+  std::vector<int64> transpose_reshape_dims =
+      lhs_reshape_1->shape().dimensions();
+  transpose_reshape_dims.erase(transpose_reshape_dims.begin() +
+                               input_batch_dimension);
+  transpose_reshape_dims.insert(
+      transpose_reshape_dims.begin() + input_feature_dimension, num_groups);
+
+  HloInstruction* lhs_transpose =
+      c->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(lhs_reshape_1->shape().element_type(),
+                               transpose_reshape_dims),
+          lhs_reshape_1, transpose_dims));
 
   // Merge [G,C/G] -> [C]
-  Shape new_shape = lhs->shape();
+  Shape new_shape = lhs_transpose->shape();
   new_shape.DeleteDimension(input_feature_dimension);
   new_shape.set_dimensions(input_feature_dimension,
                            input_feature * conv->feature_group_count());
-  lhs = c->AddInstruction(HloInstruction::CreateReshape(new_shape, lhs));
-  return std::make_tuple(true, backward_conv_window, backward_conv_dnums, lhs);
+  HloInstruction* lhs_reshape_2 = c->AddInstruction(
+      HloInstruction::CreateReshape(new_shape, lhs_transpose));
+  return std::make_tuple(true, backward_conv_window, backward_conv_dnums,
+                         lhs_reshape_2);
 }
 
 // Try to match a backward input pattern that contains "conv".

From 808a8068ad9a206d979d34b33357dd92f21ba786 Mon Sep 17 00:00:00 2001
From: amoitra <amoitra@nvidia.com>
Date: Mon, 22 Jul 2019 11:51:18 -0700
Subject: [PATCH 7/8] Incorporate Adrian's comments

---
 tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc

diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
old mode 100644
new mode 100755
index a441e70510a..9c859a00dbc
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -155,8 +155,8 @@ MatchBackwardFilter(HloInstruction* conv) {
   }
   auto rhs_in =
       conv->mutable_operand(1)->shape().dimensions(kernel_input_feature_dim);
-  if ((conv->feature_group_count() > 1) && (rhs_in == 1) &&
-      (input_batch_dim == output_batch_dim)) {
+  if (conv->feature_group_count() > 1 && rhs_in == 1 &&
+      input_batch_dim == output_batch_dim) {
     VLOG(1) << conv->ToString()
             << " is a depthwise forward convolution. No need to fold to "
                "backward filter.";
@@ -270,8 +270,7 @@ MatchBackwardFilter(HloInstruction* conv) {
   // Reshape batch_dim G*N -> [G,N]
   std::vector<int64> reshape_dims = lhs->shape().dimensions();
   auto num_groups = conv->feature_group_count();
-  // Ensure that input_batch is exact multiple of conv->feature_group_count()
-  CHECK_EQ(input_batch % conv->feature_group_count(), 0)
+  CHECK_EQ(input_batch % num_groups, 0)
       << "Input batch should be an exact multiple of feature group count";
   reshape_dims[input_batch_dimension] =
       reshape_dims[input_batch_dimension] / num_groups;

From 969a4b05b4b7bbda14c4b4b44a94137220340bb7 Mon Sep 17 00:00:00 2001
From: amoitra <amoitra@nvidia.com>
Date: Mon, 22 Jul 2019 11:58:33 -0700
Subject: [PATCH 8/8] minor edit

---
 tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
index 9c859a00dbc..33486608c1c 100755
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -328,7 +328,7 @@ MatchBackwardInput(HloInstruction* conv) {
   if (conv->feature_group_count() > 1) {
     return no_match_result;
   }
-  
+
   // Match instruction pattern.
   CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
   HloInstruction* reverse_filter = conv->mutable_operand(1);