Split up conv_ops_fused kernels.

This improves build times by allowing the double, float, and half implementations to build in parallel. PiperOrigin-RevId: 235576953
2019-02-25 12:08:41 -08:00 · 2019-02-25 12:08:41 -08:00 · 1c6f10152f
commit 1c6f10152f
parent c715e350ca
6 changed files with 123 additions and 33 deletions
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@ -43,7 +43,9 @@ tensorflow/core/kernels/conv_grad_input_ops.cc
 tensorflow/core/kernels/conv_grad_ops.cc
 tensorflow/core/kernels/conv_ops.cc
 tensorflow/core/kernels/conv_ops_3d.cc
-tensorflow/core/kernels/conv_ops_fused.cc
+tensorflow/core/kernels/conv_ops_fused_double.cc
+tensorflow/core/kernels/conv_ops_fused_float.cc
+tensorflow/core/kernels/conv_ops_fused_half.cc
 tensorflow/core/kernels/conv_ops_using_gemm.cc
 tensorflow/core/kernels/crop_and_resize_op.cc
 tensorflow/core/kernels/ctc_decoder_ops.cc
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -5624,7 +5624,10 @@ filegroup(
        "conv_grad_ops.h",
        "conv_ops.cc",
        "conv_ops_3d.cc",
-        "conv_ops_fused.cc",
+        "conv_ops_fused_double.cc",
+        "conv_ops_fused_float.cc",
+        "conv_ops_fused_half.cc",
+        "conv_ops_fused_impl.h",
        "conv_ops_using_gemm.cc",
        "crop_and_resize_op.cc",
        "crop_and_resize_op.h",
--- a/tensorflow/core/kernels/conv_ops_fused_double.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_double.cc
@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/conv_ops_fused_impl.h"
+
+namespace tensorflow {
+
+// If we're using the alternative GEMM-based implementation of Conv2D for the
+// CPU implementation, don't register this EigenTensor-based version.
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
+TF_CALL_double(REGISTER_FUSED_CPU_CONV2D);
+#endif  // !USE_GEMM_FOR_CONV
+
+#if GOOGLE_CUDA
+
+namespace functor {
+DECLARE_FUNCTOR_GPU_SPEC(double);
+}  // namespace functor
+
+TF_CALL_double(REGISTER_FUSED_GPU_CONV2D);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
--- a/tensorflow/core/kernels/conv_ops_fused_float.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_float.cc
@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/conv_ops_fused_impl.h"
+
+namespace tensorflow {
+
+// If we're using the alternative GEMM-based implementation of Conv2D for the
+// CPU implementation, don't register this EigenTensor-based version.
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
+TF_CALL_float(REGISTER_FUSED_CPU_CONV2D);
+#endif  // !USE_GEMM_FOR_CONV
+
+#if GOOGLE_CUDA
+
+namespace functor {
+DECLARE_FUNCTOR_GPU_SPEC(float);
+}  // namespace functor
+
+TF_CALL_float(REGISTER_FUSED_GPU_CONV2D);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
--- a/tensorflow/core/kernels/conv_ops_fused_half.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_half.cc
@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/conv_ops_fused_impl.h"
+
+namespace tensorflow {
+
+#if GOOGLE_CUDA
+
+namespace functor {
+DECLARE_FUNCTOR_GPU_SPEC(Eigen::half);
+}  // namespace functor
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@ -28,6 +28,9 @@ limitations under the License.
 //
 // NOTE: GPU only supports fusion of Conv2D + BiasAdd + <optional Relu>.

+#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_
+
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS

@ -63,7 +66,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;

-namespace {
 // Supported Conv2D fusions. Not all of them supported on all type of devices.
 enum class FusedComputationType {
  // NOTE(ezhulenev): CuDNN `cudnnConvolutionBiasActivationForward` supports
@ -463,12 +465,12 @@ class FusedConvParameters : public ConvParameters {
  se::dnn::ActivationMode activation_mode_;
 };

-bool operator==(const FusedConvParameters& lhs,
+inline bool operator==(const FusedConvParameters& lhs,
                const FusedConvParameters& rhs) {
  return lhs.get_data_as_tuple() == rhs.get_data_as_tuple();
 }

-bool operator!=(const FusedConvParameters& lhs,
+inline bool operator!=(const FusedConvParameters& lhs,
                const FusedConvParameters& rhs) {
  return !(lhs == rhs);
 }
@ -482,7 +484,7 @@ using AutoTuneFusedConv =
    AutoTuneSingleton<FusedConvAutoTuneGroup, FusedConvParameters,
                      se::dnn::AlgorithmConfig>;

-int64 ConvolveScratchSize() {
+inline int64 ConvolveScratchSize() {
  static int64 convolve_scratch_size = GetDnnWorkspaceLimit(
      // default value is in bytes despite the name of the environment variable
      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
@ -822,8 +824,6 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {

 #endif  // GOOGLE_CUDA

-}  // namespace
-
 template <typename Device, typename T>
 class FusedConv2DOp : public OpKernel {
 public:
@ -962,22 +962,9 @@ class FusedConv2DOp : public OpKernel {
      Name("_FusedConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
      FusedConv2DOp<CPUDevice, T>);

-// If we're using the alternative GEMM-based implementation of Conv2D for the
-// CPU implementation, don't register this EigenTensor-based version.
-// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
-// contractions with non-default contraction output kernels.
-#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
-TF_CALL_float(REGISTER_FUSED_CPU_CONV2D);
-TF_CALL_double(REGISTER_FUSED_CPU_CONV2D);
-#endif  // !USE_GEMM_FOR_CONV
-
-#undef REGISTER_FUSED_CPU_CONV2D
-
 #if GOOGLE_CUDA

-// Forward declarations of the functor specializations for GPU.
-namespace functor {
-#define DECLARE_GPU_SPEC(T)                                              \
+#define DECLARE_FUNCTOR_GPU_SPEC(T)                                      \
  template <>                                                            \
  void TransformFilter<GPUDevice, T, int, 4>::operator()(                \
      const GPUDevice& d, FilterTensorFormat dst_filter_format,          \
@ -992,23 +979,14 @@ namespace functor {
      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
  extern template struct PadInput<GPUDevice, T, int, 4>

-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(Eigen::half);
-DECLARE_GPU_SPEC(double);
-#undef DECLARE_GPU_SPEC
-}  // namespace functor
-
 // Registration of the GPU implementations.
 #define REGISTER_FUSED_GPU_CONV2D(T)                                  \
  REGISTER_KERNEL_BUILDER(                                            \
      Name("_FusedConv2D").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
      FusedConv2DOp<GPUDevice, T>);

-TF_CALL_float(REGISTER_FUSED_GPU_CONV2D);
-TF_CALL_double(REGISTER_FUSED_GPU_CONV2D);
-
-#undef REGISTER_FUSED_GPU_CONV2D
-
 #endif  // GOOGLE_CUDA

 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_FUSED_IMPL_H_