Optimize depthwise convolution. Pscatter is somewhat slow,

espesically on older hardware so replace it with overlapping stores. For depth == 1 this results in a 2x spped-up, for other sizes, speed-up ranges from 0 to ~25%. PiperOrigin-RevId: 346382015 Change-Id: I60663cfbdc373bba95894906305bef509d36f325
2020-12-08 12:14:25 -08:00 · 2020-12-08 12:14:25 -08:00 · 42c2ae6091
commit 42c2ae6091
parent a32059d2f8
1 changed files with 113 additions and 55 deletions
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@ -193,27 +193,19 @@ struct DepthwiseInputCopyOp {
                  const int64 padded_filter_inner_dim_size, const int64 out_r,
                  const int64 out_c, const T* input, T* input_buffer) {
    typedef typename Eigen::internal::packet_traits<T>::type Packet;
-    static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
+    static const int64 kPacketSize = Eigen::internal::packet_traits<T>::size;
    const int64 kDepth = args.depth_multiplier;
    // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
    const int64 input_vectorized_size =
        (args.in_depth / kPacketSize) * kPacketSize;
-    const int64 input_scalar_size = args.in_depth % kPacketSize;
+    const int64 input_scalar_size = args.in_depth - input_vectorized_size;
    // Calculate vectorized and scalar (residual) lengths for
    // 'depth_multiplier'. This is used to efficiently replicate data for
    // when 'depth_multiplier' > kPacketSize.
    const int64 dm_vectorized_size =
        (args.depth_multiplier / kPacketSize) * kPacketSize;
    const int64 dm_scalar_size = args.depth_multiplier % kPacketSize;
    // Calculate output padding length.
    const int64 output_scalar_size = args.out_depth % kPacketSize;
    const int64 output_pad_size =
        output_scalar_size > 0 ? kPacketSize - output_scalar_size : 0;
    const int64 replicated_packet_size = kPacketSize * args.depth_multiplier;
    // Iterate through all rows x cols reading 'in_depth' from 'input' and
    // replicating by 'depth_multiplier' into 'input_buffer' (otherwise
    // zero-padding input buffer as needed).
@ -221,6 +213,62 @@ struct DepthwiseInputCopyOp {
    const int64 in_r_start = out_r * args.stride - args.pad_rows;
    const int64 in_c_start = out_c * args.stride - args.pad_cols;
    // TODO: add a ploaddup variant for depth == 2 if needed.
    if (kDepth > 1 && kDepth <= kPacketSize) {
      for (int64 f_r = 0; f_r < args.filter_rows; ++f_r) {
        const int64 in_r = in_r_start + f_r;
        for (int64 f_c = 0; f_c < args.filter_cols; ++f_c) {
          const int64 in_c = in_c_start + f_c;
          if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
              in_c < args.in_cols) {
            const auto* in =
                input + (in_r * args.in_cols + in_c) * args.in_depth;
            int64 limit = args.in_depth;
            // This will overwrite up to kPacketSize next elements,
            // this is ok on all iterations except the last one, since
            // we will write correct values on a next iteration.
            if (f_c == args.filter_cols - 1) {
              limit -= (kPacketSize - kDepth) / kDepth + 1;
              if (limit < 0) {
                limit = 0;
              }
            }
            // Copy vectorized portion of inner dimension.
            for (int64 d = 0; d < limit; d++) {
              const auto p = Eigen::internal::pset1<Packet>(in[d]);
              Eigen::internal::pstoreu<T>(in_buf, p);
              in_buf += kDepth;
            }
            // Copy the scalar portion.
            for (int64 d = limit; d < args.in_depth; d++) {
              const auto value = in[d];
              for (int64 dm = 0; dm < kDepth; dm++) {
                in_buf[dm] = value;
              }
              in_buf += kDepth;
            }
            // Pad the remainder of the output to vector register boundary.
            for (int64 d = 0; d < output_pad_size; ++d) {
              in_buf[d] = static_cast<T>(0);
            }
            in_buf += output_pad_size;
          } else {
            // Zero pad.
            memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size);
            in_buf += padded_filter_inner_dim_size;
          }
        }
      }
    } else if (kDepth > kPacketSize) {
      // Calculate vectorized and scalar (residual) lengths for
      // 'depth_multiplier'. This is used to efficiently replicate data for
      // when 'depth_multiplier' > kPacketSize.
      const int64 dm_vectorized_size = (kDepth / kPacketSize) * kPacketSize;
      for (int64 f_r = 0; f_r < args.filter_rows; ++f_r) {
        const int64 in_r = in_r_start + f_r;
@ -229,48 +277,57 @@ struct DepthwiseInputCopyOp {
          if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
              in_c < args.in_cols) {
-          auto* in = input + (in_r * args.in_cols + in_c) * args.in_depth;
+            const auto* in =
                input + (in_r * args.in_cols + in_c) * args.in_depth;
            // Copy vectorized portion of inner dimension.
-          for (int64 d = 0; d < input_vectorized_size; d += kPacketSize) {
+            for (int64 d = 0; d < args.in_depth; d++) {
-            auto v = Eigen::internal::ploadu<Packet>(in + d);
+              const auto p = Eigen::internal::pset1<Packet>(in[d]);
-            for (int dm = 0; dm < args.depth_multiplier; ++dm) {
+              for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
-              Eigen::internal::pscatter<T, Packet>(in_buf + dm, v,
+                Eigen::internal::pstoreu<T>(in_buf + dm, p);
                                                   args.depth_multiplier);
              }
-            in_buf += replicated_packet_size;
+              // Overlapping store for the remainder.
              Eigen::internal::pstoreu<T>(in_buf + kDepth - kPacketSize, p);
              in_buf += kDepth;
            }
            // Pad the remainder of the output to vector register boundary.
            for (int64 d = 0; d < output_pad_size; ++d) {
              in_buf[d] = static_cast<T>(0);
            }
            in_buf += output_pad_size;
          } else {
            // Zero pad.
            memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size);
            in_buf += padded_filter_inner_dim_size;
          }
        }
      }
    } else if (kDepth == 1) {
      for (int64 f_r = 0; f_r < args.filter_rows; ++f_r) {
        const int64 in_r = in_r_start + f_r;
-          // Copy scalar portion of inner dimension.
+        for (int64 f_c = 0; f_c < args.filter_cols; ++f_c) {
          const int64 in_c = in_c_start + f_c;
          if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
              in_c < args.in_cols) {
            const auto* in =
                input + (in_r * args.in_cols + in_c) * args.in_depth;
            for (int64 d = 0; d < input_vectorized_size; d += kPacketSize) {
              const auto p = Eigen::internal::ploadu<Packet>(in + d);
              Eigen::internal::pstoreu<T>(in_buf, p);
              in_buf += kPacketSize;
            }
            for (int64 d = 0; d < input_scalar_size; ++d) {
              T v = in[input_vectorized_size + d];
-            const int64 base = d * args.depth_multiplier;
+              in_buf[d] = v;
            if (dm_vectorized_size > 0) {
              // Copy vectorized portion of replicated output.
              // This branch is only taken if 'args.depth_multiplier' is
              // vectorizable (i.e. args.depth_multiplier >= register width).
              auto p = Eigen::internal::pset1<Packet>(v);
              for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
                Eigen::internal::pstoreu<T>(in_buf + base + dm, p);
            }
-              // Copy scalar portion of replicated output.
+            in_buf += input_scalar_size;
              for (int64 dm = 0; dm < dm_scalar_size; ++dm) {
                in_buf[base + dm_vectorized_size + dm] = v;
              }
            } else {
              // Depth multiplier is less than one packet: scalar copy.
              for (int dm = 0; dm < args.depth_multiplier; ++dm) {
                in_buf[base + dm] = v;
              }
            }
          }
          in_buf += input_scalar_size * args.depth_multiplier;
            // Pad the remainder of the output to vector register boundary.
            for (int64 d = 0; d < output_pad_size; ++d) {
              in_buf[d] = static_cast<T>(0);
            }
            in_buf += output_pad_size;
          } else {
            // Zero pad.
            memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size);
@ -279,6 +336,7 @@ struct DepthwiseInputCopyOp {
        }
      }
    }
  }
 };
 }  // namespace functor