Optimize OneHot op on CPU

PiperOrigin-RevId: 276122680 Change-Id: I17a72abf2818223dee4fb86517f94ebf5c045309
2019-10-22 12:44:51 -07:00 · 2019-10-22 12:44:51 -07:00 · 13772b2e69
commit 13772b2e69
parent 6037196225
3 changed files with 162 additions and 0 deletions
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -2159,6 +2159,25 @@ tf_cc_test(
    ],
 )

+tf_cc_test(
+    name = "one_hot_op_test",
+    size = "small",
+    srcs = ["one_hot_op_test.cc"],
+    deps = [
+        ":one_hot_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/stream_executor/cuda:cudnn_plugin",
+    ],
+)
+
 tf_cc_test(
    name = "reverse_op_test",
    size = "small",
--- a/tensorflow/core/kernels/one_hot_op.h
+++ b/tensorflow/core/kernels/one_hot_op.h
@ -19,12 +19,18 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_ONE_HOT_OP_H_
 // Generator definition for OneHotOp, must be compilable by nvcc.

+#define EIGEN_USE_THREADS
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"

 namespace tensorflow {

+typedef Eigen::ThreadPoolDevice CPUDevice;
+
 namespace generator {

 template <typename T, typename TI>
@ -65,6 +71,53 @@ struct OneHot {
  }
 };

+template <typename T, typename TI>
+struct OneHot<CPUDevice, T, TI> {
+  EIGEN_ALWAYS_INLINE static void Compute(
+      const CPUDevice& d, const typename TTypes<TI>::ConstMatrix& indices,
+      const typename TTypes<T>::ConstScalar& on_value,
+      const typename TTypes<T>::ConstScalar& off_value,
+      typename TTypes<T, 3>::Tensor* output) {
+    // Pre-fill output with `off_value`.
+    output->device(d) = output->constant(off_value());
+
+    // Iterate through indices and update on_value elements in the output.
+    Eigen::Index prefix_size = output->dimensions()[0];
+    Eigen::Index depth_size = output->dimensions()[1];
+    Eigen::Index suffix_size = output->dimensions()[2];
+
+    // Cost of setting one `on_value` coefficient.
+    double bytes_loaded = sizeof(T);
+    double bytes_stored = sizeof(T);
+    double cycles = 0.0;
+    const Eigen::TensorOpCost cost(bytes_loaded, bytes_stored, cycles);
+
+    if (suffix_size == 1) {
+      const auto func = [&](Eigen::Index start, Eigen::Index end) -> void {
+        for (Eigen::Index i = start; i < end; ++i) {
+          const TI depth = internal::SubtleMustCopy(indices(i, 0));
+          if (FastBoundsCheck(depth, depth_size)) {
+            (*output)(i, depth, 0) = on_value();
+          }
+        }
+      };
+      d.parallelFor(prefix_size, cost, func);
+    } else {
+      const auto func = [&](Eigen::Index start, Eigen::Index end) -> void {
+        for (Eigen::Index i = start; i < end; ++i) {
+          const Eigen::Index d0 = i / suffix_size;
+          const Eigen::Index d1 = i - (d0 * suffix_size);
+          const TI depth = internal::SubtleMustCopy(indices(d0, d1));
+          if (FastBoundsCheck(depth, depth_size)) {
+            (*output)(d0, depth, d1) = on_value();
+          }
+        }
+      };
+      d.parallelFor(prefix_size * suffix_size, cost * suffix_size, func);
+    }
+  }
+};
+
 }  // namespace functor

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/one_hot_op_test.cc
+++ b/tensorflow/core/kernels/one_hot_op_test.cc
@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <random>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+static Graph* OneHot(int batch_size, int num_classes, int axis) {
+  Graph* g = new Graph(OpRegistry::Global());
+
+  Tensor indices(DT_INT32, TensorShape({batch_size}));
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<> dist(0, num_classes - 1);
+
+  auto indices_t = indices.flat<int32>();
+  for (int i = 0; i < batch_size; ++i) {
+    indices_t(i) = dist(gen);
+  }
+
+  Tensor depth(DT_INT32, TensorShape({}));
+  depth.scalar<int32>()() = num_classes;
+
+  Tensor on_value(DT_FLOAT, TensorShape({}));
+  on_value.scalar<float>()() = 1.0f;
+
+  Tensor off_value(DT_FLOAT, TensorShape({}));
+  off_value.scalar<float>()() = 0.0f;
+
+  test::graph::Multi(g, "OneHot",
+                     {
+                         test::graph::Constant(g, indices),
+                         test::graph::Constant(g, depth),
+                         test::graph::Constant(g, on_value),
+                         test::graph::Constant(g, off_value),
+                     })
+      ->AddAttr("axis", axis);
+  return g;
+}
+
+#define BM_OneHot(BATCH, CLASS, AXIS, DEVICE)                                \
+  static void BM_OneHot##_##BATCH##_##CLASS##_##AXIS##_##DEVICE(int iters) { \
+    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS);      \
+    test::Benchmark(#DEVICE, OneHot(BATCH, CLASS, AXIS)).Run(iters);         \
+  }                                                                          \
+  BENCHMARK(BM_OneHot##_##BATCH##_##CLASS##_##AXIS##_##DEVICE);
+
+// CPU
+BM_OneHot(32, 512, 1, cpu);
+BM_OneHot(64, 512, 1, cpu);
+BM_OneHot(128, 512, 1, cpu);
+
+BM_OneHot(32, 1024, 1, cpu);
+BM_OneHot(64, 1024, 1, cpu);
+BM_OneHot(128, 1024, 1, cpu);
+
+BM_OneHot(32, 10000, 1, cpu);
+BM_OneHot(64, 10000, 1, cpu);
+BM_OneHot(128, 10000, 1, cpu);
+
+BM_OneHot(32, 512, 0, cpu);
+BM_OneHot(64, 512, 0, cpu);
+BM_OneHot(128, 512, 0, cpu);
+
+BM_OneHot(32, 1024, 0, cpu);
+BM_OneHot(64, 1024, 0, cpu);
+BM_OneHot(128, 1024, 0, cpu);
+
+BM_OneHot(32, 10000, 0, cpu);
+BM_OneHot(64, 10000, 0, cpu);
+BM_OneHot(128, 10000, 0, cpu);
+
+}  // end namespace tensorflow