[tf.unique()] Optimize the hash table implementation in UniqueOp::Compute().

This change makes two improvements to the `UniqueOp` implementation: 1. Use `absl::flat_hash_map` instead of `std::unordered_map`. 2. For the `tstring` implementation, use `StringPiece` as the key instead of `tstring`, which avoids copying the strings into the map. In addition, this change switches the microbenchmarks in unique_op_test.cc to use the SINGLE_THREADED_EXECUTOR, which removes thread scheduling overhead from the microbenchmark, and reduces noise in the results. Microbenchmark results show a saving of between 0% and 65% on BM_Unique_INT32, between 8% and 26% on BM_Unique_INT32_Repeat, and between 17% and 40% on BM_Unique_STRING. PiperOrigin-RevId: 307647292 Change-Id: If4367df37b856bf1c4cf91fcb34eea479014077f
2020-04-21 11:35:19 -07:00 · 2020-04-21 11:35:19 -07:00 · 37d16f759a
commit 37d16f759a
parent e2bb4b2acd
3 changed files with 61 additions and 30 deletions
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -1371,7 +1371,9 @@ tf_kernel_library(
 tf_kernel_library(
    name = "unique_op",
    prefix = "unique_op",
-    deps = ARRAY_DEPS,
+    deps = ARRAY_DEPS + [
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
 )

 tf_kernel_library(
@ -2335,6 +2337,7 @@ tf_cc_test(
        "//tensorflow/core:test",
        "//tensorflow/core:test_main",
        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:single_threaded_executor",
    ],
 )

--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@ -17,6 +17,7 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>

+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@ -26,10 +27,19 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"

 namespace tensorflow {
+namespace {

 typedef Eigen::ThreadPoolDevice CPUDevice;

-template <typename T, typename TIndex>
+// `UniqueOp` computes the unique elements in the input tensor.
+//
+// * `T` is the element type.
+// * `TKey` is the key type used in a local hash map. It must be explicitly
+//   convertible from `T`. For POD inputs, `TKey = T`. For `tstring` inputs,
+//   `TKey = absl::string_view` avoids copying the input strings into the map.
+// * `TIndex` is the type used to represent indices in the output, either
+//   `int32` or `int64`.
+template <typename T, typename TKey, typename TIndex>
 class UniqueOp : public OpKernel {
 public:
  explicit UniqueOp(OpKernelConstruction* context) : OpKernel(context) {}
@ -106,10 +116,10 @@ class UniqueOp : public OpKernel {
      auto Tin = input.flat<T>();
      const int64 N = static_cast<int64>(Tin.size());

-      std::unordered_map<T, TIndex> uniq;
+      absl::flat_hash_map<TKey, TIndex> uniq;
      uniq.reserve(2 * N);
      for (Eigen::Index i = 0, j = 0; i < N; ++i) {
-        auto it = uniq.insert(std::make_pair(Tin(i), j));
+        auto it = uniq.emplace(TKey(Tin(i)), j);
        idx_vec(i) = it.first->second;
        if (it.second) {
          ++j;
@ -153,13 +163,14 @@ class UniqueOp : public OpKernel {
        return true;
      };

-      std::unordered_map<int64, int64, decltype(hash_fn), decltype(equal_to_fn)>
+      absl::flat_hash_map<int64, int64, decltype(hash_fn),
+                          decltype(equal_to_fn)>
          uniq(0, hash_fn, equal_to_fn);

      uniq.reserve(2 * Tin.dimension(1));

      for (int64 i = 0, j = 0; i < Tin.dimension(1); ++i) {
-        auto it = uniq.insert(std::make_pair(i, j));
+        auto it = uniq.emplace(i, j);
        idx_vec(i) = it.first->second;
        if (it.second) {
          ++j;
@ -194,51 +205,56 @@ class UniqueOp : public OpKernel {
  }
 };

-#define REGISTER_UNIQUE(type)                                    \
+#define REGISTER_UNIQUE_WITH_KEY_TYPE(type, key_type)            \
  REGISTER_KERNEL_BUILDER(Name("Unique")                         \
                              .Device(DEVICE_CPU)                \
                              .TypeConstraint<type>("T")         \
                              .TypeConstraint<int32>("out_idx"), \
-                          UniqueOp<type, int32>);                \
+                          UniqueOp<type, key_type, int32>);      \
  REGISTER_KERNEL_BUILDER(Name("Unique")                         \
                              .Device(DEVICE_CPU)                \
                              .TypeConstraint<type>("T")         \
                              .TypeConstraint<int64>("out_idx"), \
-                          UniqueOp<type, int64>);                \
+                          UniqueOp<type, key_type, int64>);      \
  REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
                              .Device(DEVICE_CPU)                \
                              .TypeConstraint<type>("T")         \
                              .TypeConstraint<int32>("out_idx"), \
-                          UniqueOp<type, int32>);                \
+                          UniqueOp<type, key_type, int32>);      \
  REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
                              .Device(DEVICE_CPU)                \
                              .TypeConstraint<type>("T")         \
                              .TypeConstraint<int64>("out_idx"), \
-                          UniqueOp<type, int64>);                \
+                          UniqueOp<type, key_type, int64>);      \
  REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
                              .Device(DEVICE_CPU)                \
                              .TypeConstraint<type>("T")         \
                              .TypeConstraint<int32>("out_idx"), \
-                          UniqueOp<type, int32>)                 \
+                          UniqueOp<type, key_type, int32>)       \
  REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
                              .Device(DEVICE_CPU)                \
                              .TypeConstraint<type>("T")         \
                              .TypeConstraint<int64>("out_idx"), \
-                          UniqueOp<type, int64>);                \
+                          UniqueOp<type, key_type, int64>);      \
  REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
                              .Device(DEVICE_CPU)                \
                              .TypeConstraint<type>("T")         \
                              .TypeConstraint<int32>("out_idx"), \
-                          UniqueOp<type, int32>)                 \
+                          UniqueOp<type, key_type, int32>)       \
  REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2")             \
                              .Device(DEVICE_CPU)                \
                              .TypeConstraint<type>("T")         \
                              .TypeConstraint<int64>("out_idx"), \
-                          UniqueOp<type, int64>)
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
-REGISTER_UNIQUE(tstring)
-REGISTER_UNIQUE(bool)
-#undef REGISTER_UNIQUE
+                          UniqueOp<type, key_type, int64>)
+#define REGISTER_UNIQUE_WITH_SAME_KEY_TYPE(type) \
+  REGISTER_UNIQUE_WITH_KEY_TYPE(type, type)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE_WITH_SAME_KEY_TYPE);
+REGISTER_UNIQUE_WITH_SAME_KEY_TYPE(bool)
+#undef REGISTER_UNIQUE_WITH_SAME_KEY_TYPE
+
+REGISTER_UNIQUE_WITH_KEY_TYPE(tstring, absl::string_view)
+#undef REGISTER_UNIQUE_WITH_KEY_TYPE

 // Fake integer GPU kernels so that the use of Unique in optimizers (to
 // de-duplicate sparse gradient indices) does not conflict with gradients being
@ -251,7 +267,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                            .HostMemory("x")
                            .HostMemory("y")
                            .HostMemory("idx"),
-                        UniqueOp<int32, int32>);
+                        UniqueOp<int32, int32, int32>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                            .Device(DEVICE_GPU)
                            .TypeConstraint<int32>("T")
@ -259,7 +275,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                            .HostMemory("x")
                            .HostMemory("y")
                            .HostMemory("idx"),
-                        UniqueOp<int32, int64>);
+                        UniqueOp<int32, int32, int64>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                            .Device(DEVICE_GPU)
                            .TypeConstraint<int64>("T")
@ -267,7 +283,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                            .HostMemory("x")
                            .HostMemory("y")
                            .HostMemory("idx"),
-                        UniqueOp<int64, int32>);
+                        UniqueOp<int64, int64, int32>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                            .Device(DEVICE_GPU)
                            .TypeConstraint<int64>("T")
@ -275,7 +291,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                            .HostMemory("x")
                            .HostMemory("y")
                            .HostMemory("idx"),
-                        UniqueOp<int64, int64>);
+                        UniqueOp<int64, int64, int64>);

 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Unique")
@ -285,7 +301,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                            .HostMemory("x")
                            .HostMemory("y")
                            .HostMemory("idx"),
-                        UniqueOp<int32, int32>);
+                        UniqueOp<int32, int32, int32>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                            .Device(DEVICE_SYCL)
                            .TypeConstraint<int64>("T")
@ -293,7 +309,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                            .HostMemory("x")
                            .HostMemory("y")
                            .HostMemory("idx"),
-                        UniqueOp<int64, int32>);
+                        UniqueOp<int64, int64, int32>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                            .Device(DEVICE_SYCL)
                            .TypeConstraint<int32>("T")
@ -301,7 +317,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                            .HostMemory("x")
                            .HostMemory("y")
                            .HostMemory("idx"),
-                        UniqueOp<int32, int64>);
+                        UniqueOp<int32, int32, int64>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                            .Device(DEVICE_SYCL)
                            .TypeConstraint<int64>("T")
@ -309,6 +325,8 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                            .HostMemory("x")
                            .HostMemory("y")
                            .HostMemory("idx"),
-                        UniqueOp<int64, int64>);
+                        UniqueOp<int64, int64, int64>);
 #endif  // TENSORFLOW_USE_SYCL
+
+}  // namespace
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/unique_op_test.cc
+++ b/tensorflow/core/kernels/unique_op_test.cc
@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
@ -75,11 +76,14 @@ static void BM_Unique_INT32(int iters, int dim, int max_int) {
                  .Input(test::graph::Constant(g, input))
                  .Attr("T", DT_INT32)
                  .Finalize(g, &node));
+  FixupSourceAndSinkEdges(g);

  testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(int32));
  testing::UseRealTime();
  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
 }

 static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) {
@ -95,12 +99,15 @@ static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) {
                  .Input(test::graph::Constant(g, input))
                  .Attr("T", DT_INT32)
                  .Finalize(g, &node));
+  FixupSourceAndSinkEdges(g);

  testing::BytesProcessed(static_cast<int64>(iters) * dim * 200 *
                          sizeof(int32));
  testing::UseRealTime();
  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
 }

 TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
@ -132,11 +139,14 @@ static void BM_Unique_STRING(int iters, int dim) {
                  .Input(test::graph::Constant(g, input))
                  .Attr("T", DT_STRING)
                  .Finalize(g, &node));
+  FixupSourceAndSinkEdges(g);

  testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(tstring));
  testing::UseRealTime();
  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
 }

 BENCHMARK(BM_Unique_INT32)