From 37d16f759ae9b0a665ad42f7e0d9f1ccade6bb9c Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Tue, 21 Apr 2020 11:35:19 -0700 Subject: [PATCH] [tf.unique()] Optimize the hash table implementation in `UniqueOp::Compute()`. This change makes two improvements to the `UniqueOp` implementation: 1. Use `absl::flat_hash_map` instead of `std::unordered_map`. 2. For the `tstring` implementation, use `StringPiece` as the key instead of `tstring`, which avoids copying the strings into the map. In addition, this change switches the microbenchmarks in unique_op_test.cc to use the SINGLE_THREADED_EXECUTOR, which removes thread scheduling overhead from the microbenchmark, and reduces noise in the results. Microbenchmark results show a saving of between 0% and 65% on BM_Unique_INT32, between 8% and 26% on BM_Unique_INT32_Repeat, and between 17% and 40% on BM_Unique_STRING. PiperOrigin-RevId: 307647292 Change-Id: If4367df37b856bf1c4cf91fcb34eea479014077f --- tensorflow/core/kernels/BUILD | 5 +- tensorflow/core/kernels/unique_op.cc | 70 ++++++++++++++--------- tensorflow/core/kernels/unique_op_test.cc | 16 +++++- 3 files changed, 61 insertions(+), 30 deletions(-) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 0510730cf07..201661235be 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -1371,7 +1371,9 @@ tf_kernel_library( tf_kernel_library( name = "unique_op", prefix = "unique_op", - deps = ARRAY_DEPS, + deps = ARRAY_DEPS + [ + "@com_google_absl//absl/container:flat_hash_map", + ], ) tf_kernel_library( @@ -2335,6 +2337,7 @@ tf_cc_test( "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/core:testlib", + "//tensorflow/core/kernels/data:single_threaded_executor", ], ) diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc index 8a9965fe16e..2c1505fe17c 100644 --- a/tensorflow/core/kernels/unique_op.cc +++ b/tensorflow/core/kernels/unique_op.cc @@ -17,6 +17,7 @@ limitations under the License. #include #include +#include "absl/container/flat_hash_map.h" #include "tensorflow/core/framework/bounds_check.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -26,10 +27,19 @@ limitations under the License. #include "tensorflow/core/lib/hash/hash.h" namespace tensorflow { +namespace { typedef Eigen::ThreadPoolDevice CPUDevice; -template +// `UniqueOp` computes the unique elements in the input tensor. +// +// * `T` is the element type. +// * `TKey` is the key type used in a local hash map. It must be explicitly +// convertible from `T`. For POD inputs, `TKey = T`. For `tstring` inputs, +// `TKey = absl::string_view` avoids copying the input strings into the map. +// * `TIndex` is the type used to represent indices in the output, either +// `int32` or `int64`. +template class UniqueOp : public OpKernel { public: explicit UniqueOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -106,10 +116,10 @@ class UniqueOp : public OpKernel { auto Tin = input.flat(); const int64 N = static_cast(Tin.size()); - std::unordered_map uniq; + absl::flat_hash_map uniq; uniq.reserve(2 * N); for (Eigen::Index i = 0, j = 0; i < N; ++i) { - auto it = uniq.insert(std::make_pair(Tin(i), j)); + auto it = uniq.emplace(TKey(Tin(i)), j); idx_vec(i) = it.first->second; if (it.second) { ++j; @@ -153,13 +163,14 @@ class UniqueOp : public OpKernel { return true; }; - std::unordered_map + absl::flat_hash_map uniq(0, hash_fn, equal_to_fn); uniq.reserve(2 * Tin.dimension(1)); for (int64 i = 0, j = 0; i < Tin.dimension(1); ++i) { - auto it = uniq.insert(std::make_pair(i, j)); + auto it = uniq.emplace(i, j); idx_vec(i) = it.first->second; if (it.second) { ++j; @@ -194,51 +205,56 @@ class UniqueOp : public OpKernel { } }; -#define REGISTER_UNIQUE(type) \ +#define REGISTER_UNIQUE_WITH_KEY_TYPE(type, key_type) \ REGISTER_KERNEL_BUILDER(Name("Unique") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueOp); \ + UniqueOp); \ REGISTER_KERNEL_BUILDER(Name("Unique") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueOp); \ + UniqueOp); \ REGISTER_KERNEL_BUILDER(Name("UniqueV2") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueOp); \ + UniqueOp); \ REGISTER_KERNEL_BUILDER(Name("UniqueV2") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueOp); \ + UniqueOp); \ REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueOp) \ + UniqueOp) \ REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueOp); \ + UniqueOp); \ REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueOp) \ + UniqueOp) \ REGISTER_KERNEL_BUILDER(Name("UniqueWithCountsV2") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("out_idx"), \ - UniqueOp) -TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE); -REGISTER_UNIQUE(tstring) -REGISTER_UNIQUE(bool) -#undef REGISTER_UNIQUE + UniqueOp) +#define REGISTER_UNIQUE_WITH_SAME_KEY_TYPE(type) \ + REGISTER_UNIQUE_WITH_KEY_TYPE(type, type) + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE_WITH_SAME_KEY_TYPE); +REGISTER_UNIQUE_WITH_SAME_KEY_TYPE(bool) +#undef REGISTER_UNIQUE_WITH_SAME_KEY_TYPE + +REGISTER_UNIQUE_WITH_KEY_TYPE(tstring, absl::string_view) +#undef REGISTER_UNIQUE_WITH_KEY_TYPE // Fake integer GPU kernels so that the use of Unique in optimizers (to // de-duplicate sparse gradient indices) does not conflict with gradients being @@ -251,7 +267,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique") .HostMemory("x") .HostMemory("y") .HostMemory("idx"), - UniqueOp); + UniqueOp); REGISTER_KERNEL_BUILDER(Name("Unique") .Device(DEVICE_GPU) .TypeConstraint("T") @@ -259,7 +275,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique") .HostMemory("x") .HostMemory("y") .HostMemory("idx"), - UniqueOp); + UniqueOp); REGISTER_KERNEL_BUILDER(Name("Unique") .Device(DEVICE_GPU) .TypeConstraint("T") @@ -267,7 +283,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique") .HostMemory("x") .HostMemory("y") .HostMemory("idx"), - UniqueOp); + UniqueOp); REGISTER_KERNEL_BUILDER(Name("Unique") .Device(DEVICE_GPU) .TypeConstraint("T") @@ -275,7 +291,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique") .HostMemory("x") .HostMemory("y") .HostMemory("idx"), - UniqueOp); + UniqueOp); #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("Unique") @@ -285,7 +301,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique") .HostMemory("x") .HostMemory("y") .HostMemory("idx"), - UniqueOp); + UniqueOp); REGISTER_KERNEL_BUILDER(Name("Unique") .Device(DEVICE_SYCL) .TypeConstraint("T") @@ -293,7 +309,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique") .HostMemory("x") .HostMemory("y") .HostMemory("idx"), - UniqueOp); + UniqueOp); REGISTER_KERNEL_BUILDER(Name("Unique") .Device(DEVICE_SYCL) .TypeConstraint("T") @@ -301,7 +317,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique") .HostMemory("x") .HostMemory("y") .HostMemory("idx"), - UniqueOp); + UniqueOp); REGISTER_KERNEL_BUILDER(Name("Unique") .Device(DEVICE_SYCL) .TypeConstraint("T") @@ -309,6 +325,8 @@ REGISTER_KERNEL_BUILDER(Name("Unique") .HostMemory("x") .HostMemory("y") .HostMemory("idx"), - UniqueOp); + UniqueOp); #endif // TENSORFLOW_USE_SYCL + +} // namespace } // namespace tensorflow diff --git a/tensorflow/core/kernels/unique_op_test.cc b/tensorflow/core/kernels/unique_op_test.cc index 4861a45848c..a0249d9bc4c 100644 --- a/tensorflow/core/kernels/unique_op_test.cc +++ b/tensorflow/core/kernels/unique_op_test.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.pb.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/node_builder.h" #include "tensorflow/core/graph/testlib.h" #include "tensorflow/core/kernels/ops_testutil.h" @@ -75,11 +76,14 @@ static void BM_Unique_INT32(int iters, int dim, int max_int) { .Input(test::graph::Constant(g, input)) .Attr("T", DT_INT32) .Finalize(g, &node)); + FixupSourceAndSinkEdges(g); testing::BytesProcessed(static_cast(iters) * dim * sizeof(int32)); testing::UseRealTime(); testing::StartTiming(); - test::Benchmark("cpu", g).Run(iters); + test::Benchmark("cpu", g, nullptr, nullptr, nullptr, + "SINGLE_THREADED_EXECUTOR") + .Run(iters); } static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) { @@ -95,12 +99,15 @@ static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) { .Input(test::graph::Constant(g, input)) .Attr("T", DT_INT32) .Finalize(g, &node)); + FixupSourceAndSinkEdges(g); testing::BytesProcessed(static_cast(iters) * dim * 200 * sizeof(int32)); testing::UseRealTime(); testing::StartTiming(); - test::Benchmark("cpu", g).Run(iters); + test::Benchmark("cpu", g, nullptr, nullptr, nullptr, + "SINGLE_THREADED_EXECUTOR") + .Run(iters); } TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) { @@ -132,11 +139,14 @@ static void BM_Unique_STRING(int iters, int dim) { .Input(test::graph::Constant(g, input)) .Attr("T", DT_STRING) .Finalize(g, &node)); + FixupSourceAndSinkEdges(g); testing::BytesProcessed(static_cast(iters) * dim * sizeof(tstring)); testing::UseRealTime(); testing::StartTiming(); - test::Benchmark("cpu", g).Run(iters); + test::Benchmark("cpu", g, nullptr, nullptr, nullptr, + "SINGLE_THREADED_EXECUTOR") + .Run(iters); } BENCHMARK(BM_Unique_INT32)