Internal change

PiperOrigin-RevId: 276815994 Change-Id: I61be890666a04db4e4ef8c1a7bfce59e79836650
2019-10-25 21:31:33 -07:00 · 2019-10-25 21:31:33 -07:00 · dea39f6b3d
commit dea39f6b3d
parent 1799941f59
3 changed files with 94 additions and 20 deletions
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@ -45,6 +45,55 @@ struct MemCpyCopier<ResourceHandle> {
  }
 };

+template <typename T>
+int64 EstimateBytesPerElement(
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs) {
+  return sizeof(T);
+}
+
+// EstimateBytesPerElement for strings estimates the total bytes involved in
+// concatenating the strings in the "inputs" matrices (higher-level code
+// reshapes all the inputs to matrices), by sampling the lengths of the actual
+// strings in the various tensors.
+template <>
+int64 EstimateBytesPerElement<std::string>(
+    const std::vector<
+        std::unique_ptr<typename TTypes<std::string, 2>::ConstMatrix>>&
+        inputs) {
+  // randomly sample a few input strings to get a sense of the average size
+  // of each element
+  int num_samples = 0;
+  int64 num_bytes_in_samples = 0;
+  for (const auto& input : inputs) {
+    const auto dim0 = input->dimension(0);
+    const auto dim1 = input->dimension(1);
+    const auto zero = dim0 - dim0;  // Make type match
+    if (dim0 > 0 && dim1 > 0) {
+      // Draw 9 samples of string sizes from the input, in this sort of pattern
+      // ("*" is sample), to get an estimate of the lengths of each string
+      // element in the tensors:
+      //
+      //    *...*...*
+      //    .........
+      //    *...*...*
+      //    .........
+      //    *...*...*
+      for (auto i : {zero, dim0 / 2, dim0 - 1}) {
+        for (auto j : {zero, dim1 / 2, dim1 - 1}) {
+          num_bytes_in_samples += (*input)(i, j).size();
+          num_samples++;
+        }
+      }
+    }
+  }
+  // We don't use sizeof(std::string) as the overhead, since that would
+  // overestimate the memory touched for copying a string.
+  int64 string_overhead = sizeof(char*) + sizeof(size_t);
+  return string_overhead +
+         ((num_samples > 0) ? (num_bytes_in_samples / num_samples) : 0);
+}
+
 }  // namespace

 template <typename T>
@ -53,13 +102,8 @@ void ConcatCPU(
    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
        inputs,
    typename TTypes<T, 2>::Matrix* output) {
-  if (std::is_same<T, string>::value) {
-    // use a large cost here to force strings to be handled by separate threads
-    ConcatCPUImpl<T>(d, inputs, 100000, MemCpyCopier<T>(), output);
-  } else {
-    ConcatCPUImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */,
-                     MemCpyCopier<T>(), output);
-  }
+  int64 cost_per_unit = EstimateBytesPerElement<T>(inputs);
+  ConcatCPUImpl<T>(d, inputs, cost_per_unit, MemCpyCopier<T>(), output);
 }

 #define REGISTER(T)                                                            \
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@ -45,14 +45,15 @@ void ConcatCPUImpl(
    row_size += sizes.back();
  }

+  // cost_per_unit is estimated bytes to copy per output array element (for
+  // strings this includes an estimate of the number of bytes of the actual
+  // string data, as well).
+  const int64 estimated_total_cost = output->size() * cost_per_unit;
+
  auto worker_threads = d->tensorflow_cpu_worker_threads();
  int num_threads = std::min(4, worker_threads->num_threads);
-  // strings define a different amount of work (generally much more) compared
-  // with standard POD, so we parallelize differently.
-  if (!std::is_same<T, string>::value) {
-    num_threads =
-        static_cast<int>(std::min<int64>(num_threads, output->size() / 4096));
-  }
+  num_threads = static_cast<int>(
+      std::min<int64>(num_threads, estimated_total_cost / 16384));
  // Single threaded mode.
  // TODO(dga):  Deduplicate this code w.r.t. sharded code below.
  if (num_threads == 0) {
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@ -35,10 +35,30 @@ limitations under the License.
 namespace tensorflow {
 namespace {

-// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
-// in size, and concat them together along "concat_dimension"
 template <typename T>
-static void ConcatHelper(int iters, int concat_dimension, int dim2) {
+void FillTensorWithRandomValues(Tensor* t, int string_length, int64* bytes) {
+  t->flat<T>().setRandom();
+  *bytes = t->flat<T>().size() * sizeof(T);
+}
+
+template <>
+void FillTensorWithRandomValues<std::string>(Tensor* t, int string_length,
+                                             int64* bytes) {
+  auto ts = t->flat<string>();
+  *bytes = 0;
+  for (int i = 0; i < ts.size(); i++) {
+    ts(i) = string(string_length, 'x');
+    *bytes += sizeof(ts(i)) + ts(i).size();
+  }
+}
+
+// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
+// in size, and concat them together along "concat_dimension".  If T is
+// std::string, then the length of individual strings in the tensors will be
+// of length "string_length".
+template <typename T>
+static void ConcatHelper(int iters, int concat_dimension, int dim2,
+                         int string_length = 0) {
  testing::StopTiming();
  Graph* g = new Graph(OpRegistry::Global());

@ -47,9 +67,10 @@ static void ConcatHelper(int iters, int concat_dimension, int dim2) {
  Tensor concat_dim(DT_INT32, TensorShape({}));
  concat_dim.scalar<int32>()() = concat_dimension;
  Tensor in0(dt, TensorShape({kDim1, dim2}));
-  in0.flat<T>().setRandom();
  Tensor in1(dt, TensorShape({kDim1, dim2}));
-  in1.flat<T>().setRandom();
+  int64 in0_bytes, in1_bytes;
+  FillTensorWithRandomValues<T>(&in0, string_length, &in0_bytes);
+  FillTensorWithRandomValues<T>(&in1, string_length, &in1_bytes);

  Node* node;
  TF_CHECK_OK(
@ -60,8 +81,7 @@ static void ConcatHelper(int iters, int concat_dimension, int dim2) {
          .Attr("T", dt)
          .Finalize(g, &node));

-  testing::BytesProcessed(static_cast<int64>(iters) *
-                          ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(T));
+  testing::BytesProcessed(static_cast<int64>(iters) * (in0_bytes + in1_bytes));
  testing::StartTiming();
  test::Benchmark("cpu", g).Run(iters);
  testing::UseRealTime();
@ -78,6 +98,15 @@ static void BM_ConcatDim1Float(int iters, int dim2) {
 BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);

+static void BM_ConcatDim0String(int iters, int dim2, int string_length) {
+  ConcatHelper<string>(iters, 0, dim2, string_length);
+}
+
+BENCHMARK(BM_ConcatDim0String)
+    ->ArgPair(1, 16)
+    ->ArgPair(1, 10000)
+    ->ArgPair(100, 16);
+
 static void BM_ConcatDim1uint8(int iters, int dim2) {
  ConcatHelper<uint8>(iters, 1, dim2);
 }