Remove heuristic caps on parallelism that should now be handled by cost model.

Adjust cost model for FloatToBFloat16 and BFloat16ToFloat. They do not take 100 cycles per element. This cl is a companion to cl/122779011, which makes the caps effective again, even with the nonblocking threadpool. Change: 123144919
2016-05-24 13:14:03 -08:00 · 2016-05-24 13:14:03 -08:00 · 3669479261
commit 3669479261
parent 67ddfa5b34
3 changed files with 18 additions and 33 deletions
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@ -150,19 +150,11 @@ class CpuCastOp : public CastOpBase {
      work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
        int64 N = out->NumElements();
        auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
-        int num_threads = static_cast<int>(std::min(
-            static_cast<int64>(std::min(4, worker_threads->num_threads)),
-            N / 4096));
-        if (num_threads < 1) {
-          BFloat16ToFloat(inp.flat<bfloat16>().data(),
-                          out->flat<float>().data(), N);
-        } else {
-          auto work = [&inp, &out](int64 start, int64 end) {
-            BFloat16ToFloat(inp.flat<bfloat16>().data() + start,
-                            out->flat<float>().data() + start, end - start);
-          };
-          Shard(num_threads, worker_threads->workers, N, 100, work);
-        }
+        auto work = [&inp, &out](int64 start, int64 end) {
+          BFloat16ToFloat(inp.flat<bfloat16>().data() + start,
+                          out->flat<float>().data() + start, end - start);
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, N, 2, work);
      };
      return Status::OK();
    }
@ -170,19 +162,11 @@ class CpuCastOp : public CastOpBase {
      work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
        int64 N = out->NumElements();
        auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
-        int num_threads = static_cast<int>(std::min(
-            static_cast<int64>(std::min(4, worker_threads->num_threads)),
-            N / 4096));
-        if (num_threads < 1) {
-          FloatToBFloat16(inp.flat<float>().data(),
-                          out->flat<bfloat16>().data(), N);
-        } else {
-          auto work = [&inp, &out](int64 start, int64 end) {
-            FloatToBFloat16(inp.flat<float>().data() + start,
-                            out->flat<bfloat16>().data() + start, end - start);
-          };
-          Shard(num_threads, worker_threads->workers, N, 100, work);
-        }
+        auto work = [&inp, &out](int64 start, int64 end) {
+          FloatToBFloat16(inp.flat<float>().data() + start,
+                          out->flat<bfloat16>().data() + start, end - start);
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, N, 2, work);
      };
      return Status::OK();
    }
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@ -123,7 +123,8 @@ void ConcatCPU(DeviceBase* d,
      }
    }
  };
-  Shard(num_threads, worker_threads->workers, output->size(), sizeof(T), work);
+  Shard(worker_threads->num_threads, worker_threads->workers, output->size(),
+        sizeof(T), work);
 }

 #define REGISTER(T)                                                            \
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@ -161,13 +161,11 @@ struct FillPhiloxRandom<CPUDevice, Distribution> {

    int64 total_group_count = (size + kGroupSize - 1) / kGroupSize;

-    // Limit to maximum six threads for now. The performance scaling is very
-    // sub-linear. Too many threads causes a much worse overall performance.
-    int num_workers = 6;
    const int kGroupCost =
        random::PhiloxRandom::kResultElementCount *
        (random::PhiloxRandom::kElementCost + Distribution::kElementCost);
-    Shard(num_workers, worker_threads.workers, total_group_count, kGroupCost,
+    Shard(worker_threads.num_threads, worker_threads.workers, total_group_count,
+          kGroupCost,
          [&gen, data, size, dist](int64 start_group, int64 limit_group) {
            FillPhiloxRandomTask<
                Distribution,
@ -399,8 +397,10 @@ class MultinomialOp : public OpKernel {
                    sizeof(int64) * num_samples);
      }
    };
-    Shard(std::min(batch_size, worker_threads.num_threads),
-          worker_threads.workers, batch_size, num_samples * num_classes * 2,
+    // Rough estimate, log2() takes from 58-680 cycles on Haswell.
+    // The functor here calls log twice for each element.
+    const int64 cost = 500 * num_samples * num_classes;
+    Shard(worker_threads.num_threads, worker_threads.workers, batch_size, cost,
          DoWork);
  }