From 810b4541693f7aa8afd7eeed466e2b307ad3c1bd Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Sun, 19 May 2019 08:30:28 -0700 Subject: [PATCH] [XLA:CPU] Don't create more shards than the max allowed parallelism. The max allowed parallelism might e.g. be 1, in which case creating shards is unnecessary and unhelpful. PiperOrigin-RevId: 248941468 --- .../compiler/xla/service/cpu/parallel_task_assignment.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc index 234fa91fe3e..b894bf502ca 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc +++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc @@ -74,8 +74,9 @@ class DefaultCostModel : public ParallelCostModel { // Limit max parallelism for I/O bound instructions by assuming a // sub-linear scaling function (fit based on empirical benchmark results). // TODO(b/29630486) Develop system bandwidth model. - max_parallelism = - std::ceil(std::sqrt(tensorflow::port::NumSchedulableCPUs())); + max_parallelism = std::min( + max_parallelism_, + std::ceil(std::sqrt(tensorflow::port::NumSchedulableCPUs()))); // Use shape size instruction cost and L2 cache size min per-thread cost. instruction_cost = shape_size_(instruction->shape()); min_cost_per_thread = 256LL << 10; // 256KB L2 Cache size.