From 810b4541693f7aa8afd7eeed466e2b307ad3c1bd Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Sun, 19 May 2019 08:30:28 -0700
Subject: [PATCH] [XLA:CPU] Don't create more shards than the max allowed
 parallelism.

The max allowed parallelism might e.g. be 1, in which case creating shards is
unnecessary and unhelpful.

PiperOrigin-RevId: 248941468
---
 .../compiler/xla/service/cpu/parallel_task_assignment.cc     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index 234fa91fe3e..b894bf502ca 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -74,8 +74,9 @@ class DefaultCostModel : public ParallelCostModel {
       // Limit max parallelism for I/O bound instructions by assuming a
       // sub-linear scaling function (fit based on empirical benchmark results).
       // TODO(b/29630486) Develop system bandwidth model.
-      max_parallelism =
-          std::ceil(std::sqrt(tensorflow::port::NumSchedulableCPUs()));
+      max_parallelism = std::min<int64>(
+          max_parallelism_,
+          std::ceil(std::sqrt(tensorflow::port::NumSchedulableCPUs())));
       // Use shape size instruction cost and L2 cache size min per-thread cost.
       instruction_cost = shape_size_(instruction->shape());
       min_cost_per_thread = 256LL << 10;  // 256KB L2 Cache size.