From bd20260350de4d0bc870fdc2bda39e62844a3e7b Mon Sep 17 00:00:00 2001
From: Sreeni Kesavarapu <sreenik@google.com>
Date: Tue, 2 Jun 2020 22:26:01 -0700
Subject: [PATCH] Implement GPU device priority for virtual devices.   - This
 adds a field in virtual device options to specify priority for virtual
 devices.   - When the priority is specified, it will be used to create the
 cuda streams for the virtual device with the given priority.   - This is
 backwards compatible with no priorities specified. When no priorities
 specified, the current implementation of creating a stream without any
 priority will continue while any non-zero priorities specified will be used
 to create streams with that priority.

PiperOrigin-RevId: 314470276
Change-Id: I4943f71e901245fb21b6f7e833adbdcd8126f1fa
---
 tensorflow/core/common_runtime/gpu/BUILD      |  10 +-
 .../core/common_runtime/gpu/gpu_device.cc     | 161 ++++++++++++++++--
 .../core/common_runtime/gpu/gpu_device.h      |   6 +
 .../common_runtime/gpu/gpu_device_test.cc     |  93 +++++++++-
 tensorflow/core/protobuf/config.proto         |  12 ++
 tensorflow/python/eager/context.py            |  24 ++-
 .../stream_executor/cuda/cuda_driver.cc       |  14 +-
 tensorflow/stream_executor/gpu/gpu_driver.h   |   3 +-
 tensorflow/stream_executor/gpu/gpu_stream.cc  |   3 +-
 tensorflow/stream_executor/gpu/gpu_stream.h   |   3 +
 .../stream_executor/rocm/rocm_driver.cc       |   8 +-
 .../golden/v1/tensorflow.-g-p-u-options.pbtxt |   6 +
 ...config.-logical-device-configuration.pbtxt |   4 +
 ...mental.-virtual-device-configuration.pbtxt |   4 +
 ...config.-logical-device-configuration.pbtxt |   4 +
 ...mental.-virtual-device-configuration.pbtxt |   4 +
 16 files changed, 331 insertions(+), 28 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 3744eb967d3..922a366fbfb 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -28,6 +28,10 @@ load(
     "if_static",
     "tf_cuda_tests_tags",
 )
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
 
 package(
     default_visibility = [
@@ -149,6 +153,7 @@ tf_cuda_library(
         ":gpu_id_impl",
         ":gpu_init_impl",
         ":gpu_lib",
+        "//third_party/eigen3",
         "//tensorflow/core:core_cpu_impl",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
@@ -160,8 +165,9 @@ tf_cuda_library(
         "//tensorflow/core:stream_executor",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:scoped_annotation",
-        "//third_party/eigen3",
-    ],
+    ] + if_gpu_is_configured([
+        "//tensorflow/stream_executor/cuda:cuda_platform",
+    ]),
     alwayslink = 1,
 )
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index e11b079b7ec..e47f56b2624 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -73,6 +73,7 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/stream_executor_util.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
 #if !defined(PLATFORM_GOOGLE)
@@ -243,16 +244,18 @@ class BaseGPUDevice::StreamGroupFactory {
     StreamGroup* group =
         &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
     if (!group->compute) {
-      group->compute = new se::Stream(executor);
+      int priority = GetPriority(tf_gpu_id.value(), options);
+      group->priority = priority;
+      group->compute = GetStream(executor, priority);
       group->compute->Init();
       VLOG(2) << "Created stream[" << stream_group_within_gpu
-              << "] = " << group->compute;
+              << "] = " << group->compute << " with priority: " << priority;
 
 #if TENSORFLOW_USE_ROCM
       // ROCm streams are lightweight and will not necessarily trigger device
       // queue init until they are first used. For optimal performance,
       // compute and nccl streams must be immediate siblings.
-      group->nccl = new se::Stream(executor);
+      group->nccl = GetStream(executor, priority);
       group->nccl->Init();
       VLOG(2) << "Created nccl_stream[" << stream_group_within_gpu
               << "] = " << group->nccl;
@@ -262,12 +265,12 @@ class BaseGPUDevice::StreamGroupFactory {
       group->nccl->ThenWaitFor(group->compute);
 #endif
 
-      group->host_to_device = new se::Stream(executor);
+      group->host_to_device = GetStream(executor, priority);
       group->host_to_device->Init();
       VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
               << "] = " << group->host_to_device;
 
-      group->device_to_host = new se::Stream(executor);
+      group->device_to_host = GetStream(executor, priority);
       group->device_to_host->Init();
       VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
               << "] = " << group->device_to_host;
@@ -282,7 +285,7 @@ class BaseGPUDevice::StreamGroupFactory {
         num_d2d_streams = 1;
       }
       for (int i = 0; i < num_d2d_streams; ++i) {
-        se::Stream* stream = new se::Stream(executor);
+        se::Stream* stream = GetStream(executor, priority);
         stream->Init();
         group->device_to_device.push_back(stream);
         VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
@@ -299,7 +302,70 @@ class BaseGPUDevice::StreamGroupFactory {
     return *instance;
   }
 
+  // Helper method for unit tests to reset the streams. Never use in production.
+  void TestOnlyReset() {
+    mutex_lock guard(lock_);
+    for (auto& item : streams_) {
+      auto& stream = item.second;
+      if (stream.compute) {
+        delete stream.compute;
+        stream.compute = nullptr;
+      }
+#if TENSORFLOW_USE_ROCM
+      if (stream.nccl) {
+        delete stream.nccl;
+        stream.nccl = nullptr;
+      }
+#endif
+      if (stream.host_to_device) {
+        delete stream.host_to_device;
+        stream.host_to_device = nullptr;
+      }
+      if (stream.device_to_host) {
+        delete stream.device_to_host;
+        stream.device_to_host = nullptr;
+      }
+      while (!stream.device_to_device.empty()) {
+        auto back = stream.device_to_device.back();
+        if (back) {
+          delete back;
+        }
+        stream.device_to_device.pop_back();
+      }
+    }
+    streams_.clear();
+  }
+
  private:
+  // Returns priority for the given virtual GPU id from the session options.
+  // Returns 0 if no virtual devices are specified.
+  int GetPriority(int tf_gpu_id, const GPUOptions& options) {
+    int id = tf_gpu_id;
+    int i = 0;
+    int priority = 0;
+    while (i < options.experimental().virtual_devices_size()) {
+      const int size =
+          options.experimental().virtual_devices().Get(i).priority_size();
+      if (id >= size) {
+        id -= size;
+      } else {
+        priority =
+            options.experimental().virtual_devices().Get(i).priority().Get(id);
+        break;
+      }
+      i++;
+    }
+    return priority;
+  }
+
+  // Returns a Stream with the underlying GPUStream with the given priority.
+  se::Stream* GetStream(se::StreamExecutor* executor, int priority) {
+    auto stream = new se::Stream(executor);
+    static_cast<stream_executor::gpu::GpuStream*>(stream->implementation())
+        ->SetPriority(priority);
+    return stream;
+  }
+
   mutex lock_;
   using key_type = std::tuple<int, int>;
   std::map<key_type, StreamGroup> streams_;
@@ -751,7 +817,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
 Status VerifyVirtualDeviceSettings(
     const size_t num_gpus_to_use, const GPUOptions& gpu_options,
     const std::vector<PlatformGpuId>& visible_gpu_order,
-    const std::vector<PlatformGpuId>& valid_platform_gpu_ids) {
+    const std::vector<PlatformGpuId>& valid_platform_gpu_ids,
+    const std::map<int, std::pair<int, int>>& supported_priority_ranges) {
   const auto& virtual_devices = gpu_options.experimental().virtual_devices();
   CHECK(!virtual_devices.empty());
   if (gpu_options.per_process_gpu_memory_fraction() > 0) {
@@ -780,6 +847,63 @@ Status VerifyVirtualDeviceSettings(
         " #valid GPUs: ", valid_platform_gpu_ids.size(),
         " virtual_devices.size(): ", virtual_devices.size());
   }
+#if GOOGLE_CUDA
+  // Check memory_limt_mb and priority sizes match if priority is non-empty.
+  bool priority_exists = !virtual_devices.Get(0).priority().empty();
+  for (int i = 0; i < virtual_devices.size(); ++i) {
+    const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
+    const auto& priority = virtual_devices.Get(i).priority();
+    // If the priority is empty in the first one then treat this as having no
+    // priority set in any of the virtual devices for backward compatibility.
+    // Either it's set for all or none.
+    if (!priority_exists) {
+      if (!priority.empty()) {
+        return errors::InvalidArgument(
+            "Priority must be set for all virtual devices or none. But the "
+            "priority is specified for ",
+            i,
+            " while previous devices didn't "
+            "have any set.");
+      }
+    }
+    if (priority_exists && memory_limit_mb.size() != priority.size()) {
+      return errors::InvalidArgument(
+          "Number of virtual device priorities specified doesn't "
+          "match with number of memory_limit_mb specified for GPU# ",
+          i, " memory_limit_mb size: ", memory_limit_mb.size(),
+          " and priority size: ", priority.size());
+    }
+    const int gpu_id = valid_platform_gpu_ids[i].value();
+    auto it = supported_priority_ranges.find(gpu_id);
+    if (it == supported_priority_ranges.end()) {
+      return errors::Internal(
+          "Failed to find supported priority range for GPU"
+          " device ",
+          gpu_id);
+    }
+    const std::pair<int, int>& priority_range = it->second;
+    for (int p : priority) {
+      if (p > priority_range.first || p < priority_range.second) {
+        return errors::InvalidArgument(
+            "Priority ", p,
+            " is outside the range of supported priorities "
+            "[",
+            priority_range.second, ",", priority_range.first,
+            "] for virtual device ", i, " on GPU# ", gpu_id);
+      }
+    }
+  }
+#elif TENSORFLOW_USE_ROCM
+  for (int i = 0; i < virtual_devices.size(); ++i) {
+    if (!virtual_devices.Get(i).priority().empty()) {
+      return errors::InvalidArgument(
+          "Priority is supported only on Nvidia GPUs."
+          " However, priority is set for virtual device ",
+          i, ", which corresponds to a non Nvidia GPU");
+    }
+  }
+#endif
+
   return Status::OK();
 }
 
@@ -1002,6 +1126,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
   if (num_gpus_to_use > valid_platform_gpu_ids.size()) {
     num_gpus_to_use = valid_platform_gpu_ids.size();
   }
+  std::map<int, std::pair<int, int>> supported_priority_ranges;
   if (!valid_platform_gpu_ids.empty()) {
     // Save the original device.
     int original_device = 0;
@@ -1035,6 +1160,18 @@ Status BaseGPUDeviceFactory::CreateDevices(
                                 platform_gpu_id.value(),
                                 " failed. Status: ", cudaGetErrorString(err));
       }
+      int priority_low, priority_high;
+      cudaDeviceGetStreamPriorityRange(&priority_low, &priority_high);
+      if (err != cudaSuccess) {
+        return errors::Internal(
+            "cudaDeviceGetStreamPriorityRange() on GPU:", original_device,
+            " failed. Status: ", cudaGetErrorString(err));
+      }
+      VLOG(1) << "Cuda stream priority range on GPU(" << original_device
+              << "): " << priority_high << "," << priority_low;
+      supported_priority_ranges.insert(
+          std::make_pair(platform_gpu_id.value(),
+                         std::make_pair(priority_low, priority_high)));
 #elif TENSORFLOW_USE_ROCM
       err = hipSetDevice(platform_gpu_id.value());
       if (err != hipSuccess) {
@@ -1106,9 +1243,9 @@ Status BaseGPUDeviceFactory::CreateDevices(
 
   const auto& virtual_devices = gpu_options.experimental().virtual_devices();
   if (!virtual_devices.empty()) {
-    TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(num_gpus_to_use, gpu_options,
-                                                   visible_gpu_order,
-                                                   valid_platform_gpu_ids));
+    TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
+        num_gpus_to_use, gpu_options, visible_gpu_order, valid_platform_gpu_ids,
+        supported_priority_ranges));
     // We've verified that num_gpus_to_use >= virtual_devices.size().
     num_gpus_to_use = virtual_devices.size();
     CHECK(gpu_options.visible_device_list().empty() ||
@@ -1703,6 +1840,10 @@ int BaseGPUDevice::PendingKernels() {
   return 0;
 }
 
+void BaseGPUDevice::TestOnlyReset() {
+  StreamGroupFactory::Global().TestOnlyReset();
+}
+
 uint64 GPUKernelTracker::MaybeQueue(OpKernelContext* ctx) {
   mutex_lock l(mu_);
   ++ops_since_last_;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 3646c59cec1..32c7738d916 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -114,6 +114,11 @@ class BaseGPUDevice : public LocalDevice {
   // the compute stream and are not yet known to have completed.
   int PendingKernels();
 
+  int priority() const { return stream_->priority; }
+
+  // Helper method for unit tests to reset the streams. Never use in production.
+  static void TestOnlyReset();
+
  protected:
   Allocator* gpu_allocator_;  // not owned
   Allocator* cpu_allocator_;  // not owned
@@ -131,6 +136,7 @@ class BaseGPUDevice : public LocalDevice {
     se::Stream* host_to_device = nullptr;
     se::Stream* device_to_host = nullptr;
     gtl::InlinedVector<se::Stream*, 4> device_to_device;
+    int priority = 0;
   };
   class StreamGroupFactory;
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 0d66324a8e5..26312d35af6 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -61,13 +61,17 @@ void ExpectErrorMessageSubstr(const Status& s, StringPiece substr) {
 
 class GPUDeviceTest : public ::testing::Test {
  public:
-  void TearDown() override { GPUProcessState::singleton()->TestOnlyReset(); }
+  void TearDown() override {
+    BaseGPUDevice::TestOnlyReset();
+    GPUProcessState::singleton()->TestOnlyReset();
+  }
 
  protected:
   static SessionOptions MakeSessionOptions(
       const string& visible_device_list = "",
       double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
-      const std::vector<std::vector<float>>& memory_limit_mb = {}) {
+      const std::vector<std::vector<float>>& memory_limit_mb = {},
+      const std::vector<std::vector<int32>>& priority = {}) {
     SessionOptions options;
     ConfigProto* config = &options.config;
     (*config->mutable_device_count())["GPU"] = gpu_device_count;
@@ -75,12 +79,17 @@ class GPUDeviceTest : public ::testing::Test {
     gpu_options->set_visible_device_list(visible_device_list);
     gpu_options->set_per_process_gpu_memory_fraction(
         per_process_gpu_memory_fraction);
-    for (const auto& v : memory_limit_mb) {
+    for (int i = 0; i < memory_limit_mb.size(); ++i) {
       auto virtual_devices =
           gpu_options->mutable_experimental()->add_virtual_devices();
-      for (float mb : v) {
+      for (float mb : memory_limit_mb[i]) {
         virtual_devices->add_memory_limit_mb(mb);
       }
+      if (i < priority.size()) {
+        for (int p : priority[i]) {
+          virtual_devices->add_priority(p);
+        }
+      }
     }
     return options;
   }
@@ -193,6 +202,7 @@ TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
+  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
@@ -204,25 +214,67 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
+  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
 }
 
-TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
+TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndNoPriority) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}});
   std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
+}
+
+TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
+  {
+    // Priority outside the range (-1, 0).
+    SessionOptions opts =
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 0}});
+    std::vector<std::unique_ptr<Device>> devices;
+    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+        opts, kDeviceNamePrefix, &devices);
+    EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+    ExpectErrorMessageSubstr(
+        status,
+        "Priority -2 is outside the range of supported priorities [-1,0] for"
+        " virtual device 0 on GPU# 0");
+  }
+  {
+    // Priority outside the range (-1, 0).
+    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
+    std::vector<std::unique_ptr<Device>> devices;
+    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+        opts, kDeviceNamePrefix, &devices);
+    EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+    ExpectErrorMessageSubstr(
+        status,
+        "Priority 1 is outside the range of supported priorities [-1,0] for"
+        " virtual device 0 on GPU# 0");
+  }
+}
+
+TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndPriority) {
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}}, {{-1}});
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  EXPECT_EQ(1, devices.size());
+  EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+  EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
 }
 
 TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
-  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}});
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}});
   std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(2, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
   EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
+  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
+  EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
   ASSERT_EQ(1, devices[0]->attributes().locality().links().link_size());
   ASSERT_EQ(1, devices[1]->attributes().locality().links().link_size());
   EXPECT_EQ(1, devices[0]->attributes().locality().links().link(0).device_id());
@@ -237,6 +289,35 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
             devices[1]->attributes().locality().links().link(0).strength());
 }
 
+TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
+  {
+    // Multile virtual devices with fewer priorities.
+    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1}});
+    std::vector<std::unique_ptr<Device>> devices;
+    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+        opts, kDeviceNamePrefix, &devices);
+    EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+    ExpectErrorMessageSubstr(
+        status,
+        "Number of virtual device priorities specified doesn't "
+        "match with number of memory_limit_mb specified for GPU# 0"
+        " memory_limit_mb size: 2 and priority size: 1");
+  }
+  {
+    // Multile virtual devices with matching priority.
+    SessionOptions opts =
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}});
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+        opts, kDeviceNamePrefix, &devices));
+    EXPECT_EQ(2, devices.size());
+    EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+    EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
+    EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
+    EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
+  }
+}
+
 // Enabling unified memory on pre-Pascal GPUs results in an initialization
 // error.
 TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 7973e002762..3d20d004d46 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -110,6 +110,18 @@ message GPUOptions {
       // For the concept of "visible" and "virtual" GPU, see the comments for
       // "visible_device_list" above for more information.
       repeated float memory_limit_mb = 1;
+
+      // Priority values to use with the virtual devices. Use the cuda function
+      // cudaDeviceGetStreamPriorityRange to query for valid range of values for
+      // priority.
+      //
+      // On a P4000 GPU with cuda 10.1, the priority range reported was 0 for
+      // least priority and -1 for greatest priority.
+      //
+      // If this field is not specified, then the virtual devices will be
+      // created with the default. If this field has values set, then the size
+      // of this must match with the above memory_limit_mb.
+      repeated int32 priority = 2;
     }
 
     // The multi virtual device settings. If empty (not set), it will create
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 604a960afd5..aa760583800 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -263,7 +263,8 @@ class LogicalDevice(
 @tf_export("config.LogicalDeviceConfiguration",
            "config.experimental.VirtualDeviceConfiguration")
 class LogicalDeviceConfiguration(
-    collections.namedtuple("LogicalDeviceConfiguration", ["memory_limit"])):
+    collections.namedtuple("LogicalDeviceConfiguration",
+                           ["memory_limit", "experimental_priority"])):
   """Configuration class for a logical devices.
 
   The class specifies the parameters to configure a `tf.config.PhysicalDevice`
@@ -276,10 +277,15 @@ class LogicalDeviceConfiguration(
   Fields:
     memory_limit: (optional) Maximum memory (in MB) to allocate on the virtual
       device. Currently only supported for GPUs.
+    experimental_priority: (optional) Priority to assign to a virtual device.
+      Lower values have higher priorities and 0 is the default.
+      Within a physical GPU, the GPU scheduler will prioritize ops on virtual
+      devices with higher priority. Currently only supported for Nvidia GPUs.
   """
 
-  def __new__(cls, memory_limit=None):
-    return super(LogicalDeviceConfiguration, cls).__new__(cls, memory_limit)
+  def __new__(cls, memory_limit=None, experimental_priority=None):
+    return super(LogicalDeviceConfiguration,
+                 cls).__new__(cls, memory_limit, experimental_priority)
 
 
 @tf_export("config.PhysicalDevice")
@@ -1019,12 +1025,19 @@ class Context(object):
       if self._virtual_device_map:
         vdevs = self._virtual_device_map.get(dev, [])
         device_limits = []
+        priority = []
         for virt_dev in vdevs:
           device_limits.append(virt_dev.memory_limit)
+          if virt_dev.experimental_priority is not None:
+            priority.append(virt_dev.experimental_priority)
+        # If priority is specified, it must be specified for all virtual
+        # devices.
+        if priority and len(device_limits) != len(priority):
+          raise ValueError("priority must be specified for all virtual devices")
 
         virtual_devices.append(
             config_pb2.GPUOptions.Experimental.VirtualDevices(
-                memory_limit_mb=device_limits))
+                memory_limit_mb=device_limits, priority=priority))
 
     # Only compute growth if virtual devices have not been configured and we
     # have GPUs
@@ -1394,6 +1407,9 @@ class Context(object):
         if vdev.memory_limit is not None:
           raise ValueError("Setting memory limit on CPU virtual devices is "
                            "currently not supported")
+        if vdev.experimental_priority is not None:
+          raise ValueError("Setting experimental_priority on CPU virtual "
+                           " devices is currently not supported")
     elif dev.device_type == "GPU":
       for vdev in virtual_devices:
         if vdev.memory_limit is None:
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 210c5436fad..3d011123d87 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -713,13 +713,21 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
       absl::StrCat("failed to get device for context: ", ToString(result)));
 }
 
-/* static */ bool GpuDriver::CreateStream(GpuContext* context,
-                                          CUstream* stream) {
+/* static */ bool GpuDriver::CreateStream(GpuContext* context, CUstream* stream,
+                                          int priority) {
   // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
   // up synchronization with respect to memsets and any other things that have
   // to occur on the default stream?
   ScopedActivateContext activated{context};
-  CUresult res = cuStreamCreate(stream, 0);
+  CUresult res;
+  // If the priority is 0, then use the previous api to create the stream with
+  // the default priority for backward compatibility. Probably there is no
+  // difference in using the new api call but leaving it as is for now.
+  if (priority == 0) {
+    res = cuStreamCreate(stream, 0);
+  } else {
+    res = cuStreamCreateWithPriority(stream, 0, priority);
+  }
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not allocate CUDA stream for context "
                << context->context() << ": " << ToString(res);
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
index f72c9a129cf..25b90be1bd2 100644
--- a/tensorflow/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -71,7 +71,8 @@ class GpuDriver {
   // cuStreamCreate.
   // stream is an outparam owned by the caller, must not be null.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
-  static bool CreateStream(GpuContext* context, GpuStreamHandle* stream);
+  static bool CreateStream(GpuContext* context, GpuStreamHandle* stream,
+                           int priority = 0);
 
   // Destroys a CUDA stream associated with the given context.
   // stream is owned by the caller, must not be null, and *stream is set to null
diff --git a/tensorflow/stream_executor/gpu/gpu_stream.cc b/tensorflow/stream_executor/gpu/gpu_stream.cc
index 887522cf3ae..9899bbb04a3 100644
--- a/tensorflow/stream_executor/gpu/gpu_stream.cc
+++ b/tensorflow/stream_executor/gpu/gpu_stream.cc
@@ -23,7 +23,8 @@ namespace stream_executor {
 namespace gpu {
 
 bool GpuStream::Init() {
-  if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_)) {
+  if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_,
+                               priority_)) {
     return false;
   }
   return GpuDriver::InitEvent(parent_->gpu_context(), &completed_event_,
diff --git a/tensorflow/stream_executor/gpu/gpu_stream.h b/tensorflow/stream_executor/gpu/gpu_stream.h
index db0eec53b9c..e58ac72dd56 100644
--- a/tensorflow/stream_executor/gpu/gpu_stream.h
+++ b/tensorflow/stream_executor/gpu/gpu_stream.h
@@ -48,6 +48,8 @@ class GpuStream : public internal::StreamInterface {
   // Explicitly initialize the CUDA resources associated with this stream, used
   // by StreamExecutor::AllocateStream().
   bool Init();
+  void SetPriority(int priority) { priority_ = priority; }
+  int priority() const { return priority_; }
 
   // Explicitly destroy the CUDA resources associated with this stream, used by
   // StreamExecutor::DeallocateStream().
@@ -78,6 +80,7 @@ class GpuStream : public internal::StreamInterface {
  private:
   GpuExecutor* parent_;         // Executor that spawned this stream.
   GpuStreamHandle gpu_stream_;  // Wrapped CUDA stream handle.
+  int priority_ = 0;
 
   // Event that indicates this stream has completed.
   GpuEventHandle completed_event_ = nullptr;
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
index 210e9d7a9fa..5a8154f1df8 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -558,7 +558,13 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 }
 
 /* static */ bool GpuDriver::CreateStream(GpuContext* context,
-                                          GpuStreamHandle* stream) {
+                                          GpuStreamHandle* stream,
+                                          int priority) {
+  if (priority != 0) {
+    LOG(ERROR) << "ROCM stream doesn't support priority. "
+               << " Should be set to 0 but given: " << priority;
+    return false;
+  }
   ScopedActivateContext activated{context};
   hipError_t res = tensorflow::wrap::hipStreamCreateWithFlags(
       stream, hipStreamDefault);  // switch to hipStreamNonBlocking?
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index 79c33f7e304..62bc83a3a69 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -116,6 +116,12 @@ tf_proto {
           label: LABEL_REPEATED
           type: TYPE_FLOAT
         }
+        field {
+          name: "priority"
+          number: 2
+          label: LABEL_REPEATED
+          type: TYPE_INT32
+        }
       }
     }
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
index 3f6c6e636a1..49750b0af85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
+  member {
+    name: "experimental_priority"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
index 25b6b6e216e..9b2a7d846b9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
+  member {
+    name: "experimental_priority"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
index 3f6c6e636a1..49750b0af85 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
+  member {
+    name: "experimental_priority"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
index 25b6b6e216e..9b2a7d846b9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
+  member {
+    name: "experimental_priority"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"