From bd20260350de4d0bc870fdc2bda39e62844a3e7b Mon Sep 17 00:00:00 2001 From: Sreeni Kesavarapu Date: Tue, 2 Jun 2020 22:26:01 -0700 Subject: [PATCH] Implement GPU device priority for virtual devices. - This adds a field in virtual device options to specify priority for virtual devices. - When the priority is specified, it will be used to create the cuda streams for the virtual device with the given priority. - This is backwards compatible with no priorities specified. When no priorities specified, the current implementation of creating a stream without any priority will continue while any non-zero priorities specified will be used to create streams with that priority. PiperOrigin-RevId: 314470276 Change-Id: I4943f71e901245fb21b6f7e833adbdcd8126f1fa --- tensorflow/core/common_runtime/gpu/BUILD | 10 +- .../core/common_runtime/gpu/gpu_device.cc | 161 ++++++++++++++++-- .../core/common_runtime/gpu/gpu_device.h | 6 + .../common_runtime/gpu/gpu_device_test.cc | 93 +++++++++- tensorflow/core/protobuf/config.proto | 12 ++ tensorflow/python/eager/context.py | 24 ++- .../stream_executor/cuda/cuda_driver.cc | 14 +- tensorflow/stream_executor/gpu/gpu_driver.h | 3 +- tensorflow/stream_executor/gpu/gpu_stream.cc | 3 +- tensorflow/stream_executor/gpu/gpu_stream.h | 3 + .../stream_executor/rocm/rocm_driver.cc | 8 +- .../golden/v1/tensorflow.-g-p-u-options.pbtxt | 6 + ...config.-logical-device-configuration.pbtxt | 4 + ...mental.-virtual-device-configuration.pbtxt | 4 + ...config.-logical-device-configuration.pbtxt | 4 + ...mental.-virtual-device-configuration.pbtxt | 4 + 16 files changed, 331 insertions(+), 28 deletions(-) diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD index 3744eb967d3..922a366fbfb 100644 --- a/tensorflow/core/common_runtime/gpu/BUILD +++ b/tensorflow/core/common_runtime/gpu/BUILD @@ -28,6 +28,10 @@ load( "if_static", "tf_cuda_tests_tags", ) +load( + "//tensorflow/stream_executor:build_defs.bzl", + "if_gpu_is_configured", +) package( default_visibility = [ @@ -149,6 +153,7 @@ tf_cuda_library( ":gpu_id_impl", ":gpu_init_impl", ":gpu_lib", + "//third_party/eigen3", "//tensorflow/core:core_cpu_impl", "//tensorflow/core:core_cpu_lib", "//tensorflow/core:framework", @@ -160,8 +165,9 @@ tf_cuda_library( "//tensorflow/core:stream_executor", "//tensorflow/core/profiler/lib:annotated_traceme", "//tensorflow/core/profiler/lib:scoped_annotation", - "//third_party/eigen3", - ], + ] + if_gpu_is_configured([ + "//tensorflow/stream_executor/cuda:cuda_platform", + ]), alwayslink = 1, ) diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index e11b079b7ec..e47f56b2624 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -73,6 +73,7 @@ limitations under the License. #include "tensorflow/core/util/device_name_utils.h" #include "tensorflow/core/util/env_var.h" #include "tensorflow/core/util/stream_executor_util.h" +#include "tensorflow/stream_executor/gpu/gpu_stream.h" #include "tensorflow/stream_executor/platform/dso_loader.h" #if !defined(PLATFORM_GOOGLE) @@ -243,16 +244,18 @@ class BaseGPUDevice::StreamGroupFactory { StreamGroup* group = &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)]; if (!group->compute) { - group->compute = new se::Stream(executor); + int priority = GetPriority(tf_gpu_id.value(), options); + group->priority = priority; + group->compute = GetStream(executor, priority); group->compute->Init(); VLOG(2) << "Created stream[" << stream_group_within_gpu - << "] = " << group->compute; + << "] = " << group->compute << " with priority: " << priority; #if TENSORFLOW_USE_ROCM // ROCm streams are lightweight and will not necessarily trigger device // queue init until they are first used. For optimal performance, // compute and nccl streams must be immediate siblings. - group->nccl = new se::Stream(executor); + group->nccl = GetStream(executor, priority); group->nccl->Init(); VLOG(2) << "Created nccl_stream[" << stream_group_within_gpu << "] = " << group->nccl; @@ -262,12 +265,12 @@ class BaseGPUDevice::StreamGroupFactory { group->nccl->ThenWaitFor(group->compute); #endif - group->host_to_device = new se::Stream(executor); + group->host_to_device = GetStream(executor, priority); group->host_to_device->Init(); VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu << "] = " << group->host_to_device; - group->device_to_host = new se::Stream(executor); + group->device_to_host = GetStream(executor, priority); group->device_to_host->Init(); VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu << "] = " << group->device_to_host; @@ -282,7 +285,7 @@ class BaseGPUDevice::StreamGroupFactory { num_d2d_streams = 1; } for (int i = 0; i < num_d2d_streams; ++i) { - se::Stream* stream = new se::Stream(executor); + se::Stream* stream = GetStream(executor, priority); stream->Init(); group->device_to_device.push_back(stream); VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu @@ -299,7 +302,70 @@ class BaseGPUDevice::StreamGroupFactory { return *instance; } + // Helper method for unit tests to reset the streams. Never use in production. + void TestOnlyReset() { + mutex_lock guard(lock_); + for (auto& item : streams_) { + auto& stream = item.second; + if (stream.compute) { + delete stream.compute; + stream.compute = nullptr; + } +#if TENSORFLOW_USE_ROCM + if (stream.nccl) { + delete stream.nccl; + stream.nccl = nullptr; + } +#endif + if (stream.host_to_device) { + delete stream.host_to_device; + stream.host_to_device = nullptr; + } + if (stream.device_to_host) { + delete stream.device_to_host; + stream.device_to_host = nullptr; + } + while (!stream.device_to_device.empty()) { + auto back = stream.device_to_device.back(); + if (back) { + delete back; + } + stream.device_to_device.pop_back(); + } + } + streams_.clear(); + } + private: + // Returns priority for the given virtual GPU id from the session options. + // Returns 0 if no virtual devices are specified. + int GetPriority(int tf_gpu_id, const GPUOptions& options) { + int id = tf_gpu_id; + int i = 0; + int priority = 0; + while (i < options.experimental().virtual_devices_size()) { + const int size = + options.experimental().virtual_devices().Get(i).priority_size(); + if (id >= size) { + id -= size; + } else { + priority = + options.experimental().virtual_devices().Get(i).priority().Get(id); + break; + } + i++; + } + return priority; + } + + // Returns a Stream with the underlying GPUStream with the given priority. + se::Stream* GetStream(se::StreamExecutor* executor, int priority) { + auto stream = new se::Stream(executor); + static_cast(stream->implementation()) + ->SetPriority(priority); + return stream; + } + mutex lock_; using key_type = std::tuple; std::map streams_; @@ -751,7 +817,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list, Status VerifyVirtualDeviceSettings( const size_t num_gpus_to_use, const GPUOptions& gpu_options, const std::vector& visible_gpu_order, - const std::vector& valid_platform_gpu_ids) { + const std::vector& valid_platform_gpu_ids, + const std::map>& supported_priority_ranges) { const auto& virtual_devices = gpu_options.experimental().virtual_devices(); CHECK(!virtual_devices.empty()); if (gpu_options.per_process_gpu_memory_fraction() > 0) { @@ -780,6 +847,63 @@ Status VerifyVirtualDeviceSettings( " #valid GPUs: ", valid_platform_gpu_ids.size(), " virtual_devices.size(): ", virtual_devices.size()); } +#if GOOGLE_CUDA + // Check memory_limt_mb and priority sizes match if priority is non-empty. + bool priority_exists = !virtual_devices.Get(0).priority().empty(); + for (int i = 0; i < virtual_devices.size(); ++i) { + const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb(); + const auto& priority = virtual_devices.Get(i).priority(); + // If the priority is empty in the first one then treat this as having no + // priority set in any of the virtual devices for backward compatibility. + // Either it's set for all or none. + if (!priority_exists) { + if (!priority.empty()) { + return errors::InvalidArgument( + "Priority must be set for all virtual devices or none. But the " + "priority is specified for ", + i, + " while previous devices didn't " + "have any set."); + } + } + if (priority_exists && memory_limit_mb.size() != priority.size()) { + return errors::InvalidArgument( + "Number of virtual device priorities specified doesn't " + "match with number of memory_limit_mb specified for GPU# ", + i, " memory_limit_mb size: ", memory_limit_mb.size(), + " and priority size: ", priority.size()); + } + const int gpu_id = valid_platform_gpu_ids[i].value(); + auto it = supported_priority_ranges.find(gpu_id); + if (it == supported_priority_ranges.end()) { + return errors::Internal( + "Failed to find supported priority range for GPU" + " device ", + gpu_id); + } + const std::pair& priority_range = it->second; + for (int p : priority) { + if (p > priority_range.first || p < priority_range.second) { + return errors::InvalidArgument( + "Priority ", p, + " is outside the range of supported priorities " + "[", + priority_range.second, ",", priority_range.first, + "] for virtual device ", i, " on GPU# ", gpu_id); + } + } + } +#elif TENSORFLOW_USE_ROCM + for (int i = 0; i < virtual_devices.size(); ++i) { + if (!virtual_devices.Get(i).priority().empty()) { + return errors::InvalidArgument( + "Priority is supported only on Nvidia GPUs." + " However, priority is set for virtual device ", + i, ", which corresponds to a non Nvidia GPU"); + } + } +#endif + return Status::OK(); } @@ -1002,6 +1126,7 @@ Status BaseGPUDeviceFactory::CreateDevices( if (num_gpus_to_use > valid_platform_gpu_ids.size()) { num_gpus_to_use = valid_platform_gpu_ids.size(); } + std::map> supported_priority_ranges; if (!valid_platform_gpu_ids.empty()) { // Save the original device. int original_device = 0; @@ -1035,6 +1160,18 @@ Status BaseGPUDeviceFactory::CreateDevices( platform_gpu_id.value(), " failed. Status: ", cudaGetErrorString(err)); } + int priority_low, priority_high; + cudaDeviceGetStreamPriorityRange(&priority_low, &priority_high); + if (err != cudaSuccess) { + return errors::Internal( + "cudaDeviceGetStreamPriorityRange() on GPU:", original_device, + " failed. Status: ", cudaGetErrorString(err)); + } + VLOG(1) << "Cuda stream priority range on GPU(" << original_device + << "): " << priority_high << "," << priority_low; + supported_priority_ranges.insert( + std::make_pair(platform_gpu_id.value(), + std::make_pair(priority_low, priority_high))); #elif TENSORFLOW_USE_ROCM err = hipSetDevice(platform_gpu_id.value()); if (err != hipSuccess) { @@ -1106,9 +1243,9 @@ Status BaseGPUDeviceFactory::CreateDevices( const auto& virtual_devices = gpu_options.experimental().virtual_devices(); if (!virtual_devices.empty()) { - TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(num_gpus_to_use, gpu_options, - visible_gpu_order, - valid_platform_gpu_ids)); + TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings( + num_gpus_to_use, gpu_options, visible_gpu_order, valid_platform_gpu_ids, + supported_priority_ranges)); // We've verified that num_gpus_to_use >= virtual_devices.size(). num_gpus_to_use = virtual_devices.size(); CHECK(gpu_options.visible_device_list().empty() || @@ -1703,6 +1840,10 @@ int BaseGPUDevice::PendingKernels() { return 0; } +void BaseGPUDevice::TestOnlyReset() { + StreamGroupFactory::Global().TestOnlyReset(); +} + uint64 GPUKernelTracker::MaybeQueue(OpKernelContext* ctx) { mutex_lock l(mu_); ++ops_since_last_; diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h index 3646c59cec1..32c7738d916 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.h +++ b/tensorflow/core/common_runtime/gpu/gpu_device.h @@ -114,6 +114,11 @@ class BaseGPUDevice : public LocalDevice { // the compute stream and are not yet known to have completed. int PendingKernels(); + int priority() const { return stream_->priority; } + + // Helper method for unit tests to reset the streams. Never use in production. + static void TestOnlyReset(); + protected: Allocator* gpu_allocator_; // not owned Allocator* cpu_allocator_; // not owned @@ -131,6 +136,7 @@ class BaseGPUDevice : public LocalDevice { se::Stream* host_to_device = nullptr; se::Stream* device_to_host = nullptr; gtl::InlinedVector device_to_device; + int priority = 0; }; class StreamGroupFactory; diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc index 0d66324a8e5..26312d35af6 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc @@ -61,13 +61,17 @@ void ExpectErrorMessageSubstr(const Status& s, StringPiece substr) { class GPUDeviceTest : public ::testing::Test { public: - void TearDown() override { GPUProcessState::singleton()->TestOnlyReset(); } + void TearDown() override { + BaseGPUDevice::TestOnlyReset(); + GPUProcessState::singleton()->TestOnlyReset(); + } protected: static SessionOptions MakeSessionOptions( const string& visible_device_list = "", double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1, - const std::vector>& memory_limit_mb = {}) { + const std::vector>& memory_limit_mb = {}, + const std::vector>& priority = {}) { SessionOptions options; ConfigProto* config = &options.config; (*config->mutable_device_count())["GPU"] = gpu_device_count; @@ -75,12 +79,17 @@ class GPUDeviceTest : public ::testing::Test { gpu_options->set_visible_device_list(visible_device_list); gpu_options->set_per_process_gpu_memory_fraction( per_process_gpu_memory_fraction); - for (const auto& v : memory_limit_mb) { + for (int i = 0; i < memory_limit_mb.size(); ++i) { auto virtual_devices = gpu_options->mutable_experimental()->add_virtual_devices(); - for (float mb : v) { + for (float mb : memory_limit_mb[i]) { virtual_devices->add_memory_limit_mb(mb); } + if (i < priority.size()) { + for (int p : priority[i]) { + virtual_devices->add_priority(p); + } + } } return options; } @@ -193,6 +202,7 @@ TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) { opts, kDeviceNamePrefix, &devices)); EXPECT_EQ(1, devices.size()); EXPECT_GE(devices[0]->attributes().memory_limit(), 0); + EXPECT_EQ(0, static_cast(devices[0].get())->priority()); } TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) { @@ -204,25 +214,67 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) { opts, kDeviceNamePrefix, &devices)); EXPECT_EQ(1, devices.size()); EXPECT_GE(devices[0]->attributes().memory_limit(), 0); + EXPECT_EQ(0, static_cast(devices[0].get())->priority()); } -TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) { +TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndNoPriority) { SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}}); std::vector> devices; TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices( opts, kDeviceNamePrefix, &devices)); EXPECT_EQ(1, devices.size()); EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit()); + EXPECT_EQ(0, static_cast(devices[0].get())->priority()); +} + +TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) { + { + // Priority outside the range (-1, 0). + SessionOptions opts = + MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 0}}); + std::vector> devices; + Status status = DeviceFactory::GetFactory("GPU")->CreateDevices( + opts, kDeviceNamePrefix, &devices); + EXPECT_EQ(status.code(), error::INVALID_ARGUMENT); + ExpectErrorMessageSubstr( + status, + "Priority -2 is outside the range of supported priorities [-1,0] for" + " virtual device 0 on GPU# 0"); + } + { + // Priority outside the range (-1, 0). + SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}}); + std::vector> devices; + Status status = DeviceFactory::GetFactory("GPU")->CreateDevices( + opts, kDeviceNamePrefix, &devices); + EXPECT_EQ(status.code(), error::INVALID_ARGUMENT); + ExpectErrorMessageSubstr( + status, + "Priority 1 is outside the range of supported priorities [-1,0] for" + " virtual device 0 on GPU# 0"); + } +} + +TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndPriority) { + SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}}, {{-1}}); + std::vector> devices; + TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices( + opts, kDeviceNamePrefix, &devices)); + EXPECT_EQ(1, devices.size()); + EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit()); + EXPECT_EQ(-1, static_cast(devices[0].get())->priority()); } TEST_F(GPUDeviceTest, MultipleVirtualDevices) { - SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}); + SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}}); std::vector> devices; TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices( opts, kDeviceNamePrefix, &devices)); EXPECT_EQ(2, devices.size()); EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit()); EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit()); + EXPECT_EQ(0, static_cast(devices[0].get())->priority()); + EXPECT_EQ(-1, static_cast(devices[1].get())->priority()); ASSERT_EQ(1, devices[0]->attributes().locality().links().link_size()); ASSERT_EQ(1, devices[1]->attributes().locality().links().link_size()); EXPECT_EQ(1, devices[0]->attributes().locality().links().link(0).device_id()); @@ -237,6 +289,35 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) { devices[1]->attributes().locality().links().link(0).strength()); } +TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) { + { + // Multile virtual devices with fewer priorities. + SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1}}); + std::vector> devices; + Status status = DeviceFactory::GetFactory("GPU")->CreateDevices( + opts, kDeviceNamePrefix, &devices); + EXPECT_EQ(status.code(), error::INVALID_ARGUMENT); + ExpectErrorMessageSubstr( + status, + "Number of virtual device priorities specified doesn't " + "match with number of memory_limit_mb specified for GPU# 0" + " memory_limit_mb size: 2 and priority size: 1"); + } + { + // Multile virtual devices with matching priority. + SessionOptions opts = + MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}}); + std::vector> devices; + TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices( + opts, kDeviceNamePrefix, &devices)); + EXPECT_EQ(2, devices.size()); + EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit()); + EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit()); + EXPECT_EQ(-1, static_cast(devices[0].get())->priority()); + EXPECT_EQ(0, static_cast(devices[1].get())->priority()); + } +} + // Enabling unified memory on pre-Pascal GPUs results in an initialization // error. TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) { diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto index 7973e002762..3d20d004d46 100644 --- a/tensorflow/core/protobuf/config.proto +++ b/tensorflow/core/protobuf/config.proto @@ -110,6 +110,18 @@ message GPUOptions { // For the concept of "visible" and "virtual" GPU, see the comments for // "visible_device_list" above for more information. repeated float memory_limit_mb = 1; + + // Priority values to use with the virtual devices. Use the cuda function + // cudaDeviceGetStreamPriorityRange to query for valid range of values for + // priority. + // + // On a P4000 GPU with cuda 10.1, the priority range reported was 0 for + // least priority and -1 for greatest priority. + // + // If this field is not specified, then the virtual devices will be + // created with the default. If this field has values set, then the size + // of this must match with the above memory_limit_mb. + repeated int32 priority = 2; } // The multi virtual device settings. If empty (not set), it will create diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py index 604a960afd5..aa760583800 100644 --- a/tensorflow/python/eager/context.py +++ b/tensorflow/python/eager/context.py @@ -263,7 +263,8 @@ class LogicalDevice( @tf_export("config.LogicalDeviceConfiguration", "config.experimental.VirtualDeviceConfiguration") class LogicalDeviceConfiguration( - collections.namedtuple("LogicalDeviceConfiguration", ["memory_limit"])): + collections.namedtuple("LogicalDeviceConfiguration", + ["memory_limit", "experimental_priority"])): """Configuration class for a logical devices. The class specifies the parameters to configure a `tf.config.PhysicalDevice` @@ -276,10 +277,15 @@ class LogicalDeviceConfiguration( Fields: memory_limit: (optional) Maximum memory (in MB) to allocate on the virtual device. Currently only supported for GPUs. + experimental_priority: (optional) Priority to assign to a virtual device. + Lower values have higher priorities and 0 is the default. + Within a physical GPU, the GPU scheduler will prioritize ops on virtual + devices with higher priority. Currently only supported for Nvidia GPUs. """ - def __new__(cls, memory_limit=None): - return super(LogicalDeviceConfiguration, cls).__new__(cls, memory_limit) + def __new__(cls, memory_limit=None, experimental_priority=None): + return super(LogicalDeviceConfiguration, + cls).__new__(cls, memory_limit, experimental_priority) @tf_export("config.PhysicalDevice") @@ -1019,12 +1025,19 @@ class Context(object): if self._virtual_device_map: vdevs = self._virtual_device_map.get(dev, []) device_limits = [] + priority = [] for virt_dev in vdevs: device_limits.append(virt_dev.memory_limit) + if virt_dev.experimental_priority is not None: + priority.append(virt_dev.experimental_priority) + # If priority is specified, it must be specified for all virtual + # devices. + if priority and len(device_limits) != len(priority): + raise ValueError("priority must be specified for all virtual devices") virtual_devices.append( config_pb2.GPUOptions.Experimental.VirtualDevices( - memory_limit_mb=device_limits)) + memory_limit_mb=device_limits, priority=priority)) # Only compute growth if virtual devices have not been configured and we # have GPUs @@ -1394,6 +1407,9 @@ class Context(object): if vdev.memory_limit is not None: raise ValueError("Setting memory limit on CPU virtual devices is " "currently not supported") + if vdev.experimental_priority is not None: + raise ValueError("Setting experimental_priority on CPU virtual " + " devices is currently not supported") elif dev.device_type == "GPU": for vdev in virtual_devices: if vdev.memory_limit is None: diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc index 210c5436fad..3d011123d87 100644 --- a/tensorflow/stream_executor/cuda/cuda_driver.cc +++ b/tensorflow/stream_executor/cuda/cuda_driver.cc @@ -713,13 +713,21 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { absl::StrCat("failed to get device for context: ", ToString(result))); } -/* static */ bool GpuDriver::CreateStream(GpuContext* context, - CUstream* stream) { +/* static */ bool GpuDriver::CreateStream(GpuContext* context, CUstream* stream, + int priority) { // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess // up synchronization with respect to memsets and any other things that have // to occur on the default stream? ScopedActivateContext activated{context}; - CUresult res = cuStreamCreate(stream, 0); + CUresult res; + // If the priority is 0, then use the previous api to create the stream with + // the default priority for backward compatibility. Probably there is no + // difference in using the new api call but leaving it as is for now. + if (priority == 0) { + res = cuStreamCreate(stream, 0); + } else { + res = cuStreamCreateWithPriority(stream, 0, priority); + } if (res != CUDA_SUCCESS) { LOG(ERROR) << "could not allocate CUDA stream for context " << context->context() << ": " << ToString(res); diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h index f72c9a129cf..25b90be1bd2 100644 --- a/tensorflow/stream_executor/gpu/gpu_driver.h +++ b/tensorflow/stream_executor/gpu/gpu_driver.h @@ -71,7 +71,8 @@ class GpuDriver { // cuStreamCreate. // stream is an outparam owned by the caller, must not be null. // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4 - static bool CreateStream(GpuContext* context, GpuStreamHandle* stream); + static bool CreateStream(GpuContext* context, GpuStreamHandle* stream, + int priority = 0); // Destroys a CUDA stream associated with the given context. // stream is owned by the caller, must not be null, and *stream is set to null diff --git a/tensorflow/stream_executor/gpu/gpu_stream.cc b/tensorflow/stream_executor/gpu/gpu_stream.cc index 887522cf3ae..9899bbb04a3 100644 --- a/tensorflow/stream_executor/gpu/gpu_stream.cc +++ b/tensorflow/stream_executor/gpu/gpu_stream.cc @@ -23,7 +23,8 @@ namespace stream_executor { namespace gpu { bool GpuStream::Init() { - if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_)) { + if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_, + priority_)) { return false; } return GpuDriver::InitEvent(parent_->gpu_context(), &completed_event_, diff --git a/tensorflow/stream_executor/gpu/gpu_stream.h b/tensorflow/stream_executor/gpu/gpu_stream.h index db0eec53b9c..e58ac72dd56 100644 --- a/tensorflow/stream_executor/gpu/gpu_stream.h +++ b/tensorflow/stream_executor/gpu/gpu_stream.h @@ -48,6 +48,8 @@ class GpuStream : public internal::StreamInterface { // Explicitly initialize the CUDA resources associated with this stream, used // by StreamExecutor::AllocateStream(). bool Init(); + void SetPriority(int priority) { priority_ = priority; } + int priority() const { return priority_; } // Explicitly destroy the CUDA resources associated with this stream, used by // StreamExecutor::DeallocateStream(). @@ -78,6 +80,7 @@ class GpuStream : public internal::StreamInterface { private: GpuExecutor* parent_; // Executor that spawned this stream. GpuStreamHandle gpu_stream_; // Wrapped CUDA stream handle. + int priority_ = 0; // Event that indicates this stream has completed. GpuEventHandle completed_event_ = nullptr; diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc index 210e9d7a9fa..5a8154f1df8 100644 --- a/tensorflow/stream_executor/rocm/rocm_driver.cc +++ b/tensorflow/stream_executor/rocm/rocm_driver.cc @@ -558,7 +558,13 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { } /* static */ bool GpuDriver::CreateStream(GpuContext* context, - GpuStreamHandle* stream) { + GpuStreamHandle* stream, + int priority) { + if (priority != 0) { + LOG(ERROR) << "ROCM stream doesn't support priority. " + << " Should be set to 0 but given: " << priority; + return false; + } ScopedActivateContext activated{context}; hipError_t res = tensorflow::wrap::hipStreamCreateWithFlags( stream, hipStreamDefault); // switch to hipStreamNonBlocking? diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt index 79c33f7e304..62bc83a3a69 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt @@ -116,6 +116,12 @@ tf_proto { label: LABEL_REPEATED type: TYPE_FLOAT } + field { + name: "priority" + number: 2 + label: LABEL_REPEATED + type: TYPE_INT32 + } } } } diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt index 3f6c6e636a1..49750b0af85 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt @@ -3,6 +3,10 @@ tf_class { is_instance: "" is_instance: "" is_instance: "" + member { + name: "experimental_priority" + mtype: "" + } member { name: "memory_limit" mtype: "" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt index 25b6b6e216e..9b2a7d846b9 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt @@ -3,6 +3,10 @@ tf_class { is_instance: "" is_instance: "" is_instance: "" + member { + name: "experimental_priority" + mtype: "" + } member { name: "memory_limit" mtype: "" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt index 3f6c6e636a1..49750b0af85 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt @@ -3,6 +3,10 @@ tf_class { is_instance: "" is_instance: "" is_instance: "" + member { + name: "experimental_priority" + mtype: "" + } member { name: "memory_limit" mtype: "" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt index 25b6b6e216e..9b2a7d846b9 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt @@ -3,6 +3,10 @@ tf_class { is_instance: "" is_instance: "" is_instance: "" + member { + name: "experimental_priority" + mtype: "" + } member { name: "memory_limit" mtype: ""