From 5a96e4e672109b29951d2f65fcf46b00d85e025a Mon Sep 17 00:00:00 2001
From: Vijay Vasudevan <vrv@google.com>
Date: Fri, 2 Sep 2016 19:32:03 -0800
Subject: [PATCH] Make CUDA host allocator fetch one of the available stream
 executors in the process, not the 0th one, which may not be visible to the
 process.

Fixes #1888 (for real this time?)
Change: 132128469
---
 .../core/common_runtime/gpu/process_state.cc  | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index f85b37cb8f4..60da115988e 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -181,12 +181,25 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   // different numa_nodes.  For now, just one.
   numa_node = 0;
   mutex_lock lock(mu_);
+
+  // Find the first valid StreamExecutor to request CUDA host memory
+  // through, since any will work.
+  //
+  // This search isn't super clean, and it would be nice to use a
+  // better source of information about which executor to use.  For
+  // example, process_state could maybe save the first stream executor
+  // it knows is valid.
+  gpu::StreamExecutor* se = nullptr;
+  for (size_t i = 0; i < gpu_allocators_.size(); ++i) {
+    if (gpu_allocators_[i] != nullptr) {
+      se = GPUMachineManager()->ExecutorForDevice(i).ValueOrDie();
+      break;
+    }
+  }
+
+  CHECK_NE(nullptr, se);
+
   while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) {
-    // CUDAHost alloc the same across all gpus, so just get the
-    // executor for the first device.
-    gpu::Platform* gpu_platform = GPUMachineManager();
-    gpu::StreamExecutor* se = gpu_platform->ExecutorForDevice(0).ValueOrDie();
-    CHECK(se);
     Allocator* allocator = nullptr;
     static constexpr bool kCudaHostMemoryUseBFC = true;
     if (kCudaHostMemoryUseBFC) {