[OpenCL] Extends matmul_benchmark.py to cover SYCL (#11697)

* [OpenCL] Extends matmul_benchmark.py to cover SYCL * Fixed typo * /gpu:0 -> /device:GPU:0 * Fixes control_flow_ops_py_test * /gpu: -> /device:GPU: * Fixes //tensorflow/python/profiler/internal:run_metadata_test * gpu: -> GPU: * Fixes tfprof_node * [OpenCL] Fixes device path to name with many colons (#123) The device path is constructed from a device name by replacing all colons with underscores. Some device names contain more than one colon, for example 'device:SYCL:0' which gives a path 'device_SYCL_0'. The previous code would not convert this back to the original device name, but rather to 'device:SYCL_0'. An alternative fix would be to convert all underscores to colons in the device name (i.e. remove the restriction inside `replace("_", ":", 1)`), however I'm not sure if there are any device names which contain underscores. * If no gpu device aviable fake one * gpu: -> device:GPU * Fixes profiler test * /gpu:x -> /device:GPU:x * Fixes debug_io_utils_test.cc test * Fixes device_name_utils_test.cc
2017-08-11 01:35:21 +01:00 · 2017-08-11 01:35:21 +01:00 · ab96f41fb4
commit ab96f41fb4
parent 35e7a36658
69 changed files with 286 additions and 285 deletions
--- a/tensorflow/cc/tutorials/example_trainer.cc
+++ b/tensorflow/cc/tutorials/example_trainer.cc
@ -101,7 +101,7 @@ void ConcurrentSteps(const Options* opts, int session_index) {
  std::unique_ptr<Session> session(NewSession(options));
  GraphDef def = CreateGraphDef();
  if (options.target.empty()) {
-    graph::SetDefaultDevice(opts->use_gpu ? "/gpu:0" : "/cpu:0", &def);
+    graph::SetDefaultDevice(opts->use_gpu ? "/device:GPU:0" : "/cpu:0", &def);
  }
  TF_CHECK_OK(session->Create(def));
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@ -93,7 +93,7 @@ class CudnnRNNBenchmark(test.Benchmark):
      batch_size = config["batch_size"]
      seq_length = config["seq_length"]
-      with ops.Graph().as_default(), ops.device("/gpu:0"):
+      with ops.Graph().as_default(), ops.device("/device:GPU:0"):
        model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, num_units)
        params_size_t = model.params_size()
        input_data = variables.Variable(
@ -125,7 +125,7 @@ class CudnnRNNBenchmark(test.Benchmark):
      batch_size = config["batch_size"]
      seq_length = config["seq_length"]
-      with ops.Graph().as_default(), ops.device("/gpu:0"):
+      with ops.Graph().as_default(), ops.device("/device:GPU:0"):
        inputs = seq_length * [
            array_ops.zeros([batch_size, num_units], dtypes.float32)
        ]
@ -153,7 +153,7 @@ class CudnnRNNBenchmark(test.Benchmark):
      batch_size = config["batch_size"]
      seq_length = config["seq_length"]
-      with ops.Graph().as_default(), ops.device("/gpu:0"):
+      with ops.Graph().as_default(), ops.device("/device:GPU:0"):
        inputs = seq_length * [
            array_ops.zeros([batch_size, num_units], dtypes.float32)
        ]
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
@ -634,7 +634,7 @@ class MixtureBenchmark(test.Benchmark):
    np.random.seed(127)
    with session.Session(config=config, graph=ops.Graph()) as sess:
      random_seed.set_random_seed(0)
-      with ops.device("/gpu:0" if use_gpu else "/cpu:0"):
+      with ops.device("/device:GPU:0" if use_gpu else "/cpu:0"):
        mixture = create_distribution(
            num_components=num_components,
            batch_size=batch_size,
--- a/tensorflow/contrib/framework/python/ops/variables_test.py
+++ b/tensorflow/contrib/framework/python/ops/variables_test.py
@ -443,19 +443,19 @@ class VariablesTest(test.TestCase):
        e = variables_lib2.variable('e', initializer=e_init)
      # The values below highlight how the VariableDeviceChooser puts initial
      # values on the same device as the variable job.
-      self.assertDeviceEqual(a.device, '/gpu:0')
+      self.assertDeviceEqual(a.device, '/device:GPU:0')
      self.assertEqual(a.initial_value.op.colocation_groups(),
                       a.op.colocation_groups())
-      self.assertDeviceEqual(b.device, '/gpu:0')
+      self.assertDeviceEqual(b.device, '/device:GPU:0')
      self.assertEqual(b.initial_value.op.colocation_groups(),
                       b.op.colocation_groups())
      self.assertDeviceEqual(c.device, '/cpu:12')
      self.assertEqual(c.initial_value.op.colocation_groups(),
                       c.op.colocation_groups())
-      self.assertDeviceEqual(d.device, '/gpu:0')
+      self.assertDeviceEqual(d.device, '/device:GPU:0')
      self.assertEqual(d.initial_value.op.colocation_groups(),
                       d.op.colocation_groups())
-      self.assertDeviceEqual(e.device, '/gpu:0')
+      self.assertDeviceEqual(e.device, '/device:GPU:0')
      self.assertDeviceEqual(e.initial_value.device, '/cpu:99')
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@ -43,7 +43,7 @@ class AllReduceTest(test.TestCase):
        self._testSingleAllReduce(sess, dtype, nccl.all_max, np.maximum)
  def _testSingleAllReduce(self, sess, np_type, nccl_fn, numpy_accumulation_fn):
-    for devices in [['/gpu:0', '/gpu:0', '/gpu:0'], ['/gpu:0', '/gpu:0']]:
+    for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
      shape = (3, 4)
      np_ans = None
      tensors = []
@ -84,7 +84,7 @@ class BroadcastTest(test.TestCase):
      # Create session inside outer loop to test use of
      # same communicator across multiple sessions.
      with self.test_session(use_gpu=True) as sess:
-        for devices in [['/gpu:0', '/gpu:0', '/gpu:0'], ['/gpu:0', '/gpu:0']]:
+        for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
          shape = (3, 4)
          sender = np.random.randint(0, len(devices) - 1)
          with ops.device(devices[sender]):
@ -115,7 +115,7 @@ class CombinedTest(test.TestCase):
      # Create session inside outer loop to test use of
      # same communicator across multiple sessions.
      with self.test_session(use_gpu=True) as sess:
-        for devices in [['/gpu:0', '/gpu:0', '/gpu:0'], ['/gpu:0', '/gpu:0']]:
+        for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
          shape = (3, 4)
          # all-reduce
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@ -446,12 +446,12 @@ class RNNCellTest(test.TestCase):
      # Can't perform this test w/o a GPU
      return
    gpu_dev = test.gpu_device_name()
    with self.test_session(use_gpu=True) as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 1, 3])
-        cell = rnn_cell_impl.DeviceWrapper(
+        cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev)
            rnn_cell_impl.GRUCell(3), test_util.gpu_device_name())
        with ops.device("/cpu:0"):
          outputs, _ = rnn.dynamic_rnn(
              cell=cell, inputs=x, dtype=dtypes.float32)
@ -463,8 +463,7 @@ class RNNCellTest(test.TestCase):
        _ = sess.run(outputs, options=opts, run_metadata=run_metadata)
      step_stats = run_metadata.step_stats
-      ix = 0 if (("gpu"  in step_stats.dev_stats[0].device) or
+      ix = 0 if gpu_dev in step_stats.dev_stats[0].device else 1
                 ("sycl" in step_stats.dev_stats[0].device)) else 1
      gpu_stats = step_stats.dev_stats[ix].node_stats
      cpu_stats = step_stats.dev_stats[1 - ix].node_stats
      self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name])
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@ -42,7 +42,6 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
 from tensorflow.python.framework import test_util
 class Plus1RNNCell(rnn_lib.RNNCell):
  """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
@ -2208,11 +2207,11 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
    if not test.is_gpu_available():
      return  # Test requires access to a GPU
    gpu_dev = test.gpu_device_name()
    run_metadata = self._execute_rnn_on(
-        rnn_device="/cpu:0", cell_device=test_util.gpu_device_name())
+        rnn_device="/cpu:0", cell_device=gpu_dev)
    step_stats = run_metadata.step_stats
-    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
+    ix = 0 if (gpu_dev in step_stats.dev_stats[0].device) else 1
    ("sycl" in step_stats.dev_stats[0].device)) else 1
    gpu_stats = step_stats.dev_stats[ix].node_stats
    cpu_stats = step_stats.dev_stats[1 - ix].node_stats
@ -2233,12 +2232,12 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
    if not test.is_gpu_available():
      return  # Test requires access to a GPU
    gpu_dev = test.gpu_device_name()
    run_metadata = self._execute_rnn_on(
        rnn_device="/cpu:0", cell_device="/cpu:0",
-        input_device=test_util.gpu_device_name())
+        input_device=gpu_dev)
    step_stats = run_metadata.step_stats
-    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
+    ix = 0 if (gpu_dev in step_stats.dev_stats[0].device) else 1
    ("sycl" in step_stats.dev_stats[0].device)) else 1
    gpu_stats = step_stats.dev_stats[ix].node_stats
    cpu_stats = step_stats.dev_stats[1 - ix].node_stats
@ -2253,11 +2252,11 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
    if not test.is_gpu_available():
      return  # Test requires access to a GPU
    gpu_dev = test.gpu_device_name()
    run_metadata = self._execute_rnn_on(
-        input_device=test_util.gpu_device_name())
+        input_device=gpu_dev)
    step_stats = run_metadata.step_stats
-    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
+    ix = 0 if (gpu_dev in step_stats.dev_stats[0].device) else 1
    ("sycl" in step_stats.dev_stats[0].device)) else 1
    gpu_stats = step_stats.dev_stats[ix].node_stats
    cpu_stats = step_stats.dev_stats[1 - ix].node_stats
--- a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
@ -357,7 +357,7 @@ def training_gru_block_vs_gru_cell(batch_size,
  ops.reset_default_graph()
  with session.Session(graph=ops.Graph()) as sess:
    # Specify the device which is been used.
-    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
+    with ops.device("/cpu:0" if not use_gpu else "/device:GPU:0"):
      # Random initializers.
      seed = 1994
@ -429,7 +429,7 @@ def inference_gru_block_vs_gru_cell(batch_size,
  """Benchmark inference speed between GRUBlockCell vs GRUCell."""
  ops.reset_default_graph()
  with session.Session(graph=ops.Graph()) as sess:
-    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
+    with ops.device("/cpu:0" if not use_gpu else "/device:GPU:0"):
      # Random initializers.
      seed = 1994
@ -484,7 +484,7 @@ def single_bprop_step_gru_block_vs_gru_cell(batch_size,
  """Benchmark single bprop step speed between GRUBlockCell vs GRUCell."""
  ops.reset_default_graph()
  with session.Session(graph=ops.Graph()) as sess:
-    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
+    with ops.device("/cpu:0" if not use_gpu else "/device:GPU:0"):
      initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989)
      # Inputs
      x = vs.get_variable("x", [batch_size, input_size])
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
@ -78,7 +78,7 @@ class GatherTreeTest(test.TestCase):
    sequence_length = [[3, 3, 3]]
    expected_result = _transpose_batch_time(
        [[[2, -1, 2], [6, 5, 6], [7, 8, 9], [-1, -1, -1]]])
-    with ops.device("/gpu:0"):
+    with ops.device("/device:GPU:0"):
      beams = beam_search_ops.gather_tree(
          step_ids=step_ids, parent_ids=parent_ids,
          sequence_length=sequence_length)
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@ -22,7 +22,7 @@ limitations under the License.
 // Device names
 // * Every Device should have a unique name with the format:
 //     /job:___/replica:___/task:___/(gpu|cpu):___
-//   An example name would be "/job:train/replica:0/task:3/gpu:2".
+//   An example name would be "/job:train/replica:0/task:3/device:GPU:2".
 // * Task numbers are within the specified replica, so there are as
 //   many "task zeros" as replicas.
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@ -476,7 +476,7 @@ TEST(DirectSessionTest, PlacePrunedGraph) {
    vx.scalar<float>()() = 1.0;
    Node* x = test::graph::Constant(&g, vx);
    Node* y = test::graph::Unary(&g, "Darth", x);
-    y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
    GraphDef def;
    test::graph::ToGraphDef(&g, &def);
@ -494,7 +494,7 @@ TEST(DirectSessionTest, PlacePrunedGraph) {
    vx.scalar<float>()() = 1.0;
    Node* x = test::graph::Constant(&g, vx);
    Node* y = test::graph::Unary(&g, "Darth", x);
-    y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
    GraphDef def;
    test::graph::ToGraphDef(&g, &def);
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@ -154,14 +154,14 @@ static void TestHWAccelerator(bool enableHWTrace) {
  Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
  test::FillValues<float>(&x_tensor, {1, 1});
  Node* x = test::graph::Constant(&graph, x_tensor);
-  x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+  x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
  x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
 #endif // TENSORFLOW_USE_SYCL
  // y = A * x
  Node* y = test::graph::Matmul(&graph, a, x, false, false);
-  y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+  y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
 y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
 #endif // TENSORFLOW_USE_SYCL
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@ -588,7 +588,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
  for (int i = 0; i < n; i++) {
    BaseGPUDevice* gpu_device;
    TF_RETURN_IF_ERROR(CreateGPUDevice(options,
-                                       strings::StrCat(name_prefix, "/gpu:", i),
+                                       strings::StrCat(name_prefix, "/device:GPU:", i),
                                       valid_gpu_ids[i], &gpu_device));
    TF_RETURN_IF_ERROR(gpu_device->Init(options));
    devices->push_back(gpu_device);
@ -1049,7 +1049,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
    size_t new_id = ids->size();
    ids->push_back(visible_gpu_id);
-    LOG(INFO) << "Creating TensorFlow device (/gpu:" << new_id << ") -> "
+    LOG(INFO) << "Creating TensorFlow device (/device:GPU:" << new_id << ") -> "
              << "(" << GetShortDeviceDescription(visible_gpu_id, desc) << ")";
  }
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@ -141,7 +141,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
                                         Allocator* cpu_allocator) = 0;
  // Returns into 'ids' the list of valid GPU ids, in the order that
-  // they should map to logical gpu ids "/gpu:0", "/gpu:1", etc, based
+  // they should map to logical gpu ids "/device:GPU:0", "/device:GPU:1", etc, based
  // upon 'visible_device_list', a comma-separated list of 'visible
  // gpu ids'.
  Status GetValidDeviceIds(const string& visible_device_list,
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
@ -106,9 +106,9 @@ TEST_F(GpuStreamUtilTest, SimpleGraphManyStreams) {
 TEST_F(GpuStreamUtilTest, StreamOverrides) {
  auto root = Scope::NewRootScope().ExitOnError();
  ops::_Recv(root.WithOpName("input"), DT_FLOAT, "input", "/cpu:0", 0,
-             "/gpu:0");
+             "/device:GPU:0");
  Output n = ops::MatMul(root, {}, {});
-  ops::_Send(root.WithOpName("output"), n, "output", "/gpu:0", 0, "/cpu:0");
+  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0, "/cpu:0");
  Graph g(OpRegistry::Global());
  TF_ASSERT_OK(root.ToGraph(&g));
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@ -53,7 +53,7 @@ TEST(MemoryTypeChecker, Int32NotOk) {
  EXPECT_TRUE(errors::IsInternal(ValidateMemoryTypes(DEVICE_GPU, g)));
  // But we can insert _HostSend/_HostRecv to ensure the invariant.
-  TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/gpu:0", g));
+  TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/device:GPU:0", g));
  TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
--- a/tensorflow/core/debug/debug_gateway.cc
+++ b/tensorflow/core/debug/debug_gateway.cc
@ -86,7 +86,7 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
    // Determine if the tensor is on device (GPU) or host (CPU).
    // The second part of the check is necessary because even an OpKernel on
    // may have output tensors allocated on CPU.
-    if ((device->name().find("gpu:") != string::npos || device->name().find("SYCL:") != string::npos) &&
+    if ((device->name().find("GPU:") != string::npos || device->name().find("SYCL:") != string::npos) &&
        !ctx->output_alloc_attr(output_slot).on_host()) {
      // GPU tensors: Copy it to host (CPU).
      DeviceContext* device_ctxt = ctx->op_device_context();
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@ -47,7 +47,7 @@ class SessionDebugMinusAXTest : public ::testing::Test {
    Graph graph(OpRegistry::Global());
 #if GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif defined(TENSORFLOW_USE_SYCL)
    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
@ -505,7 +505,7 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
    Graph graph(OpRegistry::Global());
 #if GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif defined(TENSORFLOW_USE_SYCL)
    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
@ -607,7 +607,7 @@ class SessionDebugVariableTest : public ::testing::Test {
    Graph graph(OpRegistry::Global());
 #if GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif defined(TENSORFLOW_USE_SYCL)
    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
@ -879,7 +879,7 @@ class SessionDebugGPUSwitchTest : public ::testing::Test {
    Graph graph(OpRegistry::Global());
 #ifdef GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif TENSORFLOW_USE_SYCL
    const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #endif
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@ -51,14 +51,14 @@ class DebugIOUtilsTest : public ::testing::Test {
 };
 TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
-  DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/gpu:2",
+  DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/device:GPU:2",
                              "hidden_1/MatMul", 0, "DebugIdentity");
-  EXPECT_EQ("/job:worker/replica:1/task:0/gpu:2", debug_node_key.device_name);
+  EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2", debug_node_key.device_name);
  EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
  EXPECT_EQ(0, debug_node_key.output_slot);
  EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
  EXPECT_EQ("hidden_1/MatMul:0:DebugIdentity", debug_node_key.debug_node_name);
-  EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,gpu_2",
+  EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,device_GPU_2",
            debug_node_key.device_path);
 }
--- a/tensorflow/core/distributed_runtime/executor_test.cc
+++ b/tensorflow/core/distributed_runtime/executor_test.cc
@ -140,7 +140,7 @@ Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
 }
 #define ALICE "/job:j/replica:0/task:0/cpu:0"
-#define BOB "/job:j/replica:0/task:0/gpu:0"
+#define BOB "/job:j/replica:0/task:0/device:GPU:0"
 TEST_F(ExecutorTest, SimpleAdd) {
  // c = a + b
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
@ -31,9 +31,9 @@ TEST(GrpcChannelTest, IsSameAddressSpace) {
  EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10/cpu:0",
                           "/job:mnist/replica:10/task:10/cpu:1"));
  EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10/cpu:0",
-                           "/job:mnist/replica:10/task:10/gpu:2"));
+                           "/job:mnist/replica:10/task:10/device:GPU:2"));
  EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10",
-                           "/job:mnist/replica:10/task:10/gpu:2"));
+                           "/job:mnist/replica:10/task:10/device:GPU:2"));
  EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10/cpu:1",
                           "/job:mnist/replica:10/task:10"));
--- a/tensorflow/core/framework/node_def.proto
+++ b/tensorflow/core/framework/node_def.proto
@ -38,8 +38,8 @@ message NodeDef {
  //              | ( ("gpu" | "cpu") ":" ([1-9][0-9]* | "*") )
  //
  // Valid values for this string include:
-  // * "/job:worker/replica:0/task:1/gpu:3"  (full specification)
+  // * "/job:worker/replica:0/task:1/device:GPU:3"  (full specification)
-  // * "/job:worker/gpu:3"                   (partial specification)
+  // * "/job:worker/device:GPU:3"                   (partial specification)
  // * ""                                    (no specification)
  //
  // If the constraints do not resolve to a single device (or if this
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@ -39,11 +39,11 @@ namespace {
 TEST(RendezvousTest, Key) {
  const string key = Rendezvous::CreateKey(
      "/job:mnist/replica:1/task:2/CPU:0", 7890,
-      "/job:mnist/replica:1/task:2/GPU:0", "var0", FrameAndIter(0, 0));
+      "/job:mnist/replica:1/task:2/device:GPU:0", "var0", FrameAndIter(0, 0));
  EXPECT_EQ(key,
            "/job:mnist/replica:1/task:2/CPU:0;"
            "0000000000001ed2;"  // 7890 = 0x1ed2
-            "/job:mnist/replica:1/task:2/GPU:0;"
+            "/job:mnist/replica:1/task:2/device:GPU:0;"
            "var0;"
            "0:0");
  Rendezvous::ParsedKey parsed;
@ -51,12 +51,12 @@ TEST(RendezvousTest, Key) {
  EXPECT_EQ(parsed.src_device, "/job:mnist/replica:1/task:2/CPU:0");
  EXPECT_EQ(parsed.src_incarnation, 7890);
  EXPECT_EQ(parsed.src.type, "CPU");
-  EXPECT_EQ(parsed.dst_device, "/job:mnist/replica:1/task:2/GPU:0");
+  EXPECT_EQ(parsed.dst_device, "/job:mnist/replica:1/task:2/device:GPU:0");
  EXPECT_EQ(parsed.dst.type, "GPU");
  EXPECT_FALSE(Rendezvous::ParseKey("foo;bar;baz", &parsed).ok());
  EXPECT_FALSE(Rendezvous::ParseKey("/job:mnist/replica:1/task:2/CPU:0;"
-                                    "/job:mnist/replica:1/task:2/GPU:0;",
+                                    "/job:mnist/replica:1/task:2/device:GPU:0;",
                                    &parsed)
                   .ok());
  EXPECT_FALSE(
@ -99,7 +99,7 @@ string V(const Tensor& tensor) {
 Rendezvous::ParsedKey MakeKey(const string& name) {
  string s = Rendezvous::CreateKey("/job:mnist/replica:1/task:2/CPU:0", 7890,
-                                   "/job:mnist/replica:1/task:2/GPU:0", name,
+                                   "/job:mnist/replica:1/task:2/device:GPU:0", name,
                                   FrameAndIter(0, 0));
  Rendezvous::ParsedKey k;
  TF_EXPECT_OK(Rendezvous::ParseKey(s, &k));
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@ -50,7 +50,7 @@ extern Status TopologicalSortNodesWithTimePriority(
 namespace {
-const char gpu_device[] = "/job:a/replica:0/task:0/gpu:0";
+const char gpu_device[] = "/job:a/replica:0/task:0/device:GPU:0";
 string SplitByDevice(const Node* node) { return node->assigned_device_name(); }
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@ -40,7 +40,7 @@ namespace tensorflow {
 namespace {
 const char kCPUDevice[] = "/job:a/replica:0/task:0/cpu:0";
-const char kGPUDevice[] = "/job:a/replica:0/task:0/gpu:0";
+const char kGPUDevice[] = "/job:a/replica:0/task:0/device:GPU:0";
 static void InitGraph(const string& s, Graph* graph,
                      const string& device = kCPUDevice) {
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@ -89,7 +89,7 @@ Status SingleMachine::Provision() {
  VLOG(1) << "Number of GPUs: " << num_gpus_;
  for (int i = 0; i < num_gpus_; ++i) {
    string device_name =
-        strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i);
+        strings::StrCat("/job:localhost/replica:0/task:0/device:GPU:", i);
    VLOG(1) << "Adding GPU device " << device_name;
    devices_[device_name] = GetLocalGPUInfo(i);
  }
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@ -42,7 +42,7 @@ class AnalyticalCostEstimatorTest : public ::testing::Test {
    gpu_device.set_frequency(1100);
    gpu_device.set_bandwidth(180 * 1024 * 1024);
    (*gpu_device.mutable_environment())["architecture"] = "6";
-    devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+    devices["/job:localhost/replica:0/task:0/device:GPU:0"] = gpu_device;
    cluster_.reset(new VirtualCluster(devices));
  }
--- a/tensorflow/core/grappler/costs/virtual_placer_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_placer_test.cc
@ -30,14 +30,14 @@ TEST(VirtualPlacerTest, LocalDevices) {
  devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
  DeviceProperties gpu_device;
  gpu_device.set_type("GPU");
-  devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+  devices["/job:localhost/replica:0/task:0/device:GPU:0"] = gpu_device;
  VirtualCluster cluster(devices);
  VirtualPlacer placer(&cluster);
  NodeDef node;
  node.set_op("Conv2D");
  EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0",
            placer.get_canonical_device_name(node));
  node.set_device("CPU");
@ -47,7 +47,7 @@ TEST(VirtualPlacerTest, LocalDevices) {
  node.set_device("GPU:0");
  EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0",
            placer.get_canonical_device_name(node));
 }
@ -60,7 +60,7 @@ TEST(VirtualPlacerTest, EmptyJobBecomesLocalhost) {
  devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
  DeviceProperties gpu_device;
  gpu_device.set_type("GPU");
-  devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+  devices["/job:localhost/replica:0/task:0/device:GPU:0"] = gpu_device;
  VirtualCluster cluster(devices);
  VirtualPlacer placer(&cluster);
@ -70,7 +70,7 @@ TEST(VirtualPlacerTest, EmptyJobBecomesLocalhost) {
  EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0",
            placer.get_canonical_device_name(node));
  node.set_device("/device:GPU:0");
-  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0",
            placer.get_canonical_device_name(node));
 }
@ -113,7 +113,7 @@ TEST(VirtualPlacerTest, RemoteDevices) {
  devices["/job:my_job/replica:0/task:0/cpu:0"] = cpu_device;
  DeviceProperties gpu_device;
  gpu_device.set_type("GPU");
-  devices["/job:my_job/replica:0/task:0/gpu:0"] = gpu_device;
+  devices["/job:my_job/replica:0/task:0/device:GPU:0"] = gpu_device;
  VirtualCluster cluster(devices);
  VirtualPlacer placer(&cluster);
@ -122,7 +122,7 @@ TEST(VirtualPlacerTest, RemoteDevices) {
  // Device falls back to GPU.
  EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
            placer.get_canonical_device_name(node));
  node.set_device("/job:my_job/replica:0/task:0/cpu:0");
@ -130,27 +130,27 @@ TEST(VirtualPlacerTest, RemoteDevices) {
  EXPECT_EQ("/job:my_job/replica:0/task:0/cpu:0",
            placer.get_canonical_device_name(node));
-  node.set_device("/job:my_job/replica:0/task:0/gpu:0");
+  node.set_device("/job:my_job/replica:0/task:0/device:GPU:0");
  EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
            placer.get_canonical_device_name(node));
  // There is no local cpu available. Device falls back to GPU.
  node.set_device("CPU");
  EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
            placer.get_canonical_device_name(node));
  node.set_device("GPU:0");
  // There is no local GPU available. Fall back to default GPU.
  EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
            placer.get_canonical_device_name(node));
  // This isn't a valid name. Fall back to GPU.
  node.set_device("/job:my_job/replica:0/task:0");
  EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
            placer.get_canonical_device_name(node));
 }
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@ -320,14 +320,14 @@ TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) {
  Output c = ops::Const(s.WithOpName("c").WithDevice("/cpu:0"), 0.0f, {10, 10});
  // Node i1 should be preserved.
-  Output i1 = ops::Identity(s.WithOpName("i1").WithDevice("/gpu:0"), c);
+  Output i1 = ops::Identity(s.WithOpName("i1").WithDevice("/device:GPU:0"), c);
-  Output a1 = ops::Sqrt(s.WithOpName("a1").WithDevice("/gpu:0"), {i1});
+  Output a1 = ops::Sqrt(s.WithOpName("a1").WithDevice("/device:GPU:0"), {i1});
-  Output a2 = ops::Sqrt(s.WithOpName("a2").WithDevice("/gpu:0"), {i1});
+  Output a2 = ops::Sqrt(s.WithOpName("a2").WithDevice("/device:GPU:0"), {i1});
  // Node i2 should be pruned since it resides on the sender's device.
  Output i2 = ops::Identity(s.WithOpName("i2").WithDevice("/cpu:0"), c);
-  Output a3 = ops::Sqrt(s.WithOpName("a3").WithDevice("/gpu:0"), {i2});
+  Output a3 = ops::Sqrt(s.WithOpName("a3").WithDevice("/device:GPU:0"), {i2});
-  Output a4 = ops::Sqrt(s.WithOpName("a4").WithDevice("/gpu:0"), {i2});
+  Output a4 = ops::Sqrt(s.WithOpName("a4").WithDevice("/device:GPU:0"), {i2});
  GrapplerItem item;
  TF_CHECK_OK(s.ToGraphDef(&item.graph));
--- a/tensorflow/core/platform/default/gpu_tracer.cc
+++ b/tensorflow/core/platform/default/gpu_tracer.cc
@ -579,8 +579,8 @@ Status GPUTracerImpl::Collect(StepStatsCollector *collector) {
  // TODO(pbar) Handle device IDs and prefix properly.
  const string prefix = "";
  const int id = 0;
-  const string stream_device = strings::StrCat(prefix, "/gpu:", id, "/stream:");
+  const string stream_device = strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
-  const string memcpy_device = strings::StrCat(prefix, "/gpu:", id, "/memcpy");
+  const string memcpy_device = strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
  mutex_lock l2(trace_mu_);
  for (const auto &rec : kernel_records_) {
--- a/tensorflow/core/platform/gpu_tracer_test.cc
+++ b/tensorflow/core/platform/gpu_tracer_test.cc
@ -63,12 +63,12 @@ class GPUTracerTest : public ::testing::Test {
    Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
    test::FillValues<float>(&x_tensor, {1, 1});
    Node* x = test::graph::Constant(&graph, x_tensor);
-    x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
    x_ = x->name();
    // y = A * x
    Node* y = test::graph::Matmul(&graph, a, x, false, false);
-    y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
    y_ = y->name();
    // Use an Identity op to force a memcpy to CPU and back to GPU.
@ -77,7 +77,7 @@ class GPUTracerTest : public ::testing::Test {
    Node* y_neg = test::graph::Unary(&graph, "Neg", i);
    y_neg_ = y_neg->name();
-    y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
    test::graph::ToGraphDef(&graph, &def_);
  }
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@ -127,10 +127,10 @@ tfprof> advise
 Not running under xxxx. Skip JobChecker.
 AcceleratorUtilizationChecker:
-device: /job:worker/replica:0/task:0/gpu:0 low utilization: 0.03
+device: /job:worker/replica:0/task:0/device:GPU:0 low utilization: 0.03
-device: /job:worker/replica:0/task:0/gpu:1 low utilization: 0.08
+device: /job:worker/replica:0/task:0/device:GPU:1 low utilization: 0.08
-device: /job:worker/replica:0/task:0/gpu:2 low utilization: 0.04
+device: /job:worker/replica:0/task:0/device:GPU:2 low utilization: 0.04
-device: /job:worker/replica:0/task:0/gpu:3 low utilization: 0.21
+device: /job:worker/replica:0/task:0/device:GPU:3 low utilization: 0.21
 OperationChecker:
 Found operation using NHWC data_format on GPU. Maybe NCHW is faster.
--- a/tensorflow/core/profiler/g3doc/advise.md
+++ b/tensorflow/core/profiler/g3doc/advise.md
@ -31,10 +31,10 @@ tfprof --graph_path=graph.pbtxt \
 tfprof> advise
 AcceleratorUtilizationChecker:
-device: /job:worker/replica:0/task:0/gpu:0 low utilization: 0.03
+device: /job:worker/replica:0/task:0/device:GPU:0 low utilization: 0.03
-device: /job:worker/replica:0/task:0/gpu:1 low utilization: 0.08
+device: /job:worker/replica:0/task:0/device:GPU:1 low utilization: 0.08
-device: /job:worker/replica:0/task:0/gpu:2 low utilization: 0.04
+device: /job:worker/replica:0/task:0/device:GPU:2 low utilization: 0.04
-device: /job:worker/replica:0/task:0/gpu:3 low utilization: 0.21
+device: /job:worker/replica:0/task:0/device:GPU:3 low utilization: 0.21
 OperationChecker:
 Found operation using NHWC data_format on GPU. Maybe NCHW is faster.
--- a/tensorflow/core/profiler/g3doc/profile_time.md
+++ b/tensorflow/core/profiler/g3doc/profile_time.md
@ -134,7 +134,7 @@ AddN                            50.10ms (17.33%, 1.34%),       5481
 tfprof> op -select micros,device -order_by micros
 node name | execution time | assigned devices
 SoftmaxCrossEntropyWithLogits     1.37sec (100.00%, 36.44%), /job:worker/replica:0/task:0/cpu:0
-MatMul                        618.97ms (63.56%, 16.51%), |/job:worker/replica:0/task:0/cpu:0|/job:worker/replica:0/task:0/gpu:0|/job:worker/replica:0/task:0/gpu:1|/job:worker/replica:0/task:0/gpu:2|/job:worker/replica:0/task:0/gpu:3
+MatMul                        618.97ms (63.56%, 16.51%), |/job:worker/replica:0/task:0/cpu:0|/job:worker/replica:0/task:0/device:GPU:0|/job:worker/replica:0/task:0/device:GPU:1|/job:worker/replica:0/task:0/device:GPU:2|/job:worker/replica:0/task:0/device:GPU:3
 ```
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@ -53,10 +53,10 @@ class TFProfAdvisorTest : public ::testing::Test {
    NodeExecStats node_stat;
    node_stat.set_all_start_micros(start_miros);
    node_stat.set_op_end_rel_micros(end_rel_micros);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0", node_stat);
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0", node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0:stream:all",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:all",
                      node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0:stream:0",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:0",
                      node_stat);
    return node;
  }
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@ -25,7 +25,7 @@ bool CountAsAcceleratorTime(const string& device) {
 }
 bool CountAsCPUTime(const string& device) {
-  return RE2::FullMatch(device, ".*/(gpu|cpu|device:sycl):\\d+");
+  return RE2::FullMatch(device, ".*/(device:gpu|gpu|cpu|device:sycl):\\d+");
 }
 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
@ -143,7 +143,7 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
  // TODO(xpan): Make this more robust?
  // See run_metadata_test.py
-  // It can be /job:0/replica:0/xxxx/gpu:0, or simply /gpu:0.
+  // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
  // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
  if (IsCanonicalDevice(dev)) {
    if (!canonical_device_.empty()) {
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@ -42,7 +42,7 @@ message GPUOptions {
  // A comma-separated list of GPU ids that determines the 'visible'
  // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
  // can see 8 GPU devices in the process, and one wanted to map
-  // visible GPU devices 5 and 3 as "/gpu:0", and "/gpu:1", then one
+  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", then one
  // would specify this field as "5,3".  This field is similar in
  // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
  // it applies to the visible GPU devices in the process.
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@ -76,21 +76,21 @@ TEST(DeviceNameUtilsTest, Basic) {
    DeviceNameUtils::ParsedName p;
    EXPECT_FALSE(DeviceNameUtils::ParseFullName("foobar", &p));
    EXPECT_FALSE(
-        DeviceNameUtils::ParseFullName("/job:123/replica:1/task:2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:123/replica:1/task:2/device:GPU:3", &p));
    EXPECT_FALSE(
        DeviceNameUtils::ParseFullName("/job:123/replica:1/task:2/gpu:", &p));
    EXPECT_FALSE(DeviceNameUtils::ParseFullName(
        "/job:123/replica:1/task:2/device:gpu:", &p));
    EXPECT_FALSE(
-        DeviceNameUtils::ParseFullName("/job:foo/replica:-1/task:2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:foo/replica:-1/task:2/device:GPU:3", &p));
    EXPECT_FALSE(
-        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:-2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:-2/device:GPU:3", &p));
    EXPECT_FALSE(
        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:2/bar:3", &p));
    EXPECT_FALSE(DeviceNameUtils::ParseFullName(
-        "/job:foo/replica:1/task:2/gpu:3/extra", &p));
+        "/job:foo/replica:1/task:2/device:GPU:3/extra", &p));
    EXPECT_TRUE(
-        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:2/device:GPU:3", &p));
    EXPECT_TRUE(p.has_job);
    EXPECT_TRUE(p.has_replica);
    EXPECT_TRUE(p.has_task);
@ -106,7 +106,7 @@ TEST(DeviceNameUtilsTest, Basic) {
    // Allow _ in job names.
    DeviceNameUtils::ParsedName p;
    EXPECT_TRUE(DeviceNameUtils::ParseFullName(
-        "/job:foo_bar/replica:1/task:2/gpu:3", &p));
+        "/job:foo_bar/replica:1/task:2/device:GPU:3", &p));
    EXPECT_TRUE(p.has_job);
    EXPECT_TRUE(p.has_replica);
    EXPECT_TRUE(p.has_task);
@ -193,7 +193,7 @@ TEST(DeviceNameUtilsTest, Basic) {
  }
  {
    DeviceNameUtils::ParsedName p;
-    EXPECT_TRUE(DeviceNameUtils::ParseFullName("/job:*/replica:4/gpu:5", &p));
+    EXPECT_TRUE(DeviceNameUtils::ParseFullName("/job:*/replica:4/device:GPU:5", &p));
    EXPECT_FALSE(p.has_job);
    EXPECT_TRUE(p.has_replica);
    EXPECT_FALSE(p.has_task);
@ -216,13 +216,13 @@ TEST(DeviceNameUtilsTest, Basic) {
  }
  EXPECT_TRUE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:2/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:2/device:GPU:4"));
  EXPECT_FALSE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:3/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:3/device:GPU:4"));
  EXPECT_FALSE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:10/task:2/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:10/task:2/device:GPU:4"));
  EXPECT_FALSE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:bar/replica:1/task:2/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:bar/replica:1/task:2/device:GPU:4"));
  EXPECT_EQ(DeviceNameUtils::LocalName("CPU", 1), "CPU:1");
  EXPECT_EQ(DeviceNameUtils::LocalName("GPU", 2), "GPU:2");
@ -284,17 +284,17 @@ static bool IsCSHelper(StringPiece pattern, StringPiece actual) {
 }
 TEST(DeviceNameUtilsTest, IsCompleteSpecification) {
-  EXPECT_TRUE(IsCSHelper("/job:*", "/job:work/replica:1/task:2/gpu:3"));
+  EXPECT_TRUE(IsCSHelper("/job:*", "/job:work/replica:1/task:2/device:GPU:3"));
  EXPECT_TRUE(
-      IsCSHelper("/job:*/replica:*", "/job:work/replica:1/task:2/gpu:3"));
+      IsCSHelper("/job:*/replica:*", "/job:work/replica:1/task:2/device:GPU:3"));
-  EXPECT_TRUE(IsCSHelper("/job:*/task:*", "/job:work/replica:1/task:2/gpu:3"));
+  EXPECT_TRUE(IsCSHelper("/job:*/task:*", "/job:work/replica:1/task:2/device:GPU:3"));
  EXPECT_TRUE(IsCSHelper("/job:*/replica:*/task:*",
-                         "/job:work/replica:1/task:2/gpu:3"));
+                         "/job:work/replica:1/task:2/device:GPU:3"));
  EXPECT_TRUE(
-      IsCSHelper("/job:*/replica:*/gpu:*", "/job:work/replica:1/task:2/gpu:3"));
+      IsCSHelper("/job:*/replica:*/gpu:*", "/job:work/replica:1/task:2/device:GPU:3"));
-  EXPECT_FALSE(IsCSHelper("/cpu:*", "/job:worker/replica:1/task:2/gpu:3"));
+  EXPECT_FALSE(IsCSHelper("/cpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
-  EXPECT_FALSE(IsCSHelper("/gpu:2", "/job:worker/replica:1/task:2/gpu:1"));
+  EXPECT_FALSE(IsCSHelper("/device:GPU:2", "/job:worker/replica:1/task:2/device:GPU:1"));
-  EXPECT_TRUE(IsCSHelper("/gpu:*", "/job:worker/replica:1/task:2/gpu:3"));
+  EXPECT_TRUE(IsCSHelper("/gpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
 }
 static bool IsSpecHelper(StringPiece pattern, StringPiece actual) {
@ -305,36 +305,36 @@ static bool IsSpecHelper(StringPiece pattern, StringPiece actual) {
 }
 TEST(DeviceNameUtilsTest, IsSpecification) {
-  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/task:2/gpu:3"));
+  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/task:2/device:GPU:3"));
-  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/gpu:3"));
+  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/device:GPU:3"));
  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1"));
  EXPECT_TRUE(IsSpecHelper("/job:*", "/replica:1"));
  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work"));
  EXPECT_TRUE(
-      IsSpecHelper("/job:*/replica:*", "/job:work/replica:1/task:2/gpu:3"));
+      IsSpecHelper("/job:*/replica:*", "/job:work/replica:1/task:2/device:GPU:3"));
  EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/gpu:*",
-                           "/job:work/replica:1/task:2/gpu:3"));
+                           "/job:work/replica:1/task:2/device:GPU:3"));
-  EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/gpu:3",
+  EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/device:GPU:3",
-                           "/job:work/replica:1/task:2/gpu:3"));
+                           "/job:work/replica:1/task:2/device:GPU:3"));
  EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/task:2",
-                           "/job:work/replica:1/task:2/gpu:3"));
+                           "/job:work/replica:1/task:2/device:GPU:3"));
  EXPECT_TRUE(IsSpecHelper("/job:work/replica:*/task:2",
-                           "/job:work/replica:1/task:2/gpu:3"));
+                           "/job:work/replica:1/task:2/device:GPU:3"));
-  EXPECT_TRUE(IsSpecHelper("/task:*", "/job:*/replica:1/task:2/gpu:3"));
+  EXPECT_TRUE(IsSpecHelper("/task:*", "/job:*/replica:1/task:2/device:GPU:3"));
-  EXPECT_TRUE(IsSpecHelper("/task:2", "/job:*/replica:1/task:2/gpu:3"));
+  EXPECT_TRUE(IsSpecHelper("/task:2", "/job:*/replica:1/task:2/device:GPU:3"));
  EXPECT_TRUE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2/cpu:1"));
  EXPECT_TRUE(IsSpecHelper("/cpu:0", "/cpu:0"));
-  EXPECT_TRUE(IsSpecHelper("/gpu:*", "/job:worker/replica:1/task:2/gpu:3"));
+  EXPECT_TRUE(IsSpecHelper("/gpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
-  EXPECT_FALSE(IsSpecHelper("/job:worker/replica:1/task:2/gpu:3", "/gpu:*"));
+  EXPECT_FALSE(IsSpecHelper("/job:worker/replica:1/task:2/device:GPU:3", "/gpu:*"));
  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2"));
-  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2/gpu:1"));
+  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2/device:GPU:1"));
-  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:worker/replica:1/task:2/gpu:3"));
+  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
-  EXPECT_FALSE(IsSpecHelper("/gpu:2", "/job:worker/replica:1/task:2/gpu:1"));
+  EXPECT_FALSE(IsSpecHelper("/device:GPU:2", "/job:worker/replica:1/task:2/device:GPU:1"));
  EXPECT_FALSE(IsSpecHelper("/job:work/replica:*/task:0",
-                            "/job:work/replica:1/task:2/gpu:3"));
+                            "/job:work/replica:1/task:2/device:GPU:3"));
  EXPECT_FALSE(IsSpecHelper("/job:work/replica:0/task:2",
-                            "/job:work/replica:*/task:2/gpu:3"));
+                            "/job:work/replica:*/task:2/device:GPU:3"));
 }
 TEST(DeviceNameUtilsTest, SplitDeviceName) {
@ -348,7 +348,7 @@ TEST(DeviceNameUtilsTest, SplitDeviceName) {
      "/job:foo/cpu:1/task:2/replica:1", &task, &device));
  EXPECT_EQ("/job:foo/replica:1/task:2", task);
  EXPECT_EQ("CPU:1", device);
-  EXPECT_TRUE(DeviceNameUtils::SplitDeviceName("/gpu:3", &task, &device));
+  EXPECT_TRUE(DeviceNameUtils::SplitDeviceName("/device:GPU:3", &task, &device));
  EXPECT_EQ("", task);
  EXPECT_EQ("GPU:3", device);
  EXPECT_FALSE(DeviceNameUtils::SplitDeviceName("gpu:3", &task, &device));
@ -413,11 +413,11 @@ TEST(DeviceNameUtilsTest, MergeDevNames) {
  MergeDevNamesHelper("", "/job:foo", "/job:foo");
  MergeDevNamesHelper("", "/replica:2", "/replica:2");
  MergeDevNamesHelper("", "/task:7", "/task:7");
-  // MergeDevNamesHelper("", "/gpu:1", "/gpu:1");
+  // MergeDevNamesHelper("", "/device:GPU:1", "/device:GPU:1");
  // Combining disjoint names.
  MergeDevNamesHelper("/job:foo", "/task:7", "/job:foo/task:7");
-  MergeDevNamesHelper("/job:foo", "/gpu:1", "/job:foo/gpu:1");
+  MergeDevNamesHelper("/job:foo", "/device:GPU:1", "/job:foo/device:GPU:1");
  // Combining overlapping names.
  MergeDevNamesHelper("/job:foo/replica:0", "/replica:0/task:1",
@ -426,25 +426,25 @@ TEST(DeviceNameUtilsTest, MergeDevNames) {
  // Wildcard tests.
  MergeDevNamesHelper("", "/gpu:*", "/gpu:*");
  MergeDevNamesHelper("/gpu:*", "/gpu:*", "/gpu:*");
-  MergeDevNamesHelper("/gpu:1", "/gpu:*", "/gpu:1");
+  MergeDevNamesHelper("/device:GPU:1", "/gpu:*", "/device:GPU:1");
  // Incompatible components.
  MergeDevNamesError("/job:foo", "/job:bar", "incompatible jobs");
  MergeDevNamesError("/replica:0", "/replica:1", "incompatible replicas");
  MergeDevNamesError("/task:0", "/task:1", "incompatible tasks");
  MergeDevNamesError("/gpu:*", "/cpu:*", "incompatible types");
-  MergeDevNamesError("/gpu:0", "/gpu:1", "incompatible ids");
+  MergeDevNamesError("/device:GPU:0", "/device:GPU:1", "incompatible ids");
 }
 TEST(DeviceNameUtilsTest, MergeDevNamesAllowSoftPlacement) {
  // Incompatible components with allow_soft_placement.
  MergeDevNamesHelperAllowSoftPlacement("/gpu:*", "/cpu:1", "");
-  MergeDevNamesHelperAllowSoftPlacement("/cpu:*", "/gpu:1", "");
+  MergeDevNamesHelperAllowSoftPlacement("/cpu:*", "/device:GPU:1", "");
-  MergeDevNamesHelperAllowSoftPlacement("/gpu:1", "/gpu:2", "/gpu:*");
+  MergeDevNamesHelperAllowSoftPlacement("/device:GPU:1", "/device:GPU:2", "/device:GPU:*");
 }
 TEST(DeviceNameUtilsTest, GetNamesForDeviceMappings) {
-  DeviceNameUtils::ParsedName p = Name("/job:foo/replica:10/task:0/gpu:1");
+  DeviceNameUtils::ParsedName p = Name("/job:foo/replica:10/task:0/device:GPU:1");
  EXPECT_EQ(str_util::Join(DeviceNameUtils::GetNamesForDeviceMappings(p), ","),
            "/job:foo/replica:10/task:0/device:GPU:1,"
            "/job:foo/replica:10/task:0/gpu:1");
--- a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
@ -73,12 +73,12 @@ other wrappers and the dynamic decoder described below.  For example, one can
 write:
 ```python
-cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:0")
+cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/device:GPU:0")
 attention_mechanism = tf.contrib.seq2seq.LuongAttention(512, encoder_outputs)
 attn_cell = tf.contrib.seq2seq.AttentionWrapper(
  cell, attention_mechanism, attention_size=256)
-attn_cell = tf.contrib.rnn.DeviceWrapper(attn_cell, "/gpu:1")
+attn_cell = tf.contrib.rnn.DeviceWrapper(attn_cell, "/device:GPU:1")
-top_cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:1")
+top_cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/device:GPU:1")
 multi_cell = MultiRNNCell([attn_cell, top_cell])
 ```
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@ -110,7 +110,7 @@ devices. For example, the following snippet creates a variable named `v` and
 places it on the second GPU device:
 ``` python
-with tf.device("/gpu:1"):
+with tf.device("/device:GPU:1"):
  v = tf.get_variable("v", [1])
 ```
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@ -411,7 +411,7 @@ the first tower are prepended with `tower_0`, e.g. `tower_0/conv1/Conv2D`.
 * A preferred hardware device to run the operation within a tower.
@{tf.device} specifies this. For
-instance, all operations in the first tower reside within `device('/gpu:0')`
+instance, all operations in the first tower reside within `device('/device:GPU:0')`
 scope indicating that they should be run on the first GPU.
 All variables are pinned to the CPU and accessed via
--- a/tensorflow/docs_src/tutorials/using_gpu.md
+++ b/tensorflow/docs_src/tutorials/using_gpu.md
@ -7,8 +7,8 @@ supported device types are `CPU` and `GPU`. They are represented as `strings`.
 For example:
 *   `"/cpu:0"`: The CPU of your machine.
-*   `"/gpu:0"`: The GPU of your machine, if you have one.
+*   `"/device:GPU:0"`: The GPU of your machine, if you have one.
-*   `"/gpu:1"`: The second GPU of your machine, etc.
+*   `"/device:GPU:1"`: The second GPU of your machine, etc.
 If a TensorFlow operation has both CPU and GPU implementations, the GPU devices
 will be given priority when the operation is assigned to a device. For example,
@ -35,11 +35,11 @@ You should see the following output:
 ```
 Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K40c, pci bus
+/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K40c, pci bus
 id: 0000:05:00.0
-b: /job:localhost/replica:0/task:0/gpu:0
+b: /job:localhost/replica:0/task:0/device:GPU:0
-a: /job:localhost/replica:0/task:0/gpu:0
+a: /job:localhost/replica:0/task:0/device:GPU:0
-MatMul: /job:localhost/replica:0/task:0/gpu:0
+MatMul: /job:localhost/replica:0/task:0/device:GPU:0
 [[ 22.  28.]
 [ 49.  64.]]
@ -71,11 +71,11 @@ example) and automatically copy tensors between devices if required.
 ```
 Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K40c, pci bus
+/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K40c, pci bus
 id: 0000:05:00.0
 b: /job:localhost/replica:0/task:0/cpu:0
 a: /job:localhost/replica:0/task:0/cpu:0
-MatMul: /job:localhost/replica:0/task:0/gpu:0
+MatMul: /job:localhost/replica:0/task:0/device:GPU:0
 [[ 22.  28.]
 [ 49.  64.]]
 ```
@ -127,7 +127,7 @@ to specify the preference explicitly:
 ```python
 # Creates a graph.
-with tf.device('/gpu:2'):
+with tf.device('/device:GPU:2'):
  a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
  b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
  c = tf.matmul(a, b)
@ -142,9 +142,9 @@ If the device you have specified does not exist, you will get
 ```
 InvalidArgumentError: Invalid argument: Cannot assign a device to node 'b':
-Could not satisfy explicit device specification '/gpu:2'
+Could not satisfy explicit device specification '/device:GPU:2'
   [[Node: b = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [3,2]
-   values: 1 2 3...>, _device="/gpu:2"]()]]
+   values: 1 2 3...>, _device="/device:GPU:2"]()]]
 ```
 If you would like TensorFlow to automatically choose an existing and supported
@ -154,7 +154,7 @@ the session.
 ```python
 # Creates a graph.
-with tf.device('/gpu:2'):
+with tf.device('/device:GPU:2'):
  a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
  b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
  c = tf.matmul(a, b)
@ -175,7 +175,7 @@ For example:
 ```
 # Creates a graph.
 c = []
-for d in ['/gpu:2', '/gpu:3']:
+for d in ['/device:GPU:2', '/device:GPU:3']:
  with tf.device(d):
    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
    b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2])
@ -192,20 +192,20 @@ You will see the following output.
 ```
 Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K20m, pci bus
 id: 0000:02:00.0
-/job:localhost/replica:0/task:0/gpu:1 -> device: 1, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:1 -> device: 1, name: Tesla K20m, pci bus
 id: 0000:03:00.0
-/job:localhost/replica:0/task:0/gpu:2 -> device: 2, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:2 -> device: 2, name: Tesla K20m, pci bus
 id: 0000:83:00.0
-/job:localhost/replica:0/task:0/gpu:3 -> device: 3, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:3 -> device: 3, name: Tesla K20m, pci bus
 id: 0000:84:00.0
-Const_3: /job:localhost/replica:0/task:0/gpu:3
+Const_3: /job:localhost/replica:0/task:0/device:GPU:3
-Const_2: /job:localhost/replica:0/task:0/gpu:3
+Const_2: /job:localhost/replica:0/task:0/device:GPU:3
-MatMul_1: /job:localhost/replica:0/task:0/gpu:3
+MatMul_1: /job:localhost/replica:0/task:0/device:GPU:3
-Const_1: /job:localhost/replica:0/task:0/gpu:2
+Const_1: /job:localhost/replica:0/task:0/device:GPU:2
-Const: /job:localhost/replica:0/task:0/gpu:2
+Const: /job:localhost/replica:0/task:0/device:GPU:2
-MatMul: /job:localhost/replica:0/task:0/gpu:2
+MatMul: /job:localhost/replica:0/task:0/device:GPU:2
 AddN: /job:localhost/replica:0/task:0/cpu:0
 [[  44.   56.]
 [  98.  128.]]
--- a/tensorflow/examples/learn/multiple_gpu.py
+++ b/tensorflow/examples/learn/multiple_gpu.py
@ -47,12 +47,12 @@ def my_model(features, labels, mode):
  # Create three fully connected layers respectively of size 10, 20, and 10 with
  # each layer having a dropout probability of 0.1.
  net = features[X_FEATURE]
-  with tf.device('/gpu:1'):
+  with tf.device('/device:GPU:1'):
    for units in [10, 20, 10]:
      net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
      net = tf.layers.dropout(net, rate=0.1)
-  with tf.device('/gpu:2'):
+  with tf.device('/device:GPU:2'):
    # Compute logits (1 per class).
    logits = tf.layers.dense(net, 3, activation=None)
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@ -173,7 +173,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
    #
    # W0718 17:14:41.521534  190121 device_mgr.cc:107] Unknown device:
    #     /job:worker/replica:0/task:0/device:CPU:0 all devices:
-    #     /job:local/replica:0/task:0/gpu:0,
+    #     /job:local/replica:0/task:0/device:GPU:0,
    #     /job:local/replica:0/task:0/device:GPU:0,
    #     /job:local/replica:0/task:0/cpu:1, CPU:0, GPU:0,
    #     /job:local/replica:0/task:0/device:CPU:1,
@ -198,7 +198,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
        sum1 = input1 + input2
      if test.is_gpu_available():
-        device_str = '/job:worker/task:0/gpu:0'
+        device_str = '/job:worker/task:0/device:GPU:0'
      else:
        device_str = '/job:worker/task:0/cpu:1'
      with ops.device(device_str):
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@ -1124,7 +1124,7 @@ class SessionTest(test_util.TensorFlowTestCase):
    # which is why placing this is invalid.  If at some point
    # GPU kernels are added to this test, some other different
    # op / device combo should be chosen.
-    with ops.device('/gpu:0'):
+    with ops.device('/device:GPU:0'):
      a = constant_op.constant(1.0, shape=[1, 2])
    b = constant_op.constant(1.0, shape=[1, 2])
@ -1145,7 +1145,7 @@ class SessionTest(test_util.TensorFlowTestCase):
    # which is why placing this is invalid.  If at some point
    # GPU kernels are added to this test, some other different
    # op / device combo should be chosen.
-    with ops.device('/gpu:0'):
+    with ops.device('/device:GPU:0'):
      _ = constant_op.constant(1.0, shape=[1, 2])
    b = constant_op.constant(1.0, shape=[1, 2])
@ -1494,7 +1494,7 @@ class SessionTest(test_util.TensorFlowTestCase):
        allow_soft_placement=True,
        graph_options=config_pb2.GraphOptions(build_cost_model=100))
    with session.Session(config=config) as sess:
-      with ops.device('/gpu:0'):
+      with ops.device('/device:GPU:0'):
        a = array_ops.placeholder(dtypes.float32, shape=[])
        b = math_ops.add(a, a)
        c = array_ops.identity(b)
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@ -100,8 +100,8 @@ class TimelineTest(test.TestCase):
    self.assertTrue(run_metadata.HasField('step_stats'))
    step_stats = run_metadata.step_stats
    devices = [d.device for d in step_stats.dev_stats]
-    self.assertTrue('/job:localhost/replica:0/task:0/gpu:0' in devices)
+    self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:0' in devices)
-    self.assertTrue('/gpu:0/stream:all' in devices)
+    self.assertTrue('/device:GPU:0/stream:all' in devices)
    tl = timeline.Timeline(step_stats)
    ctf = tl.generate_chrome_trace_format()
    self._validateTrace(ctf)
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@ -380,7 +380,8 @@ def device_path_to_device_name(device_dir):
  path_items = os.path.basename(device_dir)[
      len(METADATA_FILE_PREFIX) + len(DEVICE_TAG):].split(",")
  return "/".join([
-      path_item.replace("_", ":", 1) for path_item in path_items])
+      path_item.replace("device_", "device:").replace("_", ":", 1)
      for path_item in path_items])
 class DebugTensorDatum(object):
--- a/tensorflow/python/debug/lib/debug_data_test.py
+++ b/tensorflow/python/debug/lib/debug_data_test.py
@ -237,11 +237,11 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
    gpu_0_dir = os.path.join(
        self._dump_root,
        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
-        ",job_localhost,replica_0,task_0,gpu_0")
+        ",job_localhost,replica_0,task_0,device_GPU_0")
    gpu_1_dir = os.path.join(
        self._dump_root,
        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
-        ",job_localhost,replica_0,task_0,gpu_1")
+        ",job_localhost,replica_0,task_0,device_GPU_1")
    os.makedirs(cpu_0_dir)
    os.makedirs(gpu_0_dir)
    os.makedirs(gpu_1_dir)
@ -281,12 +281,12 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
    node = graph_gpu_0.node.add()
    node.name = "node_foo_1"
    node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:0"
    graph_gpu_1 = graph_pb2.GraphDef()
    node = graph_gpu_1.node.add()
    node.name = "node_foo_1"
    node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:1"
    dump_dir = debug_data.DebugDumpDir(
        self._dump_root,
@ -294,14 +294,14 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
    self.assertItemsEqual(
        ["/job:localhost/replica:0/task:0/cpu:0",
-         "/job:localhost/replica:0/task:0/gpu:0",
+         "/job:localhost/replica:0/task:0/device:GPU:0",
-         "/job:localhost/replica:0/task:0/gpu:1"], dump_dir.devices())
+         "/job:localhost/replica:0/task:0/device:GPU:1"], dump_dir.devices())
    self.assertEqual(1472563253536385, dump_dir.t0)
    self.assertEqual(3, dump_dir.size)
    with self.assertRaisesRegexp(
        ValueError, r"Invalid device name: "):
-      dump_dir.nodes("/job:localhost/replica:0/task:0/gpu:2")
+      dump_dir.nodes("/job:localhost/replica:0/task:0/device:GPU:2")
    self.assertItemsEqual(["node_foo_1", "node_foo_1", "node_foo_1"],
                          dump_dir.nodes())
    self.assertItemsEqual(
@ -319,16 +319,16 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
    node = graph_gpu_0.node.add()
    node.name = "node_foo_1"
    node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:0"
    graph_gpu_1 = graph_pb2.GraphDef()
    node = graph_gpu_1.node.add()
    node.name = "node_foo_1"
    node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:1"
    node = graph_gpu_1.node.add()  # Here is the duplicate.
    node.name = "node_foo_1"
    node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:1"
    with self.assertRaisesRegexp(
        ValueError, r"Duplicate node name on device "):
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@ -711,7 +711,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
    # Test node name list lookup of the DebugDumpDir object.
    if test_util.gpu_device_name():
      node_names = dump.nodes(
-          device_name="/job:localhost/replica:0/task:0/gpu:0")
+          device_name="/job:localhost/replica:0/task:0/device:GPU:0")
    else:
      node_names = dump.nodes()
    self.assertTrue(u_name in node_names)
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@ -402,7 +402,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
  def testRuntimeErrorBeforeGraphExecutionIsRaised(self):
    # Use an impossible device name to cause an error before graph execution.
-    with ops.device("/gpu:1337"):
+    with ops.device("/device:GPU:1337"):
      w = variables.Variable([1.0] * 10, name="w")
    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
--- a/tensorflow/python/framework/device_test.py
+++ b/tensorflow/python/framework/device_test.py
@ -79,17 +79,17 @@ class DeviceTest(test_util.TensorFlowTestCase):
    self.assertEquals("/replica:1/task:0/device:CPU:0", d.to_string())
    d.parse_from_string("/replica:1/task:0/device:CPU:0")
    self.assertEquals("/replica:1/task:0/device:CPU:0", d.to_string())
-    d.parse_from_string("/job:muu/gpu:2")
+    d.parse_from_string("/job:muu/device:GPU:2")
    self.assertEquals("/job:muu/device:GPU:2", d.to_string())
    with self.assertRaises(Exception) as e:
-      d.parse_from_string("/job:muu/gpu:2/cpu:0")
+      d.parse_from_string("/job:muu/device:GPU:2/cpu:0")
    self.assertTrue("Cannot specify multiple device" in str(e.exception))
  def testFromString(self):
    d = device.DeviceSpec.from_string("/job:foo/replica:0")
    self.assertEquals("/job:foo/replica:0", d.to_string())
    with self.assertRaises(Exception) as e:
-      d = device.DeviceSpec.from_string("/job:muu/gpu:2/cpu:0")
+      d = device.DeviceSpec.from_string("/job:muu/device:GPU:2/cpu:0")
    self.assertTrue("Cannot specify multiple device" in str(e.exception))
    d = device.DeviceSpec.from_string("/job:foo/replica:0/task:3/cpu:*")
@ -102,13 +102,13 @@ class DeviceTest(test_util.TensorFlowTestCase):
  def testMerge(self):
    d = device.DeviceSpec.from_string("/job:foo/replica:0")
    self.assertEquals("/job:foo/replica:0", d.to_string())
-    d.merge_from(device.DeviceSpec.from_string("/task:1/gpu:2"))
+    d.merge_from(device.DeviceSpec.from_string("/task:1/device:GPU:2"))
    self.assertEquals("/job:foo/replica:0/task:1/device:GPU:2", d.to_string())
    d = device.DeviceSpec()
    d.merge_from(device.DeviceSpec.from_string("/task:1/cpu:0"))
    self.assertEquals("/task:1/device:CPU:0", d.to_string())
-    d.merge_from(device.DeviceSpec.from_string("/job:boo/gpu:0"))
+    d.merge_from(device.DeviceSpec.from_string("/job:boo/device:GPU:0"))
    self.assertEquals("/job:boo/task:1/device:GPU:0", d.to_string())
    d.merge_from(device.DeviceSpec.from_string("/job:muu/cpu:2"))
    self.assertEquals("/job:muu/task:1/device:CPU:2", d.to_string())
@ -134,10 +134,10 @@ class DeviceTest(test_util.TensorFlowTestCase):
    self.assertEqual("/job:foo/replica:0/task:0/device:GPU:0",
                     device.canonical_name(
-                         "/job:foo/replica:0/task:0/gpu:0"))
+                         "/job:foo/replica:0/task:0/device:GPU:0"))
    self.assertEqual("/job:foo/replica:0/task:0/device:GPU:0",
                     device.canonical_name(
-                         "/gpu:0/task:0/replica:0/job:foo"))
+                         "/device:GPU:0/task:0/replica:0/job:foo"))
  def testCheckValid(self):
    device.check_valid("/job:foo/replica:0")
@ -155,7 +155,7 @@ class DeviceTest(test_util.TensorFlowTestCase):
    self.assertTrue("Unknown attribute: 'bar'" in str(e.exception))
    with self.assertRaises(Exception) as e:
-      device.check_valid("/cpu:0/gpu:2")
+      device.check_valid("/cpu:0/device:GPU:2")
    self.assertTrue("Cannot specify multiple device" in str(e.exception))
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@ -505,7 +505,7 @@ class FunctionTest(test.TestCase):
      _ = PlusOne(1, name="p1")
      with self.assertRaisesRegexp(ValueError, "Unknown keyword arguments"):
-        _ = PlusOne(1, device="/gpu:0")
+        _ = PlusOne(1, device="/device:GPU:0")
  def testFunctionDecorator(self):
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@ -106,9 +106,9 @@ class DeviceFunctionsTest(test.TestCase):
      var_0 = variables.Variable(0)
      with ops.device(test_device_func_pin_variable_to_cpu):
        var_1 = variables.Variable(1)
-        with ops.device(lambda op: "/gpu:0"):
+        with ops.device(lambda op: "/device:GPU:0"):
          var_2 = variables.Variable(2)
-        with ops.device("/gpu:0"):  # Implicit merging device function.
+        with ops.device("/device:GPU:0"):  # Implicit merging device function.
          var_3 = variables.Variable(3)
    self.assertDeviceEqual(var_0.device, None)
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@ -878,7 +878,7 @@ class ImportGraphDefTest(test.TestCase):
        self.assertEqual(c.device, c4.device)  # worker overrides ps.
    with ops.Graph().as_default():
-      with ops.device(device.merge_device("/gpu:0")):
+      with ops.device(device.merge_device("/device:GPU:0")):
        a5, b5, c5 = importer.import_graph_def(
            gdef, return_elements=["a", "b", "c"])
        self.assertEqual("/device:GPU:0", a5.device)
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@ -550,7 +550,7 @@ class ScopedMetaGraphTest(test.TestCase):
        a = variables.Variable(
            constant_op.constant(
                1.0, shape=[2, 2]), name="a")
-      with ops.device("/job:ps/replica:0/task:0/gpu:0"):
+      with ops.device("/job:ps/replica:0/task:0/device:GPU:0"):
        b = variables.Variable(
            constant_op.constant(
                2.0, shape=[2, 2]), name="b")
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@ -3342,7 +3342,7 @@ class Graph(object):
    For example:
    ```python
-    with g.device('/gpu:0'):
+    with g.device('/device:GPU:0'):
      # All operations constructed in this context will be placed
      # on GPU 0.
      with g.device(None):
@ -3352,7 +3352,7 @@ class Graph(object):
    # Defines a function from `Operation` to device string.
    def matmul_on_gpu(n):
      if n.type == "MatMul":
-        return "/gpu:0"
+        return "/device:GPU:0"
      else:
        return "/cpu:0"
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@ -1555,26 +1555,26 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
  def testColocationDeviceInteraction(self):
    with ops.device("/cpu:0"):
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
        a = constant_op.constant([2.0], name="a")
      with ops.colocate_with(a.op):
        # 'b' is created in the scope of /cpu:0, but it is
-        # colocated with 'a', which is on '/gpu:0'.  colocate_with
+        # colocated with 'a', which is on '/device:GPU:0'.  colocate_with
        # overrides devices because it is a stronger constraint.
        b = constant_op.constant(3.0)
    self.assertEqual([b"loc:@a"], b.op.colocation_groups())
    self.assertEqual(a.op.device, b.op.device)
  def testColocationCanonicalization(self):
-    with ops.device("/gpu:0"):
+    with ops.device("/device:GPU:0"):
      _ = constant_op.constant(2.0)
-    with ops.device(lambda op: "/gpu:0"):
+    with ops.device(lambda op: "/device:GPU:0"):
      b = constant_op.constant(3.0)
    with ops.get_default_graph().colocate_with(b):
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
        c = constant_op.constant(4.0)
-    # A's device will be /gpu:0
+    # A's device will be /device:GPU:0
    # B's device will be /device:GPU:0
    # C's device will be /device:GPU:0 because it
    # inherits B's device name, after canonicalizing the names.
@ -1582,10 +1582,10 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
  def testLocationOverrides(self):
    with ops.device("/cpu:0"):
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
        a = constant_op.constant([2.0], name="a")
        # Note that this colocation is "redundant", since we are
-        # within the scope of "/gpu:0".  However, we would like to
+        # within the scope of "/device:GPU:0".  However, we would like to
        # preserve in the GraphDef that these two ops should be
        # colocated in a portable way.
        with ops.colocate_with(a.op):
@ -1652,7 +1652,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
    self.assertEqual([b"loc:@a"], b.op.colocation_groups())
  def testInconsistentDeviceWithinColocate(self):
-    with ops.device("/gpu:0"):
+    with ops.device("/device:GPU:0"):
      a = constant_op.constant([2.0], name="a")
      with ops.colocate_with(a.op):
        # This is allowed due to legacy but clearly wrong, since we
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@ -405,7 +405,7 @@ class TensorFlowTestCase(googletest.TestCase):
    trigger the creation of a new session.
    Use the `use_gpu` and `force_gpu` options to control where ops are run. If
-    `force_gpu` is True, all ops are pinned to `/gpu:0`. Otherwise, if `use_gpu`
+    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if `use_gpu`
    is True, TensorFlow tries to run as many ops on the GPU as possible. If both
    `force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
@ -427,7 +427,7 @@ class TensorFlowTestCase(googletest.TestCase):
      config: An optional config_pb2.ConfigProto to use to configure the
        session.
      use_gpu: If True, attempt to run as many ops as possible on GPU.
-      force_gpu: If True, pin all ops to `/gpu:0`.
+      force_gpu: If True, pin all ops to `/device:GPU:0`.
    Returns:
      A Session object that should be used as a context manager to surround
@ -466,11 +466,11 @@ class TensorFlowTestCase(googletest.TestCase):
      sess = self._cached_session
      with sess.graph.as_default(), sess.as_default():
        if force_gpu:
-          # Use the name of an actual device if one is detected, or '/gpu:0'
+          # Use the name of an actual device if one is detected, or '/device:GPU:0'
          # otherwise
          gpu_name = gpu_device_name()
          if not gpu_name:
-            gpu_name = "/gpu:0"
+            gpu_name = "/device:GPU:0"
          with sess.graph.device(gpu_name):
            yield sess
        elif use_gpu:
@ -481,11 +481,11 @@ class TensorFlowTestCase(googletest.TestCase):
    else:
      with session.Session(graph=graph, config=prepare_config(config)) as sess:
        if force_gpu:
-          # Use the name of an actual device if one is detected, or '/gpu:0'
+          # Use the name of an actual device if one is detected, or '/device:GPU:0'
          # otherwise
          gpu_name = gpu_device_name()
          if not gpu_name:
-            gpu_name = "/gpu:0"
+            gpu_name = "/device:GPU:0"
          with sess.graph.device(gpu_name):
            yield sess
        elif use_gpu:
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@ -238,7 +238,7 @@ class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase):
    n_iterations = 500
    with session as s:
      data = variables.Variable(1.0)
-      with ops.device('/gpu:0'):
+      with ops.device('/device:GPU:0'):
        random_seed.set_random_seed(1)
        matrix1 = variables.Variable(
            random_ops.truncated_normal([1024, 1]), name='matrix1')
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@ -311,7 +311,7 @@ class CholeskyBenchmark(test.Benchmark):
      if test.is_gpu_available(True):
        with ops.Graph().as_default(), \
            session.Session() as sess, \
-            ops.device("/gpu:0"):
+            ops.device("/device:GPU:0"):
          l = linalg_ops.cholesky(data)
          self.run_op_benchmark(
              sess,
@ -338,11 +338,11 @@ class CholeskyBenchmark(test.Benchmark):
    if test.is_gpu_available(True):
      _BenchmarkGrad(
-          MatrixInverseCompositeGrad, "composite_matrix_inverse", "/gpu:0")
+          MatrixInverseCompositeGrad, "composite_matrix_inverse", "/device:GPU:0")
      _BenchmarkGrad(
-          TriAngInvCompositeGrad, "composite_tri_ang_inverse", "/gpu:0")
+          TriAngInvCompositeGrad, "composite_tri_ang_inverse", "/device:GPU:0")
      _BenchmarkGrad(
-          TriAngSolveCompositeGrad, "composite_triangular_solve", "/gpu:0")
+          TriAngSolveCompositeGrad, "composite_triangular_solve", "/device:GPU:0")
    _BenchmarkGrad(
        MatrixInverseCompositeGrad, "composite_matrix_inverse", "/cpu:0")
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@ -1423,9 +1423,8 @@ class ControlFlowTest(test.TestCase):
      self.assertEqual(45, rx.eval())
  def _testWhileGrad_ColocateGradients(self, colocate):
-    gpu_dev_name = test.gpu_device_name().lower() if test.is_gpu_available(
+    gpu_dev_name = test.gpu_device_name() if test.is_gpu_available(
-    ) else "/gpu:0"
+    ) else "/device:GPU:0"
    gpu_short_name = gpu_dev_name.split("/")[-1]
    with self.test_session(graph=ops.Graph()) as sess:
      v = constant_op.constant(2.0, name="v")
@ -1439,19 +1438,19 @@ class ControlFlowTest(test.TestCase):
      r = gradients_impl.gradients(
          loop, v, colocate_gradients_with_ops=colocate)[0]
    r_ops = r.graph.get_operations()
-    r_devices = [(op.name, op.device.lower()) for op in r_ops]
+    r_devices = [(op.name, op.device) for op in r_ops]
    self.assertTrue(any("Square" in op.name for op in r_ops))
    for (name, dev) in r_devices:
      if not colocate and name.endswith("Square"):
        # Only forward graph contain gpu in Square device
-        self.assertTrue(gpu_short_name in dev)
+        self.assertTrue(gpu_dev_name in dev)
      elif colocate and "Square" in name:
        # Forward and backward graphs contain gpu in Square/Square_grad devices
-        self.assertTrue(gpu_short_name in dev)
+        self.assertTrue(gpu_dev_name in dev)
      else:
-        self.assertFalse(gpu_short_name in dev)
+        self.assertFalse(gpu_dev_name in dev)
    self.assertAllClose(1024.0, sess.run(r))
  def testWhileGrad_ColocateGradients(self):
@ -2426,7 +2425,7 @@ class ControlFlowTest(test.TestCase):
      # device set on tensor, default device on graph => default device on dep.
      vdef = variables.Variable([0.0], name="vdef")
-      with ops.device("/job:worker/gpu:1"):
+      with ops.device("/job:worker/device:GPU:1"):
        with_vdef_dep = control_flow_ops.with_dependencies([vdef.initializer],
                                                           vdef)
        # The device is empty, but the colocation constraint is set.
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@ -347,7 +347,7 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
          ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
              x_t, y_t, adjoint_a, adjoint_b)
      else:
-        with ops.device("/gpu:0"):
+        with ops.device("/device:GPU:0"):
          x_t = constant_op.constant(x)
          y_t = constant_op.constant(y)
          ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
@ -365,7 +365,7 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
        ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
            x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
    else:
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
        x_ind = constant_op.constant(np.vstack(np.where(x)).astype(np.int64).T)
        x_val = constant_op.constant(x[np.where(x)])
        x_shape = constant_op.constant(np.array(x.shape).astype(np.int64))
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@ -722,7 +722,7 @@ class VariableScopeTest(test.TestCase):
    def device_func(op):
      if op.type in ["Variable", "VariableV2", "VarHandleOp"]:
        varname_type.append((op.name, op.get_attr("dtype")))
-      return "/gpu:0"
+      return "/device:GPU:0"
    with g.as_default():
      with ops.device(device_func):
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@ -163,20 +163,20 @@ class GradientsTest(test_util.TensorFlowTestCase):
    with ops.Graph().as_default() as g:
      w = constant(1.0, shape=[1, 1])
      x = constant(1.0, shape=[1, 2])
-      with g.device("/gpu:0"):
+      with g.device("/device:GPU:0"):
        wx = math_ops.matmul(w, x)
      gw = gradients.gradients(wx, [w], colocate_gradients_with_ops=True)[0]
    self.assertEqual(gw.op.colocation_groups(), wx.op.colocation_groups())
  def testColocateGradientsWithAggregation(self):
    with ops.Graph().as_default() as g:
-      with g.device("/gpu:1"):
+      with g.device("/device:GPU:1"):
        w = constant(1.0, shape=[1, 1])
      x = constant(1.0, shape=[1, 2])
      y = constant(1.0, shape=[1, 2])
      wx = math_ops.matmul(w, x)
      wy = math_ops.matmul(w, y)
-      with g.device("/gpu:0"):
+      with g.device("/device:GPU:0"):
        z = wx + wy
      gw1 = gradients.gradients(z, [w], colocate_gradients_with_ops=True)[0]
@ -187,7 +187,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
  def testColocateGradientsWithAggregationInMultipleDevices(self):
    with ops.Graph().as_default() as g:
-      with g.device("/gpu:1"):
+      with g.device("/device:GPU:1"):
        w = constant(1.0, shape=[1, 1])
      x = constant(1.0, shape=[1, 2])
      y = constant(1.0, shape=[1, 2])
@ -195,7 +195,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
        wx = math_ops.matmul(w, x)
      with g.device("/task:2"):
        wy = math_ops.matmul(w, y)
-      with g.device("/gpu:0"):
+      with g.device("/device:GPU:0"):
        z = wx + wy
      gw1 = gradients.gradients(z, [w], colocate_gradients_with_ops=True)[0]
--- a/tensorflow/python/ops/matmul_benchmark.py
+++ b/tensorflow/python/ops/matmul_benchmark.py
@ -47,7 +47,7 @@ def build_graph(device, n, m, k, transpose_a, transpose_b, dtype):
  Returns:
    A matmul operation to run()
  """
-  with ops.device('/%s:0' % device):
+  with ops.device('%s' % device):
    if not transpose_a:
      x = variables.Variable(random_ops.random_uniform([n, m], dtype=dtype))
    else:
@ -112,7 +112,7 @@ class MatmulBenchmark(test.Benchmark):
    return duration
  def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters):
-    self.run_graph('gpu', n, m, k, transpose_a, transpose_b, num_iters, dtype)
+    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b, num_iters, dtype)
  def test_round(self, num_iters):
    dtypes = [np.float32, np.float64]
--- a/tensorflow/python/ops/matmul_benchmark_test.py
+++ b/tensorflow/python/ops/matmul_benchmark_test.py
@ -71,37 +71,39 @@ class MatmulBenchmarkTest(googletest.TestCase):
  def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype):
    graph = ops.Graph()
    with graph.as_default():
-      matmul_benchmark.build_graph("gpu", n, m, k, transpose_a, transpose_b,
+      matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, transpose_a, transpose_b,
                                   dtype)
      gd = graph.as_graph_def()
-      self.assertProtoEquals("""
+      dev=googletest.gpu_device_name()
-      node { name: "random_uniform/shape" op: "Const" device: "/device:GPU:0" }
+      proto_expected = """
-      node { name: "random_uniform/min" op: "Const" device: "/device:GPU:0" }
+      node { name: "random_uniform/shape" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/max" op: "Const" device: "/device:GPU:0" }
+      node { name: "random_uniform/min" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: "/device:GPU:0" }
+      node { name: "random_uniform/max" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: "/device:GPU:0" }
+      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: "/device:GPU:0" }
+      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \""""+ dev +"""\" }
-      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: "/device:GPU:0" }
+      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \""""+ dev +"""\" }
-      node { name: "Variable" op: "VariableV2" device: "/device:GPU:0" }
+      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \""""+ dev +"""\" }
-      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: "/device:GPU:0" }
+      node { name: "Variable" op: "VariableV2" device: \""""+ dev +"""\" }
-      node { name: "Variable/read" op: "Identity" input: "Variable" device: "/device:GPU:0" }
+      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/shape" op: "Const" device: "/device:GPU:0" }
+      node { name: "Variable/read" op: "Identity" input: "Variable" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/min" op: "Const" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/shape" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/max" op: "Const" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/min" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/max" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: "/device:GPU:0" }
+      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \""""+ dev +"""\" }
-      node { name: "Variable_1" op: "VariableV2" device: "/device:GPU:0" }
+      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
-      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: "/device:GPU:0" }
+      node { name: "Variable_1" op: "VariableV2" device: \""""+ dev +"""\" }
-      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: "/device:GPU:0" }
+      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \""""+ dev +"""\" }
-      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: "/device:GPU:0" }
+      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \""""+ dev +"""\" }
-      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: "/device:GPU:0" }
+      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \""""+ dev +"""\" }
-                             """, self._StripGraph(gd))
+      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \""""+ dev +"""\" }
                       """
      self.assertProtoEquals(str(proto_expected), self._StripGraph(gd))
  def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype):
    benchmark_instance = matmul_benchmark.MatmulBenchmark()
-    duration = benchmark_instance.run_graph("gpu", n, m, k, transpose_a,
+    duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m, k, transpose_a,
                                            transpose_b, 1, dtype)
    self.assertTrue(duration > 1e-6)
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@ -97,21 +97,22 @@ class RunMetadataTest(test.TestCase):
    if not test.is_gpu_available(cuda_only=True):
      return
    gpu_dev = test.gpu_device_name()
    ops.reset_default_graph()
-    with ops.device('/gpu:0'):
+    with ops.device(gpu_dev):
      tfprof_node, run_meta = _run_model()
      self.assertEqual(tfprof_node.children[0].name, 'MatMul')
      self.assertGreater(tfprof_node.children[0].exec_micros, 10)
    ret = _extract_node(run_meta, ['MatMul', 'MatMul:MatMul'])
    self.assertEqual(len(ret), 3)
-    self.assertTrue('/job:localhost/replica:0/task:0/gpu:0' in ret)
+    self.assertTrue('/job:localhost/replica:0/task:0' + gpu_dev in ret)
-    del ret['/job:localhost/replica:0/task:0/gpu:0']
+    del ret['/job:localhost/replica:0/task:0' + gpu_dev]
    has_all_stream = False
    for k, _ in six.iteritems(ret):
-      self.assertTrue('gpu:0/stream' in k)
+      self.assertTrue(gpu_dev + '/stream' in k)
-      if 'gpu:0/stream:all' in k:
+      if gpu_dev + '/stream:all' in k:
        has_all_stream = True
    self.assertTrue(has_all_stream)
@ -159,24 +160,24 @@ class RunMetadataTest(test.TestCase):
      return
    ops.reset_default_graph()
-    with ops.device('/gpu:0'):
+    with ops.device('/device:GPU:0'):
      tfprof_node, run_meta = _run_loop_model()
      # The while-loop caused a node to appear 4 times in scheduling.
      ret = _extract_node(run_meta,
                          'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul')
-      self.assertEqual(len(ret['/job:localhost/replica:0/task:0/gpu:0']), 4)
+      self.assertEqual(len(ret['/job:localhost/replica:0/task:0/device:GPU:0']), 4)
      total_cpu_execs = 0
-      for node in ret['/job:localhost/replica:0/task:0/gpu:0']:
+      for node in ret['/job:localhost/replica:0/task:0/device:GPU:0']:
        total_cpu_execs += node.op_end_rel_micros
      ret = _extract_node(
          run_meta,
          'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul:MatMul')
-      self.assertGreaterEqual(len(ret['/gpu:0/stream:all']), 4)
+      self.assertGreaterEqual(len(ret['/device:GPU:0/stream:all']), 4)
      total_accelerator_execs = 0
-      for node in ret['/gpu:0/stream:all']:
+      for node in ret['/device:GPU:0/stream:all']:
        total_accelerator_execs += node.op_end_rel_micros
      mm_node = lib.SearchTFProfNode(
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@ -315,7 +315,7 @@ class ProfileOptionBuilder(object):
    """Selectively counting statistics based on node types.
    Here, 'types' means the profiler nodes' properties. Profiler by default
-    consider device name (e.g. /job:xx/.../gpu:0) and operation type
+    consider device name (e.g. /job:xx/.../device:GPU:0) and operation type
    (e.g. MatMul) as profiler nodes' properties. User can also associate
    customized 'types' to profiler nodes through OpLogProto proto.
--- a/tensorflow/tools/graph_transforms/remove_device_test.cc
+++ b/tensorflow/tools/graph_transforms/remove_device_test.cc
@ -50,7 +50,7 @@ class RemoveDeviceTest : public ::testing::Test {
    add_node2->set_op("Add");
    add_node2->add_input("const_node1");
    add_node2->add_input("const_node2");
-    add_node2->set_device("//gpu:1");
+    add_node2->set_device("//device:GPU:1");
    NodeDef* add_node3 = graph_def.add_node();
    add_node3->set_name("add_node3");