From ab96f41fb4c4f17e96fd6177aa589c19df580456 Mon Sep 17 00:00:00 2001
From: Luke Iwanski <luke@codeplay.com>
Date: Fri, 11 Aug 2017 01:35:21 +0100
Subject: [PATCH] [OpenCL] Extends matmul_benchmark.py to cover SYCL (#11697)

* [OpenCL] Extends matmul_benchmark.py to cover SYCL

* Fixed typo

* /gpu:0 -> /device:GPU:0

* Fixes control_flow_ops_py_test

* /gpu: -> /device:GPU:

* Fixes //tensorflow/python/profiler/internal:run_metadata_test

* gpu: -> GPU:

* Fixes tfprof_node

* [OpenCL] Fixes device path to name with many colons (#123)

The device path is constructed from a device name by replacing all
colons with underscores. Some device names contain more than one colon,
for example 'device:SYCL:0' which gives a path 'device_SYCL_0'. The
previous code would not convert this back to the original device name,
but rather to 'device:SYCL_0'.

An alternative fix would be to convert all underscores to colons in the
device name (i.e. remove the restriction inside `replace("_", ":", 1)`),
however I'm not sure if there are any device names which contain
underscores.

* If no gpu device aviable fake one

* gpu: -> device:GPU

* Fixes profiler test

* /gpu:x -> /device:GPU:x

* Fixes debug_io_utils_test.cc test

* Fixes device_name_utils_test.cc
---
 tensorflow/cc/tutorials/example_trainer.cc    |  2 +-
 .../kernel_tests/cudnn_rnn_ops_benchmark.py   |  6 +-
 .../python/kernel_tests/mixture_test.py       |  2 +-
 .../framework/python/ops/variables_test.py    |  8 +-
 .../contrib/nccl/python/ops/nccl_ops_test.py  |  6 +-
 .../python/kernel_tests/core_rnn_cell_test.py |  7 +-
 .../rnn/python/kernel_tests/core_rnn_test.py  | 19 ++--
 .../rnn/python/kernel_tests/gru_ops_test.py   |  6 +-
 .../kernel_tests/beam_search_ops_test.py      |  2 +-
 tensorflow/core/common_runtime/device.h       |  2 +-
 .../common_runtime/direct_session_test.cc     |  4 +-
 ...direct_session_with_tracking_alloc_test.cc |  4 +-
 .../core/common_runtime/gpu/gpu_device.cc     |  4 +-
 .../core/common_runtime/gpu/gpu_device.h      |  2 +-
 .../gpu/gpu_stream_util_test.cc               |  4 +-
 .../core/common_runtime/memory_types_test.cc  |  2 +-
 tensorflow/core/debug/debug_gateway.cc        |  2 +-
 tensorflow/core/debug/debug_gateway_test.cc   |  8 +-
 tensorflow/core/debug/debug_io_utils_test.cc  |  6 +-
 .../core/distributed_runtime/executor_test.cc |  2 +-
 .../rpc/grpc_channel_test.cc                  |  4 +-
 tensorflow/core/framework/node_def.proto      |  4 +-
 tensorflow/core/framework/rendezvous_test.cc  | 10 +--
 tensorflow/core/graph/graph_partition_test.cc |  2 +-
 tensorflow/core/graph/mkl_layout_pass_test.cc |  2 +-
 .../core/grappler/clusters/single_machine.cc  |  2 +-
 .../costs/analytical_cost_estimator_test.cc   |  2 +-
 .../grappler/costs/virtual_placer_test.cc     | 24 ++---
 .../grappler/optimizers/model_pruner_test.cc  | 10 +--
 .../core/platform/default/gpu_tracer.cc       |  4 +-
 tensorflow/core/platform/gpu_tracer_test.cc   |  6 +-
 tensorflow/core/profiler/README.md            |  8 +-
 tensorflow/core/profiler/g3doc/advise.md      |  8 +-
 .../core/profiler/g3doc/profile_time.md       |  2 +-
 .../internal/advisor/tfprof_advisor_test.cc   |  6 +-
 .../core/profiler/internal/tfprof_node.cc     |  4 +-
 tensorflow/core/protobuf/config.proto         |  2 +-
 .../core/util/device_name_utils_test.cc       | 88 +++++++++----------
 .../api_guides/python/contrib.seq2seq.md      |  6 +-
 .../docs_src/programmers_guide/variables.md   |  2 +-
 tensorflow/docs_src/tutorials/deep_cnn.md     |  2 +-
 tensorflow/docs_src/tutorials/using_gpu.md    | 46 +++++-----
 tensorflow/examples/learn/multiple_gpu.py     |  4 +-
 .../client/session_clusterspec_prop_test.py   |  4 +-
 tensorflow/python/client/session_test.py      |  6 +-
 tensorflow/python/client/timeline_test.py     |  4 +-
 tensorflow/python/debug/lib/debug_data.py     |  3 +-
 .../python/debug/lib/debug_data_test.py       | 20 ++---
 .../python/debug/lib/session_debug_testlib.py |  2 +-
 .../debug/wrappers/local_cli_wrapper_test.py  |  2 +-
 tensorflow/python/framework/device_test.py    | 16 ++--
 tensorflow/python/framework/function_test.py  |  2 +-
 .../python/framework/graph_util_test.py       |  4 +-
 tensorflow/python/framework/importer_test.py  |  2 +-
 .../python/framework/meta_graph_test.py       |  2 +-
 tensorflow/python/framework/ops.py            |  4 +-
 tensorflow/python/framework/ops_test.py       | 18 ++--
 tensorflow/python/framework/test_util.py      | 12 +--
 .../python/kernel_tests/basic_gpu_test.py     |  2 +-
 .../python/kernel_tests/cholesky_op_test.py   |  8 +-
 .../kernel_tests/control_flow_ops_py_test.py  | 15 ++--
 .../sparse_tensor_dense_matmul_op_test.py     |  4 +-
 .../kernel_tests/variable_scope_test.py       |  2 +-
 tensorflow/python/ops/gradients_test.py       | 10 +--
 tensorflow/python/ops/matmul_benchmark.py     |  4 +-
 .../python/ops/matmul_benchmark_test.py       | 54 ++++++------
 .../profiler/internal/run_metadata_test.py    | 21 ++---
 tensorflow/python/profiler/option_builder.py  |  2 +-
 .../graph_transforms/remove_device_test.cc    |  2 +-
 69 files changed, 286 insertions(+), 285 deletions(-)

diff --git a/tensorflow/cc/tutorials/example_trainer.cc b/tensorflow/cc/tutorials/example_trainer.cc
index 49d3cca3a4e..3675d72ee35 100644
--- a/tensorflow/cc/tutorials/example_trainer.cc
+++ b/tensorflow/cc/tutorials/example_trainer.cc
@@ -101,7 +101,7 @@ void ConcurrentSteps(const Options* opts, int session_index) {
   std::unique_ptr<Session> session(NewSession(options));
   GraphDef def = CreateGraphDef();
   if (options.target.empty()) {
-    graph::SetDefaultDevice(opts->use_gpu ? "/gpu:0" : "/cpu:0", &def);
+    graph::SetDefaultDevice(opts->use_gpu ? "/device:GPU:0" : "/cpu:0", &def);
   }
 
   TF_CHECK_OK(session->Create(def));
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
index 6ca38c2e479..ff409ac7182 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@@ -93,7 +93,7 @@ class CudnnRNNBenchmark(test.Benchmark):
       batch_size = config["batch_size"]
       seq_length = config["seq_length"]
 
-      with ops.Graph().as_default(), ops.device("/gpu:0"):
+      with ops.Graph().as_default(), ops.device("/device:GPU:0"):
         model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, num_units)
         params_size_t = model.params_size()
         input_data = variables.Variable(
@@ -125,7 +125,7 @@ class CudnnRNNBenchmark(test.Benchmark):
       batch_size = config["batch_size"]
       seq_length = config["seq_length"]
 
-      with ops.Graph().as_default(), ops.device("/gpu:0"):
+      with ops.Graph().as_default(), ops.device("/device:GPU:0"):
         inputs = seq_length * [
             array_ops.zeros([batch_size, num_units], dtypes.float32)
         ]
@@ -153,7 +153,7 @@ class CudnnRNNBenchmark(test.Benchmark):
       batch_size = config["batch_size"]
       seq_length = config["seq_length"]
 
-      with ops.Graph().as_default(), ops.device("/gpu:0"):
+      with ops.Graph().as_default(), ops.device("/device:GPU:0"):
         inputs = seq_length * [
             array_ops.zeros([batch_size, num_units], dtypes.float32)
         ]
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
index aa523a95118..2705b96f271 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
@@ -634,7 +634,7 @@ class MixtureBenchmark(test.Benchmark):
     np.random.seed(127)
     with session.Session(config=config, graph=ops.Graph()) as sess:
       random_seed.set_random_seed(0)
-      with ops.device("/gpu:0" if use_gpu else "/cpu:0"):
+      with ops.device("/device:GPU:0" if use_gpu else "/cpu:0"):
         mixture = create_distribution(
             num_components=num_components,
             batch_size=batch_size,
diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py
index cb278707202..6a74e4e8666 100644
--- a/tensorflow/contrib/framework/python/ops/variables_test.py
+++ b/tensorflow/contrib/framework/python/ops/variables_test.py
@@ -443,19 +443,19 @@ class VariablesTest(test.TestCase):
         e = variables_lib2.variable('e', initializer=e_init)
       # The values below highlight how the VariableDeviceChooser puts initial
       # values on the same device as the variable job.
-      self.assertDeviceEqual(a.device, '/gpu:0')
+      self.assertDeviceEqual(a.device, '/device:GPU:0')
       self.assertEqual(a.initial_value.op.colocation_groups(),
                        a.op.colocation_groups())
-      self.assertDeviceEqual(b.device, '/gpu:0')
+      self.assertDeviceEqual(b.device, '/device:GPU:0')
       self.assertEqual(b.initial_value.op.colocation_groups(),
                        b.op.colocation_groups())
       self.assertDeviceEqual(c.device, '/cpu:12')
       self.assertEqual(c.initial_value.op.colocation_groups(),
                        c.op.colocation_groups())
-      self.assertDeviceEqual(d.device, '/gpu:0')
+      self.assertDeviceEqual(d.device, '/device:GPU:0')
       self.assertEqual(d.initial_value.op.colocation_groups(),
                        d.op.colocation_groups())
-      self.assertDeviceEqual(e.device, '/gpu:0')
+      self.assertDeviceEqual(e.device, '/device:GPU:0')
       self.assertDeviceEqual(e.initial_value.device, '/cpu:99')
 
 
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
index 130cb4ca12c..ae658e73227 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@@ -43,7 +43,7 @@ class AllReduceTest(test.TestCase):
         self._testSingleAllReduce(sess, dtype, nccl.all_max, np.maximum)
 
   def _testSingleAllReduce(self, sess, np_type, nccl_fn, numpy_accumulation_fn):
-    for devices in [['/gpu:0', '/gpu:0', '/gpu:0'], ['/gpu:0', '/gpu:0']]:
+    for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
       shape = (3, 4)
       np_ans = None
       tensors = []
@@ -84,7 +84,7 @@ class BroadcastTest(test.TestCase):
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
-        for devices in [['/gpu:0', '/gpu:0', '/gpu:0'], ['/gpu:0', '/gpu:0']]:
+        for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
           shape = (3, 4)
           sender = np.random.randint(0, len(devices) - 1)
           with ops.device(devices[sender]):
@@ -115,7 +115,7 @@ class CombinedTest(test.TestCase):
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
-        for devices in [['/gpu:0', '/gpu:0', '/gpu:0'], ['/gpu:0', '/gpu:0']]:
+        for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
           shape = (3, 4)
 
           # all-reduce
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index faab0992498..a77097e0c3a 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -446,12 +446,12 @@ class RNNCellTest(test.TestCase):
       # Can't perform this test w/o a GPU
       return
 
+    gpu_dev = test.gpu_device_name()
     with self.test_session(use_gpu=True) as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 1, 3])
-        cell = rnn_cell_impl.DeviceWrapper(
-            rnn_cell_impl.GRUCell(3), test_util.gpu_device_name())
+        cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev)
         with ops.device("/cpu:0"):
           outputs, _ = rnn.dynamic_rnn(
               cell=cell, inputs=x, dtype=dtypes.float32)
@@ -463,8 +463,7 @@ class RNNCellTest(test.TestCase):
         _ = sess.run(outputs, options=opts, run_metadata=run_metadata)
 
       step_stats = run_metadata.step_stats
-      ix = 0 if (("gpu"  in step_stats.dev_stats[0].device) or
-                 ("sycl" in step_stats.dev_stats[0].device)) else 1
+      ix = 0 if gpu_dev in step_stats.dev_stats[0].device else 1
       gpu_stats = step_stats.dev_stats[ix].node_stats
       cpu_stats = step_stats.dev_stats[1 - ix].node_stats
       self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name])
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 701590a8feb..40a3fb2fb0b 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
-from tensorflow.python.framework import test_util
 
 class Plus1RNNCell(rnn_lib.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
@@ -2208,11 +2207,11 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     if not test.is_gpu_available():
       return  # Test requires access to a GPU
 
+    gpu_dev = test.gpu_device_name()
     run_metadata = self._execute_rnn_on(
-        rnn_device="/cpu:0", cell_device=test_util.gpu_device_name())
+        rnn_device="/cpu:0", cell_device=gpu_dev)
     step_stats = run_metadata.step_stats
-    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
-    ("sycl" in step_stats.dev_stats[0].device)) else 1
+    ix = 0 if (gpu_dev in step_stats.dev_stats[0].device) else 1
     gpu_stats = step_stats.dev_stats[ix].node_stats
     cpu_stats = step_stats.dev_stats[1 - ix].node_stats
 
@@ -2233,12 +2232,12 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     if not test.is_gpu_available():
       return  # Test requires access to a GPU
 
+    gpu_dev = test.gpu_device_name()
     run_metadata = self._execute_rnn_on(
         rnn_device="/cpu:0", cell_device="/cpu:0",
-        input_device=test_util.gpu_device_name())
+        input_device=gpu_dev)
     step_stats = run_metadata.step_stats
-    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
-    ("sycl" in step_stats.dev_stats[0].device)) else 1
+    ix = 0 if (gpu_dev in step_stats.dev_stats[0].device) else 1
     gpu_stats = step_stats.dev_stats[ix].node_stats
     cpu_stats = step_stats.dev_stats[1 - ix].node_stats
 
@@ -2253,11 +2252,11 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     if not test.is_gpu_available():
       return  # Test requires access to a GPU
 
+    gpu_dev = test.gpu_device_name()
     run_metadata = self._execute_rnn_on(
-        input_device=test_util.gpu_device_name())
+        input_device=gpu_dev)
     step_stats = run_metadata.step_stats
-    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
-    ("sycl" in step_stats.dev_stats[0].device)) else 1
+    ix = 0 if (gpu_dev in step_stats.dev_stats[0].device) else 1
     gpu_stats = step_stats.dev_stats[ix].node_stats
     cpu_stats = step_stats.dev_stats[1 - ix].node_stats
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
index baf17431f35..4239e32ab93 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
@@ -357,7 +357,7 @@ def training_gru_block_vs_gru_cell(batch_size,
   ops.reset_default_graph()
   with session.Session(graph=ops.Graph()) as sess:
     # Specify the device which is been used.
-    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
+    with ops.device("/cpu:0" if not use_gpu else "/device:GPU:0"):
 
       # Random initializers.
       seed = 1994
@@ -429,7 +429,7 @@ def inference_gru_block_vs_gru_cell(batch_size,
   """Benchmark inference speed between GRUBlockCell vs GRUCell."""
   ops.reset_default_graph()
   with session.Session(graph=ops.Graph()) as sess:
-    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
+    with ops.device("/cpu:0" if not use_gpu else "/device:GPU:0"):
 
       # Random initializers.
       seed = 1994
@@ -484,7 +484,7 @@ def single_bprop_step_gru_block_vs_gru_cell(batch_size,
   """Benchmark single bprop step speed between GRUBlockCell vs GRUCell."""
   ops.reset_default_graph()
   with session.Session(graph=ops.Graph()) as sess:
-    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
+    with ops.device("/cpu:0" if not use_gpu else "/device:GPU:0"):
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989)
       # Inputs
       x = vs.get_variable("x", [batch_size, input_size])
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
index 3496b355b4b..50cccf392fd 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
@@ -78,7 +78,7 @@ class GatherTreeTest(test.TestCase):
     sequence_length = [[3, 3, 3]]
     expected_result = _transpose_batch_time(
         [[[2, -1, 2], [6, 5, 6], [7, 8, 9], [-1, -1, -1]]])
-    with ops.device("/gpu:0"):
+    with ops.device("/device:GPU:0"):
       beams = beam_search_ops.gather_tree(
           step_ids=step_ids, parent_ids=parent_ids,
           sequence_length=sequence_length)
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index ded7e383d17..1d450aad7ff 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -22,7 +22,7 @@ limitations under the License.
 // Device names
 // * Every Device should have a unique name with the format:
 //     /job:___/replica:___/task:___/(gpu|cpu):___
-//   An example name would be "/job:train/replica:0/task:3/gpu:2".
+//   An example name would be "/job:train/replica:0/task:3/device:GPU:2".
 // * Task numbers are within the specified replica, so there are as
 //   many "task zeros" as replicas.
 
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 097dab8406f..05f683f6082 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -476,7 +476,7 @@ TEST(DirectSessionTest, PlacePrunedGraph) {
     vx.scalar<float>()() = 1.0;
     Node* x = test::graph::Constant(&g, vx);
     Node* y = test::graph::Unary(&g, "Darth", x);
-    y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
     GraphDef def;
     test::graph::ToGraphDef(&g, &def);
 
@@ -494,7 +494,7 @@ TEST(DirectSessionTest, PlacePrunedGraph) {
     vx.scalar<float>()() = 1.0;
     Node* x = test::graph::Constant(&g, vx);
     Node* y = test::graph::Unary(&g, "Darth", x);
-    y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
     GraphDef def;
     test::graph::ToGraphDef(&g, &def);
 
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index da76ac83db7..459c20ef20b 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -154,14 +154,14 @@ static void TestHWAccelerator(bool enableHWTrace) {
   Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
   test::FillValues<float>(&x_tensor, {1, 1});
   Node* x = test::graph::Constant(&graph, x_tensor);
-  x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+  x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
 #endif // TENSORFLOW_USE_SYCL
 
   // y = A * x
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
-  y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+  y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
 y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
 #endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index fbc2be18ccd..63956afce25 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -588,7 +588,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
   for (int i = 0; i < n; i++) {
     BaseGPUDevice* gpu_device;
     TF_RETURN_IF_ERROR(CreateGPUDevice(options,
-                                       strings::StrCat(name_prefix, "/gpu:", i),
+                                       strings::StrCat(name_prefix, "/device:GPU:", i),
                                        valid_gpu_ids[i], &gpu_device));
     TF_RETURN_IF_ERROR(gpu_device->Init(options));
     devices->push_back(gpu_device);
@@ -1049,7 +1049,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     size_t new_id = ids->size();
     ids->push_back(visible_gpu_id);
 
-    LOG(INFO) << "Creating TensorFlow device (/gpu:" << new_id << ") -> "
+    LOG(INFO) << "Creating TensorFlow device (/device:GPU:" << new_id << ") -> "
               << "(" << GetShortDeviceDescription(visible_gpu_id, desc) << ")";
   }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 08c58867eed..a7e078e97cc 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -141,7 +141,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
                                          Allocator* cpu_allocator) = 0;
 
   // Returns into 'ids' the list of valid GPU ids, in the order that
-  // they should map to logical gpu ids "/gpu:0", "/gpu:1", etc, based
+  // they should map to logical gpu ids "/device:GPU:0", "/device:GPU:1", etc, based
   // upon 'visible_device_list', a comma-separated list of 'visible
   // gpu ids'.
   Status GetValidDeviceIds(const string& visible_device_list,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
index a8bad5b94dc..003e416bbe6 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
@@ -106,9 +106,9 @@ TEST_F(GpuStreamUtilTest, SimpleGraphManyStreams) {
 TEST_F(GpuStreamUtilTest, StreamOverrides) {
   auto root = Scope::NewRootScope().ExitOnError();
   ops::_Recv(root.WithOpName("input"), DT_FLOAT, "input", "/cpu:0", 0,
-             "/gpu:0");
+             "/device:GPU:0");
   Output n = ops::MatMul(root, {}, {});
-  ops::_Send(root.WithOpName("output"), n, "output", "/gpu:0", 0, "/cpu:0");
+  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0, "/cpu:0");
   Graph g(OpRegistry::Global());
   TF_ASSERT_OK(root.ToGraph(&g));
 
diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
index b3a43d35046..2a834ddca42 100644
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -53,7 +53,7 @@ TEST(MemoryTypeChecker, Int32NotOk) {
   EXPECT_TRUE(errors::IsInternal(ValidateMemoryTypes(DEVICE_GPU, g)));
 
   // But we can insert _HostSend/_HostRecv to ensure the invariant.
-  TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/gpu:0", g));
+  TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/device:GPU:0", g));
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc
index 2aaed9563a6..616ced3d0f3 100644
--- a/tensorflow/core/debug/debug_gateway.cc
+++ b/tensorflow/core/debug/debug_gateway.cc
@@ -86,7 +86,7 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
     // Determine if the tensor is on device (GPU) or host (CPU).
     // The second part of the check is necessary because even an OpKernel on
     // may have output tensors allocated on CPU.
-    if ((device->name().find("gpu:") != string::npos || device->name().find("SYCL:") != string::npos) &&
+    if ((device->name().find("GPU:") != string::npos || device->name().find("SYCL:") != string::npos) &&
         !ctx->output_alloc_attr(output_slot).on_host()) {
       // GPU tensors: Copy it to host (CPU).
       DeviceContext* device_ctxt = ctx->op_device_context();
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index f25d91a3c27..9a74a4bb4cf 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -47,7 +47,7 @@ class SessionDebugMinusAXTest : public ::testing::Test {
     Graph graph(OpRegistry::Global());
 
 #if GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
@@ -505,7 +505,7 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
     Graph graph(OpRegistry::Global());
 
 #if GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
@@ -607,7 +607,7 @@ class SessionDebugVariableTest : public ::testing::Test {
     Graph graph(OpRegistry::Global());
 
 #if GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
@@ -879,7 +879,7 @@ class SessionDebugGPUSwitchTest : public ::testing::Test {
     Graph graph(OpRegistry::Global());
 
 #ifdef GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif TENSORFLOW_USE_SYCL
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #endif
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index df6fb1d2fe1..0aef15e4853 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -51,14 +51,14 @@ class DebugIOUtilsTest : public ::testing::Test {
 };
 
 TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
-  DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/gpu:2",
+  DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/device:GPU:2",
                               "hidden_1/MatMul", 0, "DebugIdentity");
-  EXPECT_EQ("/job:worker/replica:1/task:0/gpu:2", debug_node_key.device_name);
+  EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2", debug_node_key.device_name);
   EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
   EXPECT_EQ(0, debug_node_key.output_slot);
   EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
   EXPECT_EQ("hidden_1/MatMul:0:DebugIdentity", debug_node_key.debug_node_name);
-  EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,gpu_2",
+  EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,device_GPU_2",
             debug_node_key.device_path);
 }
 
diff --git a/tensorflow/core/distributed_runtime/executor_test.cc b/tensorflow/core/distributed_runtime/executor_test.cc
index 1a4980a61b2..5b115f9a4d4 100644
--- a/tensorflow/core/distributed_runtime/executor_test.cc
+++ b/tensorflow/core/distributed_runtime/executor_test.cc
@@ -140,7 +140,7 @@ Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
 }
 
 #define ALICE "/job:j/replica:0/task:0/cpu:0"
-#define BOB "/job:j/replica:0/task:0/gpu:0"
+#define BOB "/job:j/replica:0/task:0/device:GPU:0"
 
 TEST_F(ExecutorTest, SimpleAdd) {
   // c = a + b
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
index c975563a21f..a17acc85b38 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
@@ -31,9 +31,9 @@ TEST(GrpcChannelTest, IsSameAddressSpace) {
   EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10/cpu:0",
                            "/job:mnist/replica:10/task:10/cpu:1"));
   EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10/cpu:0",
-                           "/job:mnist/replica:10/task:10/gpu:2"));
+                           "/job:mnist/replica:10/task:10/device:GPU:2"));
   EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10",
-                           "/job:mnist/replica:10/task:10/gpu:2"));
+                           "/job:mnist/replica:10/task:10/device:GPU:2"));
   EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10/cpu:1",
                            "/job:mnist/replica:10/task:10"));
 
diff --git a/tensorflow/core/framework/node_def.proto b/tensorflow/core/framework/node_def.proto
index d145fac8c14..53aa03108ab 100644
--- a/tensorflow/core/framework/node_def.proto
+++ b/tensorflow/core/framework/node_def.proto
@@ -38,8 +38,8 @@ message NodeDef {
   //              | ( ("gpu" | "cpu") ":" ([1-9][0-9]* | "*") )
   //
   // Valid values for this string include:
-  // * "/job:worker/replica:0/task:1/gpu:3"  (full specification)
-  // * "/job:worker/gpu:3"                   (partial specification)
+  // * "/job:worker/replica:0/task:1/device:GPU:3"  (full specification)
+  // * "/job:worker/device:GPU:3"                   (partial specification)
   // * ""                                    (no specification)
   //
   // If the constraints do not resolve to a single device (or if this
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index fe37b16bb6c..32b8ad784d5 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -39,11 +39,11 @@ namespace {
 TEST(RendezvousTest, Key) {
   const string key = Rendezvous::CreateKey(
       "/job:mnist/replica:1/task:2/CPU:0", 7890,
-      "/job:mnist/replica:1/task:2/GPU:0", "var0", FrameAndIter(0, 0));
+      "/job:mnist/replica:1/task:2/device:GPU:0", "var0", FrameAndIter(0, 0));
   EXPECT_EQ(key,
             "/job:mnist/replica:1/task:2/CPU:0;"
             "0000000000001ed2;"  // 7890 = 0x1ed2
-            "/job:mnist/replica:1/task:2/GPU:0;"
+            "/job:mnist/replica:1/task:2/device:GPU:0;"
             "var0;"
             "0:0");
   Rendezvous::ParsedKey parsed;
@@ -51,12 +51,12 @@ TEST(RendezvousTest, Key) {
   EXPECT_EQ(parsed.src_device, "/job:mnist/replica:1/task:2/CPU:0");
   EXPECT_EQ(parsed.src_incarnation, 7890);
   EXPECT_EQ(parsed.src.type, "CPU");
-  EXPECT_EQ(parsed.dst_device, "/job:mnist/replica:1/task:2/GPU:0");
+  EXPECT_EQ(parsed.dst_device, "/job:mnist/replica:1/task:2/device:GPU:0");
   EXPECT_EQ(parsed.dst.type, "GPU");
 
   EXPECT_FALSE(Rendezvous::ParseKey("foo;bar;baz", &parsed).ok());
   EXPECT_FALSE(Rendezvous::ParseKey("/job:mnist/replica:1/task:2/CPU:0;"
-                                    "/job:mnist/replica:1/task:2/GPU:0;",
+                                    "/job:mnist/replica:1/task:2/device:GPU:0;",
                                     &parsed)
                    .ok());
   EXPECT_FALSE(
@@ -99,7 +99,7 @@ string V(const Tensor& tensor) {
 
 Rendezvous::ParsedKey MakeKey(const string& name) {
   string s = Rendezvous::CreateKey("/job:mnist/replica:1/task:2/CPU:0", 7890,
-                                   "/job:mnist/replica:1/task:2/GPU:0", name,
+                                   "/job:mnist/replica:1/task:2/device:GPU:0", name,
                                    FrameAndIter(0, 0));
   Rendezvous::ParsedKey k;
   TF_EXPECT_OK(Rendezvous::ParseKey(s, &k));
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 3c12ed2689e..d84c62d4546 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -50,7 +50,7 @@ extern Status TopologicalSortNodesWithTimePriority(
 
 namespace {
 
-const char gpu_device[] = "/job:a/replica:0/task:0/gpu:0";
+const char gpu_device[] = "/job:a/replica:0/task:0/device:GPU:0";
 
 string SplitByDevice(const Node* node) { return node->assigned_device_name(); }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index efbe2134e0f..482e339802f 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -40,7 +40,7 @@ namespace tensorflow {
 namespace {
 
 const char kCPUDevice[] = "/job:a/replica:0/task:0/cpu:0";
-const char kGPUDevice[] = "/job:a/replica:0/task:0/gpu:0";
+const char kGPUDevice[] = "/job:a/replica:0/task:0/device:GPU:0";
 
 static void InitGraph(const string& s, Graph* graph,
                       const string& device = kCPUDevice) {
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 3481b2b158d..1f95a9aa88c 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -89,7 +89,7 @@ Status SingleMachine::Provision() {
   VLOG(1) << "Number of GPUs: " << num_gpus_;
   for (int i = 0; i < num_gpus_; ++i) {
     string device_name =
-        strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i);
+        strings::StrCat("/job:localhost/replica:0/task:0/device:GPU:", i);
     VLOG(1) << "Adding GPU device " << device_name;
     devices_[device_name] = GetLocalGPUInfo(i);
   }
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index 02156fbf580..d1f3e36aa81 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -42,7 +42,7 @@ class AnalyticalCostEstimatorTest : public ::testing::Test {
     gpu_device.set_frequency(1100);
     gpu_device.set_bandwidth(180 * 1024 * 1024);
     (*gpu_device.mutable_environment())["architecture"] = "6";
-    devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+    devices["/job:localhost/replica:0/task:0/device:GPU:0"] = gpu_device;
 
     cluster_.reset(new VirtualCluster(devices));
   }
diff --git a/tensorflow/core/grappler/costs/virtual_placer_test.cc b/tensorflow/core/grappler/costs/virtual_placer_test.cc
index 65a03fb5575..a16455cb703 100644
--- a/tensorflow/core/grappler/costs/virtual_placer_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_placer_test.cc
@@ -30,14 +30,14 @@ TEST(VirtualPlacerTest, LocalDevices) {
   devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
   DeviceProperties gpu_device;
   gpu_device.set_type("GPU");
-  devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+  devices["/job:localhost/replica:0/task:0/device:GPU:0"] = gpu_device;
   VirtualCluster cluster(devices);
   VirtualPlacer placer(&cluster);
 
   NodeDef node;
   node.set_op("Conv2D");
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 
   node.set_device("CPU");
@@ -47,7 +47,7 @@ TEST(VirtualPlacerTest, LocalDevices) {
 
   node.set_device("GPU:0");
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 }
 
@@ -60,7 +60,7 @@ TEST(VirtualPlacerTest, EmptyJobBecomesLocalhost) {
   devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
   DeviceProperties gpu_device;
   gpu_device.set_type("GPU");
-  devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+  devices["/job:localhost/replica:0/task:0/device:GPU:0"] = gpu_device;
   VirtualCluster cluster(devices);
   VirtualPlacer placer(&cluster);
 
@@ -70,7 +70,7 @@ TEST(VirtualPlacerTest, EmptyJobBecomesLocalhost) {
   EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0",
             placer.get_canonical_device_name(node));
   node.set_device("/device:GPU:0");
-  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 }
 
@@ -113,7 +113,7 @@ TEST(VirtualPlacerTest, RemoteDevices) {
   devices["/job:my_job/replica:0/task:0/cpu:0"] = cpu_device;
   DeviceProperties gpu_device;
   gpu_device.set_type("GPU");
-  devices["/job:my_job/replica:0/task:0/gpu:0"] = gpu_device;
+  devices["/job:my_job/replica:0/task:0/device:GPU:0"] = gpu_device;
   VirtualCluster cluster(devices);
   VirtualPlacer placer(&cluster);
 
@@ -122,7 +122,7 @@ TEST(VirtualPlacerTest, RemoteDevices) {
 
   // Device falls back to GPU.
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 
   node.set_device("/job:my_job/replica:0/task:0/cpu:0");
@@ -130,27 +130,27 @@ TEST(VirtualPlacerTest, RemoteDevices) {
   EXPECT_EQ("/job:my_job/replica:0/task:0/cpu:0",
             placer.get_canonical_device_name(node));
 
-  node.set_device("/job:my_job/replica:0/task:0/gpu:0");
+  node.set_device("/job:my_job/replica:0/task:0/device:GPU:0");
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 
   // There is no local cpu available. Device falls back to GPU.
   node.set_device("CPU");
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 
   node.set_device("GPU:0");
   // There is no local GPU available. Fall back to default GPU.
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 
   // This isn't a valid name. Fall back to GPU.
   node.set_device("/job:my_job/replica:0/task:0");
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 }
 
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index aea1fcd7c93..ee722f311ed 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -320,14 +320,14 @@ TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) {
   Output c = ops::Const(s.WithOpName("c").WithDevice("/cpu:0"), 0.0f, {10, 10});
 
   // Node i1 should be preserved.
-  Output i1 = ops::Identity(s.WithOpName("i1").WithDevice("/gpu:0"), c);
-  Output a1 = ops::Sqrt(s.WithOpName("a1").WithDevice("/gpu:0"), {i1});
-  Output a2 = ops::Sqrt(s.WithOpName("a2").WithDevice("/gpu:0"), {i1});
+  Output i1 = ops::Identity(s.WithOpName("i1").WithDevice("/device:GPU:0"), c);
+  Output a1 = ops::Sqrt(s.WithOpName("a1").WithDevice("/device:GPU:0"), {i1});
+  Output a2 = ops::Sqrt(s.WithOpName("a2").WithDevice("/device:GPU:0"), {i1});
 
   // Node i2 should be pruned since it resides on the sender's device.
   Output i2 = ops::Identity(s.WithOpName("i2").WithDevice("/cpu:0"), c);
-  Output a3 = ops::Sqrt(s.WithOpName("a3").WithDevice("/gpu:0"), {i2});
-  Output a4 = ops::Sqrt(s.WithOpName("a4").WithDevice("/gpu:0"), {i2});
+  Output a3 = ops::Sqrt(s.WithOpName("a3").WithDevice("/device:GPU:0"), {i2});
+  Output a4 = ops::Sqrt(s.WithOpName("a4").WithDevice("/device:GPU:0"), {i2});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
diff --git a/tensorflow/core/platform/default/gpu_tracer.cc b/tensorflow/core/platform/default/gpu_tracer.cc
index 50c27b3cf6b..3f855461276 100644
--- a/tensorflow/core/platform/default/gpu_tracer.cc
+++ b/tensorflow/core/platform/default/gpu_tracer.cc
@@ -579,8 +579,8 @@ Status GPUTracerImpl::Collect(StepStatsCollector *collector) {
   // TODO(pbar) Handle device IDs and prefix properly.
   const string prefix = "";
   const int id = 0;
-  const string stream_device = strings::StrCat(prefix, "/gpu:", id, "/stream:");
-  const string memcpy_device = strings::StrCat(prefix, "/gpu:", id, "/memcpy");
+  const string stream_device = strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
+  const string memcpy_device = strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
 
   mutex_lock l2(trace_mu_);
   for (const auto &rec : kernel_records_) {
diff --git a/tensorflow/core/platform/gpu_tracer_test.cc b/tensorflow/core/platform/gpu_tracer_test.cc
index 713282c1fd8..f6c2c6cb379 100644
--- a/tensorflow/core/platform/gpu_tracer_test.cc
+++ b/tensorflow/core/platform/gpu_tracer_test.cc
@@ -63,12 +63,12 @@ class GPUTracerTest : public ::testing::Test {
     Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
     test::FillValues<float>(&x_tensor, {1, 1});
     Node* x = test::graph::Constant(&graph, x_tensor);
-    x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
     x_ = x->name();
 
     // y = A * x
     Node* y = test::graph::Matmul(&graph, a, x, false, false);
-    y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
     y_ = y->name();
 
     // Use an Identity op to force a memcpy to CPU and back to GPU.
@@ -77,7 +77,7 @@ class GPUTracerTest : public ::testing::Test {
 
     Node* y_neg = test::graph::Unary(&graph, "Neg", i);
     y_neg_ = y_neg->name();
-    y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 
     test::graph::ToGraphDef(&graph, &def_);
   }
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 6db38a59aef..06118e6eb21 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -127,10 +127,10 @@ tfprof> advise
 Not running under xxxx. Skip JobChecker.
 
 AcceleratorUtilizationChecker:
-device: /job:worker/replica:0/task:0/gpu:0 low utilization: 0.03
-device: /job:worker/replica:0/task:0/gpu:1 low utilization: 0.08
-device: /job:worker/replica:0/task:0/gpu:2 low utilization: 0.04
-device: /job:worker/replica:0/task:0/gpu:3 low utilization: 0.21
+device: /job:worker/replica:0/task:0/device:GPU:0 low utilization: 0.03
+device: /job:worker/replica:0/task:0/device:GPU:1 low utilization: 0.08
+device: /job:worker/replica:0/task:0/device:GPU:2 low utilization: 0.04
+device: /job:worker/replica:0/task:0/device:GPU:3 low utilization: 0.21
 
 OperationChecker:
 Found operation using NHWC data_format on GPU. Maybe NCHW is faster.
diff --git a/tensorflow/core/profiler/g3doc/advise.md b/tensorflow/core/profiler/g3doc/advise.md
index cc16c8fdffd..d87b0d8603d 100644
--- a/tensorflow/core/profiler/g3doc/advise.md
+++ b/tensorflow/core/profiler/g3doc/advise.md
@@ -31,10 +31,10 @@ tfprof --graph_path=graph.pbtxt \
 
 tfprof> advise
 AcceleratorUtilizationChecker:
-device: /job:worker/replica:0/task:0/gpu:0 low utilization: 0.03
-device: /job:worker/replica:0/task:0/gpu:1 low utilization: 0.08
-device: /job:worker/replica:0/task:0/gpu:2 low utilization: 0.04
-device: /job:worker/replica:0/task:0/gpu:3 low utilization: 0.21
+device: /job:worker/replica:0/task:0/device:GPU:0 low utilization: 0.03
+device: /job:worker/replica:0/task:0/device:GPU:1 low utilization: 0.08
+device: /job:worker/replica:0/task:0/device:GPU:2 low utilization: 0.04
+device: /job:worker/replica:0/task:0/device:GPU:3 low utilization: 0.21
 
 OperationChecker:
 Found operation using NHWC data_format on GPU. Maybe NCHW is faster.
diff --git a/tensorflow/core/profiler/g3doc/profile_time.md b/tensorflow/core/profiler/g3doc/profile_time.md
index db555b36174..e11a75553b2 100644
--- a/tensorflow/core/profiler/g3doc/profile_time.md
+++ b/tensorflow/core/profiler/g3doc/profile_time.md
@@ -134,7 +134,7 @@ AddN                            50.10ms (17.33%, 1.34%),       5481
 tfprof> op -select micros,device -order_by micros
 node name | execution time | assigned devices
 SoftmaxCrossEntropyWithLogits     1.37sec (100.00%, 36.44%), /job:worker/replica:0/task:0/cpu:0
-MatMul                        618.97ms (63.56%, 16.51%), |/job:worker/replica:0/task:0/cpu:0|/job:worker/replica:0/task:0/gpu:0|/job:worker/replica:0/task:0/gpu:1|/job:worker/replica:0/task:0/gpu:2|/job:worker/replica:0/task:0/gpu:3
+MatMul                        618.97ms (63.56%, 16.51%), |/job:worker/replica:0/task:0/cpu:0|/job:worker/replica:0/task:0/device:GPU:0|/job:worker/replica:0/task:0/device:GPU:1|/job:worker/replica:0/task:0/device:GPU:2|/job:worker/replica:0/task:0/device:GPU:3
 ```
 
 
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index 096c1d915ca..23ed287f7bb 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -53,10 +53,10 @@ class TFProfAdvisorTest : public ::testing::Test {
     NodeExecStats node_stat;
     node_stat.set_all_start_micros(start_miros);
     node_stat.set_op_end_rel_micros(end_rel_micros);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0", node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0:stream:all",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0", node_stat);
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:all",
                       node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0:stream:0",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:0",
                       node_stat);
     return node;
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 70b91c37e4b..d4a784ffaa6 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -25,7 +25,7 @@ bool CountAsAcceleratorTime(const string& device) {
 }
 
 bool CountAsCPUTime(const string& device) {
-  return RE2::FullMatch(device, ".*/(gpu|cpu|device:sycl):\\d+");
+  return RE2::FullMatch(device, ".*/(device:gpu|gpu|cpu|device:sycl):\\d+");
 }
 
 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
@@ -143,7 +143,7 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
 
   // TODO(xpan): Make this more robust?
   // See run_metadata_test.py
-  // It can be /job:0/replica:0/xxxx/gpu:0, or simply /gpu:0.
+  // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
   // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
   if (IsCanonicalDevice(dev)) {
     if (!canonical_device_.empty()) {
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 69311e3a7f3..56bb709e119 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -42,7 +42,7 @@ message GPUOptions {
   // A comma-separated list of GPU ids that determines the 'visible'
   // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
   // can see 8 GPU devices in the process, and one wanted to map
-  // visible GPU devices 5 and 3 as "/gpu:0", and "/gpu:1", then one
+  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", then one
   // would specify this field as "5,3".  This field is similar in
   // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
   // it applies to the visible GPU devices in the process.
diff --git a/tensorflow/core/util/device_name_utils_test.cc b/tensorflow/core/util/device_name_utils_test.cc
index 008100aa446..9a3f8849a65 100644
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@@ -76,21 +76,21 @@ TEST(DeviceNameUtilsTest, Basic) {
     DeviceNameUtils::ParsedName p;
     EXPECT_FALSE(DeviceNameUtils::ParseFullName("foobar", &p));
     EXPECT_FALSE(
-        DeviceNameUtils::ParseFullName("/job:123/replica:1/task:2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:123/replica:1/task:2/device:GPU:3", &p));
     EXPECT_FALSE(
         DeviceNameUtils::ParseFullName("/job:123/replica:1/task:2/gpu:", &p));
     EXPECT_FALSE(DeviceNameUtils::ParseFullName(
         "/job:123/replica:1/task:2/device:gpu:", &p));
     EXPECT_FALSE(
-        DeviceNameUtils::ParseFullName("/job:foo/replica:-1/task:2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:foo/replica:-1/task:2/device:GPU:3", &p));
     EXPECT_FALSE(
-        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:-2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:-2/device:GPU:3", &p));
     EXPECT_FALSE(
         DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:2/bar:3", &p));
     EXPECT_FALSE(DeviceNameUtils::ParseFullName(
-        "/job:foo/replica:1/task:2/gpu:3/extra", &p));
+        "/job:foo/replica:1/task:2/device:GPU:3/extra", &p));
     EXPECT_TRUE(
-        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:2/device:GPU:3", &p));
     EXPECT_TRUE(p.has_job);
     EXPECT_TRUE(p.has_replica);
     EXPECT_TRUE(p.has_task);
@@ -106,7 +106,7 @@ TEST(DeviceNameUtilsTest, Basic) {
     // Allow _ in job names.
     DeviceNameUtils::ParsedName p;
     EXPECT_TRUE(DeviceNameUtils::ParseFullName(
-        "/job:foo_bar/replica:1/task:2/gpu:3", &p));
+        "/job:foo_bar/replica:1/task:2/device:GPU:3", &p));
     EXPECT_TRUE(p.has_job);
     EXPECT_TRUE(p.has_replica);
     EXPECT_TRUE(p.has_task);
@@ -193,7 +193,7 @@ TEST(DeviceNameUtilsTest, Basic) {
   }
   {
     DeviceNameUtils::ParsedName p;
-    EXPECT_TRUE(DeviceNameUtils::ParseFullName("/job:*/replica:4/gpu:5", &p));
+    EXPECT_TRUE(DeviceNameUtils::ParseFullName("/job:*/replica:4/device:GPU:5", &p));
     EXPECT_FALSE(p.has_job);
     EXPECT_TRUE(p.has_replica);
     EXPECT_FALSE(p.has_task);
@@ -216,13 +216,13 @@ TEST(DeviceNameUtilsTest, Basic) {
   }
 
   EXPECT_TRUE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:2/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:2/device:GPU:4"));
   EXPECT_FALSE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:3/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:3/device:GPU:4"));
   EXPECT_FALSE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:10/task:2/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:10/task:2/device:GPU:4"));
   EXPECT_FALSE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:bar/replica:1/task:2/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:bar/replica:1/task:2/device:GPU:4"));
 
   EXPECT_EQ(DeviceNameUtils::LocalName("CPU", 1), "CPU:1");
   EXPECT_EQ(DeviceNameUtils::LocalName("GPU", 2), "GPU:2");
@@ -284,17 +284,17 @@ static bool IsCSHelper(StringPiece pattern, StringPiece actual) {
 }
 
 TEST(DeviceNameUtilsTest, IsCompleteSpecification) {
-  EXPECT_TRUE(IsCSHelper("/job:*", "/job:work/replica:1/task:2/gpu:3"));
+  EXPECT_TRUE(IsCSHelper("/job:*", "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(
-      IsCSHelper("/job:*/replica:*", "/job:work/replica:1/task:2/gpu:3"));
-  EXPECT_TRUE(IsCSHelper("/job:*/task:*", "/job:work/replica:1/task:2/gpu:3"));
+      IsCSHelper("/job:*/replica:*", "/job:work/replica:1/task:2/device:GPU:3"));
+  EXPECT_TRUE(IsCSHelper("/job:*/task:*", "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(IsCSHelper("/job:*/replica:*/task:*",
-                         "/job:work/replica:1/task:2/gpu:3"));
+                         "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(
-      IsCSHelper("/job:*/replica:*/gpu:*", "/job:work/replica:1/task:2/gpu:3"));
-  EXPECT_FALSE(IsCSHelper("/cpu:*", "/job:worker/replica:1/task:2/gpu:3"));
-  EXPECT_FALSE(IsCSHelper("/gpu:2", "/job:worker/replica:1/task:2/gpu:1"));
-  EXPECT_TRUE(IsCSHelper("/gpu:*", "/job:worker/replica:1/task:2/gpu:3"));
+      IsCSHelper("/job:*/replica:*/gpu:*", "/job:work/replica:1/task:2/device:GPU:3"));
+  EXPECT_FALSE(IsCSHelper("/cpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
+  EXPECT_FALSE(IsCSHelper("/device:GPU:2", "/job:worker/replica:1/task:2/device:GPU:1"));
+  EXPECT_TRUE(IsCSHelper("/gpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
 }
 
 static bool IsSpecHelper(StringPiece pattern, StringPiece actual) {
@@ -305,36 +305,36 @@ static bool IsSpecHelper(StringPiece pattern, StringPiece actual) {
 }
 
 TEST(DeviceNameUtilsTest, IsSpecification) {
-  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/task:2/gpu:3"));
-  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/gpu:3"));
+  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/task:2/device:GPU:3"));
+  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/device:GPU:3"));
   EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1"));
   EXPECT_TRUE(IsSpecHelper("/job:*", "/replica:1"));
   EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work"));
   EXPECT_TRUE(
-      IsSpecHelper("/job:*/replica:*", "/job:work/replica:1/task:2/gpu:3"));
+      IsSpecHelper("/job:*/replica:*", "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/gpu:*",
-                           "/job:work/replica:1/task:2/gpu:3"));
-  EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/gpu:3",
-                           "/job:work/replica:1/task:2/gpu:3"));
+                           "/job:work/replica:1/task:2/device:GPU:3"));
+  EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/device:GPU:3",
+                           "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/task:2",
-                           "/job:work/replica:1/task:2/gpu:3"));
+                           "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(IsSpecHelper("/job:work/replica:*/task:2",
-                           "/job:work/replica:1/task:2/gpu:3"));
-  EXPECT_TRUE(IsSpecHelper("/task:*", "/job:*/replica:1/task:2/gpu:3"));
-  EXPECT_TRUE(IsSpecHelper("/task:2", "/job:*/replica:1/task:2/gpu:3"));
+                           "/job:work/replica:1/task:2/device:GPU:3"));
+  EXPECT_TRUE(IsSpecHelper("/task:*", "/job:*/replica:1/task:2/device:GPU:3"));
+  EXPECT_TRUE(IsSpecHelper("/task:2", "/job:*/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2/cpu:1"));
   EXPECT_TRUE(IsSpecHelper("/cpu:0", "/cpu:0"));
-  EXPECT_TRUE(IsSpecHelper("/gpu:*", "/job:worker/replica:1/task:2/gpu:3"));
+  EXPECT_TRUE(IsSpecHelper("/gpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
 
-  EXPECT_FALSE(IsSpecHelper("/job:worker/replica:1/task:2/gpu:3", "/gpu:*"));
+  EXPECT_FALSE(IsSpecHelper("/job:worker/replica:1/task:2/device:GPU:3", "/gpu:*"));
   EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2"));
-  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2/gpu:1"));
-  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:worker/replica:1/task:2/gpu:3"));
-  EXPECT_FALSE(IsSpecHelper("/gpu:2", "/job:worker/replica:1/task:2/gpu:1"));
+  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2/device:GPU:1"));
+  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
+  EXPECT_FALSE(IsSpecHelper("/device:GPU:2", "/job:worker/replica:1/task:2/device:GPU:1"));
   EXPECT_FALSE(IsSpecHelper("/job:work/replica:*/task:0",
-                            "/job:work/replica:1/task:2/gpu:3"));
+                            "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_FALSE(IsSpecHelper("/job:work/replica:0/task:2",
-                            "/job:work/replica:*/task:2/gpu:3"));
+                            "/job:work/replica:*/task:2/device:GPU:3"));
 }
 
 TEST(DeviceNameUtilsTest, SplitDeviceName) {
@@ -348,7 +348,7 @@ TEST(DeviceNameUtilsTest, SplitDeviceName) {
       "/job:foo/cpu:1/task:2/replica:1", &task, &device));
   EXPECT_EQ("/job:foo/replica:1/task:2", task);
   EXPECT_EQ("CPU:1", device);
-  EXPECT_TRUE(DeviceNameUtils::SplitDeviceName("/gpu:3", &task, &device));
+  EXPECT_TRUE(DeviceNameUtils::SplitDeviceName("/device:GPU:3", &task, &device));
   EXPECT_EQ("", task);
   EXPECT_EQ("GPU:3", device);
   EXPECT_FALSE(DeviceNameUtils::SplitDeviceName("gpu:3", &task, &device));
@@ -413,11 +413,11 @@ TEST(DeviceNameUtilsTest, MergeDevNames) {
   MergeDevNamesHelper("", "/job:foo", "/job:foo");
   MergeDevNamesHelper("", "/replica:2", "/replica:2");
   MergeDevNamesHelper("", "/task:7", "/task:7");
-  // MergeDevNamesHelper("", "/gpu:1", "/gpu:1");
+  // MergeDevNamesHelper("", "/device:GPU:1", "/device:GPU:1");
 
   // Combining disjoint names.
   MergeDevNamesHelper("/job:foo", "/task:7", "/job:foo/task:7");
-  MergeDevNamesHelper("/job:foo", "/gpu:1", "/job:foo/gpu:1");
+  MergeDevNamesHelper("/job:foo", "/device:GPU:1", "/job:foo/device:GPU:1");
 
   // Combining overlapping names.
   MergeDevNamesHelper("/job:foo/replica:0", "/replica:0/task:1",
@@ -426,25 +426,25 @@ TEST(DeviceNameUtilsTest, MergeDevNames) {
   // Wildcard tests.
   MergeDevNamesHelper("", "/gpu:*", "/gpu:*");
   MergeDevNamesHelper("/gpu:*", "/gpu:*", "/gpu:*");
-  MergeDevNamesHelper("/gpu:1", "/gpu:*", "/gpu:1");
+  MergeDevNamesHelper("/device:GPU:1", "/gpu:*", "/device:GPU:1");
 
   // Incompatible components.
   MergeDevNamesError("/job:foo", "/job:bar", "incompatible jobs");
   MergeDevNamesError("/replica:0", "/replica:1", "incompatible replicas");
   MergeDevNamesError("/task:0", "/task:1", "incompatible tasks");
   MergeDevNamesError("/gpu:*", "/cpu:*", "incompatible types");
-  MergeDevNamesError("/gpu:0", "/gpu:1", "incompatible ids");
+  MergeDevNamesError("/device:GPU:0", "/device:GPU:1", "incompatible ids");
 }
 
 TEST(DeviceNameUtilsTest, MergeDevNamesAllowSoftPlacement) {
   // Incompatible components with allow_soft_placement.
   MergeDevNamesHelperAllowSoftPlacement("/gpu:*", "/cpu:1", "");
-  MergeDevNamesHelperAllowSoftPlacement("/cpu:*", "/gpu:1", "");
-  MergeDevNamesHelperAllowSoftPlacement("/gpu:1", "/gpu:2", "/gpu:*");
+  MergeDevNamesHelperAllowSoftPlacement("/cpu:*", "/device:GPU:1", "");
+  MergeDevNamesHelperAllowSoftPlacement("/device:GPU:1", "/device:GPU:2", "/device:GPU:*");
 }
 
 TEST(DeviceNameUtilsTest, GetNamesForDeviceMappings) {
-  DeviceNameUtils::ParsedName p = Name("/job:foo/replica:10/task:0/gpu:1");
+  DeviceNameUtils::ParsedName p = Name("/job:foo/replica:10/task:0/device:GPU:1");
   EXPECT_EQ(str_util::Join(DeviceNameUtils::GetNamesForDeviceMappings(p), ","),
             "/job:foo/replica:10/task:0/device:GPU:1,"
             "/job:foo/replica:10/task:0/gpu:1");
diff --git a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
index b56a4884b4c..496d43dfd7e 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
@@ -73,12 +73,12 @@ other wrappers and the dynamic decoder described below.  For example, one can
 write:
 
 ```python
-cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:0")
+cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/device:GPU:0")
 attention_mechanism = tf.contrib.seq2seq.LuongAttention(512, encoder_outputs)
 attn_cell = tf.contrib.seq2seq.AttentionWrapper(
   cell, attention_mechanism, attention_size=256)
-attn_cell = tf.contrib.rnn.DeviceWrapper(attn_cell, "/gpu:1")
-top_cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:1")
+attn_cell = tf.contrib.rnn.DeviceWrapper(attn_cell, "/device:GPU:1")
+top_cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/device:GPU:1")
 multi_cell = MultiRNNCell([attn_cell, top_cell])
 ```
 
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index dd18760e1dd..b265dbbe3e1 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -110,7 +110,7 @@ devices. For example, the following snippet creates a variable named `v` and
 places it on the second GPU device:
 
 ``` python
-with tf.device("/gpu:1"):
+with tf.device("/device:GPU:1"):
   v = tf.get_variable("v", [1])
 ```
 
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index a9e9dda12b9..de3f8338d30 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -411,7 +411,7 @@ the first tower are prepended with `tower_0`, e.g. `tower_0/conv1/Conv2D`.
 
 * A preferred hardware device to run the operation within a tower.
 @{tf.device} specifies this. For
-instance, all operations in the first tower reside within `device('/gpu:0')`
+instance, all operations in the first tower reside within `device('/device:GPU:0')`
 scope indicating that they should be run on the first GPU.
 
 All variables are pinned to the CPU and accessed via
diff --git a/tensorflow/docs_src/tutorials/using_gpu.md b/tensorflow/docs_src/tutorials/using_gpu.md
index dcec62d2749..b6edbe33451 100644
--- a/tensorflow/docs_src/tutorials/using_gpu.md
+++ b/tensorflow/docs_src/tutorials/using_gpu.md
@@ -7,8 +7,8 @@ supported device types are `CPU` and `GPU`. They are represented as `strings`.
 For example:
 
 *   `"/cpu:0"`: The CPU of your machine.
-*   `"/gpu:0"`: The GPU of your machine, if you have one.
-*   `"/gpu:1"`: The second GPU of your machine, etc.
+*   `"/device:GPU:0"`: The GPU of your machine, if you have one.
+*   `"/device:GPU:1"`: The second GPU of your machine, etc.
 
 If a TensorFlow operation has both CPU and GPU implementations, the GPU devices
 will be given priority when the operation is assigned to a device. For example,
@@ -35,11 +35,11 @@ You should see the following output:
 
 ```
 Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K40c, pci bus
+/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K40c, pci bus
 id: 0000:05:00.0
-b: /job:localhost/replica:0/task:0/gpu:0
-a: /job:localhost/replica:0/task:0/gpu:0
-MatMul: /job:localhost/replica:0/task:0/gpu:0
+b: /job:localhost/replica:0/task:0/device:GPU:0
+a: /job:localhost/replica:0/task:0/device:GPU:0
+MatMul: /job:localhost/replica:0/task:0/device:GPU:0
 [[ 22.  28.]
  [ 49.  64.]]
 
@@ -71,11 +71,11 @@ example) and automatically copy tensors between devices if required.
 
 ```
 Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K40c, pci bus
+/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K40c, pci bus
 id: 0000:05:00.0
 b: /job:localhost/replica:0/task:0/cpu:0
 a: /job:localhost/replica:0/task:0/cpu:0
-MatMul: /job:localhost/replica:0/task:0/gpu:0
+MatMul: /job:localhost/replica:0/task:0/device:GPU:0
 [[ 22.  28.]
  [ 49.  64.]]
 ```
@@ -127,7 +127,7 @@ to specify the preference explicitly:
 
 ```python
 # Creates a graph.
-with tf.device('/gpu:2'):
+with tf.device('/device:GPU:2'):
   a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
   b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
   c = tf.matmul(a, b)
@@ -142,9 +142,9 @@ If the device you have specified does not exist, you will get
 
 ```
 InvalidArgumentError: Invalid argument: Cannot assign a device to node 'b':
-Could not satisfy explicit device specification '/gpu:2'
+Could not satisfy explicit device specification '/device:GPU:2'
    [[Node: b = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [3,2]
-   values: 1 2 3...>, _device="/gpu:2"]()]]
+   values: 1 2 3...>, _device="/device:GPU:2"]()]]
 ```
 
 If you would like TensorFlow to automatically choose an existing and supported
@@ -154,7 +154,7 @@ the session.
 
 ```python
 # Creates a graph.
-with tf.device('/gpu:2'):
+with tf.device('/device:GPU:2'):
   a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
   b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
   c = tf.matmul(a, b)
@@ -175,7 +175,7 @@ For example:
 ```
 # Creates a graph.
 c = []
-for d in ['/gpu:2', '/gpu:3']:
+for d in ['/device:GPU:2', '/device:GPU:3']:
   with tf.device(d):
     a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
     b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2])
@@ -192,20 +192,20 @@ You will see the following output.
 
 ```
 Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K20m, pci bus
 id: 0000:02:00.0
-/job:localhost/replica:0/task:0/gpu:1 -> device: 1, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:1 -> device: 1, name: Tesla K20m, pci bus
 id: 0000:03:00.0
-/job:localhost/replica:0/task:0/gpu:2 -> device: 2, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:2 -> device: 2, name: Tesla K20m, pci bus
 id: 0000:83:00.0
-/job:localhost/replica:0/task:0/gpu:3 -> device: 3, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:3 -> device: 3, name: Tesla K20m, pci bus
 id: 0000:84:00.0
-Const_3: /job:localhost/replica:0/task:0/gpu:3
-Const_2: /job:localhost/replica:0/task:0/gpu:3
-MatMul_1: /job:localhost/replica:0/task:0/gpu:3
-Const_1: /job:localhost/replica:0/task:0/gpu:2
-Const: /job:localhost/replica:0/task:0/gpu:2
-MatMul: /job:localhost/replica:0/task:0/gpu:2
+Const_3: /job:localhost/replica:0/task:0/device:GPU:3
+Const_2: /job:localhost/replica:0/task:0/device:GPU:3
+MatMul_1: /job:localhost/replica:0/task:0/device:GPU:3
+Const_1: /job:localhost/replica:0/task:0/device:GPU:2
+Const: /job:localhost/replica:0/task:0/device:GPU:2
+MatMul: /job:localhost/replica:0/task:0/device:GPU:2
 AddN: /job:localhost/replica:0/task:0/cpu:0
 [[  44.   56.]
  [  98.  128.]]
diff --git a/tensorflow/examples/learn/multiple_gpu.py b/tensorflow/examples/learn/multiple_gpu.py
index c7364d1f720..a294950a386 100644
--- a/tensorflow/examples/learn/multiple_gpu.py
+++ b/tensorflow/examples/learn/multiple_gpu.py
@@ -47,12 +47,12 @@ def my_model(features, labels, mode):
   # Create three fully connected layers respectively of size 10, 20, and 10 with
   # each layer having a dropout probability of 0.1.
   net = features[X_FEATURE]
-  with tf.device('/gpu:1'):
+  with tf.device('/device:GPU:1'):
     for units in [10, 20, 10]:
       net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
       net = tf.layers.dropout(net, rate=0.1)
 
-  with tf.device('/gpu:2'):
+  with tf.device('/device:GPU:2'):
     # Compute logits (1 per class).
     logits = tf.layers.dense(net, 3, activation=None)
 
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index 6a89755bbda..b77912b4f74 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -173,7 +173,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     #
     # W0718 17:14:41.521534  190121 device_mgr.cc:107] Unknown device:
     #     /job:worker/replica:0/task:0/device:CPU:0 all devices:
-    #     /job:local/replica:0/task:0/gpu:0,
+    #     /job:local/replica:0/task:0/device:GPU:0,
     #     /job:local/replica:0/task:0/device:GPU:0,
     #     /job:local/replica:0/task:0/cpu:1, CPU:0, GPU:0,
     #     /job:local/replica:0/task:0/device:CPU:1,
@@ -198,7 +198,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
         sum1 = input1 + input2
 
       if test.is_gpu_available():
-        device_str = '/job:worker/task:0/gpu:0'
+        device_str = '/job:worker/task:0/device:GPU:0'
       else:
         device_str = '/job:worker/task:0/cpu:1'
       with ops.device(device_str):
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 15e7ae18bb0..b4f0fd6f404 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -1124,7 +1124,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     # which is why placing this is invalid.  If at some point
     # GPU kernels are added to this test, some other different
     # op / device combo should be chosen.
-    with ops.device('/gpu:0'):
+    with ops.device('/device:GPU:0'):
       a = constant_op.constant(1.0, shape=[1, 2])
 
     b = constant_op.constant(1.0, shape=[1, 2])
@@ -1145,7 +1145,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     # which is why placing this is invalid.  If at some point
     # GPU kernels are added to this test, some other different
     # op / device combo should be chosen.
-    with ops.device('/gpu:0'):
+    with ops.device('/device:GPU:0'):
       _ = constant_op.constant(1.0, shape=[1, 2])
 
     b = constant_op.constant(1.0, shape=[1, 2])
@@ -1494,7 +1494,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         allow_soft_placement=True,
         graph_options=config_pb2.GraphOptions(build_cost_model=100))
     with session.Session(config=config) as sess:
-      with ops.device('/gpu:0'):
+      with ops.device('/device:GPU:0'):
         a = array_ops.placeholder(dtypes.float32, shape=[])
         b = math_ops.add(a, a)
         c = array_ops.identity(b)
diff --git a/tensorflow/python/client/timeline_test.py b/tensorflow/python/client/timeline_test.py
index e8797712e91..8396df5f400 100644
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@@ -100,8 +100,8 @@ class TimelineTest(test.TestCase):
     self.assertTrue(run_metadata.HasField('step_stats'))
     step_stats = run_metadata.step_stats
     devices = [d.device for d in step_stats.dev_stats]
-    self.assertTrue('/job:localhost/replica:0/task:0/gpu:0' in devices)
-    self.assertTrue('/gpu:0/stream:all' in devices)
+    self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:0' in devices)
+    self.assertTrue('/device:GPU:0/stream:all' in devices)
     tl = timeline.Timeline(step_stats)
     ctf = tl.generate_chrome_trace_format()
     self._validateTrace(ctf)
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 044a91a7ce6..b2b3ec5d470 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -380,7 +380,8 @@ def device_path_to_device_name(device_dir):
   path_items = os.path.basename(device_dir)[
       len(METADATA_FILE_PREFIX) + len(DEVICE_TAG):].split(",")
   return "/".join([
-      path_item.replace("_", ":", 1) for path_item in path_items])
+      path_item.replace("device_", "device:").replace("_", ":", 1)
+      for path_item in path_items])
 
 
 class DebugTensorDatum(object):
diff --git a/tensorflow/python/debug/lib/debug_data_test.py b/tensorflow/python/debug/lib/debug_data_test.py
index eff70b662bd..694010a23cd 100644
--- a/tensorflow/python/debug/lib/debug_data_test.py
+++ b/tensorflow/python/debug/lib/debug_data_test.py
@@ -237,11 +237,11 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     gpu_0_dir = os.path.join(
         self._dump_root,
         debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
-        ",job_localhost,replica_0,task_0,gpu_0")
+        ",job_localhost,replica_0,task_0,device_GPU_0")
     gpu_1_dir = os.path.join(
         self._dump_root,
         debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
-        ",job_localhost,replica_0,task_0,gpu_1")
+        ",job_localhost,replica_0,task_0,device_GPU_1")
     os.makedirs(cpu_0_dir)
     os.makedirs(gpu_0_dir)
     os.makedirs(gpu_1_dir)
@@ -281,12 +281,12 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     node = graph_gpu_0.node.add()
     node.name = "node_foo_1"
     node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:0"
     graph_gpu_1 = graph_pb2.GraphDef()
     node = graph_gpu_1.node.add()
     node.name = "node_foo_1"
     node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:1"
 
     dump_dir = debug_data.DebugDumpDir(
         self._dump_root,
@@ -294,14 +294,14 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
 
     self.assertItemsEqual(
         ["/job:localhost/replica:0/task:0/cpu:0",
-         "/job:localhost/replica:0/task:0/gpu:0",
-         "/job:localhost/replica:0/task:0/gpu:1"], dump_dir.devices())
+         "/job:localhost/replica:0/task:0/device:GPU:0",
+         "/job:localhost/replica:0/task:0/device:GPU:1"], dump_dir.devices())
     self.assertEqual(1472563253536385, dump_dir.t0)
     self.assertEqual(3, dump_dir.size)
 
     with self.assertRaisesRegexp(
         ValueError, r"Invalid device name: "):
-      dump_dir.nodes("/job:localhost/replica:0/task:0/gpu:2")
+      dump_dir.nodes("/job:localhost/replica:0/task:0/device:GPU:2")
     self.assertItemsEqual(["node_foo_1", "node_foo_1", "node_foo_1"],
                           dump_dir.nodes())
     self.assertItemsEqual(
@@ -319,16 +319,16 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     node = graph_gpu_0.node.add()
     node.name = "node_foo_1"
     node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:0"
     graph_gpu_1 = graph_pb2.GraphDef()
     node = graph_gpu_1.node.add()
     node.name = "node_foo_1"
     node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:1"
     node = graph_gpu_1.node.add()  # Here is the duplicate.
     node.name = "node_foo_1"
     node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:1"
 
     with self.assertRaisesRegexp(
         ValueError, r"Duplicate node name on device "):
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index e54590adfea..08b3e75e7c8 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -711,7 +711,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     # Test node name list lookup of the DebugDumpDir object.
     if test_util.gpu_device_name():
       node_names = dump.nodes(
-          device_name="/job:localhost/replica:0/task:0/gpu:0")
+          device_name="/job:localhost/replica:0/task:0/device:GPU:0")
     else:
       node_names = dump.nodes()
     self.assertTrue(u_name in node_names)
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 575d74bbf09..3d18d7727ab 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -402,7 +402,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def testRuntimeErrorBeforeGraphExecutionIsRaised(self):
     # Use an impossible device name to cause an error before graph execution.
-    with ops.device("/gpu:1337"):
+    with ops.device("/device:GPU:1337"):
       w = variables.Variable([1.0] * 10, name="w")
 
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
diff --git a/tensorflow/python/framework/device_test.py b/tensorflow/python/framework/device_test.py
index e6dc3c80637..0859e956ffd 100644
--- a/tensorflow/python/framework/device_test.py
+++ b/tensorflow/python/framework/device_test.py
@@ -79,17 +79,17 @@ class DeviceTest(test_util.TensorFlowTestCase):
     self.assertEquals("/replica:1/task:0/device:CPU:0", d.to_string())
     d.parse_from_string("/replica:1/task:0/device:CPU:0")
     self.assertEquals("/replica:1/task:0/device:CPU:0", d.to_string())
-    d.parse_from_string("/job:muu/gpu:2")
+    d.parse_from_string("/job:muu/device:GPU:2")
     self.assertEquals("/job:muu/device:GPU:2", d.to_string())
     with self.assertRaises(Exception) as e:
-      d.parse_from_string("/job:muu/gpu:2/cpu:0")
+      d.parse_from_string("/job:muu/device:GPU:2/cpu:0")
     self.assertTrue("Cannot specify multiple device" in str(e.exception))
 
   def testFromString(self):
     d = device.DeviceSpec.from_string("/job:foo/replica:0")
     self.assertEquals("/job:foo/replica:0", d.to_string())
     with self.assertRaises(Exception) as e:
-      d = device.DeviceSpec.from_string("/job:muu/gpu:2/cpu:0")
+      d = device.DeviceSpec.from_string("/job:muu/device:GPU:2/cpu:0")
     self.assertTrue("Cannot specify multiple device" in str(e.exception))
 
     d = device.DeviceSpec.from_string("/job:foo/replica:0/task:3/cpu:*")
@@ -102,13 +102,13 @@ class DeviceTest(test_util.TensorFlowTestCase):
   def testMerge(self):
     d = device.DeviceSpec.from_string("/job:foo/replica:0")
     self.assertEquals("/job:foo/replica:0", d.to_string())
-    d.merge_from(device.DeviceSpec.from_string("/task:1/gpu:2"))
+    d.merge_from(device.DeviceSpec.from_string("/task:1/device:GPU:2"))
     self.assertEquals("/job:foo/replica:0/task:1/device:GPU:2", d.to_string())
 
     d = device.DeviceSpec()
     d.merge_from(device.DeviceSpec.from_string("/task:1/cpu:0"))
     self.assertEquals("/task:1/device:CPU:0", d.to_string())
-    d.merge_from(device.DeviceSpec.from_string("/job:boo/gpu:0"))
+    d.merge_from(device.DeviceSpec.from_string("/job:boo/device:GPU:0"))
     self.assertEquals("/job:boo/task:1/device:GPU:0", d.to_string())
     d.merge_from(device.DeviceSpec.from_string("/job:muu/cpu:2"))
     self.assertEquals("/job:muu/task:1/device:CPU:2", d.to_string())
@@ -134,10 +134,10 @@ class DeviceTest(test_util.TensorFlowTestCase):
 
     self.assertEqual("/job:foo/replica:0/task:0/device:GPU:0",
                      device.canonical_name(
-                         "/job:foo/replica:0/task:0/gpu:0"))
+                         "/job:foo/replica:0/task:0/device:GPU:0"))
     self.assertEqual("/job:foo/replica:0/task:0/device:GPU:0",
                      device.canonical_name(
-                         "/gpu:0/task:0/replica:0/job:foo"))
+                         "/device:GPU:0/task:0/replica:0/job:foo"))
 
   def testCheckValid(self):
     device.check_valid("/job:foo/replica:0")
@@ -155,7 +155,7 @@ class DeviceTest(test_util.TensorFlowTestCase):
     self.assertTrue("Unknown attribute: 'bar'" in str(e.exception))
 
     with self.assertRaises(Exception) as e:
-      device.check_valid("/cpu:0/gpu:2")
+      device.check_valid("/cpu:0/device:GPU:2")
     self.assertTrue("Cannot specify multiple device" in str(e.exception))
 
 
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index c94e05c4ee9..589db9ef4dc 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -505,7 +505,7 @@ class FunctionTest(test.TestCase):
 
       _ = PlusOne(1, name="p1")
       with self.assertRaisesRegexp(ValueError, "Unknown keyword arguments"):
-        _ = PlusOne(1, device="/gpu:0")
+        _ = PlusOne(1, device="/device:GPU:0")
 
   def testFunctionDecorator(self):
 
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index f6e9bc9dad3..647ed1583a0 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -106,9 +106,9 @@ class DeviceFunctionsTest(test.TestCase):
       var_0 = variables.Variable(0)
       with ops.device(test_device_func_pin_variable_to_cpu):
         var_1 = variables.Variable(1)
-        with ops.device(lambda op: "/gpu:0"):
+        with ops.device(lambda op: "/device:GPU:0"):
           var_2 = variables.Variable(2)
-        with ops.device("/gpu:0"):  # Implicit merging device function.
+        with ops.device("/device:GPU:0"):  # Implicit merging device function.
           var_3 = variables.Variable(3)
 
     self.assertDeviceEqual(var_0.device, None)
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index cfba6af5232..8ce8e76629d 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -878,7 +878,7 @@ class ImportGraphDefTest(test.TestCase):
         self.assertEqual(c.device, c4.device)  # worker overrides ps.
 
     with ops.Graph().as_default():
-      with ops.device(device.merge_device("/gpu:0")):
+      with ops.device(device.merge_device("/device:GPU:0")):
         a5, b5, c5 = importer.import_graph_def(
             gdef, return_elements=["a", "b", "c"])
         self.assertEqual("/device:GPU:0", a5.device)
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 13a92c3c7ec..65abb695991 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -550,7 +550,7 @@ class ScopedMetaGraphTest(test.TestCase):
         a = variables.Variable(
             constant_op.constant(
                 1.0, shape=[2, 2]), name="a")
-      with ops.device("/job:ps/replica:0/task:0/gpu:0"):
+      with ops.device("/job:ps/replica:0/task:0/device:GPU:0"):
         b = variables.Variable(
             constant_op.constant(
                 2.0, shape=[2, 2]), name="b")
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index ee7f77d5527..9a7f76cb588 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -3342,7 +3342,7 @@ class Graph(object):
     For example:
 
     ```python
-    with g.device('/gpu:0'):
+    with g.device('/device:GPU:0'):
       # All operations constructed in this context will be placed
       # on GPU 0.
       with g.device(None):
@@ -3352,7 +3352,7 @@ class Graph(object):
     # Defines a function from `Operation` to device string.
     def matmul_on_gpu(n):
       if n.type == "MatMul":
-        return "/gpu:0"
+        return "/device:GPU:0"
       else:
         return "/cpu:0"
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 4a8e30f4cbc..5507585a663 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -1555,26 +1555,26 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
 
   def testColocationDeviceInteraction(self):
     with ops.device("/cpu:0"):
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
         a = constant_op.constant([2.0], name="a")
       with ops.colocate_with(a.op):
         # 'b' is created in the scope of /cpu:0, but it is
-        # colocated with 'a', which is on '/gpu:0'.  colocate_with
+        # colocated with 'a', which is on '/device:GPU:0'.  colocate_with
         # overrides devices because it is a stronger constraint.
         b = constant_op.constant(3.0)
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual(a.op.device, b.op.device)
 
   def testColocationCanonicalization(self):
-    with ops.device("/gpu:0"):
+    with ops.device("/device:GPU:0"):
       _ = constant_op.constant(2.0)
-    with ops.device(lambda op: "/gpu:0"):
+    with ops.device(lambda op: "/device:GPU:0"):
       b = constant_op.constant(3.0)
     with ops.get_default_graph().colocate_with(b):
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
         c = constant_op.constant(4.0)
 
-    # A's device will be /gpu:0
+    # A's device will be /device:GPU:0
     # B's device will be /device:GPU:0
     # C's device will be /device:GPU:0 because it
     # inherits B's device name, after canonicalizing the names.
@@ -1582,10 +1582,10 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
 
   def testLocationOverrides(self):
     with ops.device("/cpu:0"):
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
         a = constant_op.constant([2.0], name="a")
         # Note that this colocation is "redundant", since we are
-        # within the scope of "/gpu:0".  However, we would like to
+        # within the scope of "/device:GPU:0".  However, we would like to
         # preserve in the GraphDef that these two ops should be
         # colocated in a portable way.
         with ops.colocate_with(a.op):
@@ -1652,7 +1652,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
 
   def testInconsistentDeviceWithinColocate(self):
-    with ops.device("/gpu:0"):
+    with ops.device("/device:GPU:0"):
       a = constant_op.constant([2.0], name="a")
       with ops.colocate_with(a.op):
         # This is allowed due to legacy but clearly wrong, since we
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index d9e507d23ce..e159cfa44bd 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -405,7 +405,7 @@ class TensorFlowTestCase(googletest.TestCase):
     trigger the creation of a new session.
 
     Use the `use_gpu` and `force_gpu` options to control where ops are run. If
-    `force_gpu` is True, all ops are pinned to `/gpu:0`. Otherwise, if `use_gpu`
+    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if `use_gpu`
     is True, TensorFlow tries to run as many ops on the GPU as possible. If both
     `force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
 
@@ -427,7 +427,7 @@ class TensorFlowTestCase(googletest.TestCase):
       config: An optional config_pb2.ConfigProto to use to configure the
         session.
       use_gpu: If True, attempt to run as many ops as possible on GPU.
-      force_gpu: If True, pin all ops to `/gpu:0`.
+      force_gpu: If True, pin all ops to `/device:GPU:0`.
 
     Returns:
       A Session object that should be used as a context manager to surround
@@ -466,11 +466,11 @@ class TensorFlowTestCase(googletest.TestCase):
       sess = self._cached_session
       with sess.graph.as_default(), sess.as_default():
         if force_gpu:
-          # Use the name of an actual device if one is detected, or '/gpu:0'
+          # Use the name of an actual device if one is detected, or '/device:GPU:0'
           # otherwise
           gpu_name = gpu_device_name()
           if not gpu_name:
-            gpu_name = "/gpu:0"
+            gpu_name = "/device:GPU:0"
           with sess.graph.device(gpu_name):
             yield sess
         elif use_gpu:
@@ -481,11 +481,11 @@ class TensorFlowTestCase(googletest.TestCase):
     else:
       with session.Session(graph=graph, config=prepare_config(config)) as sess:
         if force_gpu:
-          # Use the name of an actual device if one is detected, or '/gpu:0'
+          # Use the name of an actual device if one is detected, or '/device:GPU:0'
           # otherwise
           gpu_name = gpu_device_name()
           if not gpu_name:
-            gpu_name = "/gpu:0"
+            gpu_name = "/device:GPU:0"
           with sess.graph.device(gpu_name):
             yield sess
         elif use_gpu:
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 155aad8bd9a..405651e8ae9 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -238,7 +238,7 @@ class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase):
     n_iterations = 500
     with session as s:
       data = variables.Variable(1.0)
-      with ops.device('/gpu:0'):
+      with ops.device('/device:GPU:0'):
         random_seed.set_random_seed(1)
         matrix1 = variables.Variable(
             random_ops.truncated_normal([1024, 1]), name='matrix1')
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index 5369d2d5c49..d783522e820 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -311,7 +311,7 @@ class CholeskyBenchmark(test.Benchmark):
       if test.is_gpu_available(True):
         with ops.Graph().as_default(), \
             session.Session() as sess, \
-            ops.device("/gpu:0"):
+            ops.device("/device:GPU:0"):
           l = linalg_ops.cholesky(data)
           self.run_op_benchmark(
               sess,
@@ -338,11 +338,11 @@ class CholeskyBenchmark(test.Benchmark):
 
     if test.is_gpu_available(True):
       _BenchmarkGrad(
-          MatrixInverseCompositeGrad, "composite_matrix_inverse", "/gpu:0")
+          MatrixInverseCompositeGrad, "composite_matrix_inverse", "/device:GPU:0")
       _BenchmarkGrad(
-          TriAngInvCompositeGrad, "composite_tri_ang_inverse", "/gpu:0")
+          TriAngInvCompositeGrad, "composite_tri_ang_inverse", "/device:GPU:0")
       _BenchmarkGrad(
-          TriAngSolveCompositeGrad, "composite_triangular_solve", "/gpu:0")
+          TriAngSolveCompositeGrad, "composite_triangular_solve", "/device:GPU:0")
 
     _BenchmarkGrad(
         MatrixInverseCompositeGrad, "composite_matrix_inverse", "/cpu:0")
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 91694cd0b25..e3aac5019c1 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -1423,9 +1423,8 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(45, rx.eval())
 
   def _testWhileGrad_ColocateGradients(self, colocate):
-    gpu_dev_name = test.gpu_device_name().lower() if test.is_gpu_available(
-    ) else "/gpu:0"
-    gpu_short_name = gpu_dev_name.split("/")[-1]
+    gpu_dev_name = test.gpu_device_name() if test.is_gpu_available(
+    ) else "/device:GPU:0"
 
     with self.test_session(graph=ops.Graph()) as sess:
       v = constant_op.constant(2.0, name="v")
@@ -1439,19 +1438,19 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(
           loop, v, colocate_gradients_with_ops=colocate)[0]
     r_ops = r.graph.get_operations()
-    r_devices = [(op.name, op.device.lower()) for op in r_ops]
+    r_devices = [(op.name, op.device) for op in r_ops]
 
     self.assertTrue(any("Square" in op.name for op in r_ops))
 
     for (name, dev) in r_devices:
       if not colocate and name.endswith("Square"):
         # Only forward graph contain gpu in Square device
-        self.assertTrue(gpu_short_name in dev)
+        self.assertTrue(gpu_dev_name in dev)
       elif colocate and "Square" in name:
         # Forward and backward graphs contain gpu in Square/Square_grad devices
-        self.assertTrue(gpu_short_name in dev)
+        self.assertTrue(gpu_dev_name in dev)
       else:
-        self.assertFalse(gpu_short_name in dev)
+        self.assertFalse(gpu_dev_name in dev)
     self.assertAllClose(1024.0, sess.run(r))
 
   def testWhileGrad_ColocateGradients(self):
@@ -2426,7 +2425,7 @@ class ControlFlowTest(test.TestCase):
 
       # device set on tensor, default device on graph => default device on dep.
       vdef = variables.Variable([0.0], name="vdef")
-      with ops.device("/job:worker/gpu:1"):
+      with ops.device("/job:worker/device:GPU:1"):
         with_vdef_dep = control_flow_ops.with_dependencies([vdef.initializer],
                                                            vdef)
         # The device is empty, but the colocation constraint is set.
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index a0bd178e247..e20c6992525 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -347,7 +347,7 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
           ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
               x_t, y_t, adjoint_a, adjoint_b)
       else:
-        with ops.device("/gpu:0"):
+        with ops.device("/device:GPU:0"):
           x_t = constant_op.constant(x)
           y_t = constant_op.constant(y)
           ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
@@ -365,7 +365,7 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
         ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
             x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
     else:
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
         x_ind = constant_op.constant(np.vstack(np.where(x)).astype(np.int64).T)
         x_val = constant_op.constant(x[np.where(x)])
         x_shape = constant_op.constant(np.array(x.shape).astype(np.int64))
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index c8dafa4c3dd..b1a9458f7a2 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -722,7 +722,7 @@ class VariableScopeTest(test.TestCase):
     def device_func(op):
       if op.type in ["Variable", "VariableV2", "VarHandleOp"]:
         varname_type.append((op.name, op.get_attr("dtype")))
-      return "/gpu:0"
+      return "/device:GPU:0"
 
     with g.as_default():
       with ops.device(device_func):
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index aefed34d744..11c204b5b7f 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -163,20 +163,20 @@ class GradientsTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default() as g:
       w = constant(1.0, shape=[1, 1])
       x = constant(1.0, shape=[1, 2])
-      with g.device("/gpu:0"):
+      with g.device("/device:GPU:0"):
         wx = math_ops.matmul(w, x)
       gw = gradients.gradients(wx, [w], colocate_gradients_with_ops=True)[0]
     self.assertEqual(gw.op.colocation_groups(), wx.op.colocation_groups())
 
   def testColocateGradientsWithAggregation(self):
     with ops.Graph().as_default() as g:
-      with g.device("/gpu:1"):
+      with g.device("/device:GPU:1"):
         w = constant(1.0, shape=[1, 1])
       x = constant(1.0, shape=[1, 2])
       y = constant(1.0, shape=[1, 2])
       wx = math_ops.matmul(w, x)
       wy = math_ops.matmul(w, y)
-      with g.device("/gpu:0"):
+      with g.device("/device:GPU:0"):
         z = wx + wy
 
       gw1 = gradients.gradients(z, [w], colocate_gradients_with_ops=True)[0]
@@ -187,7 +187,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
 
   def testColocateGradientsWithAggregationInMultipleDevices(self):
     with ops.Graph().as_default() as g:
-      with g.device("/gpu:1"):
+      with g.device("/device:GPU:1"):
         w = constant(1.0, shape=[1, 1])
       x = constant(1.0, shape=[1, 2])
       y = constant(1.0, shape=[1, 2])
@@ -195,7 +195,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
         wx = math_ops.matmul(w, x)
       with g.device("/task:2"):
         wy = math_ops.matmul(w, y)
-      with g.device("/gpu:0"):
+      with g.device("/device:GPU:0"):
         z = wx + wy
 
       gw1 = gradients.gradients(z, [w], colocate_gradients_with_ops=True)[0]
diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py
index b777ace9d0c..f95cf08de1a 100644
--- a/tensorflow/python/ops/matmul_benchmark.py
+++ b/tensorflow/python/ops/matmul_benchmark.py
@@ -47,7 +47,7 @@ def build_graph(device, n, m, k, transpose_a, transpose_b, dtype):
   Returns:
     A matmul operation to run()
   """
-  with ops.device('/%s:0' % device):
+  with ops.device('%s' % device):
     if not transpose_a:
       x = variables.Variable(random_ops.random_uniform([n, m], dtype=dtype))
     else:
@@ -112,7 +112,7 @@ class MatmulBenchmark(test.Benchmark):
     return duration
 
   def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters):
-    self.run_graph('gpu', n, m, k, transpose_a, transpose_b, num_iters, dtype)
+    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b, num_iters, dtype)
 
   def test_round(self, num_iters):
     dtypes = [np.float32, np.float64]
diff --git a/tensorflow/python/ops/matmul_benchmark_test.py b/tensorflow/python/ops/matmul_benchmark_test.py
index a7914dba787..5a9c0a7a495 100644
--- a/tensorflow/python/ops/matmul_benchmark_test.py
+++ b/tensorflow/python/ops/matmul_benchmark_test.py
@@ -71,37 +71,39 @@ class MatmulBenchmarkTest(googletest.TestCase):
   def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     graph = ops.Graph()
     with graph.as_default():
-      matmul_benchmark.build_graph("gpu", n, m, k, transpose_a, transpose_b,
+      matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, transpose_a, transpose_b,
                                    dtype)
       gd = graph.as_graph_def()
-      self.assertProtoEquals("""
-      node { name: "random_uniform/shape" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform/min" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform/max" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: "/device:GPU:0" }
-      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: "/device:GPU:0" }
-      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: "/device:GPU:0" }
-      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: "/device:GPU:0" }
-      node { name: "Variable" op: "VariableV2" device: "/device:GPU:0" }
-      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: "/device:GPU:0" }
-      node { name: "Variable/read" op: "Identity" input: "Variable" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/shape" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/min" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/max" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: "/device:GPU:0" }
-      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: "/device:GPU:0" }
-      node { name: "Variable_1" op: "VariableV2" device: "/device:GPU:0" }
-      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: "/device:GPU:0" }
-      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: "/device:GPU:0" }
-      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: "/device:GPU:0" }
-      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: "/device:GPU:0" }
-                             """, self._StripGraph(gd))
+      dev=googletest.gpu_device_name()
+      proto_expected = """
+      node { name: "random_uniform/shape" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/min" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/max" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \""""+ dev +"""\" }
+      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \""""+ dev +"""\" }
+      node { name: "Variable" op: "VariableV2" device: \""""+ dev +"""\" }
+      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \""""+ dev +"""\" }
+      node { name: "Variable/read" op: "Identity" input: "Variable" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/shape" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/min" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/max" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
+      node { name: "Variable_1" op: "VariableV2" device: \""""+ dev +"""\" }
+      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \""""+ dev +"""\" }
+      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \""""+ dev +"""\" }
+      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \""""+ dev +"""\" }
+      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \""""+ dev +"""\" }
+                       """
+      self.assertProtoEquals(str(proto_expected), self._StripGraph(gd))
 
   def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     benchmark_instance = matmul_benchmark.MatmulBenchmark()
-    duration = benchmark_instance.run_graph("gpu", n, m, k, transpose_a,
+    duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m, k, transpose_a,
                                             transpose_b, 1, dtype)
     self.assertTrue(duration > 1e-6)
 
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index 62b2314aea0..b758edf87ef 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -97,21 +97,22 @@ class RunMetadataTest(test.TestCase):
     if not test.is_gpu_available(cuda_only=True):
       return
 
+    gpu_dev = test.gpu_device_name()
     ops.reset_default_graph()
-    with ops.device('/gpu:0'):
+    with ops.device(gpu_dev):
       tfprof_node, run_meta = _run_model()
       self.assertEqual(tfprof_node.children[0].name, 'MatMul')
       self.assertGreater(tfprof_node.children[0].exec_micros, 10)
 
     ret = _extract_node(run_meta, ['MatMul', 'MatMul:MatMul'])
     self.assertEqual(len(ret), 3)
-    self.assertTrue('/job:localhost/replica:0/task:0/gpu:0' in ret)
-    del ret['/job:localhost/replica:0/task:0/gpu:0']
+    self.assertTrue('/job:localhost/replica:0/task:0' + gpu_dev in ret)
+    del ret['/job:localhost/replica:0/task:0' + gpu_dev]
 
     has_all_stream = False
     for k, _ in six.iteritems(ret):
-      self.assertTrue('gpu:0/stream' in k)
-      if 'gpu:0/stream:all' in k:
+      self.assertTrue(gpu_dev + '/stream' in k)
+      if gpu_dev + '/stream:all' in k:
         has_all_stream = True
     self.assertTrue(has_all_stream)
 
@@ -159,24 +160,24 @@ class RunMetadataTest(test.TestCase):
       return
 
     ops.reset_default_graph()
-    with ops.device('/gpu:0'):
+    with ops.device('/device:GPU:0'):
       tfprof_node, run_meta = _run_loop_model()
       # The while-loop caused a node to appear 4 times in scheduling.
       ret = _extract_node(run_meta,
                           'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul')
-      self.assertEqual(len(ret['/job:localhost/replica:0/task:0/gpu:0']), 4)
+      self.assertEqual(len(ret['/job:localhost/replica:0/task:0/device:GPU:0']), 4)
 
       total_cpu_execs = 0
-      for node in ret['/job:localhost/replica:0/task:0/gpu:0']:
+      for node in ret['/job:localhost/replica:0/task:0/device:GPU:0']:
         total_cpu_execs += node.op_end_rel_micros
 
       ret = _extract_node(
           run_meta,
           'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul:MatMul')
-      self.assertGreaterEqual(len(ret['/gpu:0/stream:all']), 4)
+      self.assertGreaterEqual(len(ret['/device:GPU:0/stream:all']), 4)
 
       total_accelerator_execs = 0
-      for node in ret['/gpu:0/stream:all']:
+      for node in ret['/device:GPU:0/stream:all']:
         total_accelerator_execs += node.op_end_rel_micros
 
       mm_node = lib.SearchTFProfNode(
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index e2e022425dd..502fc49bb62 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -315,7 +315,7 @@ class ProfileOptionBuilder(object):
     """Selectively counting statistics based on node types.
 
     Here, 'types' means the profiler nodes' properties. Profiler by default
-    consider device name (e.g. /job:xx/.../gpu:0) and operation type
+    consider device name (e.g. /job:xx/.../device:GPU:0) and operation type
     (e.g. MatMul) as profiler nodes' properties. User can also associate
     customized 'types' to profiler nodes through OpLogProto proto.
 
diff --git a/tensorflow/tools/graph_transforms/remove_device_test.cc b/tensorflow/tools/graph_transforms/remove_device_test.cc
index 554c5e35952..17a87cd2366 100644
--- a/tensorflow/tools/graph_transforms/remove_device_test.cc
+++ b/tensorflow/tools/graph_transforms/remove_device_test.cc
@@ -50,7 +50,7 @@ class RemoveDeviceTest : public ::testing::Test {
     add_node2->set_op("Add");
     add_node2->add_input("const_node1");
     add_node2->add_input("const_node2");
-    add_node2->set_device("//gpu:1");
+    add_node2->set_device("//device:GPU:1");
 
     NodeDef* add_node3 = graph_def.add_node();
     add_node3->set_name("add_node3");