diff --git a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
index 215ea97f36d..1088e903109 100644
--- a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
+++ b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -31,15 +30,14 @@ class CompareAndBitpackTest(test.TestCase):
                              x, threshold,
                              truth,
                              expected_err_re=None):
-    with test_util.use_gpu():
-      ans = math_ops.compare_and_bitpack(x, threshold)
-      if expected_err_re is None:
-        tf_ans = self.evaluate(ans)
-        self.assertShapeEqual(truth, ans)
-        self.assertAllEqual(tf_ans, truth)
-      else:
-        with self.assertRaisesOpError(expected_err_re):
-          self.evaluate(ans)
+    ans = math_ops.compare_and_bitpack(x, threshold)
+    if expected_err_re is None:
+      tf_ans = self.evaluate(ans)
+      self.assertShapeEqual(truth, ans)
+      self.assertAllEqual(tf_ans, truth)
+    else:
+      with self.assertRaisesOpError(expected_err_re):
+        self.evaluate(ans)
 
   def _testBasic(self, dtype):
     rows = 371
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index 4f338880aa3..4d57c1b264a 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -63,104 +63,99 @@ class DynamicStitchTestBase(object):
         self.assertEqual([None], stitched_t.get_shape().as_list())
 
   def testSimpleOneDimensional(self):
-    with test_util.use_gpu():
-      # Test various datatypes in the simple case to ensure that the op was
-      # registered under those types.
-      dtypes_to_test = [
-          dtypes.float32, dtypes.qint8, dtypes.quint8, dtypes.qint32
+    # Test various datatypes in the simple case to ensure that the op was
+    # registered under those types.
+    dtypes_to_test = [
+        dtypes.float32, dtypes.qint8, dtypes.quint8, dtypes.qint32
+    ]
+    for dtype in dtypes_to_test:
+      indices = [
+          constant_op.constant([0, 4, 7]),
+          constant_op.constant([1, 6, 2, 3, 5])
+      ]
+      data = [
+          math_ops.cast(constant_op.constant([0, 40, 70]), dtype=dtype),
+          math_ops.cast(
+              constant_op.constant([10, 60, 20, 30, 50]), dtype=dtype)
       ]
-      for dtype in dtypes_to_test:
-        indices = [
-            constant_op.constant([0, 4, 7]),
-            constant_op.constant([1, 6, 2, 3, 5])
-        ]
-        data = [
-            math_ops.cast(constant_op.constant([0, 40, 70]), dtype=dtype),
-            math_ops.cast(
-                constant_op.constant([10, 60, 20, 30, 50]), dtype=dtype)
-        ]
-        stitched_t = self.stitch_op(indices, data)
-        stitched_val = self.evaluate(stitched_t)
-        self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
-        # Dimension 0 is max(flatten(indices))+1.
-        self.assertEqual([8], stitched_t.get_shape().as_list())
-
-  def testOneListOneDimensional(self):
-    with test_util.use_gpu():
-      indices = [constant_op.constant([1, 6, 2, 3, 5, 0, 4, 7])]
-      data = [constant_op.constant([10, 60, 20, 30, 50, 0, 40, 70])]
       stitched_t = self.stitch_op(indices, data)
       stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8], stitched_t.get_shape().as_list())
 
+  def testOneListOneDimensional(self):
+    indices = [constant_op.constant([1, 6, 2, 3, 5, 0, 4, 7])]
+    data = [constant_op.constant([10, 60, 20, 30, 50, 0, 40, 70])]
+    stitched_t = self.stitch_op(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
+    # Dimension 0 is max(flatten(indices))+1.
+    self.assertEqual([8], stitched_t.get_shape().as_list())
+
   def testSimpleTwoDimensional(self):
-    with test_util.use_gpu():
-      indices = [
-          constant_op.constant([0, 4, 7]),
-          constant_op.constant([1, 6]),
-          constant_op.constant([2, 3, 5])
-      ]
-      data = [
-          constant_op.constant([[0, 1], [40, 41], [70, 71]]),
-          constant_op.constant([[10, 11], [60, 61]]),
-          constant_op.constant([[20, 21], [30, 31], [50, 51]])
-      ]
-      stitched_t = self.stitch_op(indices, data)
-      stitched_val = self.evaluate(stitched_t)
-      self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
-                           [50, 51], [60, 61], [70, 71]], stitched_val)
-      # Dimension 0 is max(flatten(indices))+1.
-      self.assertEqual([8, 2], stitched_t.get_shape().as_list())
+    indices = [
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6]),
+        constant_op.constant([2, 3, 5])
+    ]
+    data = [
+        constant_op.constant([[0, 1], [40, 41], [70, 71]]),
+        constant_op.constant([[10, 11], [60, 61]]),
+        constant_op.constant([[20, 21], [30, 31], [50, 51]])
+    ]
+    stitched_t = self.stitch_op(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
+                         [50, 51], [60, 61], [70, 71]], stitched_val)
+    # Dimension 0 is max(flatten(indices))+1.
+    self.assertEqual([8, 2], stitched_t.get_shape().as_list())
 
   def testZeroSizeTensor(self):
-    with test_util.use_gpu():
-      indices = [
-          constant_op.constant([0, 4, 7]),
-          constant_op.constant([1, 6]),
-          constant_op.constant([2, 3, 5]),
-          array_ops.zeros([0], dtype=dtypes.int32)
-      ]
-      data = [
-          constant_op.constant([[0, 1], [40, 41], [70, 71]]),
-          constant_op.constant([[10, 11], [60, 61]]),
-          constant_op.constant([[20, 21], [30, 31], [50, 51]]),
-          array_ops.zeros([0, 2], dtype=dtypes.int32)
-      ]
-      stitched_t = self.stitch_op(indices, data)
-      stitched_val = self.evaluate(stitched_t)
-      self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
-                           [50, 51], [60, 61], [70, 71]], stitched_val)
-      # Dimension 0 is max(flatten(indices))+1.
-      self.assertEqual([8, 2], stitched_t.get_shape().as_list())
+    indices = [
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6]),
+        constant_op.constant([2, 3, 5]),
+        array_ops.zeros([0], dtype=dtypes.int32)
+    ]
+    data = [
+        constant_op.constant([[0, 1], [40, 41], [70, 71]]),
+        constant_op.constant([[10, 11], [60, 61]]),
+        constant_op.constant([[20, 21], [30, 31], [50, 51]]),
+        array_ops.zeros([0, 2], dtype=dtypes.int32)
+    ]
+    stitched_t = self.stitch_op(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
+                         [50, 51], [60, 61], [70, 71]], stitched_val)
+    # Dimension 0 is max(flatten(indices))+1.
+    self.assertEqual([8, 2], stitched_t.get_shape().as_list())
 
   @test_util.run_deprecated_v1
   def testHigherRank(self):
-    with self.session(use_gpu=True) as sess:
-      indices = [
-          constant_op.constant(6),
-          constant_op.constant([4, 1]),
-          constant_op.constant([[5, 2], [0, 3]])
-      ]
-      data = [
-          constant_op.constant([61., 62.]),
-          constant_op.constant([[41., 42.], [11., 12.]]),
-          constant_op.constant([[[51., 52.], [21., 22.]],
-                                [[1., 2.], [31., 32.]]])
-      ]
-      stitched_t = self.stitch_op(indices, data)
-      stitched_val = self.evaluate(stitched_t)
-      correct = 10. * np.arange(7)[:, None] + [1., 2.]
-      self.assertAllEqual(correct, stitched_val)
-      self.assertEqual([7, 2], stitched_t.get_shape().as_list())
-      # Test gradients
-      stitched_grad = 7. * stitched_val
-      grads = gradients_impl.gradients(stitched_t, indices + data,
-                                       stitched_grad)
-      self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
-      for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7. * self.evaluate(datum), grad)
+    indices = [
+        constant_op.constant(6),
+        constant_op.constant([4, 1]),
+        constant_op.constant([[5, 2], [0, 3]])
+    ]
+    data = [
+        constant_op.constant([61., 62.]),
+        constant_op.constant([[41., 42.], [11., 12.]]),
+        constant_op.constant([[[51., 52.], [21., 22.]],
+                              [[1., 2.], [31., 32.]]])
+    ]
+    stitched_t = self.stitch_op(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    correct = 10. * np.arange(7)[:, None] + [1., 2.]
+    self.assertAllEqual(correct, stitched_val)
+    self.assertEqual([7, 2], stitched_t.get_shape().as_list())
+    # Test gradients
+    stitched_grad = 7. * stitched_val
+    grads = gradients_impl.gradients(stitched_t, indices + data,
+                                     stitched_grad)
+    self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
+    for datum, grad in zip(data, self.evaluate(grads[3:])):
+      self.assertAllEqual(7. * self.evaluate(datum), grad)
 
   @test_util.run_deprecated_v1
   def testErrorIndicesMultiDimensional(self):
@@ -241,69 +236,66 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
 
   @test_util.run_deprecated_v1
   def testHigherRank(self):
-    with self.session(use_gpu=True) as sess:
-      indices = [
-          constant_op.constant(6),
-          constant_op.constant([4, 1]),
-          constant_op.constant([[5, 2], [0, 3]])
-      ]
-      data = [
-          constant_op.constant([61, 62], dtype=dtypes.float32),
-          constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
-          constant_op.constant(
-              [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
-      ]
-      stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = self.evaluate(stitched_t)
-      correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
-      self.assertAllEqual(correct, stitched_val)
-      self.assertEqual([7, 2], stitched_t.get_shape().as_list())
-      # Test gradients
-      stitched_grad = 7 * stitched_val
-      grads = gradients_impl.gradients(stitched_t, indices + data,
-                                       stitched_grad)
-      self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
-      for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
+    indices = [
+        constant_op.constant(6),
+        constant_op.constant([4, 1]),
+        constant_op.constant([[5, 2], [0, 3]])
+    ]
+    data = [
+        constant_op.constant([61, 62], dtype=dtypes.float32),
+        constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
+        constant_op.constant(
+            [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
+    ]
+    stitched_t = data_flow_ops.dynamic_stitch(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
+    self.assertAllEqual(correct, stitched_val)
+    self.assertEqual([7, 2], stitched_t.get_shape().as_list())
+    # Test gradients
+    stitched_grad = 7 * stitched_val
+    grads = gradients_impl.gradients(stitched_t, indices + data,
+                                     stitched_grad)
+    self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
+    for datum, grad in zip(data, self.evaluate(grads[3:])):
+      self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
   # GPU version unit tests
   def testScalarGPU(self):
-    with self.cached_session():
-      indices = [constant_op.constant(0), constant_op.constant(1)]
-      data = [constant_op.constant(40.0), constant_op.constant(60.0)]
-      for step in -1, 1:
-        stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
-        stitched_val = self.evaluate(stitched_t)
-        self.assertAllEqual([40.0, 60.0][::step], stitched_val)
-        # Dimension 0 is max(flatten(indices))+1.
-        self.assertEqual([2], stitched_t.get_shape().as_list())
+    indices = [constant_op.constant(0), constant_op.constant(1)]
+    data = [constant_op.constant(40.0), constant_op.constant(60.0)]
+    for step in -1, 1:
+      stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
+      stitched_val = self.evaluate(stitched_t)
+      self.assertAllEqual([40.0, 60.0][::step], stitched_val)
+      # Dimension 0 is max(flatten(indices))+1.
+      self.assertEqual([2], stitched_t.get_shape().as_list())
 
   @test_util.run_deprecated_v1
   def testHigherRankGPU(self):
-    with self.cached_session() as sess:
-      indices = [
-          constant_op.constant(6),
-          constant_op.constant([4, 1]),
-          constant_op.constant([[5, 2], [0, 3]])
-      ]
-      data = [
-          constant_op.constant([61, 62], dtype=dtypes.float32),
-          constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
-          constant_op.constant(
-              [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
-      ]
-      stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = self.evaluate(stitched_t)
-      correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
-      self.assertAllEqual(correct, stitched_val)
-      self.assertEqual([7, 2], stitched_t.get_shape().as_list())
-      # Test gradients
-      stitched_grad = 7 * stitched_val
-      grads = gradients_impl.gradients(stitched_t, indices + data,
-                                       stitched_grad)
-      self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
-      for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
+    indices = [
+        constant_op.constant(6),
+        constant_op.constant([4, 1]),
+        constant_op.constant([[5, 2], [0, 3]])
+    ]
+    data = [
+        constant_op.constant([61, 62], dtype=dtypes.float32),
+        constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
+        constant_op.constant(
+            [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
+    ]
+    stitched_t = data_flow_ops.dynamic_stitch(indices, data)
+    stitched_val = self.evaluate(stitched_t)
+    correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
+    self.assertAllEqual(correct, stitched_val)
+    self.assertEqual([7, 2], stitched_t.get_shape().as_list())
+    # Test gradients
+    stitched_grad = 7 * stitched_val
+    grads = gradients_impl.gradients(stitched_t, indices + data,
+                                     stitched_grad)
+    self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
+    for datum, grad in zip(data, self.evaluate(grads[3:])):
+      self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index bb3c0ae8069..2e4244e94a2 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -44,15 +43,14 @@ class ExtractImagePatches(test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with test_util.use_gpu():
-      out_tensor = array_ops.extract_image_patches(
-          constant_op.constant(image),
-          ksizes=ksizes,
-          strides=strides,
-          rates=rates,
-          padding=padding,
-          name="im2col")
-      self.assertAllClose(patches, self.evaluate(out_tensor))
+    out_tensor = array_ops.extract_image_patches(
+        constant_op.constant(image),
+        ksizes=ksizes,
+        strides=strides,
+        rates=rates,
+        padding=padding,
+        name="im2col")
+    self.assertAllClose(patches, self.evaluate(out_tensor))
 
   def testKsize1x1Stride1x1Rate1x1(self):
     """Verifies that for 1x1 kernel the output equals the input."""
diff --git a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
index 88f7df8fbb6..7a63e590cf3 100644
--- a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -46,14 +45,13 @@ class ExtractVolumePatches(test.TestCase):
     ksizes = [1] + ksizes + [1]
     strides = [1] + strides + [1]
 
-    with test_util.use_gpu():
-      out_tensor = array_ops.extract_volume_patches(
-          constant_op.constant(image),
-          ksizes=ksizes,
-          strides=strides,
-          padding=padding,
-          name="im2col_3d")
-      self.assertAllClose(patches, self.evaluate(out_tensor))
+    out_tensor = array_ops.extract_volume_patches(
+        constant_op.constant(image),
+        ksizes=ksizes,
+        strides=strides,
+        padding=padding,
+        name="im2col_3d")
+    self.assertAllClose(patches, self.evaluate(out_tensor))
 
   # pylint: disable=bad-whitespace
   def testKsize1x1x1Stride1x1x1(self):
diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
index 951af020fe7..1c0280c3ce6 100644
--- a/tensorflow/python/kernel_tests/lu_op_test.py
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -66,64 +66,62 @@ class LuOpTest(test.TestCase):
 
   def _verifyLu(self, x, output_idx_type=dtypes.int64):
     # Verify that Px = LU.
-    with test_util.use_gpu():
+    lu, perm = linalg_ops.lu(x, output_idx_type=output_idx_type)
 
-      lu, perm = linalg_ops.lu(x, output_idx_type=output_idx_type)
+    # Prepare the lower factor of shape num_rows x num_rows
+    lu_shape = np.array(lu.shape.as_list())
+    batch_shape = lu_shape[:-2]
+    num_rows = lu_shape[-2]
+    num_cols = lu_shape[-1]
 
-      # Prepare the lower factor of shape num_rows x num_rows
-      lu_shape = np.array(lu.shape.as_list())
-      batch_shape = lu_shape[:-2]
-      num_rows = lu_shape[-2]
-      num_cols = lu_shape[-1]
+    lower = array_ops.matrix_band_part(lu, -1, 0)
 
-      lower = array_ops.matrix_band_part(lu, -1, 0)
+    if num_rows > num_cols:
+      eye = linalg_ops.eye(
+          num_rows, batch_shape=batch_shape, dtype=lower.dtype)
+      lower = array_ops.concat([lower, eye[..., num_cols:]], axis=-1)
+    elif num_rows < num_cols:
+      lower = lower[..., :num_rows]
 
-      if num_rows > num_cols:
-        eye = linalg_ops.eye(
-            num_rows, batch_shape=batch_shape, dtype=lower.dtype)
-        lower = array_ops.concat([lower, eye[..., num_cols:]], axis=-1)
-      elif num_rows < num_cols:
-        lower = lower[..., :num_rows]
+    # Fill the diagonal with ones.
+    ones_diag = array_ops.ones(
+        np.append(batch_shape, num_rows), dtype=lower.dtype)
+    lower = array_ops.matrix_set_diag(lower, ones_diag)
 
-      # Fill the diagonal with ones.
-      ones_diag = array_ops.ones(
-          np.append(batch_shape, num_rows), dtype=lower.dtype)
-      lower = array_ops.matrix_set_diag(lower, ones_diag)
+    # Prepare the upper factor.
+    upper = array_ops.matrix_band_part(lu, 0, -1)
 
-      # Prepare the upper factor.
-      upper = array_ops.matrix_band_part(lu, 0, -1)
+    verification = math_ops.matmul(lower, upper)
 
-      verification = math_ops.matmul(lower, upper)
+    # Permute the rows of product of the Cholesky factors.
+    if num_rows > 0:
+      # Reshape the product of the triangular factors and permutation indices
+      # to a single batch dimension. This makes it easy to apply
+      # invert_permutation and gather_nd ops.
+      perm_reshaped = array_ops.reshape(perm, [-1, num_rows])
+      verification_reshaped = array_ops.reshape(verification,
+                                                [-1, num_rows, num_cols])
+      # Invert the permutation in each batch.
+      inv_perm_reshaped = map_fn.map_fn(array_ops.invert_permutation,
+                                        perm_reshaped)
+      batch_size = perm_reshaped.shape.as_list()[0]
+      # Prepare the batch indices with the same shape as the permutation.
+      # The corresponding batch index is paired with each of the `num_rows`
+      # permutation indices.
+      batch_indices = math_ops.cast(
+          array_ops.broadcast_to(
+              math_ops.range(batch_size)[:, None], perm_reshaped.shape),
+          dtype=output_idx_type)
+      permuted_verification_reshaped = array_ops.gather_nd(
+          verification_reshaped,
+          array_ops.stack([batch_indices, inv_perm_reshaped], axis=-1))
 
-      # Permute the rows of product of the Cholesky factors.
-      if num_rows > 0:
-        # Reshape the product of the triangular factors and permutation indices
-        # to a single batch dimension. This makes it easy to apply
-        # invert_permutation and gather_nd ops.
-        perm_reshaped = array_ops.reshape(perm, [-1, num_rows])
-        verification_reshaped = array_ops.reshape(verification,
-                                                  [-1, num_rows, num_cols])
-        # Invert the permutation in each batch.
-        inv_perm_reshaped = map_fn.map_fn(array_ops.invert_permutation,
-                                          perm_reshaped)
-        batch_size = perm_reshaped.shape.as_list()[0]
-        # Prepare the batch indices with the same shape as the permutation.
-        # The corresponding batch index is paired with each of the `num_rows`
-        # permutation indices.
-        batch_indices = math_ops.cast(
-            array_ops.broadcast_to(
-                math_ops.range(batch_size)[:, None], perm_reshaped.shape),
-            dtype=output_idx_type)
-        permuted_verification_reshaped = array_ops.gather_nd(
-            verification_reshaped,
-            array_ops.stack([batch_indices, inv_perm_reshaped], axis=-1))
+      # Reshape the verification matrix back to the original shape.
+      verification = array_ops.reshape(permuted_verification_reshaped,
+                                       lu_shape)
 
-        # Reshape the verification matrix back to the original shape.
-        verification = array_ops.reshape(permuted_verification_reshaped,
-                                         lu_shape)
-
-      self._verifyLuBase(x, lower, upper, perm, verification,
-                         output_idx_type)
+    self._verifyLuBase(x, lower, upper, perm, verification,
+                       output_idx_type)
 
   def testBasic(self):
     data = np.array([[4., -1., 2.], [-1., 6., 0], [10., 0., 5.]])
@@ -140,46 +138,44 @@ class LuOpTest(test.TestCase):
         self._verifyLu(complex_data, output_idx_type=output_idx_type)
 
   def testPivoting(self):
-    with test_util.use_gpu():
-      # This matrix triggers partial pivoting because the first diagonal entry
-      # is small.
-      data = np.array([[1e-9, 1., 0.], [1., 0., 0], [0., 1., 5]])
-      self._verifyLu(data.astype(np.float32))
+    # This matrix triggers partial pivoting because the first diagonal entry
+    # is small.
+    data = np.array([[1e-9, 1., 0.], [1., 0., 0], [0., 1., 5]])
+    self._verifyLu(data.astype(np.float32))
 
-      for dtype in (np.float32, np.float64):
-        self._verifyLu(data.astype(dtype))
-        _, p = linalg_ops.lu(data)
-        p_val = self.evaluate([p])
-        # Make sure p_val is not the identity permutation.
-        self.assertNotAllClose(np.arange(3), p_val)
+    for dtype in (np.float32, np.float64):
+      self._verifyLu(data.astype(dtype))
+      _, p = linalg_ops.lu(data)
+      p_val = self.evaluate([p])
+      # Make sure p_val is not the identity permutation.
+      self.assertNotAllClose(np.arange(3), p_val)
 
-      for dtype in (np.complex64, np.complex128):
-        complex_data = np.tril(1j * data, -1).astype(dtype)
-        complex_data += np.triu(-1j * data, 1).astype(dtype)
-        complex_data += data
-        self._verifyLu(complex_data)
-        _, p = linalg_ops.lu(data)
-        p_val = self.evaluate([p])
-        # Make sure p_val is not the identity permutation.
-        self.assertNotAllClose(np.arange(3), p_val)
+    for dtype in (np.complex64, np.complex128):
+      complex_data = np.tril(1j * data, -1).astype(dtype)
+      complex_data += np.triu(-1j * data, 1).astype(dtype)
+      complex_data += data
+      self._verifyLu(complex_data)
+      _, p = linalg_ops.lu(data)
+      p_val = self.evaluate([p])
+      # Make sure p_val is not the identity permutation.
+      self.assertNotAllClose(np.arange(3), p_val)
 
   def testInvalidMatrix(self):
     # LU factorization gives an error when the input is singular.
     # Note: A singular matrix may return without error but it won't be a valid
     # factorization.
-    with test_util.use_gpu():
-      for dtype in self.float_types:
-        with self.assertRaises(errors.InvalidArgumentError):
-          self.evaluate(
-              linalg_ops.lu(
-                  np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
-                           dtype=dtype)))
-        with self.assertRaises(errors.InvalidArgumentError):
-          self.evaluate(
-              linalg_ops.lu(
-                  np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
-                            [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
-                           dtype=dtype)))
+    for dtype in self.float_types:
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(
+            linalg_ops.lu(
+                np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
+                         dtype=dtype)))
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(
+            linalg_ops.lu(
+                np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
+                          [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
+                         dtype=dtype)))
 
   def testBatch(self):
     simple_array = np.array([[[1., -1.], [2., 5.]]])  # shape (1, 2, 2)
@@ -220,14 +216,13 @@ class LuOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
-    with test_util.use_gpu():
-      matrix1 = random_ops.random_normal([5, 5], seed=42)
-      matrix2 = random_ops.random_normal([5, 5], seed=42)
-      lu1, p1 = linalg_ops.lu(matrix1)
-      lu2, p2 = linalg_ops.lu(matrix2)
-      lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2])
-      self.assertAllEqual(lu1_val, lu2_val)
-      self.assertAllEqual(p1_val, p2_val)
+    matrix1 = random_ops.random_normal([5, 5], seed=42)
+    matrix2 = random_ops.random_normal([5, 5], seed=42)
+    lu1, p1 = linalg_ops.lu(matrix1)
+    lu2, p2 = linalg_ops.lu(matrix2)
+    lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2])
+    self.assertAllEqual(lu1_val, lu2_val)
+    self.assertAllEqual(p1_val, p2_val)
 
 
 class LuBenchmark(test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 022ed496968..66125c17f21 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -127,7 +127,7 @@ def _GetMatMulGradientTest(a_np_, b_np_, use_static_shape_, **kwargs_):
     epsilon = np.finfo(a_np_.dtype).eps
     delta = epsilon**(1.0 / 3.0)
     tol = 20 * delta
-    with self.session(), test_util.use_gpu():
+    with self.session():
       theoretical, numerical = gradient_checker_v2.compute_gradient(
           lambda x: math_ops.matmul(x, effective_b_np, **kwargs_),
           [effective_a_np],
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index e09530b8915..51a90e8f337 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -32,12 +32,12 @@ class SquareRootOpTest(test.TestCase):
 
   def _verifySquareRoot(self, matrix, np_type):
     matrix = matrix.astype(np_type)
-    with test_util.use_gpu():
-      # Verify that matmul(sqrtm(A), sqrtm(A)) = A
-      sqrt = gen_linalg_ops.matrix_square_root(matrix)
-      square = math_ops.matmul(sqrt, sqrt)
-      self.assertShapeEqual(matrix, square)
-      self.assertAllClose(matrix, square, rtol=1e-4, atol=1e-3)
+
+    # Verify that matmul(sqrtm(A), sqrtm(A)) = A
+    sqrt = gen_linalg_ops.matrix_square_root(matrix)
+    square = math_ops.matmul(sqrt, sqrt)
+    self.assertShapeEqual(matrix, square)
+    self.assertAllClose(matrix, square, rtol=1e-4, atol=1e-3)
 
   def _verifySquareRootReal(self, x):
     for np_type in [np.float32, np.float64]:
diff --git a/tensorflow/python/kernel_tests/unstack_op_test.py b/tensorflow/python/kernel_tests/unstack_op_test.py
index f5ba475e7ad..bdf8dc1c83f 100644
--- a/tensorflow/python/kernel_tests/unstack_op_test.py
+++ b/tensorflow/python/kernel_tests/unstack_op_test.py
@@ -41,20 +41,19 @@ class UnstackOpTest(test.TestCase):
 
   def testSimple(self):
     np.random.seed(7)
-    with test_util.use_gpu():
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        for dtype in [
-            np.bool, np.float16, np.float32, np.float64, np.int32, np.int64
-        ]:
-          data = np.random.randn(*shape).astype(dtype)
-          # Convert data to a single tensorflow tensor
-          x = constant_op.constant(data)
-          # Unstack into a list of tensors
-          cs = array_ops.unstack(x, num=shape[0])
-          self.assertEqual(type(cs), list)
-          self.assertEqual(len(cs), shape[0])
-          cs = [self.evaluate(c) for c in cs]
-          self.assertAllEqual(cs, data)
+    for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+      for dtype in [
+          np.bool, np.float16, np.float32, np.float64, np.int32, np.int64
+      ]:
+        data = np.random.randn(*shape).astype(dtype)
+        # Convert data to a single tensorflow tensor
+        x = constant_op.constant(data)
+        # Unstack into a list of tensors
+        cs = array_ops.unstack(x, num=shape[0])
+        self.assertEqual(type(cs), list)
+        self.assertEqual(len(cs), shape[0])
+        cs = [self.evaluate(c) for c in cs]
+        self.assertAllEqual(cs, data)
 
   def testSimpleGpu(self):
     if not test_util.is_gpu_available():
@@ -80,7 +79,7 @@ class UnstackOpTest(test.TestCase):
       data = np.random.randn(*shape)
       shapes = [shape[1:]] * shape[0]
       for i in xrange(shape[0]):
-        with self.cached_session(use_gpu=True):
+        with self.cached_session():
           x = constant_op.constant(data)
           cs = array_ops.unstack(x, num=shape[0])
           err = gradient_checker.compute_gradient_error(x, shape, cs[i],
@@ -94,7 +93,7 @@ class UnstackOpTest(test.TestCase):
       out_shape = list(shape)
       del out_shape[1]
       for i in xrange(shape[1]):
-        with self.cached_session(use_gpu=True):
+        with self.cached_session():
           x = constant_op.constant(data)
           cs = array_ops.unstack(x, num=shape[1], axis=1)
           err = gradient_checker.compute_gradient_error(x, shape, cs[i],
@@ -103,12 +102,11 @@ class UnstackOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testInferNum(self):
-    with self.cached_session():
-      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        x = array_ops.placeholder(np.float32, shape=shape)
-        cs = array_ops.unstack(x)
-        self.assertEqual(type(cs), list)
-        self.assertEqual(len(cs), shape[0])
+    for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+      x = array_ops.placeholder(np.float32, shape=shape)
+      cs = array_ops.unstack(x)
+      self.assertEqual(type(cs), list)
+      self.assertEqual(len(cs), shape[0])
 
   @test_util.run_deprecated_v1
   def testCannotInferNumFromUnknownShape(self):