diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index f6420aaad8c..ce57f61d436 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -210,7 +210,7 @@ REGISTER_CPU(complex64);
 REGISTER_CPU(complex128);
 #if GOOGLE_CUDA
 REGISTER_GPU(float);
-// REGISTER_GPU(double);
+REGISTER_GPU(double);
 #if CUDA_VERSION >= 7050
 REGISTER_GPU(Eigen::half);
 #endif
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 0e7f2efe61f..595f3f41204 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -95,6 +95,7 @@ class MatMulTest(tf.test.TestCase):
     x = np.arange(1., 5.).reshape([4, 1]).astype(np.float64)
     y = np.arange(1., 3.).reshape([1, 2]).astype(np.float64)
     self._testCpuMatmul(x, y)
+    self._testGpuMatmul(x, y)
 
   def testHalfBasic(self):
     x = np.arange(1., 5.).reshape([4, 1]).astype(np.float16)
@@ -135,6 +136,7 @@ class MatMulTest(tf.test.TestCase):
       x = self._randMatrix(n, k, np.float64)
       y = self._randMatrix(k, m, np.float64)
       self._testCpuMatmul(x, y)
+      self._testGpuMatmul(x, y)
 
   def testHalfRandom(self):
     for _ in range(10):
@@ -185,6 +187,7 @@ class MatMulTest(tf.test.TestCase):
       x = self._randMatrix(k, n, np.float64)
       y = self._randMatrix(m, k, np.float64)
       self._testCpuMatmul(x, y, True, True)
+      self._testGpuMatmul(x, y, True, True)
 
   def testHalfRandomTransposeBoth(self):
     for _ in range(10):