diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py
index 791ff3e03ab..1b4b95d5d01 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py
@@ -193,7 +193,7 @@ class MultivariateNormalCholeskyTest(tf.test.TestCase):
     mat = self._rng.rand(*shape)
     chol = distributions.matrix_diag_transform(mat, transform=tf.nn.softplus)
     chol = tf.matrix_band_part(chol, -1, 0)
-    sigma = tf.batch_matmul(chol, chol, adj_y=True)
+    sigma = tf.matmul(chol, chol, adjoint_b=True)
     return chol.eval(), sigma.eval()
 
   def testNonmatchingMuSigmaFailsStatic(self):
@@ -391,7 +391,7 @@ class MultivariateNormalFullTest(tf.test.TestCase):
     # This ensures sigma is positive def.
     mat_shape = batch_shape + event_shape + event_shape
     mat = self._rng.randn(*mat_shape)
-    sigma = tf.batch_matmul(mat, mat, adj_y=True).eval()
+    sigma = tf.matmul(mat, mat, adjoint_b=True).eval()
 
     mu_shape = batch_shape + event_shape
     mu = self._rng.randn(*mu_shape)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
index 4da3577afaa..d318f34291c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
@@ -84,7 +84,7 @@ class OperatorPDCholeskyTest(tf.test.TestCase):
         operator = operator_pd_cholesky.OperatorPDCholesky(chol)
 
         sqrt_operator_times_x = operator.sqrt_matmul(x)
-        expected = tf.batch_matmul(chol, x)
+        expected = tf.matmul(chol, x)
 
         self.assertEqual(expected.get_shape(),
                          sqrt_operator_times_x.get_shape())
@@ -102,7 +102,7 @@ class OperatorPDCholeskyTest(tf.test.TestCase):
         operator = operator_pd_cholesky.OperatorPDCholesky(chol)
 
         sqrt_operator_times_x = operator.sqrt_matmul(x)
-        expected = tf.batch_matmul(chol, x)
+        expected = tf.matmul(chol, x)
 
         self.assertEqual(expected.get_shape(),
                          sqrt_operator_times_x.get_shape())
@@ -121,7 +121,7 @@ class OperatorPDCholeskyTest(tf.test.TestCase):
 
         sqrt_operator_times_x = operator.sqrt_matmul(x, transpose_x=True)
         # tf.batch_matmul is defined x * y, so "y" is on the right, not "x".
-        expected = tf.batch_matmul(chol, x, adj_y=True)
+        expected = tf.matmul(chol, x, adjoint_b=True)
 
         self.assertEqual(expected.get_shape(),
                          sqrt_operator_times_x.get_shape())
@@ -135,11 +135,11 @@ class OperatorPDCholeskyTest(tf.test.TestCase):
         x = self._rng.rand(*x_shape)
         chol_shape = batch_shape + (k, k)
         chol = self._random_cholesky_array(chol_shape)
-        matrix = tf.batch_matmul(chol, chol, adj_y=True)
+        matrix = tf.matmul(chol, chol, adjoint_b=True)
 
         operator = operator_pd_cholesky.OperatorPDCholesky(chol)
 
-        expected = tf.batch_matmul(matrix, x)
+        expected = tf.matmul(matrix, x)
 
         self.assertEqual(expected.get_shape(), operator.matmul(x).get_shape())
         self.assertAllClose(expected.eval(), operator.matmul(x).eval())
@@ -152,11 +152,11 @@ class OperatorPDCholeskyTest(tf.test.TestCase):
         x = self._rng.rand(*x_shape)
         chol_shape = batch_shape + (k, k)
         chol = self._random_cholesky_array(chol_shape)
-        matrix = tf.batch_matmul(chol, chol, adj_y=True)
+        matrix = tf.matmul(chol, chol, adjoint_b=True)
 
         operator = operator_pd_cholesky.OperatorPDCholesky(chol)
 
-        expected = tf.batch_matmul(matrix, x)
+        expected = tf.matmul(matrix, x)
 
         self.assertEqual(expected.get_shape(), operator.matmul(x).get_shape())
         self.assertAllClose(expected.eval(), operator.matmul(x).eval())
@@ -169,13 +169,13 @@ class OperatorPDCholeskyTest(tf.test.TestCase):
         x = self._rng.rand(*x_shape)
         chol_shape = batch_shape + (k, k)
         chol = self._random_cholesky_array(chol_shape)
-        matrix = tf.batch_matmul(chol, chol, adj_y=True)
+        matrix = tf.matmul(chol, chol, adjoint_b=True)
 
         operator = operator_pd_cholesky.OperatorPDCholesky(chol)
         operator_times_x = operator.matmul(x, transpose_x=True)
 
         # tf.batch_matmul is defined x * y, so "y" is on the right, not "x".
-        expected = tf.batch_matmul(matrix, x, adj_y=True)
+        expected = tf.matmul(matrix, x, adjoint_b=True)
 
         self.assertEqual(expected.get_shape(), operator_times_x.get_shape())
         self.assertAllClose(expected.eval(), operator_times_x.eval())
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py
index 188df296219..34b4cf6961a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py
@@ -32,7 +32,7 @@ class OperatorPDFullTest(tf.test.TestCase):
 
   def _random_positive_def_array(self, *shape):
     matrix = self._rng.rand(*shape)
-    return tf.batch_matmul(matrix, matrix, adj_y=True).eval()
+    return tf.matmul(matrix, matrix, adjoint_b=True).eval()
 
   def testPositiveDefiniteMatrixDoesntRaise(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_test.py
index 6514b9f1fc3..3beac24e79a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_test.py
@@ -75,7 +75,7 @@ class OperatorSolve(OperatorShape):
   """Operator implements .solve."""
 
   def __init__(self, chol):
-    self._pos_def_matrix = tf.batch_matmul(chol, chol, adj_y=True)
+    self._pos_def_matrix = tf.matmul(chol, chol, adjoint_b=True)
     super(OperatorSolve, self).__init__(chol.shape)
 
   def _solve(self, rhs):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_vdvt_update_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_vdvt_update_test.py
index b85fb0ec7bb..1bd38a644a6 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_vdvt_update_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_vdvt_update_test.py
@@ -38,7 +38,7 @@ class OperatorPDSqrtVDVTUpdateTest(
   def _random_pd_matrix(self, shape):
     # With probability 1 this is positive definite.
     sqrt = self._rng.randn(*shape)
-    mat = tf.batch_matmul(sqrt, sqrt, adj_y=True)
+    mat = tf.matmul(sqrt, sqrt, adjoint_b=True)
     return mat.eval()
 
   def _random_v_and_diag(self, mat_shape, v_matrix_rank):
@@ -67,11 +67,11 @@ class OperatorPDSqrtVDVTUpdateTest(
       diag_vt = tf.matrix_transpose(v)
     else:
       diag_mat = tf.matrix_diag(diag)
-      diag_vt = tf.batch_matmul(diag_mat, v, adj_y=True)
+      diag_vt = tf.matmul(diag_mat, v, adjoint_b=True)
 
-    v_diag_vt = tf.batch_matmul(v, diag_vt)
+    v_diag_vt = tf.matmul(v, diag_vt)
     sqrt = mat + v_diag_vt
-    a = tf.batch_matmul(sqrt, sqrt, adj_y=True)
+    a = tf.matmul(sqrt, sqrt, adjoint_b=True)
     return a.eval()
 
   def _build_operator_and_mat(self, batch_shape, k, dtype=np.float64):
diff --git a/tensorflow/contrib/distributions/python/ops/bijector.py b/tensorflow/contrib/distributions/python/ops/bijector.py
index a8b552bb404..1fc9e6eda77 100644
--- a/tensorflow/contrib/distributions/python/ops/bijector.py
+++ b/tensorflow/contrib/distributions/python/ops/bijector.py
@@ -1394,7 +1394,7 @@ class ScaleAndShift(Bijector):
 
   def _forward(self, x):
     x, sample_shape = self.shaper.make_batch_of_event_sample_matrices(x)
-    x = math_ops.batch_matmul(self.scale, x)
+    x = math_ops.matmul(self.scale, x)
     x = self.shaper.undo_make_batch_of_event_sample_matrices(x, sample_shape)
     x += self.shift
     return x
@@ -1776,7 +1776,7 @@ class CholeskyOuterProduct(Bijector):
       x = control_flow_ops.with_dependencies([is_matrix, is_square], x)
     # For safety, explicitly zero-out the upper triangular part.
     x = array_ops.matrix_band_part(x, -1, 0)
-    return math_ops.batch_matmul(x, x, adj_y=True)
+    return math_ops.matmul(x, x, adjoint_b=True)
 
   def _inverse_and_inverse_log_det_jacobian(self, y):
     x = (math_ops.sqrt(y) if self._static_event_ndims == 0
@@ -1855,8 +1855,7 @@ class CholeskyOuterProduct(Bijector):
         dim=1)
 
     sum_weighted_log_diag = array_ops.squeeze(
-        math_ops.batch_matmul(math_ops.log(diag), exponents),
-        squeeze_dims=-1)
+        math_ops.matmul(math_ops.log(diag), exponents), squeeze_dims=-1)
     fldj = p * math.log(2.) + sum_weighted_log_diag
 
     if x.get_shape().ndims is not None:
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet.py b/tensorflow/contrib/distributions/python/ops/dirichlet.py
index a20fdb7b771..2a2ea4ec260 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet.py
+++ b/tensorflow/contrib/distributions/python/ops/dirichlet.py
@@ -216,9 +216,11 @@ class Dirichlet(distribution.Distribution):
   def _variance(self):
     scale = self.alpha_sum * math_ops.sqrt(1. + self.alpha_sum)
     alpha = self.alpha / scale
-    outer_prod = -math_ops.batch_matmul(
-        array_ops.expand_dims(alpha, dim=-1),  # column
-        array_ops.expand_dims(alpha, dim=-2))  # row
+    outer_prod = -math_ops.matmul(
+        array_ops.expand_dims(
+            alpha, dim=-1),  # column
+        array_ops.expand_dims(
+            alpha, dim=-2))  # row
     return array_ops.matrix_set_diag(outer_prod,
                                      alpha * (self.alpha_sum / scale - alpha))
 
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py b/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
index e55125b9289..970cabe21f0 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
+++ b/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
@@ -245,7 +245,7 @@ class DirichletMultinomial(distribution.Distribution):
   def _variance(self):
     alpha_sum = array_ops.expand_dims(self.alpha_sum, -1)
     normalized_alpha = self.alpha / alpha_sum
-    variance = -math_ops.batch_matmul(
+    variance = -math_ops.matmul(
         array_ops.expand_dims(normalized_alpha, -1),
         array_ops.expand_dims(normalized_alpha, -2))
     variance = array_ops.matrix_set_diag(variance, normalized_alpha *
diff --git a/tensorflow/contrib/distributions/python/ops/multinomial.py b/tensorflow/contrib/distributions/python/ops/multinomial.py
index 91d6435645f..845e74b8a96 100644
--- a/tensorflow/contrib/distributions/python/ops/multinomial.py
+++ b/tensorflow/contrib/distributions/python/ops/multinomial.py
@@ -223,9 +223,8 @@ class Multinomial(distribution.Distribution):
 
   def _variance(self):
     p = self.p * array_ops.expand_dims(array_ops.ones_like(self.n), -1)
-    outer_prod = math_ops.batch_matmul(
-        array_ops.expand_dims(self._mean_val, -1),
-        array_ops.expand_dims(p, -2))
+    outer_prod = math_ops.matmul(
+        array_ops.expand_dims(self._mean_val, -1), array_ops.expand_dims(p, -2))
     return array_ops.matrix_set_diag(-outer_prod,
                                      self._mean_val - self._mean_val * p)
 
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd.py b/tensorflow/contrib/distributions/python/ops/operator_pd.py
index 313fec3753a..c679104c085 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd.py
@@ -437,7 +437,7 @@ class OperatorPDBase(object):
       name:  A name to give this `Op`.
 
     Returns:
-      A result equivalent to `tf.batch_matmul(self.to_dense(), x)`.
+      A result equivalent to `tf.matmul(self.to_dense(), x)`.
     """
     with ops.name_scope(self.name):
       with ops.name_scope(name, values=[x] + self.inputs):
@@ -471,7 +471,7 @@ class OperatorPDBase(object):
       name:  A name scope to use for ops added by this method.
 
     Returns:
-      A result equivalent to `tf.batch_matmul(self.sqrt_to_dense(), x)`.
+      A result equivalent to `tf.matmul(self.sqrt_to_dense(), x)`.
     """
     with ops.name_scope(self.name):
       with ops.name_scope(name, values=[x] + self.inputs):
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_cholesky.py b/tensorflow/contrib/distributions/python/ops/operator_pd_cholesky.py
index dce536e9e17..2f3ff0be2e9 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_cholesky.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_cholesky.py
@@ -127,21 +127,21 @@ class OperatorPDCholesky(operator_pd.OperatorPDBase):
     return math_ops.matmul(chol, chol_times_x)
 
   def _batch_matmul(self, x, transpose_x=False):
-    # tf.batch_matmul is defined x * y, so "y" is on the right, not "x".
+    # tf.matmul is defined x * y, so "y" is on the right, not "x".
     chol = array_ops.matrix_band_part(self._chol, -1, 0)
-    chol_times_x = math_ops.batch_matmul(
-        chol, x, adj_x=True, adj_y=transpose_x)
-    return math_ops.batch_matmul(chol, chol_times_x)
+    chol_times_x = math_ops.matmul(
+        chol, x, adjoint_a=True, adjoint_b=transpose_x)
+    return math_ops.matmul(chol, chol_times_x)
 
   def _sqrt_matmul(self, x, transpose_x=False):
     chol = array_ops.matrix_band_part(self._chol, -1, 0)
     # tf.matmul is defined a * b
-    return math_ops.matmul(chol, x, transpose_b=transpose_x)
+    return math_ops.matmul(chol, x, adjoint_b=transpose_x)
 
   def _batch_sqrt_matmul(self, x, transpose_x=False):
     chol = array_ops.matrix_band_part(self._chol, -1, 0)
     # tf.batch_matmul is defined x * y, so "y" is on the right, not "x".
-    return math_ops.batch_matmul(chol, x, adj_y=transpose_x)
+    return math_ops.matmul(chol, x, adjoint_b=transpose_x)
 
   def _batch_solve(self, rhs):
     return linalg_ops.cholesky_solve(self._chol, rhs)
@@ -181,4 +181,4 @@ class OperatorPDCholesky(operator_pd.OperatorPDBase):
 
   def _to_dense(self):
     chol = array_ops.matrix_band_part(self._chol, -1, 0)
-    return math_ops.batch_matmul(chol, chol, adj_y=True)
+    return math_ops.matmul(chol, chol, adjoint_b=True)
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py b/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py
index e858315f3f1..eff3133c448 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py
@@ -319,10 +319,7 @@ class OperatorPDSqrtVDVTUpdate(operator_pd.OperatorPDBase):
     # M^{-1} V
     minv_v = self._operator.solve(self._v)
     # V^T M^{-1} V
-    if batch_mode:
-      vt_minv_v = math_ops.batch_matmul(self._v, minv_v, adj_x=True)
-    else:
-      vt_minv_v = math_ops.matmul(self._v, minv_v, transpose_a=True)
+    vt_minv_v = math_ops.matmul(self._v, minv_v, adjoint_a=True)
 
     # D^{-1} + V^T M^{-1} V
     capacitance = self._diag_inv_operator.add_to_tensor(vt_minv_v)
@@ -360,14 +357,14 @@ class OperatorPDSqrtVDVTUpdate(operator_pd.OperatorPDBase):
     v = self._v
     m = self._operator
     d = self._diag_operator
-    # The operators call the appropriate matmul/batch_matmul automatically.  We
-    # cannot override.
-    # batch_matmul is defined as:  x * y, so adj_x and adj_y are the ways to
-    # transpose the left and right.
+    # The operators call the appropriate matmul/batch_matmul automatically.
+    # We cannot override.
+    # batch_matmul is defined as:  x * y, so adjoint_a and adjoint_b are the
+    # ways to transpose the left and right.
     mx = m.matmul(x, transpose_x=transpose_x)
-    vt_x = math_ops.batch_matmul(v, x, adj_x=True, adj_y=transpose_x)
+    vt_x = math_ops.matmul(v, x, adjoint_a=True, adjoint_b=transpose_x)
     d_vt_x = d.matmul(vt_x)
-    v_d_vt_x = math_ops.batch_matmul(v, d_vt_x)
+    v_d_vt_x = math_ops.matmul(v, d_vt_x)
 
     return mx + v_d_vt_x
 
@@ -444,11 +441,11 @@ class OperatorPDSqrtVDVTUpdate(operator_pd.OperatorPDBase):
     # M^{-1} rhs
     minv_rhs = m.solve(rhs)
     # V^T M^{-1} rhs
-    vt_minv_rhs = math_ops.batch_matmul(v, minv_rhs, adj_x=True)
+    vt_minv_rhs = math_ops.matmul(v, minv_rhs, adjoint_a=True)
     # C^{-1} V^T M^{-1} rhs
     cinv_vt_minv_rhs = linalg_ops.cholesky_solve(cchol, vt_minv_rhs)
     # V C^{-1} V^T M^{-1} rhs
-    v_cinv_vt_minv_rhs = math_ops.batch_matmul(v, cinv_vt_minv_rhs)
+    v_cinv_vt_minv_rhs = math_ops.matmul(v, cinv_vt_minv_rhs)
     # M^{-1} V C^{-1} V^T M^{-1} rhs
     minv_v_cinv_vt_minv_rhs = m.solve(v_cinv_vt_minv_rhs)
 
@@ -457,7 +454,7 @@ class OperatorPDSqrtVDVTUpdate(operator_pd.OperatorPDBase):
 
   def _to_dense(self):
     sqrt = self.sqrt_to_dense()
-    return math_ops.batch_matmul(sqrt, sqrt, adj_y=True)
+    return math_ops.matmul(sqrt, sqrt, adjoint_b=True)
 
   def _sqrt_to_dense(self):
     v = self._v
@@ -467,6 +464,6 @@ class OperatorPDSqrtVDVTUpdate(operator_pd.OperatorPDBase):
     d_vt = d.matmul(v, transpose_x=True)
     # Batch op won't be efficient for singletons.  Currently we don't break
     # to_dense into batch/singleton methods.
-    v_d_vt = math_ops.batch_matmul(v, d_vt)
+    v_d_vt = math_ops.matmul(v, d_vt)
     m_plus_v_d_vt = m.to_dense() + v_d_vt
     return m_plus_v_d_vt
diff --git a/tensorflow/contrib/distributions/python/ops/operator_test_util.py b/tensorflow/contrib/distributions/python/ops/operator_test_util.py
index 06b27c9702a..aeb6950b838 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_test_util.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_test_util.py
@@ -87,9 +87,7 @@ class OperatorPDDerivedClassTest(tf.test.TestCase):
           self.assertEqual(mat.shape, sqrt.get_shape())
           # Square roots are not unique, but SS^T should equal mat.  In this
           # case however, we should have S = S^T.
-          self._compare_results(
-              expected=mat,
-              actual=tf.batch_matmul(sqrt, sqrt))
+          self._compare_results(expected=mat, actual=tf.matmul(sqrt, sqrt))
 
   def testDeterminants(self):
     with self.test_session():
@@ -111,8 +109,7 @@ class OperatorPDDerivedClassTest(tf.test.TestCase):
           x = self._rng.randn(*(batch_shape + (k, 5)))
 
           self._compare_results(
-              expected=tf.batch_matmul(mat, x).eval(),
-              actual=operator.matmul(x))
+              expected=tf.matmul(mat, x).eval(), actual=operator.matmul(x))
 
   def testSqrtMatmul(self):
     # Square roots are not unique, but we should have SS^T x = Ax, and in our
@@ -126,7 +123,7 @@ class OperatorPDDerivedClassTest(tf.test.TestCase):
           x = self._rng.randn(*(batch_shape + (k, 5)))
 
           self._compare_results(
-              expected=tf.batch_matmul(mat, x).eval(),
+              expected=tf.matmul(mat, x).eval(),
               actual=operator.sqrt_matmul(operator.sqrt_matmul(x)))
 
   def testSolve(self):
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index 3cc82389f5e..288d958ca2e 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -245,7 +245,7 @@ class _WishartOperatorPD(distribution.Distribution):
 
     if not self.cholesky_input_output_matrices:
       # Complexity: O(nbk^3)
-      x = math_ops.batch_matmul(x, x, adj_y=True)
+      x = math_ops.matmul(x, x, adjoint_b=True)
 
     return x
 
@@ -353,7 +353,7 @@ class _WishartOperatorPD(distribution.Distribution):
   def _variance(self):
     x = math_ops.sqrt(self.df) * self.scale_operator_pd.to_dense()
     d = array_ops.expand_dims(array_ops.matrix_diag_part(x), -1)
-    v = math_ops.square(x) + math_ops.batch_matmul(d, d, adj_y=True)
+    v = math_ops.square(x) + math_ops.matmul(d, d, adjoint_b=True)
     if self.cholesky_input_output_matrices:
       return linalg_ops.cholesky(v)
     return v