[TF:XLA] Re-implement MatrixSolve in terms of QR decomposition.

This implementation is effectively just a MatrixInverse followed by a batched dot, so all of the same disclaimers apply as for MatrixInverse (QR decomposition likely to be slower than LU, doesn't support complex numbers). This is a short-term solution to avoid having to use workarounds for tf.linalg.solve on TPU. Performance can be improved in future (e.g. when LU is implemented). This functionality was added previously but had to be rolled-back due to slow tests. This version has faster tests. PiperOrigin-RevId: 277120825 Change-Id: I0a1e274738d9a9393b061f6b793180e1f20dacf7
2019-10-28 12:37:34 -07:00 · 2019-10-28 12:37:34 -07:00 · 269e4abc11
commit 269e4abc11
parent ad6be7820b
5 changed files with 171 additions and 0 deletions
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@ -300,12 +300,14 @@ bool RecursiveCompilabilityChecker::OpIsInaccurate(const Node& node) const {
 bool RecursiveCompilabilityChecker::OpIsSlow(const Node& node) const {
  // b/128001705: SelfAdjointEigV2 and Svd performance issues.
  // b/135640736: MatrixInverse performance issues.
+  // b/111271662: MatrixSolve performance issues.
  // https://github.com/tensorflow/tensorflow/pull/31012:
  //    ResizeNearestNeighbor, ResizeBilinear, and ResizeBilinearGrad sometimes
  //    create convolutions too large for CuDNN to handle.
  return node.type_string() == "SelfAdjointEigV2" ||
         node.type_string() == "Svd" || node.type_string() == "Qr" ||
         node.type_string() == "MatrixInverse" ||
+         node.type_string() == "MatrixSolve" ||
         node.type_string() == "ResizeNearestNeighbor" ||
         node.type_string() == "ResizeBilinear" ||
         node.type_string() == "ResizeBilinearGrad";
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@ -311,6 +311,20 @@ tf_xla_py_test(
    ],
 )

+tf_xla_py_test(
+    name = "matrix_solve_op_test",
+    size = "small",
+    timeout = "moderate",
+    srcs = ["matrix_solve_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_xla_py_test(
    name = "matrix_triangular_solve_op_test",
    size = "small",
--- a/tensorflow/compiler/tests/matrix_solve_op_test.py
+++ b/tensorflow/compiler/tests/matrix_solve_op_test.py
@ -0,0 +1,78 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for XLA implementation of tf.linalg.solve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import googletest
+
+
+class MatrixSolveOpTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  def _verifySolve(self, x, y, adjoint):
+    for np_type in self.float_types & {np.float32, np.float64}:
+      tol = 1e-4 if np_type == np.float32 else 1e-12
+      a = x.astype(np_type)
+      b = y.astype(np_type)
+      np_ans = np.linalg.solve(np.swapaxes(a, -2, -1) if adjoint else a, b)
+      with self.session() as sess:
+        with self.test_scope():
+          tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
+        out = sess.run(tf_ans)
+        self.assertEqual(tf_ans.shape, out.shape)
+        self.assertEqual(np_ans.shape, out.shape)
+        self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
+
+  @parameterized.named_parameters(
+      ("Scalar", 1, 1, [], [], False),
+      ("Vector", 5, 1, [], [], False),
+      ("MultipleRHS", 5, 4, [], [], False),
+      ("Adjoint", 5, 4, [], [], True),
+      ("BatchedScalar", 1, 4, [2], [2], False),
+      ("BatchedVector", 5, 4, [2], [2], False),
+      ("BatchedRank2", 5, 4, [7, 4], [7, 4], False),
+      ("BatchedAdjoint", 5, 4, [7, 4], [7, 4], True),
+  )
+  def testSolve(self, n, nrhs, batch_dims, rhs_batch_dims, adjoint):
+    matrix = np.random.normal(-5.0, 5.0, batch_dims + [n, n])
+    rhs = np.random.normal(-5.0, 5.0, rhs_batch_dims + [n, nrhs])
+    self._verifySolve(matrix, rhs, adjoint=adjoint)
+
+  @parameterized.named_parameters(
+      ("Simple", False),
+      ("Adjoint", True),
+  )
+  def testConcurrent(self, adjoint):
+    with self.session() as sess:
+      lhs1 = random_ops.random_normal([3, 3], seed=42)
+      lhs2 = random_ops.random_normal([3, 3], seed=42)
+      rhs1 = random_ops.random_normal([3, 3], seed=42)
+      rhs2 = random_ops.random_normal([3, 3], seed=42)
+      with self.test_scope():
+        s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint)
+        s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint)
+      self.assertAllEqual(*sess.run([s1, s2]))
+
+
+if __name__ == "__main__":
+  googletest.main()
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@ -57,6 +57,7 @@ tf_kernel_library(
        "matrix_band_part_op.cc",
        "matrix_diag_ops.cc",
        "matrix_inverse_op.cc",
+        "matrix_solve_op.cc",
        "matrix_triangular_solve_op.cc",
        "mirror_pad_op.cc",
        "next_after_op.cc",
--- a/tensorflow/compiler/tf2xla/kernels/matrix_solve_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_solve_op.cc
@ -0,0 +1,76 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace tensorflow {
+namespace {
+
+class MatrixSolveOp : public XlaOpKernel {
+ public:
+  explicit MatrixSolveOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint", &adjoint_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape matrix_shape = ctx->InputShape(0);
+    int64 matrix_ndims = matrix_shape.dims();
+    OP_REQUIRES(ctx, matrix_ndims >= 2,
+                errors::InvalidArgument(
+                    "Input matrix must have rank >= 2, got ", matrix_ndims));
+    OP_REQUIRES(ctx,
+                matrix_shape.dim_size(matrix_ndims - 2) ==
+                    matrix_shape.dim_size(matrix_ndims - 1),
+                errors::InvalidArgument(
+                    "Input matrices must be square, got",
+                    matrix_shape.dim_size(matrix_ndims - 2),
+                    " != ", matrix_shape.dim_size(matrix_ndims - 1)));
+
+    xla::XlaOp matrix = ctx->Input(0);
+    xla::XlaOp rhs = ctx->Input(1);
+
+    // TODO(b/111271662): Using LU decomposition instead of QR should be faster.
+    auto qr = xla::QRDecomposition(matrix, /*full_matrices=*/false);
+    OP_REQUIRES_OK(ctx, qr.status());
+
+    xla::XlaOp inv = xla::TriangularSolve(
+        qr.ValueOrDie().r, xla::TransposeInMinorDims(qr.ValueOrDie().q),
+        /*left_side=*/true,
+        /*lower=*/false, /*unit_diagonal=*/false,
+        /*transpose_a=*/
+        xla::TriangularSolveOptions::NO_TRANSPOSE);
+
+    xla::XlaOp output =
+        xla::BatchDot(inv, adjoint_, rhs,
+                      /*transpose_y=*/false, xla::PrecisionConfig::HIGHEST);
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  bool adjoint_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSolveOp);
+};
+
+// TODO(b/111271662): Support integer and complex types.
+REGISTER_XLA_OP(Name("MatrixSolve").TypeConstraint("T", kFloatTypes),
+                MatrixSolveOp);
+
+}  // namespace
+}  // namespace tensorflow