From 269e4abc1127aa8eecfc6031bd96f7ececfa771e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 28 Oct 2019 12:37:34 -0700
Subject: [PATCH] [TF:XLA] Re-implement MatrixSolve in terms of QR
 decomposition.

This implementation is effectively just a MatrixInverse followed by a batched
dot, so all of the same disclaimers apply as for MatrixInverse (QR decomposition
likely to be slower than LU, doesn't support complex numbers). This is a
short-term solution to avoid having to use workarounds for tf.linalg.solve on
TPU. Performance can be improved in future (e.g. when LU is implemented).

This functionality was added previously but had to be rolled-back due to slow
tests. This version has faster tests.

PiperOrigin-RevId: 277120825
Change-Id: I0a1e274738d9a9393b061f6b793180e1f20dacf7
---
 .../compiler/jit/compilability_check_util.cc  |  2 +
 tensorflow/compiler/tests/BUILD               | 14 ++++
 .../compiler/tests/matrix_solve_op_test.py    | 78 +++++++++++++++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      |  1 +
 .../tf2xla/kernels/matrix_solve_op.cc         | 76 ++++++++++++++++++
 5 files changed, 171 insertions(+)
 create mode 100644 tensorflow/compiler/tests/matrix_solve_op_test.py
 create mode 100644 tensorflow/compiler/tf2xla/kernels/matrix_solve_op.cc

diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index b0c78469118..b8f04f7d791 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -300,12 +300,14 @@ bool RecursiveCompilabilityChecker::OpIsInaccurate(const Node& node) const {
 bool RecursiveCompilabilityChecker::OpIsSlow(const Node& node) const {
   // b/128001705: SelfAdjointEigV2 and Svd performance issues.
   // b/135640736: MatrixInverse performance issues.
+  // b/111271662: MatrixSolve performance issues.
   // https://github.com/tensorflow/tensorflow/pull/31012:
   //    ResizeNearestNeighbor, ResizeBilinear, and ResizeBilinearGrad sometimes
   //    create convolutions too large for CuDNN to handle.
   return node.type_string() == "SelfAdjointEigV2" ||
          node.type_string() == "Svd" || node.type_string() == "Qr" ||
          node.type_string() == "MatrixInverse" ||
+         node.type_string() == "MatrixSolve" ||
          node.type_string() == "ResizeNearestNeighbor" ||
          node.type_string() == "ResizeBilinear" ||
          node.type_string() == "ResizeBilinearGrad";
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 66cc89f10eb..dd9cf615e4d 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -311,6 +311,20 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "matrix_solve_op_test",
+    size = "small",
+    timeout = "moderate",
+    srcs = ["matrix_solve_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_xla_py_test(
     name = "matrix_triangular_solve_op_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/matrix_solve_op_test.py b/tensorflow/compiler/tests/matrix_solve_op_test.py
new file mode 100644
index 00000000000..fb12c6c453e
--- /dev/null
+++ b/tensorflow/compiler/tests/matrix_solve_op_test.py
@@ -0,0 +1,78 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for XLA implementation of tf.linalg.solve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import googletest
+
+
+class MatrixSolveOpTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  def _verifySolve(self, x, y, adjoint):
+    for np_type in self.float_types & {np.float32, np.float64}:
+      tol = 1e-4 if np_type == np.float32 else 1e-12
+      a = x.astype(np_type)
+      b = y.astype(np_type)
+      np_ans = np.linalg.solve(np.swapaxes(a, -2, -1) if adjoint else a, b)
+      with self.session() as sess:
+        with self.test_scope():
+          tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
+        out = sess.run(tf_ans)
+        self.assertEqual(tf_ans.shape, out.shape)
+        self.assertEqual(np_ans.shape, out.shape)
+        self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
+
+  @parameterized.named_parameters(
+      ("Scalar", 1, 1, [], [], False),
+      ("Vector", 5, 1, [], [], False),
+      ("MultipleRHS", 5, 4, [], [], False),
+      ("Adjoint", 5, 4, [], [], True),
+      ("BatchedScalar", 1, 4, [2], [2], False),
+      ("BatchedVector", 5, 4, [2], [2], False),
+      ("BatchedRank2", 5, 4, [7, 4], [7, 4], False),
+      ("BatchedAdjoint", 5, 4, [7, 4], [7, 4], True),
+  )
+  def testSolve(self, n, nrhs, batch_dims, rhs_batch_dims, adjoint):
+    matrix = np.random.normal(-5.0, 5.0, batch_dims + [n, n])
+    rhs = np.random.normal(-5.0, 5.0, rhs_batch_dims + [n, nrhs])
+    self._verifySolve(matrix, rhs, adjoint=adjoint)
+
+  @parameterized.named_parameters(
+      ("Simple", False),
+      ("Adjoint", True),
+  )
+  def testConcurrent(self, adjoint):
+    with self.session() as sess:
+      lhs1 = random_ops.random_normal([3, 3], seed=42)
+      lhs2 = random_ops.random_normal([3, 3], seed=42)
+      rhs1 = random_ops.random_normal([3, 3], seed=42)
+      rhs2 = random_ops.random_normal([3, 3], seed=42)
+      with self.test_scope():
+        s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint)
+        s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint)
+      self.assertAllEqual(*sess.run([s1, s2]))
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index a3ae36ce0d5..b6e89c6540b 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -57,6 +57,7 @@ tf_kernel_library(
         "matrix_band_part_op.cc",
         "matrix_diag_ops.cc",
         "matrix_inverse_op.cc",
+        "matrix_solve_op.cc",
         "matrix_triangular_solve_op.cc",
         "mirror_pad_op.cc",
         "next_after_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_solve_op.cc
new file mode 100644
index 00000000000..8a4e71068b8
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_solve_op.cc
@@ -0,0 +1,76 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace tensorflow {
+namespace {
+
+class MatrixSolveOp : public XlaOpKernel {
+ public:
+  explicit MatrixSolveOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint", &adjoint_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape matrix_shape = ctx->InputShape(0);
+    int64 matrix_ndims = matrix_shape.dims();
+    OP_REQUIRES(ctx, matrix_ndims >= 2,
+                errors::InvalidArgument(
+                    "Input matrix must have rank >= 2, got ", matrix_ndims));
+    OP_REQUIRES(ctx,
+                matrix_shape.dim_size(matrix_ndims - 2) ==
+                    matrix_shape.dim_size(matrix_ndims - 1),
+                errors::InvalidArgument(
+                    "Input matrices must be square, got",
+                    matrix_shape.dim_size(matrix_ndims - 2),
+                    " != ", matrix_shape.dim_size(matrix_ndims - 1)));
+
+    xla::XlaOp matrix = ctx->Input(0);
+    xla::XlaOp rhs = ctx->Input(1);
+
+    // TODO(b/111271662): Using LU decomposition instead of QR should be faster.
+    auto qr = xla::QRDecomposition(matrix, /*full_matrices=*/false);
+    OP_REQUIRES_OK(ctx, qr.status());
+
+    xla::XlaOp inv = xla::TriangularSolve(
+        qr.ValueOrDie().r, xla::TransposeInMinorDims(qr.ValueOrDie().q),
+        /*left_side=*/true,
+        /*lower=*/false, /*unit_diagonal=*/false,
+        /*transpose_a=*/
+        xla::TriangularSolveOptions::NO_TRANSPOSE);
+
+    xla::XlaOp output =
+        xla::BatchDot(inv, adjoint_, rhs,
+                      /*transpose_y=*/false, xla::PrecisionConfig::HIGHEST);
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  bool adjoint_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSolveOp);
+};
+
+// TODO(b/111271662): Support integer and complex types.
+REGISTER_XLA_OP(Name("MatrixSolve").TypeConstraint("T", kFloatTypes),
+                MatrixSolveOp);
+
+}  // namespace
+}  // namespace tensorflow