Parallelize BitonicSort on GPU.

We now emit O(log^n) kernel thunks. Each thunk is responsible for looping over the other dimensions, and then doing a comparison loop through the dimension that should be sorted. PiperOrigin-RevId: 205791397
2018-07-24 01:13:09 -07:00 · 2018-07-24 01:13:09 -07:00 · 33035bb79b
commit 33035bb79b
parent fca1561b9d
6 changed files with 97 additions and 99 deletions
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@ -123,17 +122,6 @@ Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
  return Status::OK();
 }

-Status IrEmitter::HandleSort(HloInstruction* sort) {
-  auto values = sort->operand_count() > 1 ? sort->operand(1) : nullptr;
-  if (values != nullptr) {
-    // TODO(b/26783907): Also sort the values by their corresponding key.
-    return Unimplemented("Key/Value Sort is not implemented on GPU");
-  }
-  int dimension_to_sort = sort->dimensions(0);
-  return llvm_ir::EmitSortInPlace(dimension_to_sort, GetIrArray(*sort, *sort),
-                                  IrName(sort), &b_);
-}
-
 Status IrEmitter::HandleSend(HloInstruction*) {
  return Unimplemented("Send is not implemented on GPU");
 }
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@ -79,7 +79,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
  Status HandleCrossReplicaSum(HloInstruction* crs) override;
  Status HandleInfeed(HloInstruction* infeed) override;
  Status HandleOutfeed(HloInstruction* outfeed) override;
-  Status HandleSort(HloInstruction* sort) override;
  Status HandleSend(HloInstruction* send) override;
  Status HandleSendDone(HloInstruction* send_done) override;
  Status HandleRecv(HloInstruction* recv) override;
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@ -63,6 +63,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@ -71,6 +72,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
@ -2036,11 +2038,51 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
        /*mem_size=*/ShapeUtil::ByteSizeOf(sort->shape()), sort));
  }

-  thunks.push_back(
-      BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+  int64 dimension_to_sort = sort->dimensions(0);
+  int64 dimension_to_sort_bound = sort->shape().dimensions(dimension_to_sort);
+  int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
+  auto index_type = b_.getInt64Ty();
+
+  // Naive C++ code for the outer loops:
+  //
+  // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound);
+  //     ++stage) {
+  //   int64 first_xor_mask = (1LL << (stage + 1)) - 1;
+  //   SortInPlace(first_xor_mask);
+  //   for (int64 mask = stage - 1; mask >= 0; --mask) {
+  //     int64 later_xor_mask = 1LL << mask;
+  //     SortInPlace(later_xor_mask);
+  //   }
+  // }
+  //
+  // This follows the algorithm described on Wikipedia:
+  // https://en.wikipedia.org/wiki/Bitonic_sorter
+
+  for (int64 stage = 0; stage < num_stages; ++stage) {
+    for (int64 mask = stage; mask >= 0; --mask) {
+      thunks.push_back(
+          BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+      LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+          sort->shape(), ir_emitter_context_->device_description());
+      UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
+                             ir_emitter_context_->llvm_module());
+
+      llvm::Value* xor_mask;
+      if (mask == stage) {
+        xor_mask = llvm::ConstantInt::get(index_type, (1LL << (stage + 1)) - 1);
+      } else {
+        xor_mask = llvm::ConstantInt::get(index_type, 1LL << mask);
+      }
+
+      TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace(
+          dimension_to_sort, GetIrArray(*sort, *sort), IrName(sort), xor_mask,
+          &b_, &launch_dimensions));
+    }
+  }
+
  thunk_sequence_->emplace_back(
      MakeUnique<SequentialThunk>(std::move(thunks), sort));
-  return IrEmitter::HandleSort(sort);
+  return Status::OK();
 }

 Status IrEmitterUnnested::HandleTupleSelect(HloInstruction* tuple_select) {
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@ -188,7 +188,10 @@ cc_library(
        ":ir_array",
        ":llvm_loop",
        ":llvm_util",
+        ":loop_emitter",
        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter",
+        "//tensorflow/compiler/xla/service/gpu:partition_assignment",
        "//tensorflow/core:lib",
        "@llvm//:core",
    ],
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@ -19,12 +19,15 @@ limitations under the License.
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
@ -73,7 +76,9 @@ void EmitCompareLoop(int64 dimension_to_sort,
 }  // namespace

 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       tensorflow::StringPiece name, llvm::IRBuilder<>* b) {
+                       tensorflow::StringPiece name, llvm::Value* xor_mask,
+                       llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions* launch_dimensions) {
  const Shape& keys_shape = keys_array.GetShape();

  // TODO(b/26783907): This case can probably be avoided with the Algebraic
@ -83,11 +88,13 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
  }

  // Create loop nests which loop through the operand dimensions. The sort
-  // dimension is handled in three separate innermost loops which perform the
-  // sorting.
+  // dimension is handled in the innermost loop which performs the sorting.
  ForLoopNest loop_nest(name, b);
  IrArray::Index keys_index =
      loop_nest.EmitOperandArrayLoopNest(keys_array, dimension_to_sort, "keys");
+  if (loop_nest.GetInnerLoopBodyBasicBlock() != nullptr) {
+    SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), b);
+  }

  // 'compare_keys_index' is the index of the element that 'keys_index' should
  // be compared to.
@ -100,89 +107,42 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
    }
  }

-  // Create the sorting loops which do the sorting.
-  int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
-  std::unique_ptr<ForLoop> stages_loop = loop_nest.AddLoop(
-      /*start_index=*/0,
-      /*end_index=*/
-      tensorflow::Log2Ceiling64(dimension_to_sort_bound),
-      /*suffix=*/"sort_stages");
-  std::unique_ptr<ForLoop> mask_loop = loop_nest.AddLoop(
-      /*suffix=*/"mask",
-      /*start_index=*/keys_index.GetConstantWithIndexType(0),
-      /*end_index=*/stages_loop->GetIndVarValue());
-  std::unique_ptr<ForLoop> compare_loop = loop_nest.AddLoop(
-      /*start_index=*/0,
-      /*end_index=*/dimension_to_sort_bound,
-      /*suffix=*/"compare");
-
-  // Naive C++ code for the inner loops (without parallelization):
+  // Naive C++ code for the inner compare loop:
  //
-  // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound);
-  //     ++stage) {
-  //   int64 first_xor_mask = (1LL << (stage + 1)) - 1;
-  //   for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
-  //     int64 j = i ^ first_xor_mask;
-  //     if (i < j && j < dimension_to_sort_bound) {
-  //       int64 min_key = std::min(keys[i], keys[j]);
-  //       keys[j] = std::max(keys[i], keys[j]);
-  //       keys[i] = min_key;
-  //     }
-  //   }
-  //   for (int64 mask = 0; mask < stage; ++mask) {
-  //     int64 later_xor_mask = (1LL << (stage - (mask + 1));
-  //     for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
-  //       int64 j = i ^ later_xor_mask;
-  //       if (i < j && j < dimension_to_sort_bound) {
-  //         int64 min_key = std::min(keys[i], keys[j]);
-  //         keys[j] = std::max(keys[i], keys[j]);
-  //         keys[i] = min_key;
-  //       }
-  //     }
+  // for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
+  //   int64 j = i ^ xor_mask;
+  //   if (i < j && j < dimension_to_sort_bound) {
+  //     int64 min_key = std::min(keys[i], keys[j]);
+  //     keys[j] = std::max(keys[i], keys[j]);
+  //     keys[i] = min_key;
  //   }
  // }
  //
  // This follows the algorithm described on Wikipedia:
  // https://en.wikipedia.org/wiki/Bitonic_sorter

-  SetToFirstInsertPoint(stages_loop->GetBodyBasicBlock(), b);
-  // The first xor mask of a stage is 2^(stage + 1) - 1.
-  auto first_xor_mask = b->CreateSub(
-      b->CreateShl(keys_index.GetConstantWithIndexType(1),
-                   b->CreateAdd(stages_loop->GetIndVarValue(),
-                                keys_index.GetConstantWithIndexType(1))),
-      keys_index.GetConstantWithIndexType(1));
-  std::unique_ptr<ForLoop> first_compare_loop = ForLoop::EmitForLoop(
-      /*prefix=*/"first_compare",
-      /*start_index=*/keys_index.GetConstantWithIndexType(0),
-      /*end_index=*/
-      keys_index.GetConstantWithIndexType(dimension_to_sort_bound),
-      /*step=*/keys_index.GetConstantWithIndexType(1),
-      /*b=*/b);
-
-  SetToFirstInsertPoint(first_compare_loop->GetBodyBasicBlock(), b);
-  // 'first_compare_loop' iterates through the 'dimension_to_sort'.
-  keys_index[dimension_to_sort] = first_compare_loop->GetIndVarValue();
-  compare_keys_index[dimension_to_sort] =
-      b->CreateXor(first_compare_loop->GetIndVarValue(), first_xor_mask);
-  EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, keys_array,
-                  b);
-
-  SetToFirstInsertPoint(compare_loop->GetPreheaderBasicBlock(), b);
-  // The later masks of a stage are 2^(stage - (mask_loop_ind_var + 1)).
-  auto later_xor_mask = b->CreateShl(
-      keys_index.GetConstantWithIndexType(1),
-      b->CreateSub(stages_loop->GetIndVarValue(),
-                   b->CreateAdd(mask_loop->GetIndVarValue(),
-                                keys_index.GetConstantWithIndexType(1))));
-
-  SetToFirstInsertPoint(compare_loop->GetBodyBasicBlock(), b);
-  // 'compare_loop' iterates through the 'dimension_to_sort'.
-  keys_index[dimension_to_sort] = compare_loop->GetIndVarValue();
-  compare_keys_index[dimension_to_sort] =
-      b->CreateXor(compare_loop->GetIndVarValue(), later_xor_mask);
-  EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, keys_array,
-                  b);
+  int64 dimension_to_sort_bound =
+      keys_array.GetShape().dimensions(dimension_to_sort);
+  Shape compare_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
+                                             {dimension_to_sort_bound});
+  auto compare_loop_body_emitter =
+      [&](const IrArray::Index& compare_index) -> Status {
+    keys_index[dimension_to_sort] = compare_index[0];
+    compare_keys_index[dimension_to_sort] =
+        b->CreateXor(compare_index[0], xor_mask);
+    EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index,
+                    keys_array, b);
+    return Status::OK();
+  };
+  if (launch_dimensions != nullptr) {
+    TF_RETURN_IF_ERROR(gpu::ParallelLoopEmitter(compare_loop_body_emitter,
+                                                compare_shape,
+                                                *launch_dimensions, b)
+                           .EmitLoop(name));
+  } else {
+    TF_RETURN_IF_ERROR(LoopEmitter(compare_loop_body_emitter, compare_shape, b)
+                           .EmitLoop(name));
+  }

  // Set the IR builder insert point to the exit basic block of the outer most
  // loop. This ensures later instructions are inserted after this loop nest.
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_

+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@ -23,10 +25,14 @@ limitations under the License.

 namespace xla {
 namespace llvm_ir {
-// Emits llvm IR to sort the 'dimension_to_sort' dimension of 'keys_array' into
-// ascending order.
+// Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
+// dimension of 'keys_array'. All other dimensions are kept as-is. This
+// implements the inner loop of BitonicSort. If 'launch_dimensions' is nullptr,
+// the inner compare loop will not be parallelized.
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       tensorflow::StringPiece name, llvm::IRBuilder<>* b);
+                       tensorflow::StringPiece name, llvm::Value* xor_mask,
+                       llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions* launch_dimensions);
 }  // namespace llvm_ir
 }  // namespace xla