- Fix PjRt GPU Client to allow num_partitions > 1

- Fix XLA GPU backend to allow num_partitions > 1 PiperOrigin-RevId: 354998179 Change-Id: Ief04993252d80e3edf04a91f4523edd25d8b3102
2021-02-01 12:18:30 -08:00 · 2021-02-01 12:18:30 -08:00 · 761dc221ce
commit 761dc221ce
parent c455215395
4 changed files with 20 additions and 14 deletions
--- a/tensorflow/compiler/xla/pjrt/gpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu_device.cc
@ -46,9 +46,7 @@ class GpuClient : public xla::PjRtStreamExecutorClient {

 xla::StatusOr<xla::DeviceAssignment> GpuClient::GetDefaultDeviceAssignment(
    int num_replicas, int num_partitions) const {
-  // XLA:GPU does not support multiple partitions yet.
-  TF_RET_CHECK(num_partitions == 1) << num_partitions;
-  if (num_replicas <= addressable_devices().size()) {
+  if (num_partitions == 1 && num_replicas <= addressable_devices().size()) {
    xla::DeviceAssignment assignment(num_replicas, 1);
    for (int i = 0; i < num_replicas; ++i) {
      assignment(i, 0) = addressable_devices().at(i)->id();
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@ -104,6 +104,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
+#include "tensorflow/compiler/xla/service/llvm_compiler.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/logistic_expander.h"
 #include "tensorflow/compiler/xla/service/map_inliner.h"
@ -186,6 +187,19 @@ CpuCompiler::CpuCompiler() {
  (void)llvm_initialized;
 }

+StatusOr<std::vector<std::unique_ptr<Executable>>> CpuCompiler::Compile(
+    std::unique_ptr<HloModuleGroup> module_group,
+    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
+    const CompileOptions& options) {
+  for (const std::vector<se::StreamExecutor*>& se_vector : stream_execs) {
+    if (se_vector.size() != 1) {
+      return Unimplemented(
+          "Model partitioning not implemented for the CPU compiler");
+    }
+  }
+  return LLVMCompiler::Compile(std::move(module_group), stream_execs, options);
+}
+
 /* static */ void CpuCompiler::InitializeLLVMTarget() {
  // Initialize LLVM's MC layer for the native target.
  llvm::InitializeNativeTarget();
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@ -125,12 +125,10 @@ class CpuCompiler : public LLVMCompiler {
  CpuCompiler();
  ~CpuCompiler() override {}

-  // Bring in
-  // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-  //     std::vector<std::unique_ptr<HloModule>> modules,
-  //     std::vector<std::vector<se::StreamExecutor*>>
-  //        stream_execs)
-  using LLVMCompiler::Compile;
+  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::unique_ptr<HloModuleGroup> module_group,
+      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
+      const CompileOptions& options) override;

  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/

 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
+
 #include "tensorflow/core/platform/denormal.h"

 #ifdef __FAST_MATH__
@ -41,11 +42,6 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
  std::vector<std::unique_ptr<HloModule>> modules =
      module_group->ConsumeModules();
  for (size_t i = 0; i < modules.size(); i++) {
-    if (stream_execs[i].size() != 1) {
-      return Unimplemented(
-          "Model partitioning not implemented for the CPU/GPU compilers!");
-    }
-
    TF_ASSIGN_OR_RETURN(modules[i],
                        RunHloPasses(std::move(modules[i]), stream_execs[i][0],
                                     options.device_allocator));