* Move implicit platform initialization logic in PlatformUtil to MulitPlatformManager so that latter is the only implicit initialization site. * Remove unused methods. PiperOrigin-RevId: 279422441 Change-Id: I37161feb4b96f839438e95a3b04f118279a77e8b
		
			
				
	
	
		
			235 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			235 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 | |
| 
 | |
| Licensed under the Apache License, Version 2.0 (the "License");
 | |
| you may not use this file except in compliance with the License.
 | |
| You may obtain a copy of the License at
 | |
| 
 | |
|     http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
| Unless required by applicable law or agreed to in writing, software
 | |
| distributed under the License is distributed on an "AS IS" BASIS,
 | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| See the License for the specific language governing permissions and
 | |
| limitations under the License.
 | |
| ==============================================================================*/
 | |
| 
 | |
| #include "tensorflow/compiler/xla/service/platform_util.h"
 | |
| 
 | |
| #include <algorithm>
 | |
| #include <string>
 | |
| #include <utility>
 | |
| 
 | |
| #include "absl/strings/ascii.h"
 | |
| #include "absl/strings/str_join.h"
 | |
| #include "tensorflow/compiler/xla/debug_options_flags.h"
 | |
| #include "tensorflow/compiler/xla/service/compiler.h"
 | |
| #include "tensorflow/compiler/xla/status_macros.h"
 | |
| #include "tensorflow/compiler/xla/statusor.h"
 | |
| #include "tensorflow/compiler/xla/types.h"
 | |
| #include "tensorflow/compiler/xla/util.h"
 | |
| #include "tensorflow/core/lib/core/threadpool.h"
 | |
| #include "tensorflow/core/platform/logging.h"
 | |
| #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 | |
| 
 | |
| namespace xla {
 | |
| 
 | |
| // Minimum supported CUDA compute capability is 3.5.
 | |
| constexpr int kMinCudaComputeCapabilityMajor = 3;
 | |
| constexpr int kMinCudaComputeCapabilityMinor = 5;
 | |
| 
 | |
| // Minimum supported AMDGPU ISA version is 803.
 | |
| constexpr int kMinAMDGPUISAVersion = 803;
 | |
| 
 | |
| // The name of the interpreter platform.
 | |
| constexpr char kInterpreter[] = "interpreter";
 | |
| 
 | |
| namespace {
 | |
| 
 | |
| string CanonicalPlatformName(const string& platform_name) {
 | |
|   string lowercase_platform_name = absl::AsciiStrToLower(platform_name);
 | |
|   // "cpu" and "host" mean the same thing.
 | |
|   if (lowercase_platform_name == "cpu") {
 | |
|     return "host";
 | |
|   }
 | |
|   // When configured on CUDA, "gpu" and "cuda" mean the same thing.
 | |
|   // When configured on ROCm, "gpu" and "rocm" mean the same thing.
 | |
|   if (lowercase_platform_name == "gpu") {
 | |
| #if TENSORFLOW_USE_ROCM
 | |
|     return "rocm";
 | |
| #else
 | |
|     return "cuda";
 | |
| #endif
 | |
|   }
 | |
|   return lowercase_platform_name;
 | |
| }
 | |
| 
 | |
| StatusOr<std::vector<se::Platform*>> GetSupportedPlatforms() {
 | |
|   return se::MultiPlatformManager::PlatformsWithFilter(
 | |
|       [](const se::Platform* platform) {
 | |
|         auto compiler_status = Compiler::GetForPlatform(platform);
 | |
|         bool supported = compiler_status.ok();
 | |
|         if (!supported) {
 | |
|           LOG(INFO) << "platform " << platform->Name() << " present but no "
 | |
|                     << "XLA compiler available: "
 | |
|                     << compiler_status.status().error_message();
 | |
|         }
 | |
|         return supported;
 | |
|       });
 | |
| }
 | |
| 
 | |
| }  // namespace
 | |
| 
 | |
| /* static */ StatusOr<std::vector<se::Platform*>>
 | |
| PlatformUtil::GetSupportedPlatforms() {
 | |
|   // Gather all platforms which have an XLA compiler.
 | |
|   return xla::GetSupportedPlatforms();
 | |
| }
 | |
| 
 | |
| /* static */ StatusOr<se::Platform*> PlatformUtil::GetDefaultPlatform() {
 | |
|   TF_ASSIGN_OR_RETURN(auto platforms, GetSupportedPlatforms());
 | |
| 
 | |
|   se::Platform* platform = nullptr;
 | |
|   if (platforms.empty()) {
 | |
|     return NotFound("no platforms found");
 | |
|   } else if (platforms.size() == 1) {
 | |
|     platform = platforms[0];
 | |
|   } else if (platforms.size() == 2) {
 | |
|     for (int i = 0; i < 2; i++) {
 | |
|       if (absl::AsciiStrToLower(platforms[i]->Name()) == kInterpreter &&
 | |
|           absl::AsciiStrToLower(platforms[1 - i]->Name()) != kInterpreter) {
 | |
|         platform = platforms[1 - i];
 | |
|         break;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   if (platform != nullptr) {
 | |
|     return platform;
 | |
|   }
 | |
| 
 | |
|   // Multiple platforms present and we can't pick a reasonable default.
 | |
|   string platforms_string = absl::StrJoin(
 | |
|       platforms, ", ",
 | |
|       [](string* out, const se::Platform* p) { out->append(p->Name()); });
 | |
|   return InvalidArgument(
 | |
|       "must specify platform because more than one platform (except for the "
 | |
|       "interpreter platform) found: %s",
 | |
|       platforms_string);
 | |
| }
 | |
| 
 | |
| /*static*/ StatusOr<se::Platform*> PlatformUtil::GetPlatform(
 | |
|     const string& platform_name) {
 | |
|   TF_ASSIGN_OR_RETURN(se::Platform * platform,
 | |
|                       se::MultiPlatformManager::PlatformWithName(
 | |
|                           CanonicalPlatformName(platform_name)));
 | |
|   TF_RETURN_IF_ERROR(Compiler::GetForPlatform(platform).status());
 | |
|   return platform;
 | |
| }
 | |
| 
 | |
| // Returns whether the device underlying the given StreamExecutor is supported
 | |
| // by XLA.
 | |
| static bool IsDeviceSupported(se::StreamExecutor* executor) {
 | |
|   const auto& description = executor->GetDeviceDescription();
 | |
|   if (executor->platform()->id() == se::cuda::kCudaPlatformId) {
 | |
|     // CUDA devices must have a minimum compute capability.
 | |
|     int major_version, minor_version;
 | |
|     if (description.cuda_compute_capability(&major_version, &minor_version)) {
 | |
|       if (major_version < kMinCudaComputeCapabilityMajor ||
 | |
|           (major_version == kMinCudaComputeCapabilityMajor &&
 | |
|            minor_version < kMinCudaComputeCapabilityMinor)) {
 | |
|         LOG(INFO) << "StreamExecutor cuda device ("
 | |
|                   << executor->device_ordinal() << ") is of "
 | |
|                   << "insufficient compute capability: "
 | |
|                   << kMinCudaComputeCapabilityMajor << "."
 | |
|                   << kMinCudaComputeCapabilityMinor << " required, "
 | |
|                   << "device is " << major_version << "." << minor_version;
 | |
|         return false;
 | |
|       }
 | |
|     }
 | |
|   } else if (executor->platform()->id() == se::rocm::kROCmPlatformId) {
 | |
|     int isa_version = 0;
 | |
|     if (description.rocm_amdgpu_isa_version(&isa_version)) {
 | |
|       if (isa_version < kMinAMDGPUISAVersion) {
 | |
|         LOG(INFO) << "StreamExecutor ROCM device ("
 | |
|                   << executor->device_ordinal() << ") is of "
 | |
|                   << "obsolete AMDGPU ISA version: "
 | |
|                   << "gfx" << kMinAMDGPUISAVersion << " required, "
 | |
|                   << "device is gfx" << isa_version;
 | |
|         return false;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| /* static */ StatusOr<std::vector<se::StreamExecutor*>>
 | |
| PlatformUtil::GetStreamExecutors(
 | |
|     se::Platform* platform,
 | |
|     const absl::optional<std::set<int>>& allowed_devices) {
 | |
|   int device_count = platform->VisibleDeviceCount();
 | |
|   if (device_count <= 0) {
 | |
|     return NotFound("no %s devices found", platform->Name());
 | |
|   }
 | |
|   if (platform->id() == se::host::kHostPlatformId) {
 | |
|     // On host "devices", StreamExecutor exports a device for each hardware
 | |
|     // thread. Because we parallelize a single computation across threads, it
 | |
|     // doesn't make sense to expose these as separate devices, so by default we
 | |
|     // fix the number of devices to one.  However we do let the user override
 | |
|     // this behavior to help run tests on the host that run models in parallel
 | |
|     // across multiple devices.
 | |
|     device_count =
 | |
|         GetDebugOptionsFromFlags().xla_force_host_platform_device_count();
 | |
|   }
 | |
|   std::vector<se::StreamExecutor*> stream_executors(device_count, nullptr);
 | |
|   VLOG(1) << "Initializing devices";
 | |
|   {
 | |
|     tensorflow::thread::ThreadPool thread_pool(
 | |
|         tensorflow::Env::Default(), "device_initialization", device_count);
 | |
|     for (int i = 0; i < device_count; ++i) {
 | |
|       // Once a stream executor is instantiated it will cause allocations on
 | |
|       // the device, for example for GPUs cuda context, cudnn handles etc. will
 | |
|       // be constructed. By constructing stream executors only on the
 | |
|       // allowed_devices, we don't make any allocations on other devices.
 | |
|       // This helps in multi-process executions on the same host like horovod or
 | |
|       // shared hosts.
 | |
|       if (allowed_devices && allowed_devices->count(i) == 0) {
 | |
|         VLOG(1) << "Not initializing StreamExecutor for device " << i
 | |
|                 << " since it is not in the visible device list";
 | |
|         continue;
 | |
|       }
 | |
|       thread_pool.Schedule([platform, i, &stream_executors]() {
 | |
|         VLOG(1) << "Started device init " << i;
 | |
|         se::StreamExecutorConfig config;
 | |
|         config.ordinal = i;
 | |
|         auto executor_status = platform->GetExecutor(config);
 | |
|         if (executor_status.ok()) {
 | |
|           se::StreamExecutor* executor = executor_status.ValueOrDie();
 | |
|           if (IsDeviceSupported(executor)) {
 | |
|             stream_executors[i] = executor;
 | |
|           }
 | |
|         } else {
 | |
|           LOG(WARNING) << "unable to create StreamExecutor for "
 | |
|                        << platform->Name() << ":" << i << ": "
 | |
|                        << executor_status.status().error_message();
 | |
|         }
 | |
|         VLOG(1) << "Finished device init " << i;
 | |
|       });
 | |
|     }
 | |
|     // Block here in thread_pool destructor until all devices are initialized.
 | |
|   }
 | |
|   VLOG(1) << "Device initialization complete";
 | |
| 
 | |
|   std::vector<se::StreamExecutor*> out;
 | |
|   for (se::StreamExecutor* executor : stream_executors) {
 | |
|     if (executor != nullptr) {
 | |
|       out.push_back(executor);
 | |
|     }
 | |
|   }
 | |
|   if (out.empty()) {
 | |
|     return InternalError("no supported devices found for platform %s",
 | |
|                          platform->Name());
 | |
|   }
 | |
|   return out;
 | |
| }
 | |
| 
 | |
| }  // namespace xla
 |