Note: This ia a different PR from #28189 (same symptom, different cause and fix)
The `--config=rocm` build was broken by the following commit.
9b1b3df00e
The changes made by the above commit were missing a couple of changes for the ROCm platform, whias was leading to the build failure. Adding those changes to make the `--config=rocm` build working again.
241 lines
8.6 KiB
C++
241 lines
8.6 KiB
C++
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
|
|
|
|
#include <dirent.h>
|
|
#include <limits.h>
|
|
#include <link.h>
|
|
#include <stddef.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <sys/sysmacros.h>
|
|
#include <unistd.h>
|
|
|
|
#include <algorithm>
|
|
#include <memory>
|
|
#include <vector>
|
|
|
|
#include "absl/container/inlined_vector.h"
|
|
#include "absl/strings/str_cat.h"
|
|
#include "absl/strings/str_format.h"
|
|
#include "absl/strings/str_split.h"
|
|
#include "absl/strings/strip.h"
|
|
#include "tensorflow/stream_executor/lib/error.h"
|
|
#include "tensorflow/stream_executor/lib/numbers.h"
|
|
#include "tensorflow/stream_executor/lib/process_state.h"
|
|
#include "tensorflow/stream_executor/lib/status.h"
|
|
#include "tensorflow/stream_executor/platform/logging.h"
|
|
|
|
namespace stream_executor {
|
|
namespace rocm {
|
|
|
|
string DriverVersionToString(DriverVersion version) {
|
|
return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
|
|
std::get<2>(version));
|
|
}
|
|
|
|
string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
|
|
if (!version.ok()) {
|
|
return version.status().ToString();
|
|
}
|
|
|
|
return DriverVersionToString(version.ValueOrDie());
|
|
}
|
|
|
|
port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
|
|
std::vector<string> pieces = absl::StrSplit(value, '.');
|
|
if (pieces.size() != 2 && pieces.size() != 3) {
|
|
return port::Status{port::error::INVALID_ARGUMENT,
|
|
absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
|
|
"for driver version; got \"%s\"",
|
|
value.c_str())};
|
|
}
|
|
|
|
int major;
|
|
int minor;
|
|
int patch = 0;
|
|
if (!port::safe_strto32(pieces[0], &major)) {
|
|
return port::Status{
|
|
port::error::INVALID_ARGUMENT,
|
|
absl::StrFormat("could not parse major version number \"%s\" as an "
|
|
"integer from string \"%s\"",
|
|
pieces[0].c_str(), value.c_str())};
|
|
}
|
|
if (!port::safe_strto32(pieces[1], &minor)) {
|
|
return port::Status{
|
|
port::error::INVALID_ARGUMENT,
|
|
absl::StrFormat("could not parse minor version number \"%s\" as an "
|
|
"integer from string \"%s\"",
|
|
pieces[1].c_str(), value.c_str())};
|
|
}
|
|
if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
|
|
return port::Status{
|
|
port::error::INVALID_ARGUMENT,
|
|
absl::StrFormat("could not parse patch version number \"%s\" as an "
|
|
"integer from string \"%s\"",
|
|
pieces[2].c_str(), value.c_str())};
|
|
}
|
|
|
|
DriverVersion result{major, minor, patch};
|
|
VLOG(2) << "version string \"" << value << "\" made value "
|
|
<< DriverVersionToString(result);
|
|
return result;
|
|
}
|
|
|
|
} // namespace rocm
|
|
} // namespace stream_executor
|
|
|
|
namespace stream_executor {
|
|
namespace gpu {
|
|
|
|
// -- class Diagnostician
|
|
|
|
string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
|
|
return absl::StrCat("/dev/kfd", dev_node_ordinal);
|
|
}
|
|
|
|
void Diagnostician::LogDiagnosticInformation() {
|
|
LOG(INFO) << "retrieving ROCM diagnostic information for host: "
|
|
<< port::Hostname();
|
|
|
|
LogDriverVersionInformation();
|
|
}
|
|
|
|
/* static */ void Diagnostician::LogDriverVersionInformation() {
|
|
LOG(INFO) << "hostname: " << port::Hostname();
|
|
if (VLOG_IS_ON(1)) {
|
|
const char* value = getenv("LD_LIBRARY_PATH");
|
|
string library_path = value == nullptr ? "" : value;
|
|
VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
|
|
|
|
std::vector<string> pieces = absl::StrSplit(library_path, ':');
|
|
for (const auto& piece : pieces) {
|
|
if (piece.empty()) {
|
|
continue;
|
|
}
|
|
DIR* dir = opendir(piece.c_str());
|
|
if (dir == nullptr) {
|
|
VLOG(1) << "could not open \"" << piece << "\"";
|
|
continue;
|
|
}
|
|
while (dirent* entity = readdir(dir)) {
|
|
VLOG(1) << piece << " :: " << entity->d_name;
|
|
}
|
|
closedir(dir);
|
|
}
|
|
}
|
|
port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
|
|
LOG(INFO) << "librocm reported version is: "
|
|
<< rocm::DriverVersionStatusToString(dso_version);
|
|
|
|
port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
|
|
LOG(INFO) << "kernel reported version is: "
|
|
<< rocm::DriverVersionStatusToString(kernel_version);
|
|
|
|
if (kernel_version.ok() && dso_version.ok()) {
|
|
WarnOnDsoKernelMismatch(dso_version, kernel_version);
|
|
}
|
|
}
|
|
|
|
// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
|
|
// driver-interfacing DSO version number. Returns it as a string.
|
|
port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
|
|
port::StatusOr<DriverVersion> result{port::Status{
|
|
port::error::NOT_FOUND,
|
|
"was unable to find librocm.so DSO loaded into this program"}};
|
|
|
|
// Callback used when iterating through DSOs. Looks for the driver-interfacing
|
|
// DSO and yields its version number into the callback data, when found.
|
|
auto iterate_phdr = [](struct dl_phdr_info* info, size_t size,
|
|
void* data) -> int {
|
|
if (strstr(info->dlpi_name, "librocm.so.1")) {
|
|
VLOG(1) << "found DLL info with name: " << info->dlpi_name;
|
|
char resolved_path[PATH_MAX] = {0};
|
|
if (realpath(info->dlpi_name, resolved_path) == nullptr) {
|
|
return 0;
|
|
}
|
|
VLOG(1) << "found DLL info with resolved path: " << resolved_path;
|
|
const char* slash = rindex(resolved_path, '/');
|
|
if (slash == nullptr) {
|
|
return 0;
|
|
}
|
|
const char* so_suffix = ".so.";
|
|
const char* dot = strstr(slash, so_suffix);
|
|
if (dot == nullptr) {
|
|
return 0;
|
|
}
|
|
string dso_version = dot + strlen(so_suffix);
|
|
// TODO(b/22689637): Eliminate the explicit namespace if possible.
|
|
auto stripped_dso_version = absl::StripSuffix(dso_version, ".ld64");
|
|
auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
|
|
*result = rocm::StringToDriverVersion(string(stripped_dso_version));
|
|
return 1;
|
|
}
|
|
return 0;
|
|
};
|
|
|
|
dl_iterate_phdr(iterate_phdr, &result);
|
|
|
|
return result;
|
|
}
|
|
|
|
port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
|
|
const string& driver_version_file_contents) {
|
|
static const char* kDriverFilePrelude = "Kernel Module ";
|
|
size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
|
|
if (offset == string::npos) {
|
|
return port::Status{
|
|
port::error::NOT_FOUND,
|
|
absl::StrCat("could not find kernel module information in "
|
|
"driver version file contents: \"",
|
|
driver_version_file_contents, "\"")};
|
|
}
|
|
|
|
string version_and_rest = driver_version_file_contents.substr(
|
|
offset + strlen(kDriverFilePrelude), string::npos);
|
|
size_t space_index = version_and_rest.find(" ");
|
|
auto kernel_version = version_and_rest.substr(0, space_index);
|
|
// TODO(b/22689637): Eliminate the explicit namespace if possible.
|
|
auto stripped_kernel_version = absl::StripSuffix(kernel_version, ".ld64");
|
|
return rocm::StringToDriverVersion(string(stripped_kernel_version));
|
|
}
|
|
|
|
void Diagnostician::WarnOnDsoKernelMismatch(
|
|
port::StatusOr<DriverVersion> dso_version,
|
|
port::StatusOr<DriverVersion> kernel_version) {
|
|
if (kernel_version.ok() && dso_version.ok() &&
|
|
dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
|
|
LOG(INFO) << "kernel version seems to match DSO: "
|
|
<< rocm::DriverVersionToString(kernel_version.ValueOrDie());
|
|
} else {
|
|
LOG(ERROR) << "kernel version "
|
|
<< rocm::DriverVersionStatusToString(kernel_version)
|
|
<< " does not match DSO version "
|
|
<< rocm::DriverVersionStatusToString(dso_version)
|
|
<< " -- cannot find working devices in this configuration";
|
|
}
|
|
}
|
|
|
|
port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
|
|
auto status = port::Status{port::error::UNIMPLEMENTED,
|
|
"kernel reported driver version not implemented"};
|
|
return status;
|
|
}
|
|
|
|
} // namespace gpu
|
|
} // namespace stream_executor
|