[SE] [XLA:GPU] Inform --xla_hlo_profile of the GPU's memory bandwidth.

Add a memory_bandwidth() property to StreamExecutor's DeviceDescription,
and use this in the GPU's --xla_hlo_profile.

PiperOrigin-RevId: 189157407
This commit is contained in:
Justin Lebar 2018-03-15 02:22:17 -07:00 committed by TensorFlower Gardener
parent 9037e241de
commit b08c542710
6 changed files with 48 additions and 1 deletions

View File

@ -671,6 +671,8 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
if (module->config().hlo_profiling_enabled()) {
HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
cost_analysis.set_bytes_per_second(
stream_exec->GetDeviceDescription().memory_bandwidth());
TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
profile_printer =

View File

@ -1503,6 +1503,19 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return true;
}
/* static */ port::StatusOr<int> CUDADriver::GetDeviceAttribute(
CUdevice_attribute attribute, CUdevice device) {
int val;
CUresult res = cuDeviceGetAttribute(&val, attribute, device);
if (res != CUDA_SUCCESS) {
return port::Status{
port::error::INTERNAL,
port::Printf("failed to get device attribute %d for device %d: %s",
attribute, device, ToString(res).c_str())};
}
return val;
}
/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
int value = -1;
CUresult res =

View File

@ -400,12 +400,20 @@ class CUDADriver {
// Returns a grab-bag of device properties in a caller-owned device_properties
// structure for device_ordinal via cuDeviceGetProperties.
// This call is deprecated in the NVIDIA driver API.
//
// This call is deprecated in the NVIDIA driver API; its replacement is
// GetDeviceAttribute
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
static bool GetDeviceProperties(CUdevprop *device_properties,
int device_ordinal);
// Gets a specific integer-valued property about the given device.
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
static port::StatusOr<int> GetDeviceAttribute(CUdevice_attribute attribute,
CUdevice device);
// Returns whether ECC is enabled for the given CUdevice via
// cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266

View File

@ -1103,6 +1103,18 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
builder.set_device_memory_size(device_memory_size);
}
port::StatusOr<int> mem_clock_khz = CUDADriver::GetDeviceAttribute(
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
port::StatusOr<int> mem_bus_width_bits = CUDADriver::GetDeviceAttribute(
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
// Times 2 because HBM is DDR memory; it gets two data bits per each data
// lane.
builder.set_memory_bandwidth(2 * int64_t{mem_clock_khz.ValueOrDie()} *
1000 *
int64_t{mem_bus_width_bits.ValueOrDie()} / 8);
}
{
BlockDim block_dim_limit;
FillBlockDimLimit(&block_dim_limit);

View File

@ -50,6 +50,7 @@ DeviceDescription::DeviceDescription()
shared_memory_alloc_granularity_(1),
device_address_bits_(kUninitializedUint64),
device_memory_size_(kUninitializedUint64),
memory_bandwidth_(kUninitializedUint64),
shared_memory_per_core_(kUninitializedUint64),
shared_memory_per_block_(kUninitializedUint64),
clock_rate_ghz_(-1.0),
@ -85,6 +86,8 @@ std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
result["Device Address Bits"] = port::StrCat(device_address_bits());
result["Device Memory Size"] =
port::HumanReadableNumBytes::ToString(device_memory_size());
result["Memory Bandwidth"] = port::StrCat(
port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s");
result["Shared Memory Per Core"] =
port::HumanReadableNumBytes::ToString(shared_memory_per_core_);

View File

@ -140,6 +140,11 @@ class DeviceDescription {
// Returns the device memory size in bytes.
uint64 device_memory_size() const { return device_memory_size_; }
// Returns the device's memory bandwidth in bytes/sec. (This is for
// reads/writes to/from the device's own memory, not for transfers between the
// host and device.)
uint64 memory_bandwidth() const { return memory_bandwidth_; }
// Returns the device's core clock rate in GHz.
float clock_rate_ghz() const { return clock_rate_ghz_; }
@ -212,6 +217,7 @@ class DeviceDescription {
uint64 device_address_bits_;
uint64 device_memory_size_;
uint64 memory_bandwidth_;
// Shared memory limits on a given device.
uint64 shared_memory_per_core_;
@ -305,6 +311,9 @@ class DeviceDescriptionBuilder {
void set_device_memory_size(uint64 value) {
device_description_->device_memory_size_ = value;
}
void set_memory_bandwidth(uint64 value) {
device_description_->memory_bandwidth_ = value;
}
void set_shared_memory_per_core(int64 value) {
device_description_->shared_memory_per_core_ = value;