Move DeviceMemoryAllocator and OwningDeviceMemory from XLA to StreamExecutor.
This change achieves three goals: 1. There are currently three different allocator abstractions in three different places: XLA, stream executor, and tensorflow. This change shrinks down the number of packages with allocator abstraction to two. 2. Moving the allocator enables unifying ScopedDeviceMemory and OwningDeviceMemory which both have "owning pointer" semantics, but a slightly different API. 3. Moving the allocator enables moving RedzoneAllocator in stream executor, which we would like to use in tensorflow to catch out-of-bound-writes in CUDNN convolutions during the autotuning. PiperOrigin-RevId: 247211996
This commit is contained in:
parent
3ae5e7d3b9
commit
0410cff073
@ -262,7 +262,6 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:statusor",
|
"//tensorflow/compiler/xla:statusor",
|
||||||
"//tensorflow/compiler/xla/client:client_library",
|
"//tensorflow/compiler/xla/client:client_library",
|
||||||
"//tensorflow/compiler/xla/client:local_client",
|
"//tensorflow/compiler/xla/client:local_client",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/core:core_cpu_internal",
|
"//tensorflow/core:core_cpu_internal",
|
||||||
"//tensorflow/core:framework",
|
"//tensorflow/core:framework",
|
||||||
"//tensorflow/core:framework_internal",
|
"//tensorflow/core:framework_internal",
|
||||||
@ -270,6 +269,7 @@ cc_library(
|
|||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:lib_internal",
|
"//tensorflow/core:lib_internal",
|
||||||
"//tensorflow/core:protos_all_cc",
|
"//tensorflow/core:protos_all_cc",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/algorithm:container",
|
"@com_google_absl//absl/algorithm:container",
|
||||||
"@com_google_absl//absl/base:core_headers",
|
"@com_google_absl//absl/base:core_headers",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
|
@ -62,7 +62,7 @@ XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
|
|||||||
se::Platform::Id platform_id = nullptr;
|
se::Platform::Id platform_id = nullptr;
|
||||||
const XlaDevice::Metadata* xla_device_metadata = nullptr;
|
const XlaDevice::Metadata* xla_device_metadata = nullptr;
|
||||||
std::unique_ptr<XlaAllocator> xla_allocator;
|
std::unique_ptr<XlaAllocator> xla_allocator;
|
||||||
xla::DeviceMemoryAllocator* device_allocator = nullptr;
|
se::DeviceMemoryAllocator* device_allocator = nullptr;
|
||||||
|
|
||||||
if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
|
if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
|
||||||
platform_id = se::host::kHostPlatformId;
|
platform_id = se::host::kHostPlatformId;
|
||||||
|
@ -40,7 +40,7 @@ class XlaPlatformInfo {
|
|||||||
se::Platform::Id platform_id,
|
se::Platform::Id platform_id,
|
||||||
const XlaDevice::Metadata* xla_device_metadata,
|
const XlaDevice::Metadata* xla_device_metadata,
|
||||||
std::unique_ptr<XlaAllocator> xla_allocator,
|
std::unique_ptr<XlaAllocator> xla_allocator,
|
||||||
xla::DeviceMemoryAllocator* device_allocator)
|
se::DeviceMemoryAllocator* device_allocator)
|
||||||
: device_type_(device_type),
|
: device_type_(device_type),
|
||||||
platform_id_(platform_id),
|
platform_id_(platform_id),
|
||||||
xla_device_metadata_(xla_device_metadata),
|
xla_device_metadata_(xla_device_metadata),
|
||||||
@ -55,7 +55,7 @@ class XlaPlatformInfo {
|
|||||||
return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams();
|
return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams();
|
||||||
}
|
}
|
||||||
|
|
||||||
xla::DeviceMemoryAllocator* allocator() const {
|
se::DeviceMemoryAllocator* allocator() const {
|
||||||
return device_allocator_ ? device_allocator_ : xla_allocator_.get();
|
return device_allocator_ ? device_allocator_ : xla_allocator_.get();
|
||||||
}
|
}
|
||||||
DeviceType device_type() const { return device_type_; }
|
DeviceType device_type() const { return device_type_; }
|
||||||
@ -86,7 +86,7 @@ class XlaPlatformInfo {
|
|||||||
// then device_allocator_ is null and xla_allocator_ points to an appropriate
|
// then device_allocator_ is null and xla_allocator_ points to an appropriate
|
||||||
// XlaAllocator instance.
|
// XlaAllocator instance.
|
||||||
std::unique_ptr<XlaAllocator> xla_allocator_;
|
std::unique_ptr<XlaAllocator> xla_allocator_;
|
||||||
xla::DeviceMemoryAllocator* device_allocator_;
|
se::DeviceMemoryAllocator* device_allocator_;
|
||||||
|
|
||||||
TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo);
|
TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo);
|
||||||
};
|
};
|
||||||
|
@ -168,11 +168,11 @@ Status SnapshotResourceVariables(OpKernelContext* ctx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
|
XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
|
||||||
: xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
|
: se::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
|
||||||
|
|
||||||
XlaAllocator::~XlaAllocator() {}
|
XlaAllocator::~XlaAllocator() {}
|
||||||
|
|
||||||
xla::StatusOr<xla::OwningDeviceMemory> XlaAllocator::Allocate(
|
xla::StatusOr<se::OwningDeviceMemory> XlaAllocator::Allocate(
|
||||||
int device_ordinal, uint64 size, bool retry_on_failure) {
|
int device_ordinal, uint64 size, bool retry_on_failure) {
|
||||||
AllocationAttributes attrs;
|
AllocationAttributes attrs;
|
||||||
attrs.no_retry_on_failure = !retry_on_failure;
|
attrs.no_retry_on_failure = !retry_on_failure;
|
||||||
@ -184,7 +184,7 @@ xla::StatusOr<xla::OwningDeviceMemory> XlaAllocator::Allocate(
|
|||||||
"Out of memory while trying to allocate ", size, " bytes.");
|
"Out of memory while trying to allocate ", size, " bytes.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return xla::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
|
return se::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
|
||||||
device_ordinal, this);
|
device_ordinal, this);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -194,7 +194,7 @@ Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
XlaComputationLaunchContext::XlaComputationLaunchContext(
|
XlaComputationLaunchContext::XlaComputationLaunchContext(
|
||||||
xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator,
|
xla::LocalClient* client, se::DeviceMemoryAllocator* xla_allocator,
|
||||||
bool allocate_xla_tensors, bool use_multiple_streams)
|
bool allocate_xla_tensors, bool use_multiple_streams)
|
||||||
: client_(client),
|
: client_(client),
|
||||||
xla_allocator_(xla_allocator),
|
xla_allocator_(xla_allocator),
|
||||||
@ -374,7 +374,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
|
|||||||
} else {
|
} else {
|
||||||
Tensor output_tensor = XlaTensorBuffer::MakeTensor(
|
Tensor output_tensor = XlaTensorBuffer::MakeTensor(
|
||||||
ctx->expected_output_dtype(i), shape, buffer, allocator);
|
ctx->expected_output_dtype(i), shape, buffer, allocator);
|
||||||
output.set_buffer(xla::OwningDeviceMemory(), {output_num});
|
output.set_buffer(se::OwningDeviceMemory(), {output_num});
|
||||||
ctx->set_output(i, output_tensor);
|
ctx->set_output(i, output_tensor);
|
||||||
}
|
}
|
||||||
++output_num;
|
++output_num;
|
||||||
@ -435,7 +435,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
|
|||||||
*variable_infos[i].var()->tensor() = output_tensor;
|
*variable_infos[i].var()->tensor() = output_tensor;
|
||||||
} else {
|
} else {
|
||||||
se::DeviceMemoryBase buffer = output.buffer({output_num});
|
se::DeviceMemoryBase buffer = output.buffer({output_num});
|
||||||
output.set_buffer(xla::OwningDeviceMemory(), {output_num});
|
output.set_buffer(se::OwningDeviceMemory(), {output_num});
|
||||||
Tensor output_tensor = XlaTensorBuffer::MakeTensor(
|
Tensor output_tensor = XlaTensorBuffer::MakeTensor(
|
||||||
write.type, write.shape, buffer, allocator);
|
write.type, write.shape, buffer, allocator);
|
||||||
*variable_infos[i].var()->tensor() = output_tensor;
|
*variable_infos[i].var()->tensor() = output_tensor;
|
||||||
|
@ -23,14 +23,14 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/jit/xla_tensor.h"
|
#include "tensorflow/compiler/jit/xla_tensor.h"
|
||||||
#include "tensorflow/compiler/tf2xla/xla_compiler.h"
|
#include "tensorflow/compiler/tf2xla/xla_compiler.h"
|
||||||
#include "tensorflow/compiler/xla/client/local_client.h"
|
#include "tensorflow/compiler/xla/client/local_client.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/owning_device_memory.h"
|
|
||||||
#include "tensorflow/core/framework/allocation_description.pb.h"
|
#include "tensorflow/core/framework/allocation_description.pb.h"
|
||||||
#include "tensorflow/core/framework/resource_var.h"
|
#include "tensorflow/core/framework/resource_var.h"
|
||||||
#include "tensorflow/core/framework/tensor.h"
|
#include "tensorflow/core/framework/tensor.h"
|
||||||
#include "tensorflow/core/framework/types.h"
|
#include "tensorflow/core/framework/types.h"
|
||||||
#include "tensorflow/core/lib/core/status.h"
|
#include "tensorflow/core/lib/core/status.h"
|
||||||
#include "tensorflow/core/lib/gtl/array_slice.h"
|
#include "tensorflow/core/lib/gtl/array_slice.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
#include "tensorflow/stream_executor/owning_device_memory.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
class XlaAllocator;
|
class XlaAllocator;
|
||||||
@ -108,11 +108,11 @@ Status LockVariables(absl::Span<VariableInfo> variables)
|
|||||||
// Adapter class that wraps a Tensorflow allocator as an XLA allocator.
|
// Adapter class that wraps a Tensorflow allocator as an XLA allocator.
|
||||||
// Assumes that the Tensorflow allocator permits asynchronous deallocation:
|
// Assumes that the Tensorflow allocator permits asynchronous deallocation:
|
||||||
// see comment on `AllowsAsynchronousDeallocation()`.
|
// see comment on `AllowsAsynchronousDeallocation()`.
|
||||||
class XlaAllocator : public xla::DeviceMemoryAllocator {
|
class XlaAllocator : public se::DeviceMemoryAllocator {
|
||||||
public:
|
public:
|
||||||
XlaAllocator(const se::Platform* platform, Allocator* wrapped);
|
XlaAllocator(const se::Platform* platform, Allocator* wrapped);
|
||||||
~XlaAllocator() override;
|
~XlaAllocator() override;
|
||||||
xla::StatusOr<xla::OwningDeviceMemory> Allocate(
|
xla::StatusOr<se::OwningDeviceMemory> Allocate(
|
||||||
int device_ordinal, uint64 size, bool retry_on_failure) override;
|
int device_ordinal, uint64 size, bool retry_on_failure) override;
|
||||||
Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
|
Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
|
||||||
|
|
||||||
@ -142,7 +142,7 @@ class XlaComputationLaunchContext {
|
|||||||
// because we track inter-stream dependencies through events inside XlaTensor
|
// because we track inter-stream dependencies through events inside XlaTensor
|
||||||
// objects.
|
// objects.
|
||||||
XlaComputationLaunchContext(xla::LocalClient* client,
|
XlaComputationLaunchContext(xla::LocalClient* client,
|
||||||
xla::DeviceMemoryAllocator* xla_allocator,
|
se::DeviceMemoryAllocator* xla_allocator,
|
||||||
bool allocate_xla_tensors,
|
bool allocate_xla_tensors,
|
||||||
bool use_multiple_streams);
|
bool use_multiple_streams);
|
||||||
|
|
||||||
@ -186,7 +186,7 @@ class XlaComputationLaunchContext {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
xla::LocalClient* client_;
|
xla::LocalClient* client_;
|
||||||
xla::DeviceMemoryAllocator* xla_allocator_;
|
se::DeviceMemoryAllocator* xla_allocator_;
|
||||||
bool allocate_xla_tensors_;
|
bool allocate_xla_tensors_;
|
||||||
bool use_multiple_streams_;
|
bool use_multiple_streams_;
|
||||||
std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
|
std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
|
||||||
|
@ -59,7 +59,7 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype,
|
|||||||
xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
|
xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
|
||||||
uint64 size =
|
uint64 size =
|
||||||
client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
|
client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
|
||||||
TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer,
|
TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
|
||||||
client->backend().memory_allocator()->Allocate(
|
client->backend().memory_allocator()->Allocate(
|
||||||
device_ordinal, size, /*retry_on_failure=*/false));
|
device_ordinal, size, /*retry_on_failure=*/false));
|
||||||
// Move our buffer into shaped_buffer, which takes ownership of it.
|
// Move our buffer into shaped_buffer, which takes ownership of it.
|
||||||
|
@ -339,7 +339,7 @@ class XlaCompiler {
|
|||||||
// here, but on some devices (notably, GPUs), TensorFlow tends to eagerly
|
// here, but on some devices (notably, GPUs), TensorFlow tends to eagerly
|
||||||
// allocate most or all available memory on the device, leaving none for the
|
// allocate most or all available memory on the device, leaving none for the
|
||||||
// compiler to access, unless it can use TensorFlow's allocator.
|
// compiler to access, unless it can use TensorFlow's allocator.
|
||||||
xla::DeviceMemoryAllocator* device_allocator = nullptr;
|
se::DeviceMemoryAllocator* device_allocator = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
explicit XlaCompiler(Options options);
|
explicit XlaCompiler(Options options);
|
||||||
|
@ -96,7 +96,7 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla:xla_proto",
|
"//tensorflow/compiler/xla:xla_proto",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
"@com_google_absl//absl/strings:str_format",
|
"@com_google_absl//absl/strings:str_format",
|
||||||
"@com_google_absl//absl/types:optional",
|
"@com_google_absl//absl/types:optional",
|
||||||
@ -117,7 +117,6 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:backend",
|
"//tensorflow/compiler/xla/service:backend",
|
||||||
"//tensorflow/compiler/xla/service:compiler",
|
"//tensorflow/compiler/xla/service:compiler",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:dump",
|
"//tensorflow/compiler/xla/service:dump",
|
||||||
"//tensorflow/compiler/xla/service:executable",
|
"//tensorflow/compiler/xla/service:executable",
|
||||||
"//tensorflow/compiler/xla/service:hlo_proto",
|
"//tensorflow/compiler/xla/service:hlo_proto",
|
||||||
@ -126,6 +125,7 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla/service:source_map_util",
|
"//tensorflow/compiler/xla/service:source_map_util",
|
||||||
"//tensorflow/compiler/xla/service:stream_pool",
|
"//tensorflow/compiler/xla/service:stream_pool",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
"@com_google_absl//absl/types:span",
|
"@com_google_absl//absl/types:span",
|
||||||
"@llvm//:support",
|
"@llvm//:support",
|
||||||
@ -165,11 +165,11 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla/service:backend",
|
"//tensorflow/compiler/xla/service:backend",
|
||||||
"//tensorflow/compiler/xla/service:compile_only_service",
|
"//tensorflow/compiler/xla/service:compile_only_service",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:local_service",
|
"//tensorflow/compiler/xla/service:local_service",
|
||||||
"//tensorflow/compiler/xla/service:platform_util",
|
"//tensorflow/compiler/xla/service:platform_util",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
"@com_google_absl//absl/types:optional",
|
"@com_google_absl//absl/types:optional",
|
||||||
],
|
],
|
||||||
|
@ -31,7 +31,6 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/client/compile_only_client.h"
|
#include "tensorflow/compiler/xla/client/compile_only_client.h"
|
||||||
#include "tensorflow/compiler/xla/client/local_client.h"
|
#include "tensorflow/compiler/xla/client/local_client.h"
|
||||||
#include "tensorflow/compiler/xla/service/compile_only_service.h"
|
#include "tensorflow/compiler/xla/service/compile_only_service.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/local_service.h"
|
#include "tensorflow/compiler/xla/service/local_service.h"
|
||||||
#include "tensorflow/compiler/xla/statusor.h"
|
#include "tensorflow/compiler/xla/statusor.h"
|
||||||
#include "tensorflow/compiler/xla/types.h"
|
#include "tensorflow/compiler/xla/types.h"
|
||||||
@ -39,6 +38,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/mutex.h"
|
#include "tensorflow/core/platform/mutex.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/platform/thread_annotations.h"
|
#include "tensorflow/core/platform/thread_annotations.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
|
@ -22,12 +22,12 @@ limitations under the License.
|
|||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
|
ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
|
||||||
DeviceMemoryAllocator* allocator) {
|
se::DeviceMemoryAllocator* allocator) {
|
||||||
device_allocator_ = allocator;
|
device_allocator_ = allocator;
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
|
se::DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
|
||||||
return device_allocator_;
|
return device_allocator_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,11 +18,11 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "absl/strings/string_view.h"
|
#include "absl/strings/string_view.h"
|
||||||
#include "absl/types/optional.h"
|
#include "absl/types/optional.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/shape.h"
|
#include "tensorflow/compiler/xla/shape.h"
|
||||||
#include "tensorflow/compiler/xla/util.h"
|
#include "tensorflow/compiler/xla/util.h"
|
||||||
#include "tensorflow/compiler/xla/xla.pb.h"
|
#include "tensorflow/compiler/xla/xla.pb.h"
|
||||||
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
@ -57,11 +57,11 @@ class ExecutableBuildOptions {
|
|||||||
// want to run various algorithms on the device and pick the fastest one -- it
|
// want to run various algorithms on the device and pick the fastest one -- it
|
||||||
// might allocate buffers for use by these algorithms using this allocator.
|
// might allocate buffers for use by these algorithms using this allocator.
|
||||||
//
|
//
|
||||||
// This does not need to be the same as the DeviceMemoryAllocator passed when
|
// This does not need to be the same as the se::DeviceMemoryAllocator passed
|
||||||
// running the executable.
|
// when running the executable.
|
||||||
ExecutableBuildOptions& set_device_allocator(
|
ExecutableBuildOptions& set_device_allocator(
|
||||||
DeviceMemoryAllocator* allocator);
|
se::DeviceMemoryAllocator* allocator);
|
||||||
DeviceMemoryAllocator* device_allocator() const;
|
se::DeviceMemoryAllocator* device_allocator() const;
|
||||||
|
|
||||||
// Returns a string representation of the build options, suitable for
|
// Returns a string representation of the build options, suitable for
|
||||||
// debugging.
|
// debugging.
|
||||||
@ -77,7 +77,7 @@ class ExecutableBuildOptions {
|
|||||||
Shape result_layout_;
|
Shape result_layout_;
|
||||||
bool result_layout_set_ = false;
|
bool result_layout_set_ = false;
|
||||||
absl::optional<DebugOptions> debug_options_;
|
absl::optional<DebugOptions> debug_options_;
|
||||||
DeviceMemoryAllocator* device_allocator_ = nullptr;
|
se::DeviceMemoryAllocator* device_allocator_ = nullptr;
|
||||||
int num_replicas_ = 1;
|
int num_replicas_ = 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -279,7 +279,7 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
|
|||||||
|
|
||||||
StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
|
StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
|
||||||
const LiteralSlice& literal, int device_ordinal,
|
const LiteralSlice& literal, int device_ordinal,
|
||||||
DeviceMemoryAllocator* allocator) {
|
se::DeviceMemoryAllocator* allocator) {
|
||||||
if (allocator == nullptr) {
|
if (allocator == nullptr) {
|
||||||
allocator = backend().memory_allocator();
|
allocator = backend().memory_allocator();
|
||||||
}
|
}
|
||||||
|
@ -24,7 +24,6 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/client/xla_computation.h"
|
#include "tensorflow/compiler/xla/client/xla_computation.h"
|
||||||
#include "tensorflow/compiler/xla/executable_run_options.h"
|
#include "tensorflow/compiler/xla/executable_run_options.h"
|
||||||
#include "tensorflow/compiler/xla/service/compiler.h"
|
#include "tensorflow/compiler/xla/service/compiler.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/executable.h"
|
#include "tensorflow/compiler/xla/service/executable.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo.pb.h"
|
#include "tensorflow/compiler/xla/service/hlo.pb.h"
|
||||||
#include "tensorflow/compiler/xla/service/local_service.h"
|
#include "tensorflow/compiler/xla/service/local_service.h"
|
||||||
@ -32,6 +31,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/statusor.h"
|
#include "tensorflow/compiler/xla/statusor.h"
|
||||||
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
@ -137,7 +137,7 @@ class LocalClient : public Client {
|
|||||||
// device is used.
|
// device is used.
|
||||||
StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
|
StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
|
||||||
const LiteralSlice& literal, int device_ordinal,
|
const LiteralSlice& literal, int device_ordinal,
|
||||||
DeviceMemoryAllocator* allocator = nullptr);
|
se::DeviceMemoryAllocator* allocator = nullptr);
|
||||||
|
|
||||||
// Transfer the BorrowingLiteral to the device with the given ordinal.
|
// Transfer the BorrowingLiteral to the device with the given ordinal.
|
||||||
StatusOr<TransferToServerResponse> TransferToLocalServer(
|
StatusOr<TransferToServerResponse> TransferToLocalServer(
|
||||||
|
@ -26,12 +26,13 @@ ExecutableRunOptions& ExecutableRunOptions::set_device_ordinal(
|
|||||||
int ExecutableRunOptions::device_ordinal() const { return device_ordinal_; }
|
int ExecutableRunOptions::device_ordinal() const { return device_ordinal_; }
|
||||||
|
|
||||||
ExecutableRunOptions& ExecutableRunOptions::set_allocator(
|
ExecutableRunOptions& ExecutableRunOptions::set_allocator(
|
||||||
DeviceMemoryAllocator* allocator) {
|
stream_executor::DeviceMemoryAllocator* allocator) {
|
||||||
allocator_ = allocator;
|
allocator_ = allocator;
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
DeviceMemoryAllocator* ExecutableRunOptions::allocator() const {
|
stream_executor::DeviceMemoryAllocator* ExecutableRunOptions::allocator()
|
||||||
|
const {
|
||||||
return allocator_;
|
return allocator_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,6 +23,7 @@ limitations under the License.
|
|||||||
namespace stream_executor {
|
namespace stream_executor {
|
||||||
class Stream;
|
class Stream;
|
||||||
class Platform;
|
class Platform;
|
||||||
|
class DeviceMemoryAllocator;
|
||||||
} // namespace stream_executor
|
} // namespace stream_executor
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
@ -31,7 +32,6 @@ struct ThreadPoolDevice;
|
|||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
class DeviceMemoryAllocator;
|
|
||||||
class DeviceAssignment;
|
class DeviceAssignment;
|
||||||
class ExecutionProfile;
|
class ExecutionProfile;
|
||||||
|
|
||||||
@ -39,8 +39,9 @@ class ExecutionProfile;
|
|||||||
class ExecutableRunOptions {
|
class ExecutableRunOptions {
|
||||||
public:
|
public:
|
||||||
// Specifies the allocator to use during execution.
|
// Specifies the allocator to use during execution.
|
||||||
ExecutableRunOptions& set_allocator(DeviceMemoryAllocator* allocator);
|
ExecutableRunOptions& set_allocator(
|
||||||
DeviceMemoryAllocator* allocator() const;
|
stream_executor::DeviceMemoryAllocator* allocator);
|
||||||
|
stream_executor::DeviceMemoryAllocator* allocator() const;
|
||||||
|
|
||||||
// If set, this is the device to run the computation on. Valid device_ordinal
|
// If set, this is the device to run the computation on. Valid device_ordinal
|
||||||
// values are: 0 to # of devices - 1. These values are identical to the device
|
// values are: 0 to # of devices - 1. These values are identical to the device
|
||||||
@ -87,7 +88,7 @@ class ExecutableRunOptions {
|
|||||||
int rng_seed() const;
|
int rng_seed() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DeviceMemoryAllocator* allocator_ = nullptr;
|
stream_executor::DeviceMemoryAllocator* allocator_ = nullptr;
|
||||||
int device_ordinal_ = -1;
|
int device_ordinal_ = -1;
|
||||||
const DeviceAssignment* device_assignment_ = nullptr;
|
const DeviceAssignment* device_assignment_ = nullptr;
|
||||||
stream_executor::Stream* stream_ = nullptr;
|
stream_executor::Stream* stream_ = nullptr;
|
||||||
|
@ -67,8 +67,8 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:statusor",
|
"//tensorflow/compiler/xla:statusor",
|
||||||
"//tensorflow/compiler/xla:types",
|
"//tensorflow/compiler/xla:types",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/container:flat_hash_map",
|
"@com_google_absl//absl/container:flat_hash_map",
|
||||||
"@com_google_absl//absl/types:optional",
|
"@com_google_absl//absl/types:optional",
|
||||||
"@pybind11",
|
"@pybind11",
|
||||||
@ -109,9 +109,9 @@ cc_library(
|
|||||||
hdrs = ["shared_device_buffer.h"],
|
hdrs = ["shared_device_buffer.h"],
|
||||||
deps = [
|
deps = [
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:shaped_buffer",
|
"//tensorflow/compiler/xla/service:shaped_buffer",
|
||||||
"//tensorflow/compiler/xla/service:transfer_manager",
|
"//tensorflow/compiler/xla/service:transfer_manager",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/container:flat_hash_set",
|
"@com_google_absl//absl/container:flat_hash_set",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@ -178,7 +178,7 @@ tf_pybind_extension(
|
|||||||
"//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
|
"//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
|
||||||
"//tensorflow/compiler/xla/client/lib:svd",
|
"//tensorflow/compiler/xla/client/lib:svd",
|
||||||
"//tensorflow/compiler/xla/service:computation_placer",
|
"//tensorflow/compiler/xla/service:computation_placer",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"//tensorflow/compiler/xla/service:hlo",
|
"//tensorflow/compiler/xla/service:hlo",
|
||||||
"//tensorflow/compiler/xla/service:hlo_graph_dumper",
|
"//tensorflow/compiler/xla/service:hlo_graph_dumper",
|
||||||
"//tensorflow/compiler/xla/service:name_uniquer",
|
"//tensorflow/compiler/xla/service:name_uniquer",
|
||||||
|
@ -212,7 +212,7 @@ StatusOr<pybind11::object> PyLocalClient::TransferFromOutfeed(
|
|||||||
static StatusOr<PyLocalBuffer> TransferHostToDeviceAsync(
|
static StatusOr<PyLocalBuffer> TransferHostToDeviceAsync(
|
||||||
const PythonBufferTree& tree, int device_ordinal, PyLocalClient* client,
|
const PythonBufferTree& tree, int device_ordinal, PyLocalClient* client,
|
||||||
const Device& device) {
|
const Device& device) {
|
||||||
DeviceMemoryAllocator* allocator =
|
se::DeviceMemoryAllocator* allocator =
|
||||||
client->client()->backend().memory_allocator();
|
client->client()->backend().memory_allocator();
|
||||||
TransferManager* transfer_manager =
|
TransferManager* transfer_manager =
|
||||||
client->client()->backend().transfer_manager();
|
client->client()->backend().transfer_manager();
|
||||||
@ -367,7 +367,7 @@ PyLocalBuffer::FromPythonValues(
|
|||||||
host_shapes.push_back(buffer.on_host_shape());
|
host_shapes.push_back(buffer.on_host_shape());
|
||||||
device_buffers.push_back(buffer.device_buffer());
|
device_buffers.push_back(buffer.device_buffer());
|
||||||
}
|
}
|
||||||
DeviceMemoryAllocator* allocator =
|
se::DeviceMemoryAllocator* allocator =
|
||||||
client->client()->backend().memory_allocator();
|
client->client()->backend().memory_allocator();
|
||||||
TransferManager* transfer_manager =
|
TransferManager* transfer_manager =
|
||||||
client->client()->backend().transfer_manager();
|
client->client()->backend().transfer_manager();
|
||||||
|
@ -15,7 +15,7 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "tensorflow/compiler/xla/python/shared_device_buffer.h"
|
#include "tensorflow/compiler/xla/python/shared_device_buffer.h"
|
||||||
|
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
@ -47,13 +47,13 @@ void BufferDefinitionEvent::WaitForEventOnStream(se::Stream* stream) {
|
|||||||
static std::shared_ptr<PySharedDeviceBuffer>
|
static std::shared_ptr<PySharedDeviceBuffer>
|
||||||
BufferFromScopedShapedBufferIterator(
|
BufferFromScopedShapedBufferIterator(
|
||||||
const Shape& on_device_shape, int device_ordinal,
|
const Shape& on_device_shape, int device_ordinal,
|
||||||
DeviceMemoryAllocator* allocator,
|
se::DeviceMemoryAllocator* allocator,
|
||||||
ShapeTree<se::DeviceMemoryBase>::iterator* iterator,
|
ShapeTree<se::DeviceMemoryBase>::iterator* iterator,
|
||||||
const ShapeTree<se::DeviceMemoryBase>::iterator& end,
|
const ShapeTree<se::DeviceMemoryBase>::iterator& end,
|
||||||
const std::shared_ptr<BufferDefinitionEvent>& definition_event) {
|
const std::shared_ptr<BufferDefinitionEvent>& definition_event) {
|
||||||
CHECK(*iterator != end);
|
CHECK(*iterator != end);
|
||||||
|
|
||||||
OwningDeviceMemory device_memory((*iterator)->second, device_ordinal,
|
se::OwningDeviceMemory device_memory((*iterator)->second, device_ordinal,
|
||||||
allocator);
|
allocator);
|
||||||
(*iterator)->second = se::DeviceMemoryBase();
|
(*iterator)->second = se::DeviceMemoryBase();
|
||||||
++*iterator;
|
++*iterator;
|
||||||
@ -90,7 +90,7 @@ PySharedDeviceBuffer::FromScopedShapedBuffer(
|
|||||||
/* static */ StatusOr<std::shared_ptr<PySharedDeviceBuffer>>
|
/* static */ StatusOr<std::shared_ptr<PySharedDeviceBuffer>>
|
||||||
PySharedDeviceBuffer::MakeTuple(
|
PySharedDeviceBuffer::MakeTuple(
|
||||||
std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
|
std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
|
||||||
TransferManager* transfer_manager, DeviceMemoryAllocator* allocator,
|
TransferManager* transfer_manager, se::DeviceMemoryAllocator* allocator,
|
||||||
int device_ordinal,
|
int device_ordinal,
|
||||||
std::shared_ptr<BufferDefinitionEvent> definition_event) {
|
std::shared_ptr<BufferDefinitionEvent> definition_event) {
|
||||||
std::vector<Shape> child_shapes;
|
std::vector<Shape> child_shapes;
|
||||||
@ -102,7 +102,7 @@ PySharedDeviceBuffer::MakeTuple(
|
|||||||
|
|
||||||
Shape shape = ShapeUtil::MakeTupleShape(child_shapes);
|
Shape shape = ShapeUtil::MakeTupleShape(child_shapes);
|
||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
OwningDeviceMemory device_memory,
|
se::OwningDeviceMemory device_memory,
|
||||||
allocator->Allocate(device_ordinal,
|
allocator->Allocate(device_ordinal,
|
||||||
transfer_manager->GetByteSizeRequirement(shape)));
|
transfer_manager->GetByteSizeRequirement(shape)));
|
||||||
return std::make_shared<PySharedDeviceBuffer>(
|
return std::make_shared<PySharedDeviceBuffer>(
|
||||||
@ -113,10 +113,10 @@ PySharedDeviceBuffer::MakeTuple(
|
|||||||
/* static */ StatusOr<std::shared_ptr<PySharedDeviceBuffer>>
|
/* static */ StatusOr<std::shared_ptr<PySharedDeviceBuffer>>
|
||||||
PySharedDeviceBuffer::MakeArray(
|
PySharedDeviceBuffer::MakeArray(
|
||||||
Shape on_device_shape, TransferManager* transfer_manager,
|
Shape on_device_shape, TransferManager* transfer_manager,
|
||||||
DeviceMemoryAllocator* allocator, int device_ordinal,
|
se::DeviceMemoryAllocator* allocator, int device_ordinal,
|
||||||
std::shared_ptr<BufferDefinitionEvent> definition_event) {
|
std::shared_ptr<BufferDefinitionEvent> definition_event) {
|
||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
OwningDeviceMemory device_memory,
|
se::OwningDeviceMemory device_memory,
|
||||||
allocator->Allocate(
|
allocator->Allocate(
|
||||||
device_ordinal,
|
device_ordinal,
|
||||||
transfer_manager->GetByteSizeRequirement(on_device_shape)));
|
transfer_manager->GetByteSizeRequirement(on_device_shape)));
|
||||||
@ -153,7 +153,7 @@ ShapedBuffer PySharedDeviceBuffer::AsShapedBuffer(
|
|||||||
}
|
}
|
||||||
|
|
||||||
PySharedDeviceBuffer::PySharedDeviceBuffer(
|
PySharedDeviceBuffer::PySharedDeviceBuffer(
|
||||||
Shape on_device_shape, OwningDeviceMemory device_memory,
|
Shape on_device_shape, se::OwningDeviceMemory device_memory,
|
||||||
std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
|
std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
|
||||||
std::shared_ptr<BufferDefinitionEvent> definition_event)
|
std::shared_ptr<BufferDefinitionEvent> definition_event)
|
||||||
: on_device_shape_(std::move(on_device_shape)),
|
: on_device_shape_(std::move(on_device_shape)),
|
||||||
|
@ -17,11 +17,11 @@ limitations under the License.
|
|||||||
#define TENSORFLOW_COMPILER_XLA_PYTHON_SHARED_DEVICE_BUFFER_H_
|
#define TENSORFLOW_COMPILER_XLA_PYTHON_SHARED_DEVICE_BUFFER_H_
|
||||||
|
|
||||||
#include "absl/container/flat_hash_set.h"
|
#include "absl/container/flat_hash_set.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/owning_device_memory.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
||||||
#include "tensorflow/compiler/xla/service/transfer_manager.h"
|
#include "tensorflow/compiler/xla/service/transfer_manager.h"
|
||||||
#include "tensorflow/compiler/xla/shape.h"
|
#include "tensorflow/compiler/xla/shape.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
#include "tensorflow/stream_executor/owning_device_memory.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
@ -93,14 +93,14 @@ class PySharedDeviceBuffer {
|
|||||||
// Makes a tuple buffer. Does not initialize the tuple table.
|
// Makes a tuple buffer. Does not initialize the tuple table.
|
||||||
static StatusOr<std::shared_ptr<PySharedDeviceBuffer>> MakeTuple(
|
static StatusOr<std::shared_ptr<PySharedDeviceBuffer>> MakeTuple(
|
||||||
std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
|
std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
|
||||||
TransferManager* transfer_manager, DeviceMemoryAllocator* allocator,
|
TransferManager* transfer_manager, se::DeviceMemoryAllocator* allocator,
|
||||||
int device_ordinal,
|
int device_ordinal,
|
||||||
std::shared_ptr<BufferDefinitionEvent> definition_event);
|
std::shared_ptr<BufferDefinitionEvent> definition_event);
|
||||||
|
|
||||||
// Makes an uninitialized array buffer.
|
// Makes an uninitialized array buffer.
|
||||||
static StatusOr<std::shared_ptr<PySharedDeviceBuffer>> MakeArray(
|
static StatusOr<std::shared_ptr<PySharedDeviceBuffer>> MakeArray(
|
||||||
Shape on_device_shape, TransferManager* transfer_manager,
|
Shape on_device_shape, TransferManager* transfer_manager,
|
||||||
DeviceMemoryAllocator* allocator, int device_ordinal,
|
se::DeviceMemoryAllocator* allocator, int device_ordinal,
|
||||||
std::shared_ptr<BufferDefinitionEvent> definition_event);
|
std::shared_ptr<BufferDefinitionEvent> definition_event);
|
||||||
|
|
||||||
// Builds a ShapedBuffer view onto the buffers of 'tree'. Since
|
// Builds a ShapedBuffer view onto the buffers of 'tree'. Since
|
||||||
@ -113,7 +113,7 @@ class PySharedDeviceBuffer {
|
|||||||
const std::vector<std::shared_ptr<PySharedDeviceBuffer>>& children() const {
|
const std::vector<std::shared_ptr<PySharedDeviceBuffer>>& children() const {
|
||||||
return children_;
|
return children_;
|
||||||
}
|
}
|
||||||
const OwningDeviceMemory& device_memory() const { return device_memory_; }
|
const se::OwningDeviceMemory& device_memory() const { return device_memory_; }
|
||||||
int device_ordinal() const { return device_memory_.device_ordinal(); }
|
int device_ordinal() const { return device_memory_.device_ordinal(); }
|
||||||
const std::shared_ptr<BufferDefinitionEvent> definition_event() const {
|
const std::shared_ptr<BufferDefinitionEvent> definition_event() const {
|
||||||
return definition_event_;
|
return definition_event_;
|
||||||
@ -121,7 +121,7 @@ class PySharedDeviceBuffer {
|
|||||||
|
|
||||||
PySharedDeviceBuffer() = default;
|
PySharedDeviceBuffer() = default;
|
||||||
PySharedDeviceBuffer(
|
PySharedDeviceBuffer(
|
||||||
Shape on_device_shape, OwningDeviceMemory device_memory,
|
Shape on_device_shape, se::OwningDeviceMemory device_memory,
|
||||||
std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
|
std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
|
||||||
std::shared_ptr<BufferDefinitionEvent> definition_event);
|
std::shared_ptr<BufferDefinitionEvent> definition_event);
|
||||||
|
|
||||||
@ -130,7 +130,7 @@ class PySharedDeviceBuffer {
|
|||||||
// one-to-one with the tree of device buffers, so to avoid representational
|
// one-to-one with the tree of device buffers, so to avoid representational
|
||||||
// awkwardness we maintain on-host shapes separately.
|
// awkwardness we maintain on-host shapes separately.
|
||||||
Shape on_device_shape_;
|
Shape on_device_shape_;
|
||||||
OwningDeviceMemory device_memory_;
|
se::OwningDeviceMemory device_memory_;
|
||||||
std::vector<std::shared_ptr<PySharedDeviceBuffer>> children_;
|
std::vector<std::shared_ptr<PySharedDeviceBuffer>> children_;
|
||||||
|
|
||||||
// An event that is triggered when the content of one or more buffers is
|
// An event that is triggered when the content of one or more buffers is
|
||||||
|
@ -16,8 +16,8 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/python/types.h"
|
#include "tensorflow/compiler/xla/python/types.h"
|
||||||
|
|
||||||
#include "absl/container/flat_hash_map.h"
|
#include "absl/container/flat_hash_map.h"
|
||||||
#include "tensorflow/compiler/xla/service/owning_device_memory.h"
|
|
||||||
#include "tensorflow/compiler/xla/status_macros.h"
|
#include "tensorflow/compiler/xla/status_macros.h"
|
||||||
|
#include "tensorflow/stream_executor/owning_device_memory.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
|
@ -437,10 +437,10 @@ tf_cc_test(
|
|||||||
srcs = ["pattern_matcher_test.cc"],
|
srcs = ["pattern_matcher_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
":hlo",
|
":hlo",
|
||||||
|
":hlo_parser",
|
||||||
":pattern_matcher",
|
":pattern_matcher",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
@ -508,8 +508,8 @@ cc_library(
|
|||||||
hdrs = ["hlo_matchers.h"],
|
hdrs = ["hlo_matchers.h"],
|
||||||
deps = [
|
deps = [
|
||||||
":hlo",
|
":hlo",
|
||||||
|
":hlo_parser",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
"@com_google_absl//absl/types:optional",
|
"@com_google_absl//absl/types:optional",
|
||||||
@ -552,13 +552,13 @@ tf_cc_test(
|
|||||||
srcs = ["hlo_sharding_test.cc"],
|
srcs = ["hlo_sharding_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
":hlo",
|
":hlo",
|
||||||
|
":hlo_parser",
|
||||||
"//tensorflow/compiler/xla:literal",
|
"//tensorflow/compiler/xla:literal",
|
||||||
"//tensorflow/compiler/xla:protobuf_util",
|
"//tensorflow/compiler/xla:protobuf_util",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla:test_helpers",
|
"//tensorflow/compiler/xla:test_helpers",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
],
|
],
|
||||||
@ -586,6 +586,7 @@ tf_cc_test(
|
|||||||
srcs = ["call_graph_test.cc"],
|
srcs = ["call_graph_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
":call_graph",
|
":call_graph",
|
||||||
|
":hlo",
|
||||||
"//tensorflow/compiler/xla:literal",
|
"//tensorflow/compiler/xla:literal",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
@ -593,7 +594,6 @@ tf_cc_test(
|
|||||||
"//tensorflow/compiler/xla:test_helpers",
|
"//tensorflow/compiler/xla:test_helpers",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:hlo",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
@ -656,6 +656,7 @@ tf_cc_test(
|
|||||||
deps = [
|
deps = [
|
||||||
":call_graph",
|
":call_graph",
|
||||||
":flatten_call_graph",
|
":flatten_call_graph",
|
||||||
|
":hlo",
|
||||||
"//tensorflow/compiler/xla:literal",
|
"//tensorflow/compiler/xla:literal",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
@ -663,7 +664,6 @@ tf_cc_test(
|
|||||||
"//tensorflow/compiler/xla:test_helpers",
|
"//tensorflow/compiler/xla:test_helpers",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:hlo",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
@ -694,7 +694,6 @@ cc_library(
|
|||||||
deps = [
|
deps = [
|
||||||
":compiler",
|
":compiler",
|
||||||
":computation_placer",
|
":computation_placer",
|
||||||
":device_memory_allocator",
|
|
||||||
":platform_util",
|
":platform_util",
|
||||||
":stream_pool",
|
":stream_pool",
|
||||||
":transfer_manager",
|
":transfer_manager",
|
||||||
@ -704,6 +703,7 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"//third_party/eigen3",
|
"//third_party/eigen3",
|
||||||
"@com_google_absl//absl/container:flat_hash_map",
|
"@com_google_absl//absl/container:flat_hash_map",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
@ -724,7 +724,6 @@ cc_library(
|
|||||||
":compiler",
|
":compiler",
|
||||||
":computation_layout",
|
":computation_layout",
|
||||||
":computation_placer",
|
":computation_placer",
|
||||||
":device_memory_allocator",
|
|
||||||
":dump",
|
":dump",
|
||||||
":dynamic_dimension_inference",
|
":dynamic_dimension_inference",
|
||||||
":executable",
|
":executable",
|
||||||
@ -754,6 +753,7 @@ cc_library(
|
|||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:ptr_util",
|
"//tensorflow/core:ptr_util",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
"@com_google_absl//absl/strings:str_format",
|
"@com_google_absl//absl/strings:str_format",
|
||||||
@ -770,7 +770,6 @@ cc_library(
|
|||||||
":backend",
|
":backend",
|
||||||
":compiler",
|
":compiler",
|
||||||
":computation_layout",
|
":computation_layout",
|
||||||
":device_memory_allocator",
|
|
||||||
":executable",
|
":executable",
|
||||||
":hlo",
|
":hlo",
|
||||||
":hlo_execution_profile",
|
":hlo_execution_profile",
|
||||||
@ -790,6 +789,7 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla/client:xla_computation",
|
"//tensorflow/compiler/xla/client:xla_computation",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
"@com_google_absl//absl/strings:str_format",
|
"@com_google_absl//absl/strings:str_format",
|
||||||
@ -858,7 +858,6 @@ cc_library(
|
|||||||
srcs = ["shaped_buffer.cc"],
|
srcs = ["shaped_buffer.cc"],
|
||||||
hdrs = ["shaped_buffer.h"],
|
hdrs = ["shaped_buffer.h"],
|
||||||
deps = [
|
deps = [
|
||||||
":device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla:shape_tree",
|
"//tensorflow/compiler/xla:shape_tree",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
@ -868,6 +867,7 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/container:flat_hash_set",
|
"@com_google_absl//absl/container:flat_hash_set",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
@ -881,7 +881,6 @@ tf_cc_test(
|
|||||||
srcs = ["shaped_buffer_test.cc"],
|
srcs = ["shaped_buffer_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
":cpu_plugin",
|
":cpu_plugin",
|
||||||
":device_memory_allocator",
|
|
||||||
":platform_util",
|
":platform_util",
|
||||||
":shaped_buffer",
|
":shaped_buffer",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
@ -891,6 +890,7 @@ tf_cc_test(
|
|||||||
"//tensorflow/core:ptr_util",
|
"//tensorflow/core:ptr_util",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@ -904,7 +904,6 @@ cc_library(
|
|||||||
],
|
],
|
||||||
deps = [
|
deps = [
|
||||||
":computation_layout",
|
":computation_layout",
|
||||||
":device_memory_allocator",
|
|
||||||
":dump",
|
":dump",
|
||||||
":hlo",
|
":hlo",
|
||||||
":hlo_execution_profile",
|
":hlo_execution_profile",
|
||||||
@ -925,6 +924,7 @@ cc_library(
|
|||||||
"//tensorflow/core:lib_internal",
|
"//tensorflow/core:lib_internal",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/stream_executor",
|
"//tensorflow/stream_executor",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
"@com_google_absl//absl/strings:str_format",
|
"@com_google_absl//absl/strings:str_format",
|
||||||
"@com_google_absl//absl/types:span",
|
"@com_google_absl//absl/types:span",
|
||||||
@ -991,7 +991,6 @@ cc_library(
|
|||||||
hdrs = ["allocation_tracker.h"],
|
hdrs = ["allocation_tracker.h"],
|
||||||
deps = [
|
deps = [
|
||||||
":backend",
|
":backend",
|
||||||
":device_memory_allocator",
|
|
||||||
":transfer_manager",
|
":transfer_manager",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
@ -1000,6 +999,7 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/container:flat_hash_map",
|
"@com_google_absl//absl/container:flat_hash_map",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
@ -1159,6 +1159,7 @@ tf_cc_test(
|
|||||||
":hlo",
|
":hlo",
|
||||||
":hlo_memory_scheduler",
|
":hlo_memory_scheduler",
|
||||||
":hlo_ordering",
|
":hlo_ordering",
|
||||||
|
":hlo_parser",
|
||||||
"//tensorflow/compiler/xla:literal",
|
"//tensorflow/compiler/xla:literal",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
@ -1166,7 +1167,6 @@ tf_cc_test(
|
|||||||
"//tensorflow/compiler/xla:types",
|
"//tensorflow/compiler/xla:types",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
@ -1208,10 +1208,10 @@ tf_cc_test(
|
|||||||
":hlo_dataflow_analysis",
|
":hlo_dataflow_analysis",
|
||||||
":hlo_memory_scheduler",
|
":hlo_memory_scheduler",
|
||||||
":hlo_ordering",
|
":hlo_ordering",
|
||||||
|
":hlo_parser",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:types",
|
"//tensorflow/compiler/xla:types",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
@ -1458,8 +1458,8 @@ tf_cc_test(
|
|||||||
srcs = ["instruction_fusion_test.cc"],
|
srcs = ["instruction_fusion_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
":hlo_matchers",
|
":hlo_matchers",
|
||||||
|
":hlo_parser",
|
||||||
":instruction_fusion",
|
":instruction_fusion",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
],
|
],
|
||||||
@ -1470,11 +1470,11 @@ cc_library(
|
|||||||
srcs = ["multi_output_fusion.cc"],
|
srcs = ["multi_output_fusion.cc"],
|
||||||
hdrs = ["multi_output_fusion.h"],
|
hdrs = ["multi_output_fusion.h"],
|
||||||
deps = [
|
deps = [
|
||||||
|
":hlo",
|
||||||
|
":hlo_pass",
|
||||||
":hlo_reachability",
|
":hlo_reachability",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:statusor",
|
"//tensorflow/compiler/xla:statusor",
|
||||||
"//tensorflow/compiler/xla/service:hlo",
|
|
||||||
"//tensorflow/compiler/xla/service:hlo_pass",
|
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"@com_google_absl//absl/container:flat_hash_map",
|
"@com_google_absl//absl/container:flat_hash_map",
|
||||||
"@com_google_absl//absl/container:flat_hash_set",
|
"@com_google_absl//absl/container:flat_hash_set",
|
||||||
@ -1791,8 +1791,8 @@ tf_cc_test(
|
|||||||
srcs = ["gather_expander_test.cc"],
|
srcs = ["gather_expander_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
":gather_expander",
|
":gather_expander",
|
||||||
|
":hlo_parser",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:test_macros_header",
|
"//tensorflow/compiler/xla/tests:test_macros_header",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep
|
||||||
],
|
],
|
||||||
@ -1890,9 +1890,9 @@ tf_cc_test(
|
|||||||
name = "while_loop_analysis_test",
|
name = "while_loop_analysis_test",
|
||||||
srcs = ["while_loop_analysis_test.cc"],
|
srcs = ["while_loop_analysis_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
|
":hlo_parser",
|
||||||
":while_loop_analysis",
|
":while_loop_analysis",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
@ -2297,7 +2297,7 @@ tf_cc_test(
|
|||||||
":cpu_plugin",
|
":cpu_plugin",
|
||||||
":hlo_cost_analysis",
|
":hlo_cost_analysis",
|
||||||
":hlo_execution_profile",
|
":hlo_execution_profile",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
":hlo_parser",
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
@ -2310,14 +2310,14 @@ tf_cc_test(
|
|||||||
srcs = ["hlo_computation_test.cc"],
|
srcs = ["hlo_computation_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
":hlo",
|
":hlo",
|
||||||
|
":hlo_matchers",
|
||||||
|
":hlo_parser",
|
||||||
":pattern_matcher",
|
":pattern_matcher",
|
||||||
":pattern_matcher_gmock",
|
":pattern_matcher_gmock",
|
||||||
"//tensorflow/compiler/xla:literal",
|
"//tensorflow/compiler/xla:literal",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla:test_helpers",
|
"//tensorflow/compiler/xla:test_helpers",
|
||||||
"//tensorflow/compiler/xla/service:hlo_matchers",
|
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
"@com_google_absl//absl/container:flat_hash_map",
|
"@com_google_absl//absl/container:flat_hash_map",
|
||||||
@ -2522,13 +2522,13 @@ tf_cc_test(
|
|||||||
deps = [
|
deps = [
|
||||||
":hlo",
|
":hlo",
|
||||||
":hlo_liveness_analysis",
|
":hlo_liveness_analysis",
|
||||||
|
":hlo_parser",
|
||||||
"//tensorflow/compiler/xla:literal",
|
"//tensorflow/compiler/xla:literal",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla:test_helpers",
|
"//tensorflow/compiler/xla:test_helpers",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
@ -2912,12 +2912,12 @@ tf_cc_test(
|
|||||||
deps = [
|
deps = [
|
||||||
":hlo",
|
":hlo",
|
||||||
":hlo_module_dce",
|
":hlo_module_dce",
|
||||||
|
":hlo_parser",
|
||||||
"//tensorflow/compiler/xla:literal",
|
"//tensorflow/compiler/xla:literal",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:types",
|
"//tensorflow/compiler/xla:types",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:literal_test_util",
|
"//tensorflow/compiler/xla/tests:literal_test_util",
|
||||||
"//tensorflow/compiler/xla/tests:test_utils",
|
"//tensorflow/compiler/xla/tests:test_utils",
|
||||||
@ -3043,12 +3043,12 @@ tf_cc_test(
|
|||||||
":hlo",
|
":hlo",
|
||||||
":hlo_cse",
|
":hlo_cse",
|
||||||
":hlo_matchers",
|
":hlo_matchers",
|
||||||
|
":hlo_parser",
|
||||||
"//tensorflow/compiler/xla:literal",
|
"//tensorflow/compiler/xla:literal",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:types",
|
"//tensorflow/compiler/xla:types",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:literal_test_util",
|
"//tensorflow/compiler/xla/tests:literal_test_util",
|
||||||
"//tensorflow/compiler/xla/tests:test_utils",
|
"//tensorflow/compiler/xla/tests:test_utils",
|
||||||
@ -3232,27 +3232,6 @@ tf_cc_test(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
cc_library(
|
|
||||||
name = "device_memory_allocator",
|
|
||||||
srcs = [
|
|
||||||
"device_memory_allocator.cc",
|
|
||||||
"owning_device_memory.cc",
|
|
||||||
],
|
|
||||||
hdrs = [
|
|
||||||
"device_memory_allocator.h",
|
|
||||||
"owning_device_memory.h",
|
|
||||||
],
|
|
||||||
deps = [
|
|
||||||
"//tensorflow/compiler/xla:status_macros",
|
|
||||||
"//tensorflow/compiler/xla:statusor",
|
|
||||||
"//tensorflow/compiler/xla:types",
|
|
||||||
"//tensorflow/compiler/xla:util",
|
|
||||||
"//tensorflow/core:lib",
|
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
|
||||||
"@com_google_absl//absl/types:span",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "maybe_owning_device_memory",
|
name = "maybe_owning_device_memory",
|
||||||
srcs = [
|
srcs = [
|
||||||
@ -3262,7 +3241,7 @@ cc_library(
|
|||||||
"maybe_owning_device_memory.h",
|
"maybe_owning_device_memory.h",
|
||||||
],
|
],
|
||||||
deps = [
|
deps = [
|
||||||
":device_memory_allocator",
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/types:optional",
|
"@com_google_absl//absl/types:optional",
|
||||||
"@com_google_absl//absl/types:variant",
|
"@com_google_absl//absl/types:variant",
|
||||||
],
|
],
|
||||||
@ -3305,10 +3284,10 @@ xla_test(
|
|||||||
"gpu",
|
"gpu",
|
||||||
],
|
],
|
||||||
deps = [
|
deps = [
|
||||||
|
":hlo_parser",
|
||||||
"//tensorflow/compiler/xla:execution_options_util",
|
"//tensorflow/compiler/xla:execution_options_util",
|
||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:client_library_test_base",
|
"//tensorflow/compiler/xla/tests:client_library_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
@ -3431,6 +3410,7 @@ tf_cc_test(
|
|||||||
deps = [
|
deps = [
|
||||||
":hlo",
|
":hlo",
|
||||||
":hlo_matchers",
|
":hlo_matchers",
|
||||||
|
":hlo_parser",
|
||||||
":shape_inference",
|
":shape_inference",
|
||||||
":transpose_folding",
|
":transpose_folding",
|
||||||
"//tensorflow/compiler/xla:literal",
|
"//tensorflow/compiler/xla:literal",
|
||||||
@ -3439,7 +3419,6 @@ tf_cc_test(
|
|||||||
"//tensorflow/compiler/xla:test_helpers",
|
"//tensorflow/compiler/xla:test_helpers",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/client:xla_builder",
|
"//tensorflow/compiler/xla/client:xla_builder",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
|
"//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
@ -3682,10 +3661,10 @@ tf_cc_test(
|
|||||||
name = "tuple_util_test",
|
name = "tuple_util_test",
|
||||||
srcs = ["tuple_util_test.cc"],
|
srcs = ["tuple_util_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
|
":hlo_matchers",
|
||||||
|
":hlo_parser",
|
||||||
":tuple_util",
|
":tuple_util",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla/service:hlo_matchers",
|
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@ -3711,11 +3690,11 @@ tf_cc_test(
|
|||||||
name = "while_util_test",
|
name = "while_util_test",
|
||||||
srcs = ["while_util_test.cc"],
|
srcs = ["while_util_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
|
":hlo_matchers",
|
||||||
|
":hlo_parser",
|
||||||
":while_util",
|
":while_util",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla/service:hlo_matchers",
|
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main",
|
||||||
"@com_google_absl//absl/algorithm:container",
|
"@com_google_absl//absl/algorithm:container",
|
||||||
],
|
],
|
||||||
@ -3746,9 +3725,9 @@ tf_cc_test(
|
|||||||
srcs = ["while_loop_invariant_code_motion_test.cc"],
|
srcs = ["while_loop_invariant_code_motion_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
":hlo_matchers",
|
":hlo_matchers",
|
||||||
|
":hlo_parser",
|
||||||
":while_loop_invariant_code_motion",
|
":while_loop_invariant_code_motion",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
],
|
],
|
||||||
@ -3774,9 +3753,9 @@ tf_cc_test(
|
|||||||
srcs = ["while_loop_constant_sinking_test.cc"],
|
srcs = ["while_loop_constant_sinking_test.cc"],
|
||||||
deps = [
|
deps = [
|
||||||
":hlo_matchers",
|
":hlo_matchers",
|
||||||
|
":hlo_parser",
|
||||||
":while_loop_constant_sinking",
|
":while_loop_constant_sinking",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla/service:hlo_parser",
|
|
||||||
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
"//tensorflow/compiler/xla/tests:hlo_test_base",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
],
|
],
|
||||||
@ -3976,6 +3955,8 @@ cc_library(
|
|||||||
hdrs = ["ar_crs_combiner.h"],
|
hdrs = ["ar_crs_combiner.h"],
|
||||||
deps = [
|
deps = [
|
||||||
":call_graph",
|
":call_graph",
|
||||||
|
":hlo",
|
||||||
|
":hlo_pass",
|
||||||
":pattern_matcher",
|
":pattern_matcher",
|
||||||
"//tensorflow/compiler/xla:literal",
|
"//tensorflow/compiler/xla:literal",
|
||||||
"//tensorflow/compiler/xla:literal_util",
|
"//tensorflow/compiler/xla:literal_util",
|
||||||
@ -3983,8 +3964,6 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
"//tensorflow/compiler/xla:statusor",
|
"//tensorflow/compiler/xla:statusor",
|
||||||
"//tensorflow/compiler/xla:types",
|
"//tensorflow/compiler/xla:types",
|
||||||
"//tensorflow/compiler/xla/service:hlo",
|
|
||||||
"//tensorflow/compiler/xla/service:hlo_pass",
|
|
||||||
"@com_google_absl//absl/container:flat_hash_map",
|
"@com_google_absl//absl/container:flat_hash_map",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
],
|
],
|
||||||
@ -4008,11 +3987,11 @@ cc_library(
|
|||||||
srcs = ["dynamic_index_splitter.cc"],
|
srcs = ["dynamic_index_splitter.cc"],
|
||||||
hdrs = ["dynamic_index_splitter.h"],
|
hdrs = ["dynamic_index_splitter.h"],
|
||||||
deps = [
|
deps = [
|
||||||
|
":hlo",
|
||||||
":hlo_casting_utils",
|
":hlo_casting_utils",
|
||||||
|
":hlo_pass",
|
||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:statusor",
|
"//tensorflow/compiler/xla:statusor",
|
||||||
"//tensorflow/compiler/xla/service:hlo",
|
|
||||||
"//tensorflow/compiler/xla/service:hlo_pass",
|
|
||||||
"@com_google_absl//absl/container:flat_hash_map",
|
"@com_google_absl//absl/container:flat_hash_map",
|
||||||
"@com_google_absl//absl/container:flat_hash_set",
|
"@com_google_absl//absl/container:flat_hash_set",
|
||||||
"@com_google_absl//absl/container:inlined_vector",
|
"@com_google_absl//absl/container:inlined_vector",
|
||||||
|
@ -20,13 +20,13 @@ limitations under the License.
|
|||||||
#include "absl/memory/memory.h"
|
#include "absl/memory/memory.h"
|
||||||
#include "absl/strings/str_cat.h"
|
#include "absl/strings/str_cat.h"
|
||||||
#include "tensorflow/compiler/xla/map_util.h"
|
#include "tensorflow/compiler/xla/map_util.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/transfer_manager.h"
|
#include "tensorflow/compiler/xla/service/transfer_manager.h"
|
||||||
#include "tensorflow/compiler/xla/shape_util.h"
|
#include "tensorflow/compiler/xla/shape_util.h"
|
||||||
#include "tensorflow/compiler/xla/status_macros.h"
|
#include "tensorflow/compiler/xla/status_macros.h"
|
||||||
#include "tensorflow/compiler/xla/types.h"
|
#include "tensorflow/compiler/xla/types.h"
|
||||||
#include "tensorflow/compiler/xla/util.h"
|
#include "tensorflow/compiler/xla/util.h"
|
||||||
#include "tensorflow/core/platform/logging.h"
|
#include "tensorflow/core/platform/logging.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
@ -221,7 +221,7 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
|
|||||||
auto it = allocation_map.find(device_memory.opaque());
|
auto it = allocation_map.find(device_memory.opaque());
|
||||||
if (it == allocation_map.end()) {
|
if (it == allocation_map.end()) {
|
||||||
allocation_map[device_memory.opaque()] = {
|
allocation_map[device_memory.opaque()] = {
|
||||||
OwningDeviceMemory(device_memory, device_ordinal,
|
se::OwningDeviceMemory(device_memory, device_ordinal,
|
||||||
backend_->memory_allocator()),
|
backend_->memory_allocator()),
|
||||||
/*ref_count=*/1};
|
/*ref_count=*/1};
|
||||||
} else {
|
} else {
|
||||||
|
@ -77,7 +77,7 @@ class AllocationTracker {
|
|||||||
// Data structure encapsulating single memory allocation on the device.
|
// Data structure encapsulating single memory allocation on the device.
|
||||||
struct Allocation {
|
struct Allocation {
|
||||||
// The pointer to this allocation.
|
// The pointer to this allocation.
|
||||||
OwningDeviceMemory device_memory;
|
se::OwningDeviceMemory device_memory;
|
||||||
|
|
||||||
// This is the number of times this memory allocation is referred to by
|
// This is the number of times this memory allocation is referred to by
|
||||||
// registered data handles.
|
// registered data handles.
|
||||||
|
@ -134,7 +134,7 @@ Backend::Backend(se::Platform* platform, Compiler* compiler,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Create a memory allocator for the valid stream executors.
|
// Create a memory allocator for the valid stream executors.
|
||||||
memory_allocator_ = absl::make_unique<StreamExecutorMemoryAllocator>(
|
memory_allocator_ = absl::make_unique<se::StreamExecutorMemoryAllocator>(
|
||||||
platform, stream_executors);
|
platform, stream_executors);
|
||||||
CHECK(!stream_executors_.empty())
|
CHECK(!stream_executors_.empty())
|
||||||
<< "Service found no devices for backend " << platform_->Name() << '.';
|
<< "Service found no devices for backend " << platform_->Name() << '.';
|
||||||
|
@ -27,7 +27,6 @@ limitations under the License.
|
|||||||
#include "absl/types/span.h"
|
#include "absl/types/span.h"
|
||||||
#include "tensorflow/compiler/xla/service/compiler.h"
|
#include "tensorflow/compiler/xla/service/compiler.h"
|
||||||
#include "tensorflow/compiler/xla/service/computation_placer.h"
|
#include "tensorflow/compiler/xla/service/computation_placer.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/stream_pool.h"
|
#include "tensorflow/compiler/xla/service/stream_pool.h"
|
||||||
#include "tensorflow/compiler/xla/service/transfer_manager.h"
|
#include "tensorflow/compiler/xla/service/transfer_manager.h"
|
||||||
#include "tensorflow/compiler/xla/statusor.h"
|
#include "tensorflow/compiler/xla/statusor.h"
|
||||||
@ -35,6 +34,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/mutex.h"
|
#include "tensorflow/core/platform/mutex.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/platform/thread_annotations.h"
|
#include "tensorflow/core/platform/thread_annotations.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
struct ThreadPoolDevice;
|
struct ThreadPoolDevice;
|
||||||
@ -88,7 +88,7 @@ class Backend {
|
|||||||
// Accessors for the various objects.
|
// Accessors for the various objects.
|
||||||
se::Platform* platform() const { return platform_; }
|
se::Platform* platform() const { return platform_; }
|
||||||
Compiler* compiler() const { return compiler_; }
|
Compiler* compiler() const { return compiler_; }
|
||||||
DeviceMemoryAllocator* memory_allocator() const {
|
se::DeviceMemoryAllocator* memory_allocator() const {
|
||||||
return memory_allocator_.get();
|
return memory_allocator_.get();
|
||||||
}
|
}
|
||||||
TransferManager* transfer_manager() const { return transfer_manager_; }
|
TransferManager* transfer_manager() const { return transfer_manager_; }
|
||||||
@ -179,7 +179,7 @@ class Backend {
|
|||||||
stream_pools_ GUARDED_BY(mu_);
|
stream_pools_ GUARDED_BY(mu_);
|
||||||
|
|
||||||
// The default memory allocator to use.
|
// The default memory allocator to use.
|
||||||
std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
|
std::unique_ptr<se::StreamExecutorMemoryAllocator> memory_allocator_;
|
||||||
|
|
||||||
// For the CPU backend, an Eigen threadpool device for use by Eigen code.
|
// For the CPU backend, an Eigen threadpool device for use by Eigen code.
|
||||||
struct IntraOpThreadPool;
|
struct IntraOpThreadPool;
|
||||||
|
@ -75,8 +75,10 @@ class AotCompilationOptions {
|
|||||||
|
|
||||||
// Optional allocator that may be used for allocating temp space on the device
|
// Optional allocator that may be used for allocating temp space on the device
|
||||||
// during compilation.
|
// during compilation.
|
||||||
DeviceMemoryAllocator* device_allocator() const { return device_allocator_; }
|
se::DeviceMemoryAllocator* device_allocator() const {
|
||||||
void set_device_allocator(DeviceMemoryAllocator* device_allocator) {
|
return device_allocator_;
|
||||||
|
}
|
||||||
|
void set_device_allocator(se::DeviceMemoryAllocator* device_allocator) {
|
||||||
device_allocator_ = device_allocator;
|
device_allocator_ = device_allocator;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -98,7 +100,7 @@ class AotCompilationOptions {
|
|||||||
AotCompilationOptions();
|
AotCompilationOptions();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DeviceMemoryAllocator* device_allocator_ = nullptr;
|
se::DeviceMemoryAllocator* device_allocator_ = nullptr;
|
||||||
DebugOptions debug_options_;
|
DebugOptions debug_options_;
|
||||||
absl::optional<DeviceAssignment> static_device_assignment_;
|
absl::optional<DeviceAssignment> static_device_assignment_;
|
||||||
};
|
};
|
||||||
@ -147,14 +149,14 @@ class Compiler {
|
|||||||
// allocated should be deallocated before this function returns.
|
// allocated should be deallocated before this function returns.
|
||||||
virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
||||||
std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
|
std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
|
||||||
DeviceMemoryAllocator* device_allocator) = 0;
|
se::DeviceMemoryAllocator* device_allocator) = 0;
|
||||||
|
|
||||||
// Optimizes a HLO module group, a set of module which runs concurrently on
|
// Optimizes a HLO module group, a set of module which runs concurrently on
|
||||||
// multiple devices potentially communicating data between the modules.
|
// multiple devices potentially communicating data between the modules.
|
||||||
virtual Status RunHloPassesOnModuleGroup(
|
virtual Status RunHloPassesOnModuleGroup(
|
||||||
HloModuleGroup* module_group,
|
HloModuleGroup* module_group,
|
||||||
absl::Span<se::StreamExecutor* const> executors,
|
absl::Span<se::StreamExecutor* const> executors,
|
||||||
DeviceMemoryAllocator* device_allocator) = 0;
|
se::DeviceMemoryAllocator* device_allocator) = 0;
|
||||||
|
|
||||||
// Compiles the HLO module for execution on a device given by the executor,
|
// Compiles the HLO module for execution on a device given by the executor,
|
||||||
// and returns an executable object or an error status. No HLO passes are
|
// and returns an executable object or an error status. No HLO passes are
|
||||||
@ -168,7 +170,7 @@ class Compiler {
|
|||||||
// device_allocator is optional; see RunHloPasses.
|
// device_allocator is optional; see RunHloPasses.
|
||||||
virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
|
virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
|
||||||
std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
|
std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
|
||||||
DeviceMemoryAllocator* device_allocator) = 0;
|
se::DeviceMemoryAllocator* device_allocator) = 0;
|
||||||
|
|
||||||
// Compiles a set of HLO modules that can run in parallel, potentially
|
// Compiles a set of HLO modules that can run in parallel, potentially
|
||||||
// communicating data between the modules.
|
// communicating data between the modules.
|
||||||
@ -176,7 +178,7 @@ class Compiler {
|
|||||||
RunBackendOnModuleGroup(
|
RunBackendOnModuleGroup(
|
||||||
std::unique_ptr<HloModuleGroup> module_group,
|
std::unique_ptr<HloModuleGroup> module_group,
|
||||||
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) = 0;
|
se::DeviceMemoryAllocator* device_allocator) = 0;
|
||||||
|
|
||||||
// Compiles a set of HLO modules that can run in parallel, potentially
|
// Compiles a set of HLO modules that can run in parallel, potentially
|
||||||
// communicating data between the modules, and returns a corresponding
|
// communicating data between the modules, and returns a corresponding
|
||||||
@ -189,7 +191,7 @@ class Compiler {
|
|||||||
virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
|
virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
|
||||||
std::unique_ptr<HloModuleGroup> module_group,
|
std::unique_ptr<HloModuleGroup> module_group,
|
||||||
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) = 0;
|
se::DeviceMemoryAllocator* device_allocator) = 0;
|
||||||
|
|
||||||
// Returns the backend configurations that the backend will consider for the
|
// Returns the backend configurations that the backend will consider for the
|
||||||
// given HLO. Returns no configurations if the backend does not support
|
// given HLO. Returns no configurations if the backend does not support
|
||||||
|
@ -245,7 +245,6 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:buffer_assignment",
|
"//tensorflow/compiler/xla/service:buffer_assignment",
|
||||||
"//tensorflow/compiler/xla/service:computation_layout",
|
"//tensorflow/compiler/xla/service:computation_layout",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:executable",
|
"//tensorflow/compiler/xla/service:executable",
|
||||||
"//tensorflow/compiler/xla/service:hlo",
|
"//tensorflow/compiler/xla/service:hlo",
|
||||||
"//tensorflow/compiler/xla/service:hlo_execution_profile",
|
"//tensorflow/compiler/xla/service:hlo_execution_profile",
|
||||||
@ -255,6 +254,7 @@ cc_library(
|
|||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/core/profiler/lib:traceme",
|
"//tensorflow/core/profiler/lib:traceme",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"//tensorflow/stream_executor/host:host_stream",
|
"//tensorflow/stream_executor/host:host_stream",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
"@com_google_absl//absl/strings:str_format",
|
"@com_google_absl//absl/strings:str_format",
|
||||||
|
@ -537,7 +537,7 @@ Status CreateHloProfilingArtifacts(
|
|||||||
|
|
||||||
StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
|
StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
|
||||||
std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
|
std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
|
||||||
DeviceMemoryAllocator* /*device_allocator*/) {
|
se::DeviceMemoryAllocator* /*device_allocator*/) {
|
||||||
std::unique_ptr<llvm::TargetMachine> jit_target_machine =
|
std::unique_ptr<llvm::TargetMachine> jit_target_machine =
|
||||||
SimpleOrcJIT::InferTargetMachineForJIT(
|
SimpleOrcJIT::InferTargetMachineForJIT(
|
||||||
CompilerTargetOptions(module->config()),
|
CompilerTargetOptions(module->config()),
|
||||||
@ -597,7 +597,7 @@ struct OrcJITPostCompilationHook {
|
|||||||
|
|
||||||
StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
|
StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
|
||||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* /*device_allocator*/) {
|
se::DeviceMemoryAllocator* /*device_allocator*/) {
|
||||||
VLOG(1) << "Compiling: " << module->name();
|
VLOG(1) << "Compiling: " << module->name();
|
||||||
XLA_SCOPED_LOGGING_TIMER(
|
XLA_SCOPED_LOGGING_TIMER(
|
||||||
absl::StrFormat("Compiling [%s] for CPU using JIT", module->name()));
|
absl::StrFormat("Compiling [%s] for CPU using JIT", module->name()));
|
||||||
|
@ -133,11 +133,11 @@ class CpuCompiler : public LLVMCompiler {
|
|||||||
|
|
||||||
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
||||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
|
|
||||||
StatusOr<std::unique_ptr<Executable>> RunBackend(
|
StatusOr<std::unique_ptr<Executable>> RunBackend(
|
||||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
|
|
||||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||||
CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
|
CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
|
||||||
|
@ -73,13 +73,13 @@ CpuExecutable::CpuExecutable(
|
|||||||
}
|
}
|
||||||
|
|
||||||
StatusOr<std::pair<std::vector<se::DeviceMemoryBase>,
|
StatusOr<std::pair<std::vector<se::DeviceMemoryBase>,
|
||||||
std::vector<OwningDeviceMemory>>>
|
std::vector<se::OwningDeviceMemory>>>
|
||||||
CpuExecutable::CreateBufferTable(
|
CpuExecutable::CreateBufferTable(
|
||||||
DeviceMemoryAllocator* memory_allocator, int device_ordinal,
|
se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
|
||||||
absl::Span<const ShapedBuffer* const> arguments) {
|
absl::Span<const ShapedBuffer* const> arguments) {
|
||||||
std::vector<se::DeviceMemoryBase> unowning_buffers(
|
std::vector<se::DeviceMemoryBase> unowning_buffers(
|
||||||
assignment_->Allocations().size());
|
assignment_->Allocations().size());
|
||||||
std::vector<OwningDeviceMemory> owning_buffers(
|
std::vector<se::OwningDeviceMemory> owning_buffers(
|
||||||
assignment_->Allocations().size());
|
assignment_->Allocations().size());
|
||||||
VLOG(3) << "Allocating " << assignment_->Allocations().size()
|
VLOG(3) << "Allocating " << assignment_->Allocations().size()
|
||||||
<< " allocations for module " << module().name();
|
<< " allocations for module " << module().name();
|
||||||
@ -207,7 +207,7 @@ Status CpuExecutable::ExecuteComputeFunction(
|
|||||||
|
|
||||||
StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
|
StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
|
||||||
const ServiceExecutableRunOptions* run_options,
|
const ServiceExecutableRunOptions* run_options,
|
||||||
absl::Span<OwningDeviceMemory> buffers) {
|
absl::Span<se::OwningDeviceMemory> buffers) {
|
||||||
se::Stream* stream = run_options->stream();
|
se::Stream* stream = run_options->stream();
|
||||||
ScopedShapedBuffer result_buffer(
|
ScopedShapedBuffer result_buffer(
|
||||||
/*on_host_shape=*/result_shape(),
|
/*on_host_shape=*/result_shape(),
|
||||||
@ -216,7 +216,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
|
|||||||
const HloInputOutputAliasConfig& input_output_alias =
|
const HloInputOutputAliasConfig& input_output_alias =
|
||||||
module().input_output_alias_config();
|
module().input_output_alias_config();
|
||||||
|
|
||||||
// Move OwningDeviceMemory values which contain the array(s) of the result
|
// Move se::OwningDeviceMemory values which contain the array(s) of the result
|
||||||
// into the respective location in ScopedShapedBuffer which is returned to the
|
// into the respective location in ScopedShapedBuffer which is returned to the
|
||||||
// caller.
|
// caller.
|
||||||
TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
|
TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
|
||||||
@ -235,7 +235,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
|
|||||||
const BufferAllocation::Slice slice,
|
const BufferAllocation::Slice slice,
|
||||||
this->assignment_->GetUniqueSlice(src, buffer_source->index()));
|
this->assignment_->GetUniqueSlice(src, buffer_source->index()));
|
||||||
const BufferAllocation::Index buffer_index = slice.index();
|
const BufferAllocation::Index buffer_index = slice.index();
|
||||||
OwningDeviceMemory& buffer = buffers[buffer_index];
|
se::OwningDeviceMemory& buffer = buffers[buffer_index];
|
||||||
if (!slice.allocation()->is_entry_computation_parameter()) {
|
if (!slice.allocation()->is_entry_computation_parameter()) {
|
||||||
// If the buffer coming out of the result is from a parameter, the
|
// If the buffer coming out of the result is from a parameter, the
|
||||||
// owning buffer will be null, and that means the caller aliased some
|
// owning buffer will be null, and that means the caller aliased some
|
||||||
@ -297,8 +297,8 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
|
|||||||
auto* host_stream = dynamic_cast<se::host::HostStream*>(
|
auto* host_stream = dynamic_cast<se::host::HostStream*>(
|
||||||
run_options->stream()->implementation());
|
run_options->stream()->implementation());
|
||||||
se::Stream* stream = run_options->stream();
|
se::Stream* stream = run_options->stream();
|
||||||
DeviceMemoryAllocator* memory_allocator = run_options->allocator();
|
se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
|
||||||
std::vector<OwningDeviceMemory> owning_buffers;
|
std::vector<se::OwningDeviceMemory> owning_buffers;
|
||||||
std::vector<se::DeviceMemoryBase> unowning_buffers;
|
std::vector<se::DeviceMemoryBase> unowning_buffers;
|
||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
std::tie(unowning_buffers, owning_buffers),
|
std::tie(unowning_buffers, owning_buffers),
|
||||||
@ -326,7 +326,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
|
|||||||
CpuExecutable* executable;
|
CpuExecutable* executable;
|
||||||
ServiceExecutableRunOptions run_options;
|
ServiceExecutableRunOptions run_options;
|
||||||
std::vector<se::DeviceMemoryBase> unowning_buffers;
|
std::vector<se::DeviceMemoryBase> unowning_buffers;
|
||||||
std::shared_ptr<std::vector<OwningDeviceMemory>> buffers;
|
std::shared_ptr<std::vector<se::OwningDeviceMemory>> buffers;
|
||||||
HloExecutionProfile* hlo_execution_profile;
|
HloExecutionProfile* hlo_execution_profile;
|
||||||
|
|
||||||
void operator()() {
|
void operator()() {
|
||||||
@ -338,7 +338,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
|
|||||||
};
|
};
|
||||||
host_stream->EnqueueTask(
|
host_stream->EnqueueTask(
|
||||||
AsyncRunTask{this, *run_options, std::move(unowning_buffers),
|
AsyncRunTask{this, *run_options, std::move(unowning_buffers),
|
||||||
std::make_shared<std::vector<OwningDeviceMemory>>(
|
std::make_shared<std::vector<se::OwningDeviceMemory>>(
|
||||||
std::move(owning_buffers)),
|
std::move(owning_buffers)),
|
||||||
hlo_execution_profile});
|
hlo_execution_profile});
|
||||||
|
|
||||||
|
@ -25,7 +25,6 @@ limitations under the License.
|
|||||||
#include "absl/types/span.h"
|
#include "absl/types/span.h"
|
||||||
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
|
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
|
||||||
#include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
|
#include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/executable.h"
|
#include "tensorflow/compiler/xla/service/executable.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
|
#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_instruction.h"
|
#include "tensorflow/compiler/xla/service/hlo_instruction.h"
|
||||||
@ -37,6 +36,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/macros.h"
|
#include "tensorflow/core/platform/macros.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace cpu {
|
namespace cpu {
|
||||||
@ -111,8 +111,9 @@ class CpuExecutable : public Executable {
|
|||||||
// storage and the live-out buffer into which the computation writes it
|
// storage and the live-out buffer into which the computation writes it
|
||||||
// result.
|
// result.
|
||||||
StatusOr<std::pair<std::vector<se::DeviceMemoryBase>,
|
StatusOr<std::pair<std::vector<se::DeviceMemoryBase>,
|
||||||
std::vector<OwningDeviceMemory>>>
|
std::vector<se::OwningDeviceMemory>>>
|
||||||
CreateBufferTable(DeviceMemoryAllocator* memory_allocator, int device_ordinal,
|
CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator,
|
||||||
|
int device_ordinal,
|
||||||
absl::Span<const ShapedBuffer* const> arguments);
|
absl::Span<const ShapedBuffer* const> arguments);
|
||||||
|
|
||||||
// Calls the generated function performing the computation with the given
|
// Calls the generated function performing the computation with the given
|
||||||
@ -126,7 +127,7 @@ class CpuExecutable : public Executable {
|
|||||||
// The addresses are set according to buffer assignment.
|
// The addresses are set according to buffer assignment.
|
||||||
StatusOr<ScopedShapedBuffer> CreateResultShapedBuffer(
|
StatusOr<ScopedShapedBuffer> CreateResultShapedBuffer(
|
||||||
const ServiceExecutableRunOptions* run_options,
|
const ServiceExecutableRunOptions* run_options,
|
||||||
absl::Span<OwningDeviceMemory> buffers);
|
absl::Span<se::OwningDeviceMemory> buffers);
|
||||||
|
|
||||||
// Returns the points-to set of the root instruction of the entry
|
// Returns the points-to set of the root instruction of the entry
|
||||||
// computation. Uses points-to analysis from buffer assignment.
|
// computation. Uses points-to analysis from buffer assignment.
|
||||||
|
@ -24,13 +24,11 @@ limitations under the License.
|
|||||||
#include "absl/types/variant.h"
|
#include "absl/types/variant.h"
|
||||||
#include "tensorflow/compiler/xla/debug_options_flags.h"
|
#include "tensorflow/compiler/xla/debug_options_flags.h"
|
||||||
#include "tensorflow/compiler/xla/service/computation_layout.h"
|
#include "tensorflow/compiler/xla/service/computation_layout.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/hlo.pb.h"
|
#include "tensorflow/compiler/xla/service/hlo.pb.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
|
#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
|
#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_module.h"
|
#include "tensorflow/compiler/xla/service/hlo_module.h"
|
||||||
#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
|
#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
|
||||||
#include "tensorflow/compiler/xla/service/owning_device_memory.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
|
#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
|
||||||
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
||||||
#include "tensorflow/compiler/xla/shape_tree.h"
|
#include "tensorflow/compiler/xla/shape_tree.h"
|
||||||
@ -40,6 +38,8 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/mutex.h"
|
#include "tensorflow/core/platform/mutex.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/platform/thread_annotations.h"
|
#include "tensorflow/core/platform/thread_annotations.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
#include "tensorflow/stream_executor/owning_device_memory.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
@ -47,13 +47,13 @@ namespace xla {
|
|||||||
// leftover buffers to be released by the caller.
|
// leftover buffers to be released by the caller.
|
||||||
struct ExecutionOutput {
|
struct ExecutionOutput {
|
||||||
ExecutionOutput(ScopedShapedBuffer result,
|
ExecutionOutput(ScopedShapedBuffer result,
|
||||||
std::vector<OwningDeviceMemory> to_be_released)
|
std::vector<se::OwningDeviceMemory> to_be_released)
|
||||||
: result(std::move(result)), to_be_released(std::move(to_be_released)) {}
|
: result(std::move(result)), to_be_released(std::move(to_be_released)) {}
|
||||||
ScopedShapedBuffer result;
|
ScopedShapedBuffer result;
|
||||||
|
|
||||||
// Leftover buffers for the caller to release. Elements in this list are
|
// Leftover buffers for the caller to release. Elements in this list are
|
||||||
// donated input memory buffers that are not reused by XLA as outputs.
|
// donated input memory buffers that are not reused by XLA as outputs.
|
||||||
std::vector<OwningDeviceMemory> to_be_released;
|
std::vector<se::OwningDeviceMemory> to_be_released;
|
||||||
};
|
};
|
||||||
|
|
||||||
// A given platform's compiler will produce an Executable -- this is a uniform
|
// A given platform's compiler will produce an Executable -- this is a uniform
|
||||||
|
@ -282,10 +282,10 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:types",
|
"//tensorflow/compiler/xla:types",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla/service:buffer_assignment",
|
"//tensorflow/compiler/xla/service:buffer_assignment",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:lib_internal",
|
"//tensorflow/core:lib_internal",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/container:flat_hash_map",
|
"@com_google_absl//absl/container:flat_hash_map",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
"@com_google_absl//absl/types:span",
|
"@com_google_absl//absl/types:span",
|
||||||
@ -408,7 +408,6 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:buffer_assignment",
|
"//tensorflow/compiler/xla/service:buffer_assignment",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:executable",
|
"//tensorflow/compiler/xla/service:executable",
|
||||||
"//tensorflow/compiler/xla/service:hlo",
|
"//tensorflow/compiler/xla/service:hlo",
|
||||||
"//tensorflow/compiler/xla/service:hlo_execution_profile",
|
"//tensorflow/compiler/xla/service:hlo_execution_profile",
|
||||||
@ -428,6 +427,7 @@ cc_library(
|
|||||||
"//tensorflow/stream_executor",
|
"//tensorflow/stream_executor",
|
||||||
"//tensorflow/stream_executor:blas",
|
"//tensorflow/stream_executor:blas",
|
||||||
"//tensorflow/stream_executor:device_memory",
|
"//tensorflow/stream_executor:device_memory",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"//tensorflow/stream_executor:kernel",
|
"//tensorflow/stream_executor:kernel",
|
||||||
"@com_google_absl//absl/algorithm:container",
|
"@com_google_absl//absl/algorithm:container",
|
||||||
"@com_google_absl//absl/base:core_headers",
|
"@com_google_absl//absl/base:core_headers",
|
||||||
@ -476,7 +476,6 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla/service:compiler",
|
"//tensorflow/compiler/xla/service:compiler",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:hlo",
|
"//tensorflow/compiler/xla/service:hlo",
|
||||||
"//tensorflow/compiler/xla/service:hlo_casting_utils",
|
"//tensorflow/compiler/xla/service:hlo_casting_utils",
|
||||||
"//tensorflow/compiler/xla/service:hlo_pass",
|
"//tensorflow/compiler/xla/service:hlo_pass",
|
||||||
@ -485,6 +484,7 @@ cc_library(
|
|||||||
"//tensorflow/core:logger",
|
"//tensorflow/core:logger",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/core/util/proto:proto_utils",
|
"//tensorflow/core/util/proto:proto_utils",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/algorithm:container",
|
"@com_google_absl//absl/algorithm:container",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
"@com_google_absl//absl/strings:str_format",
|
"@com_google_absl//absl/strings:str_format",
|
||||||
@ -500,8 +500,8 @@ cc_library(
|
|||||||
deps = [
|
deps = [
|
||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -517,12 +517,12 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:hlo_module_config",
|
"//tensorflow/compiler/xla/service:hlo_module_config",
|
||||||
"//tensorflow/compiler/xla/service:shaped_buffer",
|
"//tensorflow/compiler/xla/service:shaped_buffer",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/stream_executor:device_memory",
|
"//tensorflow/stream_executor:device_memory",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"//tensorflow/stream_executor:stream_executor_headers",
|
"//tensorflow/stream_executor:stream_executor_headers",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@ -536,12 +536,12 @@ tf_cc_test(
|
|||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
"//tensorflow/compiler/xla:test",
|
"//tensorflow/compiler/xla:test",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:hlo_module_config",
|
"//tensorflow/compiler/xla/service:hlo_module_config",
|
||||||
"//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep
|
"//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
"//tensorflow/core/platform/default/build_config:stream_executor_cuda",
|
"//tensorflow/core/platform/default/build_config:stream_executor_cuda",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"//tensorflow/stream_executor:event",
|
"//tensorflow/stream_executor:event",
|
||||||
"//tensorflow/stream_executor:kernel",
|
"//tensorflow/stream_executor:kernel",
|
||||||
"//tensorflow/stream_executor/cuda:cuda_activation",
|
"//tensorflow/stream_executor/cuda:cuda_activation",
|
||||||
@ -634,12 +634,12 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:literal",
|
"//tensorflow/compiler/xla:literal",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:hlo",
|
"//tensorflow/compiler/xla/service:hlo",
|
||||||
"//tensorflow/compiler/xla/service:hlo_pass",
|
"//tensorflow/compiler/xla/service:hlo_pass",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/stream_executor:blas",
|
"//tensorflow/stream_executor:blas",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/types:optional",
|
"@com_google_absl//absl/types:optional",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@ -1164,7 +1164,6 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:shape_util",
|
"//tensorflow/compiler/xla:shape_util",
|
||||||
"//tensorflow/compiler/xla:status_macros",
|
"//tensorflow/compiler/xla:status_macros",
|
||||||
"//tensorflow/compiler/xla:util",
|
"//tensorflow/compiler/xla:util",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:hlo_module_config",
|
"//tensorflow/compiler/xla/service:hlo_module_config",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/stream_executor:stream_executor_headers",
|
"//tensorflow/stream_executor:stream_executor_headers",
|
||||||
|
@ -39,7 +39,7 @@ void BufferAllocations::Builder::RegisterBuffer(BufferAllocation::Index index,
|
|||||||
|
|
||||||
StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
|
StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
|
||||||
const BufferAssignment* buffer_assignment, int device_ordinal,
|
const BufferAssignment* buffer_assignment, int device_ordinal,
|
||||||
DeviceMemoryAllocator* memory_allocator) {
|
se::DeviceMemoryAllocator* memory_allocator) {
|
||||||
const int64 num_buffers = buffer_assignment->Allocations().size();
|
const int64 num_buffers = buffer_assignment->Allocations().size();
|
||||||
auto buffer_allocations = absl::WrapUnique(new BufferAllocations(
|
auto buffer_allocations = absl::WrapUnique(new BufferAllocations(
|
||||||
num_buffers, device_ordinal, memory_allocator, buffer_assignment));
|
num_buffers, device_ordinal, memory_allocator, buffer_assignment));
|
||||||
@ -77,7 +77,7 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
|
|||||||
const int64 buffer_size = allocation.size();
|
const int64 buffer_size = allocation.size();
|
||||||
se::DeviceMemoryBase buffer_address;
|
se::DeviceMemoryBase buffer_address;
|
||||||
if (buffer_size > 0) {
|
if (buffer_size > 0) {
|
||||||
OwningDeviceMemory buffer;
|
se::OwningDeviceMemory buffer;
|
||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
buffer, memory_allocator->Allocate(device_ordinal, buffer_size));
|
buffer, memory_allocator->Allocate(device_ordinal, buffer_size));
|
||||||
if (reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment !=
|
if (reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment !=
|
||||||
|
@ -23,9 +23,9 @@ limitations under the License.
|
|||||||
#include "absl/container/flat_hash_map.h"
|
#include "absl/container/flat_hash_map.h"
|
||||||
#include "absl/types/span.h"
|
#include "absl/types/span.h"
|
||||||
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
|
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/statusor.h"
|
#include "tensorflow/compiler/xla/statusor.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
@ -50,7 +50,7 @@ class BufferAllocations {
|
|||||||
// memory on.
|
// memory on.
|
||||||
StatusOr<std::unique_ptr<BufferAllocations>> Build(
|
StatusOr<std::unique_ptr<BufferAllocations>> Build(
|
||||||
const BufferAssignment* buffer_assignment, int device_ordinal,
|
const BufferAssignment* buffer_assignment, int device_ordinal,
|
||||||
DeviceMemoryAllocator* memory_allocator);
|
se::DeviceMemoryAllocator* memory_allocator);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>
|
absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>
|
||||||
@ -62,7 +62,9 @@ class BufferAllocations {
|
|||||||
BufferAllocations(const BufferAllocations&) = delete;
|
BufferAllocations(const BufferAllocations&) = delete;
|
||||||
BufferAllocations& operator=(const BufferAllocations&) = delete;
|
BufferAllocations& operator=(const BufferAllocations&) = delete;
|
||||||
|
|
||||||
DeviceMemoryAllocator* memory_allocator() const { return memory_allocator_; }
|
se::DeviceMemoryAllocator* memory_allocator() const {
|
||||||
|
return memory_allocator_;
|
||||||
|
}
|
||||||
int device_ordinal() const { return device_ordinal_; }
|
int device_ordinal() const { return device_ordinal_; }
|
||||||
|
|
||||||
// Returns the device address of buffer `buffer_index`. `buffer_index` must be
|
// Returns the device address of buffer `buffer_index`. `buffer_index` must be
|
||||||
@ -84,7 +86,7 @@ class BufferAllocations {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
BufferAllocations(BufferAllocation::Index buffer_count, int device_ordinal,
|
BufferAllocations(BufferAllocation::Index buffer_count, int device_ordinal,
|
||||||
DeviceMemoryAllocator* memory_allocator,
|
se::DeviceMemoryAllocator* memory_allocator,
|
||||||
const BufferAssignment* buffer_assignment)
|
const BufferAssignment* buffer_assignment)
|
||||||
: buffers_(buffer_count),
|
: buffers_(buffer_count),
|
||||||
device_ordinal_(device_ordinal),
|
device_ordinal_(device_ordinal),
|
||||||
@ -104,7 +106,7 @@ class BufferAllocations {
|
|||||||
se::DeviceMemoryBase temp_buffer_base_;
|
se::DeviceMemoryBase temp_buffer_base_;
|
||||||
|
|
||||||
int device_ordinal_;
|
int device_ordinal_;
|
||||||
DeviceMemoryAllocator* memory_allocator_;
|
se::DeviceMemoryAllocator* memory_allocator_;
|
||||||
const BufferAssignment* buffer_assignment_;
|
const BufferAssignment* buffer_assignment_;
|
||||||
bool torn_down_ = false;
|
bool torn_down_ = false;
|
||||||
};
|
};
|
||||||
|
@ -256,9 +256,9 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
|
|||||||
const auto device_ordinal = stream_exec_->device_ordinal();
|
const auto device_ordinal = stream_exec_->device_ordinal();
|
||||||
|
|
||||||
// allocator either points to this->allocator_ or, if that's null, to a
|
// allocator either points to this->allocator_ or, if that's null, to a
|
||||||
// StreamExecutorMemoryAllocator for stream_exec_.
|
// se::StreamExecutorMemoryAllocator for stream_exec_.
|
||||||
DeviceMemoryAllocator* allocator;
|
se::DeviceMemoryAllocator* allocator;
|
||||||
optional<StreamExecutorMemoryAllocator> se_allocator;
|
optional<se::StreamExecutorMemoryAllocator> se_allocator;
|
||||||
if (allocator_ != nullptr) {
|
if (allocator_ != nullptr) {
|
||||||
allocator = allocator_;
|
allocator = allocator_;
|
||||||
} else {
|
} else {
|
||||||
|
@ -19,13 +19,13 @@ limitations under the License.
|
|||||||
#include "absl/time/time.h"
|
#include "absl/time/time.h"
|
||||||
#include "absl/types/optional.h"
|
#include "absl/types/optional.h"
|
||||||
#include "tensorflow/compiler/xla/service/compiler.h"
|
#include "tensorflow/compiler/xla/service/compiler.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
|
#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_instructions.h"
|
#include "tensorflow/compiler/xla/service/hlo_instructions.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_module.h"
|
#include "tensorflow/compiler/xla/service/hlo_module.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
|
#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/protobuf/autotuning.pb.h"
|
#include "tensorflow/core/protobuf/autotuning.pb.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
@ -38,7 +38,8 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
|
|||||||
// memory while timing the various convolution algorithms. If it's null,
|
// memory while timing the various convolution algorithms. If it's null,
|
||||||
// we'll use the default allocator on the StreamExecutor.
|
// we'll use the default allocator on the StreamExecutor.
|
||||||
CudnnConvAlgorithmPicker(se::StreamExecutor* stream_exec,
|
CudnnConvAlgorithmPicker(se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* allocator, Compiler* compiler)
|
se::DeviceMemoryAllocator* allocator,
|
||||||
|
Compiler* compiler)
|
||||||
: stream_exec_(stream_exec), allocator_(allocator), compiler_(compiler) {}
|
: stream_exec_(stream_exec), allocator_(allocator), compiler_(compiler) {}
|
||||||
|
|
||||||
absl::string_view name() const override {
|
absl::string_view name() const override {
|
||||||
@ -56,7 +57,7 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
|
|||||||
const HloCustomCallInstruction* instr);
|
const HloCustomCallInstruction* instr);
|
||||||
|
|
||||||
se::StreamExecutor* stream_exec_; // never null
|
se::StreamExecutor* stream_exec_; // never null
|
||||||
DeviceMemoryAllocator* allocator_; // may be null
|
se::DeviceMemoryAllocator* allocator_; // may be null
|
||||||
Compiler* compiler_;
|
Compiler* compiler_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -174,9 +174,9 @@ StatusOr<bool> CusolverRewriter::RunOnComputation(HloComputation* computation) {
|
|||||||
const auto device_ordinal = stream_exec_->device_ordinal();
|
const auto device_ordinal = stream_exec_->device_ordinal();
|
||||||
|
|
||||||
// allocator either points to this->allocator_ or, if that's null, to a
|
// allocator either points to this->allocator_ or, if that's null, to a
|
||||||
// StreamExecutorMemoryAllocator for stream_exec_.
|
// se::StreamExecutorMemoryAllocator for stream_exec_.
|
||||||
DeviceMemoryAllocator* allocator;
|
se::DeviceMemoryAllocator* allocator;
|
||||||
absl::optional<StreamExecutorMemoryAllocator> se_allocator;
|
absl::optional<se::StreamExecutorMemoryAllocator> se_allocator;
|
||||||
if (allocator_ != nullptr) {
|
if (allocator_ != nullptr) {
|
||||||
allocator = allocator_;
|
allocator = allocator_;
|
||||||
} else {
|
} else {
|
||||||
@ -200,7 +200,7 @@ StatusOr<bool> CusolverRewriter::RunOnComputation(HloComputation* computation) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
CusolverRewriter::CusolverRewriter(se::StreamExecutor* stream_exec,
|
CusolverRewriter::CusolverRewriter(se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* allocator)
|
se::DeviceMemoryAllocator* allocator)
|
||||||
: stream_exec_(stream_exec), allocator_(allocator) {}
|
: stream_exec_(stream_exec), allocator_(allocator) {}
|
||||||
|
|
||||||
StatusOr<bool> CusolverRewriter::Run(HloModule* module) {
|
StatusOr<bool> CusolverRewriter::Run(HloModule* module) {
|
||||||
|
@ -16,12 +16,12 @@ limitations under the License.
|
|||||||
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
|
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
|
||||||
#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
|
#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
|
||||||
|
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/gpu/cusolver_context.h"
|
#include "tensorflow/compiler/xla/service/gpu/cusolver_context.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_computation.h"
|
#include "tensorflow/compiler/xla/service/hlo_computation.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_module.h"
|
#include "tensorflow/compiler/xla/service/hlo_module.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
|
#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
@ -30,7 +30,7 @@ namespace gpu {
|
|||||||
class CusolverRewriter : public HloModulePass {
|
class CusolverRewriter : public HloModulePass {
|
||||||
public:
|
public:
|
||||||
CusolverRewriter(se::StreamExecutor* stream_exec,
|
CusolverRewriter(se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* allocator);
|
se::DeviceMemoryAllocator* allocator);
|
||||||
absl::string_view name() const override { return "cusolver-rewriter"; }
|
absl::string_view name() const override { return "cusolver-rewriter"; }
|
||||||
|
|
||||||
StatusOr<bool> Run(HloModule* module) override;
|
StatusOr<bool> Run(HloModule* module) override;
|
||||||
@ -39,7 +39,7 @@ class CusolverRewriter : public HloModulePass {
|
|||||||
StatusOr<bool> RunOnComputation(HloComputation* computation);
|
StatusOr<bool> RunOnComputation(HloComputation* computation);
|
||||||
|
|
||||||
se::StreamExecutor* stream_exec_; // never null
|
se::StreamExecutor* stream_exec_; // never null
|
||||||
DeviceMemoryAllocator* allocator_; // may be null
|
se::DeviceMemoryAllocator* allocator_; // may be null
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
|
@ -29,7 +29,7 @@ namespace xla {
|
|||||||
namespace gpu {
|
namespace gpu {
|
||||||
|
|
||||||
FftScratchAllocator::FftScratchAllocator(
|
FftScratchAllocator::FftScratchAllocator(
|
||||||
int device_ordinal, DeviceMemoryAllocator* memory_allocator)
|
int device_ordinal, se::DeviceMemoryAllocator* memory_allocator)
|
||||||
: device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
|
: device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
|
||||||
|
|
||||||
int64 FftScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
|
int64 FftScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
|
||||||
@ -48,7 +48,7 @@ StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
|
|||||||
byte_size, GetMemoryLimitInBytes(stream)));
|
byte_size, GetMemoryLimitInBytes(stream)));
|
||||||
}
|
}
|
||||||
|
|
||||||
TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
|
TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
|
||||||
memory_allocator_->Allocate(device_ordinal_, byte_size,
|
memory_allocator_->Allocate(device_ordinal_, byte_size,
|
||||||
/*retry_on_failure=*/false));
|
/*retry_on_failure=*/false));
|
||||||
total_allocated_bytes_ += byte_size;
|
total_allocated_bytes_ += byte_size;
|
||||||
|
@ -38,7 +38,7 @@ namespace gpu {
|
|||||||
class FftScratchAllocator : public se::ScratchAllocator {
|
class FftScratchAllocator : public se::ScratchAllocator {
|
||||||
public:
|
public:
|
||||||
FftScratchAllocator(int device_ordinal,
|
FftScratchAllocator(int device_ordinal,
|
||||||
DeviceMemoryAllocator* memory_allocator);
|
se::DeviceMemoryAllocator* memory_allocator);
|
||||||
|
|
||||||
int64 GetMemoryLimitInBytes(se::Stream* stream) override;
|
int64 GetMemoryLimitInBytes(se::Stream* stream) override;
|
||||||
|
|
||||||
@ -49,8 +49,8 @@ class FftScratchAllocator : public se::ScratchAllocator {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
const int device_ordinal_;
|
const int device_ordinal_;
|
||||||
DeviceMemoryAllocator* memory_allocator_;
|
se::DeviceMemoryAllocator* memory_allocator_;
|
||||||
std::vector<OwningDeviceMemory> allocated_buffers_;
|
std::vector<se::OwningDeviceMemory> allocated_buffers_;
|
||||||
int64 total_allocated_bytes_ = 0;
|
int64 total_allocated_bytes_ = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -230,7 +230,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::Execute(
|
|||||||
const ServiceExecutableRunOptions* run_options,
|
const ServiceExecutableRunOptions* run_options,
|
||||||
absl::Span<const ShapedBuffer* const> arguments,
|
absl::Span<const ShapedBuffer* const> arguments,
|
||||||
HloExecutionProfile* hlo_execution_profile, bool block_host_until_done) {
|
HloExecutionProfile* hlo_execution_profile, bool block_host_until_done) {
|
||||||
DeviceMemoryAllocator* memory_allocator = run_options->allocator();
|
se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
|
||||||
|
|
||||||
if (GetRootPointsToSet().IsAmbiguous()) {
|
if (GetRootPointsToSet().IsAmbiguous()) {
|
||||||
return Unimplemented("Points-to set of root instruction is ambiguous");
|
return Unimplemented("Points-to set of root instruction is ambiguous");
|
||||||
@ -348,7 +348,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
|
|||||||
StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
|
StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
|
||||||
const ServiceExecutableRunOptions* run_options,
|
const ServiceExecutableRunOptions* run_options,
|
||||||
absl::Span<const ShapedBuffer* const> arguments) {
|
absl::Span<const ShapedBuffer* const> arguments) {
|
||||||
DeviceMemoryAllocator* memory_allocator = run_options->allocator();
|
se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
|
||||||
// Force synchronous execution if the allocator requires it.
|
// Force synchronous execution if the allocator requires it.
|
||||||
bool block_host_until_done =
|
bool block_host_until_done =
|
||||||
!memory_allocator->AllowsAsynchronousDeallocation();
|
!memory_allocator->AllowsAsynchronousDeallocation();
|
||||||
|
@ -24,7 +24,6 @@ limitations under the License.
|
|||||||
#include "absl/types/optional.h"
|
#include "absl/types/optional.h"
|
||||||
#include "absl/types/span.h"
|
#include "absl/types/span.h"
|
||||||
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
|
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/executable.h"
|
#include "tensorflow/compiler/xla/service/executable.h"
|
||||||
#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
|
#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
|
||||||
#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
|
#include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
|
||||||
@ -38,6 +37,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/types.h"
|
#include "tensorflow/compiler/xla/types.h"
|
||||||
#include "tensorflow/core/platform/macros.h"
|
#include "tensorflow/core/platform/macros.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
|
@ -164,7 +164,7 @@ string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
|
|||||||
// It takes a compiler pointer, as passes may compile and execute HLOs on the
|
// It takes a compiler pointer, as passes may compile and execute HLOs on the
|
||||||
// fly for cuDNN verification or other purposes.
|
// fly for cuDNN verification or other purposes.
|
||||||
Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
|
Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator,
|
se::DeviceMemoryAllocator* device_allocator,
|
||||||
Compiler* compiler) {
|
Compiler* compiler) {
|
||||||
{
|
{
|
||||||
HloPassPipeline pipeline("optimization");
|
HloPassPipeline pipeline("optimization");
|
||||||
@ -463,7 +463,7 @@ NVPTXCompiler::NVPTXCompiler()
|
|||||||
|
|
||||||
StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
|
StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
|
||||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) {
|
se::DeviceMemoryAllocator* device_allocator) {
|
||||||
// We dump the post-optimization HLO in RunBackend so no need to dump it here.
|
// We dump the post-optimization HLO in RunBackend so no need to dump it here.
|
||||||
XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
|
XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
|
||||||
tensorflow::profiler::TraceMe activity(
|
tensorflow::profiler::TraceMe activity(
|
||||||
@ -479,7 +479,7 @@ StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
|
|||||||
|
|
||||||
StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
|
StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
|
||||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) {
|
se::DeviceMemoryAllocator* device_allocator) {
|
||||||
XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend");
|
XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend");
|
||||||
|
|
||||||
TF_RET_CHECK(stream_exec != nullptr);
|
TF_RET_CHECK(stream_exec != nullptr);
|
||||||
|
@ -53,11 +53,11 @@ class NVPTXCompiler : public LLVMCompiler {
|
|||||||
|
|
||||||
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
||||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
|
|
||||||
StatusOr<std::unique_ptr<Executable>> RunBackend(
|
StatusOr<std::unique_ptr<Executable>> RunBackend(
|
||||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
|
|
||||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||||
CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
|
CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
|
||||||
|
@ -50,7 +50,7 @@ StatusOr<se::DeviceMemory<uint8>> RedzoneAllocator::AllocateBytes(
|
|||||||
|
|
||||||
int64 rhs_slop = RoundUpToNearest(byte_size, kRhsRedzoneAlign) - byte_size;
|
int64 rhs_slop = RoundUpToNearest(byte_size, kRhsRedzoneAlign) - byte_size;
|
||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
OwningDeviceMemory allocated_buffer,
|
se::OwningDeviceMemory allocated_buffer,
|
||||||
memory_allocator_->Allocate(device_ordinal_,
|
memory_allocator_->Allocate(device_ordinal_,
|
||||||
byte_size + 2 * redzone_size_ + rhs_slop,
|
byte_size + 2 * redzone_size_ + rhs_slop,
|
||||||
/*retry_on_failure=*/false));
|
/*retry_on_failure=*/false));
|
||||||
|
@ -18,12 +18,12 @@ limitations under the License.
|
|||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
|
#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_module_config.h"
|
#include "tensorflow/compiler/xla/service/hlo_module_config.h"
|
||||||
#include "tensorflow/compiler/xla/service/owning_device_memory.h"
|
|
||||||
#include "tensorflow/compiler/xla/util.h"
|
#include "tensorflow/compiler/xla/util.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
#include "tensorflow/stream_executor/owning_device_memory.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
@ -41,7 +41,8 @@ namespace gpu {
|
|||||||
// memory for cudnn convolutions.
|
// memory for cudnn convolutions.
|
||||||
class RedzoneAllocator : public se::ScratchAllocator {
|
class RedzoneAllocator : public se::ScratchAllocator {
|
||||||
public:
|
public:
|
||||||
RedzoneAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator,
|
RedzoneAllocator(int device_ordinal,
|
||||||
|
se::DeviceMemoryAllocator* memory_allocator,
|
||||||
const HloModuleConfig& hlo_module_config,
|
const HloModuleConfig& hlo_module_config,
|
||||||
int64 redzone_size = 1 << 23, // 8MiB per side, 16MiB total
|
int64 redzone_size = 1 << 23, // 8MiB per side, 16MiB total
|
||||||
uint8 redzone_pattern = -1)
|
uint8 redzone_pattern = -1)
|
||||||
@ -76,14 +77,14 @@ class RedzoneAllocator : public se::ScratchAllocator {
|
|||||||
const int64 redzone_size_;
|
const int64 redzone_size_;
|
||||||
|
|
||||||
const uint8 redzone_pattern_;
|
const uint8 redzone_pattern_;
|
||||||
DeviceMemoryAllocator* memory_allocator_;
|
se::DeviceMemoryAllocator* memory_allocator_;
|
||||||
const HloModuleConfig& hlo_module_config_;
|
const HloModuleConfig& hlo_module_config_;
|
||||||
|
|
||||||
// The second element of the pair is the size of the user allocation. This
|
// The second element of the pair is the size of the user allocation. This
|
||||||
// isn't necessarily just first.size() - 2 * redzone_size_ because when the
|
// isn't necessarily just first.size() - 2 * redzone_size_ because when the
|
||||||
// user allocation size is not a multiple of 4 bytes, we round up the size of
|
// user allocation size is not a multiple of 4 bytes, we round up the size of
|
||||||
// the RHS redzone.
|
// the RHS redzone.
|
||||||
std::vector<std::pair<OwningDeviceMemory, int64>> allocated_buffers_;
|
std::vector<std::pair<se::OwningDeviceMemory, int64>> allocated_buffers_;
|
||||||
|
|
||||||
int64 allocated_bytes_excluding_redzones_ = 0;
|
int64 allocated_bytes_excluding_redzones_ = 0;
|
||||||
};
|
};
|
||||||
|
@ -15,13 +15,13 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "tensorflow/compiler/xla/service/gpu/redzone_allocator.h"
|
#include "tensorflow/compiler/xla/service/gpu/redzone_allocator.h"
|
||||||
|
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/hlo_module_config.h"
|
#include "tensorflow/compiler/xla/service/hlo_module_config.h"
|
||||||
#include "tensorflow/compiler/xla/status_macros.h"
|
#include "tensorflow/compiler/xla/status_macros.h"
|
||||||
#include "tensorflow/compiler/xla/test.h"
|
#include "tensorflow/compiler/xla/test.h"
|
||||||
#include "tensorflow/core/lib/core/status_test_util.h"
|
#include "tensorflow/core/lib/core/status_test_util.h"
|
||||||
#include "tensorflow/core/platform/test.h"
|
#include "tensorflow/core/platform/test.h"
|
||||||
#include "tensorflow/core/platform/test_benchmark.h"
|
#include "tensorflow/core/platform/test_benchmark.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
#include "tensorflow/stream_executor/multi_platform_manager.h"
|
#include "tensorflow/stream_executor/multi_platform_manager.h"
|
||||||
#include "tensorflow/stream_executor/platform.h"
|
#include "tensorflow/stream_executor/platform.h"
|
||||||
|
|
||||||
@ -42,7 +42,7 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {
|
|||||||
se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
|
se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
|
||||||
se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
|
se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
|
||||||
HloModuleConfig config;
|
HloModuleConfig config;
|
||||||
StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
|
se::StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
|
||||||
RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config,
|
RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config,
|
||||||
kRedzoneSize, kRedzonePattern);
|
kRedzoneSize, kRedzonePattern);
|
||||||
|
|
||||||
@ -118,7 +118,7 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
|
|||||||
se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
|
se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
|
||||||
se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
|
se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
|
||||||
HloModuleConfig config;
|
HloModuleConfig config;
|
||||||
StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
|
se::StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
|
||||||
RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config,
|
RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config,
|
||||||
kRedzoneSize, /*redzone_pattern=*/-1);
|
kRedzoneSize, /*redzone_pattern=*/-1);
|
||||||
se::Stream stream(stream_exec);
|
se::Stream stream(stream_exec);
|
||||||
|
@ -29,7 +29,7 @@ StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
|
|||||||
byte_size, GetMemoryLimitInBytes(stream)));
|
byte_size, GetMemoryLimitInBytes(stream)));
|
||||||
}
|
}
|
||||||
|
|
||||||
TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
|
TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
|
||||||
memory_allocator_->Allocate(device_ordinal_, byte_size,
|
memory_allocator_->Allocate(device_ordinal_, byte_size,
|
||||||
/*retry_on_failure=*/false));
|
/*retry_on_failure=*/false));
|
||||||
total_allocated_bytes_ += byte_size;
|
total_allocated_bytes_ += byte_size;
|
||||||
|
@ -18,18 +18,19 @@ limitations under the License.
|
|||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/owning_device_memory.h"
|
|
||||||
#include "tensorflow/compiler/xla/status_macros.h"
|
#include "tensorflow/compiler/xla/status_macros.h"
|
||||||
#include "tensorflow/compiler/xla/util.h"
|
#include "tensorflow/compiler/xla/util.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
#include "tensorflow/stream_executor/owning_device_memory.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
|
|
||||||
class ScratchAllocator : public se::ScratchAllocator {
|
class ScratchAllocator : public se::ScratchAllocator {
|
||||||
public:
|
public:
|
||||||
ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
|
ScratchAllocator(int device_ordinal,
|
||||||
|
se::DeviceMemoryAllocator* memory_allocator)
|
||||||
: device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
|
: device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
|
||||||
|
|
||||||
int64 GetMemoryLimitInBytes(se::Stream* stream) override {
|
int64 GetMemoryLimitInBytes(se::Stream* stream) override {
|
||||||
@ -50,8 +51,8 @@ class ScratchAllocator : public se::ScratchAllocator {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
const int device_ordinal_;
|
const int device_ordinal_;
|
||||||
DeviceMemoryAllocator* memory_allocator_;
|
se::DeviceMemoryAllocator* memory_allocator_;
|
||||||
std::vector<OwningDeviceMemory> allocated_buffers_;
|
std::vector<se::OwningDeviceMemory> allocated_buffers_;
|
||||||
int64 total_allocated_bytes_ = 0;
|
int64 total_allocated_bytes_ = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -96,7 +96,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
|
|||||||
|
|
||||||
StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
|
StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
|
||||||
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* /*stream_exec*/,
|
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* /*stream_exec*/,
|
||||||
DeviceMemoryAllocator* /*device_allocator*/) {
|
se::DeviceMemoryAllocator* /*device_allocator*/) {
|
||||||
VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
|
VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
|
||||||
TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
|
TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
|
||||||
return std::move(hlo_module);
|
return std::move(hlo_module);
|
||||||
@ -105,13 +105,13 @@ StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
|
|||||||
Status InterpreterCompiler::RunHloPassesOnModuleGroup(
|
Status InterpreterCompiler::RunHloPassesOnModuleGroup(
|
||||||
HloModuleGroup* module_group,
|
HloModuleGroup* module_group,
|
||||||
absl::Span<se::StreamExecutor* const> executors,
|
absl::Span<se::StreamExecutor* const> executors,
|
||||||
DeviceMemoryAllocator* device_allocator) {
|
se::DeviceMemoryAllocator* device_allocator) {
|
||||||
return Unimplemented("Module group compilation not supported on Interpreter");
|
return Unimplemented("Module group compilation not supported on Interpreter");
|
||||||
}
|
}
|
||||||
|
|
||||||
StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
|
StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
|
||||||
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
|
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* /*device_allocator*/) {
|
se::DeviceMemoryAllocator* /*device_allocator*/) {
|
||||||
TF_RET_CHECK(stream_exec != nullptr);
|
TF_RET_CHECK(stream_exec != nullptr);
|
||||||
|
|
||||||
VLOG(1) << "Run backend " << hlo_module->name();
|
VLOG(1) << "Run backend " << hlo_module->name();
|
||||||
@ -137,7 +137,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>>
|
|||||||
InterpreterCompiler::RunBackendOnModuleGroup(
|
InterpreterCompiler::RunBackendOnModuleGroup(
|
||||||
std::unique_ptr<HloModuleGroup> module_group,
|
std::unique_ptr<HloModuleGroup> module_group,
|
||||||
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) {
|
se::DeviceMemoryAllocator* device_allocator) {
|
||||||
return Unimplemented(
|
return Unimplemented(
|
||||||
"Module group compilation is not supported on Interpreter.");
|
"Module group compilation is not supported on Interpreter.");
|
||||||
}
|
}
|
||||||
@ -145,7 +145,7 @@ InterpreterCompiler::RunBackendOnModuleGroup(
|
|||||||
StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
|
StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
|
||||||
std::unique_ptr<HloModuleGroup> module_group,
|
std::unique_ptr<HloModuleGroup> module_group,
|
||||||
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) {
|
se::DeviceMemoryAllocator* device_allocator) {
|
||||||
if (module_group->empty()) {
|
if (module_group->empty()) {
|
||||||
return std::vector<std::unique_ptr<Executable>>();
|
return std::vector<std::unique_ptr<Executable>>();
|
||||||
}
|
}
|
||||||
|
@ -45,24 +45,24 @@ class InterpreterCompiler : public Compiler {
|
|||||||
|
|
||||||
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
||||||
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
|
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
Status RunHloPassesOnModuleGroup(
|
Status RunHloPassesOnModuleGroup(
|
||||||
HloModuleGroup* module_group,
|
HloModuleGroup* module_group,
|
||||||
absl::Span<se::StreamExecutor* const> executors,
|
absl::Span<se::StreamExecutor* const> executors,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
|
|
||||||
StatusOr<std::unique_ptr<Executable>> RunBackend(
|
StatusOr<std::unique_ptr<Executable>> RunBackend(
|
||||||
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
|
std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
|
StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
|
||||||
std::unique_ptr<HloModuleGroup> module_group,
|
std::unique_ptr<HloModuleGroup> module_group,
|
||||||
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
|
|
||||||
StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
|
StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
|
||||||
std::unique_ptr<HloModuleGroup> module_group,
|
std::unique_ptr<HloModuleGroup> module_group,
|
||||||
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
|
|
||||||
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
|
||||||
CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
|
CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
|
||||||
|
@ -24,7 +24,7 @@ namespace xla {
|
|||||||
Status LLVMCompiler::RunHloPassesOnModuleGroup(
|
Status LLVMCompiler::RunHloPassesOnModuleGroup(
|
||||||
HloModuleGroup* module_group,
|
HloModuleGroup* module_group,
|
||||||
absl::Span<se::StreamExecutor* const> executors,
|
absl::Span<se::StreamExecutor* const> executors,
|
||||||
DeviceMemoryAllocator* device_allocator) {
|
se::DeviceMemoryAllocator* device_allocator) {
|
||||||
return Unimplemented(
|
return Unimplemented(
|
||||||
"Model partitioning not implemented for the CPU/GPU compilers!");
|
"Model partitioning not implemented for the CPU/GPU compilers!");
|
||||||
}
|
}
|
||||||
@ -33,7 +33,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>>
|
|||||||
LLVMCompiler::RunBackendOnModuleGroup(
|
LLVMCompiler::RunBackendOnModuleGroup(
|
||||||
std::unique_ptr<HloModuleGroup> module_group,
|
std::unique_ptr<HloModuleGroup> module_group,
|
||||||
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) {
|
se::DeviceMemoryAllocator* device_allocator) {
|
||||||
return Unimplemented(
|
return Unimplemented(
|
||||||
"Model partitioning not implemented for the CPU/GPU compilers!");
|
"Model partitioning not implemented for the CPU/GPU compilers!");
|
||||||
}
|
}
|
||||||
@ -41,7 +41,7 @@ LLVMCompiler::RunBackendOnModuleGroup(
|
|||||||
StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
|
StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
|
||||||
std::unique_ptr<HloModuleGroup> module_group,
|
std::unique_ptr<HloModuleGroup> module_group,
|
||||||
std::vector<std::vector<se::StreamExecutor*>> stream_execs,
|
std::vector<std::vector<se::StreamExecutor*>> stream_execs,
|
||||||
DeviceMemoryAllocator* device_allocator) {
|
se::DeviceMemoryAllocator* device_allocator) {
|
||||||
// Tensorflow tries to enable the following behaviors in all its threads:
|
// Tensorflow tries to enable the following behaviors in all its threads:
|
||||||
//
|
//
|
||||||
// - Denormals are zero (DAZ): roughly, operations treat denormal floats as
|
// - Denormals are zero (DAZ): roughly, operations treat denormal floats as
|
||||||
|
@ -61,28 +61,28 @@ class LLVMCompiler : public Compiler {
|
|||||||
// StatusOr<std::unique_ptr<Executable>> RunBackend(
|
// StatusOr<std::unique_ptr<Executable>> RunBackend(
|
||||||
// std::unique_ptr<HloModule> module,
|
// std::unique_ptr<HloModule> module,
|
||||||
// se::StreamExecutor* stream_exec,
|
// se::StreamExecutor* stream_exec,
|
||||||
// DeviceMemoryAllocator* device_allocator)
|
// se::DeviceMemoryAllocator* device_allocator)
|
||||||
// StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
// StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
|
||||||
// std::unique_ptr<HloModule> module,
|
// std::unique_ptr<HloModule> module,
|
||||||
// se::StreamExecutor* stream_exec,
|
// se::StreamExecutor* stream_exec,
|
||||||
// DeviceMemoryAllocator* device_allocator)
|
// se::DeviceMemoryAllocator* device_allocator)
|
||||||
using Compiler::RunBackend;
|
using Compiler::RunBackend;
|
||||||
using Compiler::RunHloPasses;
|
using Compiler::RunHloPasses;
|
||||||
|
|
||||||
Status RunHloPassesOnModuleGroup(
|
Status RunHloPassesOnModuleGroup(
|
||||||
HloModuleGroup* module_group,
|
HloModuleGroup* module_group,
|
||||||
absl::Span<se::StreamExecutor* const> executors,
|
absl::Span<se::StreamExecutor* const> executors,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
|
|
||||||
StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
|
StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
|
||||||
std::unique_ptr<HloModuleGroup> module_group,
|
std::unique_ptr<HloModuleGroup> module_group,
|
||||||
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
std::vector<std::vector<se::StreamExecutor*>> stream_exec,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
|
|
||||||
StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
|
StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
|
||||||
std::unique_ptr<HloModuleGroup> module_group,
|
std::unique_ptr<HloModuleGroup> module_group,
|
||||||
std::vector<std::vector<se::StreamExecutor*>> stream_execs,
|
std::vector<std::vector<se::StreamExecutor*>> stream_execs,
|
||||||
DeviceMemoryAllocator* device_allocator) override;
|
se::DeviceMemoryAllocator* device_allocator) override;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
ModuleHook user_pre_optimization_hook_;
|
ModuleHook user_pre_optimization_hook_;
|
||||||
|
@ -23,13 +23,13 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/client/xla_computation.h"
|
#include "tensorflow/compiler/xla/client/xla_computation.h"
|
||||||
#include "tensorflow/compiler/xla/service/backend.h"
|
#include "tensorflow/compiler/xla/service/backend.h"
|
||||||
#include "tensorflow/compiler/xla/service/compiler.h"
|
#include "tensorflow/compiler/xla/service/compiler.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/executable.h"
|
#include "tensorflow/compiler/xla/service/executable.h"
|
||||||
#include "tensorflow/compiler/xla/service/service.h"
|
#include "tensorflow/compiler/xla/service/service.h"
|
||||||
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
||||||
#include "tensorflow/compiler/xla/statusor.h"
|
#include "tensorflow/compiler/xla/statusor.h"
|
||||||
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
|
@ -17,25 +17,29 @@ limitations under the License.
|
|||||||
#include "absl/types/variant.h"
|
#include "absl/types/variant.h"
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() {
|
tensorflow::se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() {
|
||||||
if (HasOwnership()) {
|
if (HasOwnership()) {
|
||||||
return absl::get<OwningDeviceMemory>(mem_).AsDeviceMemoryBase();
|
return absl::get<tensorflow::se::OwningDeviceMemory>(mem_)
|
||||||
|
.AsDeviceMemoryBase();
|
||||||
} else {
|
} else {
|
||||||
return absl::get<se::DeviceMemoryBase>(mem_);
|
return absl::get<tensorflow::se::DeviceMemoryBase>(mem_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MaybeOwningDeviceMemory::HasOwnership() const {
|
bool MaybeOwningDeviceMemory::HasOwnership() const {
|
||||||
return absl::holds_alternative<OwningDeviceMemory>(mem_);
|
return absl::holds_alternative<tensorflow::se::OwningDeviceMemory>(mem_);
|
||||||
}
|
}
|
||||||
|
|
||||||
absl::optional<OwningDeviceMemory> MaybeOwningDeviceMemory::Release() {
|
absl::optional<tensorflow::se::OwningDeviceMemory>
|
||||||
|
MaybeOwningDeviceMemory::Release() {
|
||||||
if (!HasOwnership()) {
|
if (!HasOwnership()) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
OwningDeviceMemory result = std::move(absl::get<OwningDeviceMemory>(mem_));
|
tensorflow::se::OwningDeviceMemory result =
|
||||||
|
std::move(absl::get<tensorflow::se::OwningDeviceMemory>(mem_));
|
||||||
mem_ = result.AsDeviceMemoryBase();
|
mem_ = result.AsDeviceMemoryBase();
|
||||||
return absl::make_optional<OwningDeviceMemory>(std::move(result));
|
return absl::make_optional<tensorflow::se::OwningDeviceMemory>(
|
||||||
|
std::move(result));
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace xla
|
} // namespace xla
|
||||||
|
@ -18,30 +18,30 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "absl/types/optional.h"
|
#include "absl/types/optional.h"
|
||||||
#include "absl/types/variant.h"
|
#include "absl/types/variant.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
#include "tensorflow/compiler/xla/service/owning_device_memory.h"
|
#include "tensorflow/stream_executor/owning_device_memory.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
// MaybeOwningDeviceMemory represents either an owned or unowned device memory.
|
// MaybeOwningDeviceMemory represents either an owned or unowned device memory.
|
||||||
// Like std::variant<OwningDeviceMemory, DeviceMemory>. When the object goes
|
// Like std::variant<se::OwningDeviceMemory, DeviceMemory>. When the object goes
|
||||||
// output of scope, it will free the underlying memory if it owns it.
|
// output of scope, it will free the underlying memory if it owns it.
|
||||||
class MaybeOwningDeviceMemory {
|
class MaybeOwningDeviceMemory {
|
||||||
public:
|
public:
|
||||||
MaybeOwningDeviceMemory() = default;
|
MaybeOwningDeviceMemory() = default;
|
||||||
explicit MaybeOwningDeviceMemory(OwningDeviceMemory owned)
|
explicit MaybeOwningDeviceMemory(tensorflow::se::OwningDeviceMemory owned)
|
||||||
: mem_(std::move(owned)) {}
|
: mem_(std::move(owned)) {}
|
||||||
explicit MaybeOwningDeviceMemory(se::DeviceMemoryBase unowned)
|
explicit MaybeOwningDeviceMemory(tensorflow::se::DeviceMemoryBase unowned)
|
||||||
: mem_(unowned) {}
|
: mem_(unowned) {}
|
||||||
MaybeOwningDeviceMemory(MaybeOwningDeviceMemory&&) = default;
|
MaybeOwningDeviceMemory(MaybeOwningDeviceMemory&&) = default;
|
||||||
~MaybeOwningDeviceMemory() = default;
|
~MaybeOwningDeviceMemory() = default;
|
||||||
|
|
||||||
MaybeOwningDeviceMemory& operator=(se::DeviceMemoryBase unowned) {
|
MaybeOwningDeviceMemory& operator=(tensorflow::se::DeviceMemoryBase unowned) {
|
||||||
mem_ = unowned;
|
mem_ = unowned;
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
MaybeOwningDeviceMemory& operator=(OwningDeviceMemory owned) {
|
MaybeOwningDeviceMemory& operator=(tensorflow::se::OwningDeviceMemory owned) {
|
||||||
mem_ = std::move(owned);
|
mem_ = std::move(owned);
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
@ -50,19 +50,21 @@ class MaybeOwningDeviceMemory {
|
|||||||
|
|
||||||
// Fetches the underlying DeviceMemoryBase from a MaybeOwningDeviceMemory. The
|
// Fetches the underlying DeviceMemoryBase from a MaybeOwningDeviceMemory. The
|
||||||
// caller of this function is *not* responsible for freeing the memory.
|
// caller of this function is *not* responsible for freeing the memory.
|
||||||
se::DeviceMemoryBase AsDeviceMemoryBase();
|
tensorflow::se::DeviceMemoryBase AsDeviceMemoryBase();
|
||||||
|
|
||||||
// Release the OwningDeviceMemory without freeing it, and moves the ownership
|
// Release the tensorflow::se::OwningDeviceMemory without freeing it, and
|
||||||
// of the memory buffer from the object to the caller.
|
// moves the ownership of the memory buffer from the object to the caller.
|
||||||
//
|
//
|
||||||
// A nullopt is returned if the HasOwnership() == false;
|
// A nullopt is returned if the HasOwnership() == false;
|
||||||
absl::optional<OwningDeviceMemory> Release();
|
absl::optional<tensorflow::se::OwningDeviceMemory> Release();
|
||||||
|
|
||||||
// Returns true if the device_memory has ownership over underlying memory.
|
// Returns true if the device_memory has ownership over underlying memory.
|
||||||
bool HasOwnership() const;
|
bool HasOwnership() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
absl::variant<OwningDeviceMemory, se::DeviceMemoryBase> mem_;
|
absl::variant<tensorflow::se::OwningDeviceMemory,
|
||||||
|
tensorflow::se::DeviceMemoryBase>
|
||||||
|
mem_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace xla
|
} // namespace xla
|
||||||
|
@ -29,7 +29,6 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/service/compiler.h"
|
#include "tensorflow/compiler/xla/service/compiler.h"
|
||||||
#include "tensorflow/compiler/xla/service/computation_layout.h"
|
#include "tensorflow/compiler/xla/service/computation_layout.h"
|
||||||
#include "tensorflow/compiler/xla/service/computation_placer.h"
|
#include "tensorflow/compiler/xla/service/computation_placer.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/dump.h"
|
#include "tensorflow/compiler/xla/service/dump.h"
|
||||||
#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
|
#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
|
||||||
#include "tensorflow/compiler/xla/service/executable.h"
|
#include "tensorflow/compiler/xla/service/executable.h"
|
||||||
@ -58,6 +57,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
#include "tensorflow/core/util/ptr_util.h"
|
#include "tensorflow/core/util/ptr_util.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace {
|
namespace {
|
||||||
@ -347,7 +347,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
|
|||||||
const std::vector<const HloModuleProto*>& module_protos,
|
const std::vector<const HloModuleProto*>& module_protos,
|
||||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
|
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
|
||||||
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
|
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
|
||||||
DeviceMemoryAllocator* device_allocator) {
|
se::DeviceMemoryAllocator* device_allocator) {
|
||||||
VLOG(1) << StrFormat("BuildExecutable on service %p", this);
|
VLOG(1) << StrFormat("BuildExecutable on service %p", this);
|
||||||
|
|
||||||
// Dump computation proto state if flag is set.
|
// Dump computation proto state if flag is set.
|
||||||
@ -783,7 +783,7 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
|
|||||||
StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
|
StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
|
||||||
const HloModuleProto& module_proto,
|
const HloModuleProto& module_proto,
|
||||||
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
|
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
|
||||||
se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
|
se::StreamExecutor* executor, se::DeviceMemoryAllocator* device_allocator) {
|
||||||
VLOG(1) << StrFormat(
|
VLOG(1) << StrFormat(
|
||||||
"BuildExecutable on service %p with serialized module proto: %s", this,
|
"BuildExecutable on service %p with serialized module proto: %s", this,
|
||||||
module_proto.name());
|
module_proto.name());
|
||||||
|
@ -29,7 +29,6 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/service/backend.h"
|
#include "tensorflow/compiler/xla/service/backend.h"
|
||||||
#include "tensorflow/compiler/xla/service/channel_tracker.h"
|
#include "tensorflow/compiler/xla/service/channel_tracker.h"
|
||||||
#include "tensorflow/compiler/xla/service/compilation_cache.h"
|
#include "tensorflow/compiler/xla/service/compilation_cache.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/executable.h"
|
#include "tensorflow/compiler/xla/service/executable.h"
|
||||||
#include "tensorflow/compiler/xla/service/execution_tracker.h"
|
#include "tensorflow/compiler/xla/service/execution_tracker.h"
|
||||||
#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
|
#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
|
||||||
@ -43,6 +42,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/logging.h"
|
#include "tensorflow/core/platform/logging.h"
|
||||||
#include "tensorflow/core/platform/macros.h"
|
#include "tensorflow/core/platform/macros.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
@ -234,7 +234,7 @@ class Service : public ServiceInterface {
|
|||||||
const HloModuleProto& module_proto,
|
const HloModuleProto& module_proto,
|
||||||
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
|
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
|
||||||
se::StreamExecutor* executor,
|
se::StreamExecutor* executor,
|
||||||
DeviceMemoryAllocator* device_allocator = nullptr);
|
se::DeviceMemoryAllocator* device_allocator = nullptr);
|
||||||
|
|
||||||
// Same as BuildExecutable() above, but builds a list of Executables for the
|
// Same as BuildExecutable() above, but builds a list of Executables for the
|
||||||
// given computations that may interact with each other.
|
// given computations that may interact with each other.
|
||||||
@ -242,7 +242,7 @@ class Service : public ServiceInterface {
|
|||||||
const std::vector<const HloModuleProto*>& module_protos,
|
const std::vector<const HloModuleProto*>& module_protos,
|
||||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
|
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
|
||||||
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
|
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
|
||||||
DeviceMemoryAllocator* device_allocator);
|
se::DeviceMemoryAllocator* device_allocator);
|
||||||
|
|
||||||
// Runs the given executable with the given arguments and register the result
|
// Runs the given executable with the given arguments and register the result
|
||||||
// in the allocation tracker. The handle of the result from the tracker is
|
// in the allocation tracker. The handle of the result from the tracker is
|
||||||
|
@ -43,7 +43,9 @@ class ServiceExecutableRunOptions {
|
|||||||
|
|
||||||
// Delegate to `ExecutableRunOptions` member.
|
// Delegate to `ExecutableRunOptions` member.
|
||||||
se::Stream* stream() const { return run_options_.stream(); }
|
se::Stream* stream() const { return run_options_.stream(); }
|
||||||
DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); }
|
se::DeviceMemoryAllocator* allocator() const {
|
||||||
|
return run_options_.allocator();
|
||||||
|
}
|
||||||
int device_ordinal() const { return run_options_.device_ordinal(); }
|
int device_ordinal() const { return run_options_.device_ordinal(); }
|
||||||
|
|
||||||
// Borrows a stream and returns a smart pointer which returns the stream on
|
// Borrows a stream and returns a smart pointer which returns the stream on
|
||||||
|
@ -119,14 +119,14 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) {
|
|||||||
|
|
||||||
ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape,
|
ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape,
|
||||||
const Shape& on_device_shape,
|
const Shape& on_device_shape,
|
||||||
DeviceMemoryAllocator* allocator,
|
se::DeviceMemoryAllocator* allocator,
|
||||||
int device_ordinal)
|
int device_ordinal)
|
||||||
: ShapedBuffer(on_host_shape, on_device_shape, allocator->platform(),
|
: ShapedBuffer(on_host_shape, on_device_shape, allocator->platform(),
|
||||||
device_ordinal),
|
device_ordinal),
|
||||||
allocator_(allocator) {}
|
allocator_(allocator) {}
|
||||||
|
|
||||||
ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer,
|
ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer,
|
||||||
DeviceMemoryAllocator* allocator)
|
se::DeviceMemoryAllocator* allocator)
|
||||||
: ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {}
|
: ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {}
|
||||||
|
|
||||||
ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s)
|
ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s)
|
||||||
|
@ -21,12 +21,12 @@ limitations under the License.
|
|||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "absl/types/span.h"
|
#include "absl/types/span.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/shape_tree.h"
|
#include "tensorflow/compiler/xla/shape_tree.h"
|
||||||
#include "tensorflow/compiler/xla/statusor.h"
|
#include "tensorflow/compiler/xla/statusor.h"
|
||||||
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
@ -138,13 +138,13 @@ class ScopedShapedBuffer : public ShapedBuffer {
|
|||||||
// Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index.
|
// Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index.
|
||||||
explicit ScopedShapedBuffer(const Shape& on_host_shape,
|
explicit ScopedShapedBuffer(const Shape& on_host_shape,
|
||||||
const Shape& on_device_shape,
|
const Shape& on_device_shape,
|
||||||
DeviceMemoryAllocator* allocator,
|
se::DeviceMemoryAllocator* allocator,
|
||||||
int device_ordinal);
|
int device_ordinal);
|
||||||
|
|
||||||
// Create a ScopedShapedBuffer by taking over the memory from the incoming
|
// Create a ScopedShapedBuffer by taking over the memory from the incoming
|
||||||
// ShapedBuffer.
|
// ShapedBuffer.
|
||||||
explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer,
|
explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer,
|
||||||
DeviceMemoryAllocator* allocator);
|
se::DeviceMemoryAllocator* allocator);
|
||||||
|
|
||||||
// Movable, but not copyable.
|
// Movable, but not copyable.
|
||||||
ScopedShapedBuffer(ScopedShapedBuffer&& s);
|
ScopedShapedBuffer(ScopedShapedBuffer&& s);
|
||||||
@ -157,13 +157,13 @@ class ScopedShapedBuffer : public ShapedBuffer {
|
|||||||
|
|
||||||
// Return the allocator used to allocate the device memory held in this
|
// Return the allocator used to allocate the device memory held in this
|
||||||
// ScopedShapedBuffer.
|
// ScopedShapedBuffer.
|
||||||
DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
|
se::DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
|
||||||
|
|
||||||
// Sets the device memory buffer at the given index.
|
// Sets the device memory buffer at the given index.
|
||||||
//
|
//
|
||||||
// If the given buffer's device memory is non-null, its device_ordinal and
|
// If the given buffer's device memory is non-null, its device_ordinal and
|
||||||
// allocator must match those in `this`.
|
// allocator must match those in `this`.
|
||||||
void set_buffer(OwningDeviceMemory buffer, const ShapeIndex& index) {
|
void set_buffer(se::OwningDeviceMemory buffer, const ShapeIndex& index) {
|
||||||
if (!buffer.is_null()) {
|
if (!buffer.is_null()) {
|
||||||
CHECK_EQ(buffer.device_ordinal(), device_ordinal());
|
CHECK_EQ(buffer.device_ordinal(), device_ordinal());
|
||||||
CHECK_EQ(buffer.allocator(), allocator_);
|
CHECK_EQ(buffer.allocator(), allocator_);
|
||||||
@ -187,7 +187,7 @@ class ScopedShapedBuffer : public ShapedBuffer {
|
|||||||
protected:
|
protected:
|
||||||
void Deallocate();
|
void Deallocate();
|
||||||
|
|
||||||
DeviceMemoryAllocator* allocator_;
|
se::DeviceMemoryAllocator* allocator_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace xla
|
} // namespace xla
|
||||||
|
@ -16,13 +16,13 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
||||||
|
|
||||||
#include "absl/memory/memory.h"
|
#include "absl/memory/memory.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/platform_util.h"
|
#include "tensorflow/compiler/xla/service/platform_util.h"
|
||||||
#include "tensorflow/compiler/xla/shape_util.h"
|
#include "tensorflow/compiler/xla/shape_util.h"
|
||||||
#include "tensorflow/compiler/xla/test.h"
|
#include "tensorflow/compiler/xla/test.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/platform/test_benchmark.h"
|
#include "tensorflow/core/platform/test_benchmark.h"
|
||||||
#include "tensorflow/core/util/ptr_util.h"
|
#include "tensorflow/core/util/ptr_util.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace {
|
namespace {
|
||||||
@ -34,7 +34,7 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
|
|||||||
auto* platform = platforms[0];
|
auto* platform = platforms[0];
|
||||||
TF_ASSERT_OK_AND_ASSIGN(auto executors,
|
TF_ASSERT_OK_AND_ASSIGN(auto executors,
|
||||||
xla::PlatformUtil::GetStreamExecutors(platform));
|
xla::PlatformUtil::GetStreamExecutors(platform));
|
||||||
xla::StreamExecutorMemoryAllocator allocator(platform, executors);
|
xla::se::StreamExecutorMemoryAllocator allocator(platform, executors);
|
||||||
const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
|
const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
|
||||||
const int kDeviceOrdinal = 0;
|
const int kDeviceOrdinal = 0;
|
||||||
auto scoped_buffer = absl::make_unique<xla::ScopedShapedBuffer>(
|
auto scoped_buffer = absl::make_unique<xla::ScopedShapedBuffer>(
|
||||||
@ -43,11 +43,11 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
|
|||||||
buffer = nullptr;
|
buffer = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
class TestAllocator : public DeviceMemoryAllocator {
|
class TestAllocator : public se::DeviceMemoryAllocator {
|
||||||
public:
|
public:
|
||||||
TestAllocator()
|
TestAllocator()
|
||||||
: DeviceMemoryAllocator(PlatformUtil::GetDefaultPlatform().ValueOrDie()) {
|
: se::DeviceMemoryAllocator(
|
||||||
}
|
PlatformUtil::GetDefaultPlatform().ValueOrDie()) {}
|
||||||
|
|
||||||
~TestAllocator() override {
|
~TestAllocator() override {
|
||||||
if (!allocations_.empty()) {
|
if (!allocations_.empty()) {
|
||||||
@ -56,18 +56,18 @@ class TestAllocator : public DeviceMemoryAllocator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Pull in two-arg overload of Allocate.
|
// Pull in two-arg overload of Allocate.
|
||||||
using DeviceMemoryAllocator::Allocate;
|
using se::DeviceMemoryAllocator::Allocate;
|
||||||
|
|
||||||
StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
|
StatusOr<se::OwningDeviceMemory> Allocate(
|
||||||
bool /*retry_on_failure*/) override {
|
int device_ordinal, uint64 size, bool /*retry_on_failure*/) override {
|
||||||
// By contract, we must return null if size == 0.
|
// By contract, we must return null if size == 0.
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
return OwningDeviceMemory();
|
return se::OwningDeviceMemory();
|
||||||
}
|
}
|
||||||
void* buf = malloc(size);
|
void* buf = malloc(size);
|
||||||
allocations_.insert({device_ordinal, buf});
|
allocations_.insert({device_ordinal, buf});
|
||||||
return OwningDeviceMemory(se::DeviceMemoryBase(buf, size), device_ordinal,
|
return se::OwningDeviceMemory(se::DeviceMemoryBase(buf, size),
|
||||||
this);
|
device_ordinal, this);
|
||||||
}
|
}
|
||||||
|
|
||||||
Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override {
|
Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override {
|
||||||
@ -120,7 +120,7 @@ TEST(ScopedShapedBufferTest, TestTakeSubTree) {
|
|||||||
sb.buffers().ForEachMutableElement(
|
sb.buffers().ForEachMutableElement(
|
||||||
[&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
|
[&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
|
||||||
TF_ASSERT_OK_AND_ASSIGN(
|
TF_ASSERT_OK_AND_ASSIGN(
|
||||||
OwningDeviceMemory m,
|
se::OwningDeviceMemory m,
|
||||||
allocator.Allocate(/*device_ordinal=*/0, /*size=*/77));
|
allocator.Allocate(/*device_ordinal=*/0, /*size=*/77));
|
||||||
*buffer = m.Forget();
|
*buffer = m.Forget();
|
||||||
});
|
});
|
||||||
@ -158,7 +158,7 @@ TEST(ScopedShapedBufferTest, TestSubShapeTree) {
|
|||||||
sb.buffers().ForEachMutableElement(
|
sb.buffers().ForEachMutableElement(
|
||||||
[&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
|
[&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
|
||||||
TF_ASSERT_OK_AND_ASSIGN(
|
TF_ASSERT_OK_AND_ASSIGN(
|
||||||
OwningDeviceMemory m,
|
se::OwningDeviceMemory m,
|
||||||
allocator.Allocate(/*device_ordinal=*/0, /*size=*/32));
|
allocator.Allocate(/*device_ordinal=*/0, /*size=*/32));
|
||||||
*buffer = m.Forget();
|
*buffer = m.Forget();
|
||||||
});
|
});
|
||||||
|
@ -308,7 +308,7 @@ Status TransferManager::TransferBufferToDevice(
|
|||||||
}
|
}
|
||||||
|
|
||||||
StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
|
StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
|
||||||
const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
|
const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
|
||||||
int device_ordinal) {
|
int device_ordinal) {
|
||||||
if (!LayoutUtil::HasLayout(on_host_shape)) {
|
if (!LayoutUtil::HasLayout(on_host_shape)) {
|
||||||
return InvalidArgument("Shape must have a layout: %s",
|
return InvalidArgument("Shape must have a layout: %s",
|
||||||
|
@ -229,7 +229,7 @@ class TransferManager {
|
|||||||
// shape. The on-device shape may be different as indicated by
|
// shape. The on-device shape may be different as indicated by
|
||||||
// HostShapeToDeviceShape.
|
// HostShapeToDeviceShape.
|
||||||
StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
|
StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
|
||||||
const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
|
const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
|
||||||
int device_ordinal);
|
int device_ordinal);
|
||||||
|
|
||||||
// The given ShapedBuffer holds a handle to allocated memory, but it is not
|
// The given ShapedBuffer holds a handle to allocated memory, but it is not
|
||||||
|
@ -259,7 +259,6 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla/client:local_client",
|
"//tensorflow/compiler/xla/client:local_client",
|
||||||
"//tensorflow/compiler/xla/client:xla_computation",
|
"//tensorflow/compiler/xla/client:xla_computation",
|
||||||
"//tensorflow/compiler/xla/service:computation_placer",
|
"//tensorflow/compiler/xla/service:computation_placer",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:local_service",
|
"//tensorflow/compiler/xla/service:local_service",
|
||||||
"//tensorflow/compiler/xla/service:platform_util",
|
"//tensorflow/compiler/xla/service:platform_util",
|
||||||
"//tensorflow/compiler/xla/service:shaped_buffer",
|
"//tensorflow/compiler/xla/service:shaped_buffer",
|
||||||
@ -268,6 +267,7 @@ cc_library(
|
|||||||
"//tensorflow/core:core_cpu_internal",
|
"//tensorflow/core:core_cpu_internal",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"//third_party/eigen3",
|
"//third_party/eigen3",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
"@com_google_absl//absl/types:span",
|
"@com_google_absl//absl/types:span",
|
||||||
@ -1172,7 +1172,6 @@ xla_test(
|
|||||||
"//tensorflow/compiler/xla/client:local_client",
|
"//tensorflow/compiler/xla/client:local_client",
|
||||||
"//tensorflow/compiler/xla/client:xla_builder",
|
"//tensorflow/compiler/xla/client:xla_builder",
|
||||||
"//tensorflow/compiler/xla/service:computation_placer",
|
"//tensorflow/compiler/xla/service:computation_placer",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:local_service",
|
"//tensorflow/compiler/xla/service:local_service",
|
||||||
"//tensorflow/compiler/xla/service:platform_util",
|
"//tensorflow/compiler/xla/service:platform_util",
|
||||||
"//tensorflow/compiler/xla/service:shaped_buffer",
|
"//tensorflow/compiler/xla/service:shaped_buffer",
|
||||||
@ -1183,6 +1182,7 @@ xla_test(
|
|||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -2078,7 +2078,6 @@ xla_test(
|
|||||||
"//tensorflow/compiler/xla/client:local_client",
|
"//tensorflow/compiler/xla/client:local_client",
|
||||||
"//tensorflow/compiler/xla/client:xla_builder",
|
"//tensorflow/compiler/xla/client:xla_builder",
|
||||||
"//tensorflow/compiler/xla/client:xla_computation",
|
"//tensorflow/compiler/xla/client:xla_computation",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:local_service",
|
"//tensorflow/compiler/xla/service:local_service",
|
||||||
"//tensorflow/compiler/xla/service:platform_util",
|
"//tensorflow/compiler/xla/service:platform_util",
|
||||||
"//tensorflow/compiler/xla/service:shaped_buffer",
|
"//tensorflow/compiler/xla/service:shaped_buffer",
|
||||||
@ -2090,6 +2089,7 @@ xla_test(
|
|||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -2206,13 +2206,13 @@ xla_test(
|
|||||||
"//tensorflow/compiler/xla:statusor",
|
"//tensorflow/compiler/xla:statusor",
|
||||||
"//tensorflow/compiler/xla:types",
|
"//tensorflow/compiler/xla:types",
|
||||||
"//tensorflow/compiler/xla:xla_data_proto",
|
"//tensorflow/compiler/xla:xla_data_proto",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:generic_transfer_manager",
|
"//tensorflow/compiler/xla/service:generic_transfer_manager",
|
||||||
"//tensorflow/compiler/xla/service:shaped_buffer",
|
"//tensorflow/compiler/xla/service:shaped_buffer",
|
||||||
"//tensorflow/compiler/xla/service:stream_pool",
|
"//tensorflow/compiler/xla/service:stream_pool",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/core:test",
|
"//tensorflow/core:test",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1521,7 +1521,7 @@ void DOT_ReorderContracting(int num_iters) {
|
|||||||
|
|
||||||
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
||||||
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
||||||
StreamExecutorMemoryAllocator allocator(platform, executors);
|
se::StreamExecutorMemoryAllocator allocator(platform, executors);
|
||||||
|
|
||||||
xla::LocalClientOptions client_options;
|
xla::LocalClientOptions client_options;
|
||||||
client_options.set_platform(platform);
|
client_options.set_platform(platform);
|
||||||
|
@ -21,7 +21,6 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/client/local_client.h"
|
#include "tensorflow/compiler/xla/client/local_client.h"
|
||||||
#include "tensorflow/compiler/xla/client/xla_builder.h"
|
#include "tensorflow/compiler/xla/client/xla_builder.h"
|
||||||
#include "tensorflow/compiler/xla/reference_util.h"
|
#include "tensorflow/compiler/xla/reference_util.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/local_service.h"
|
#include "tensorflow/compiler/xla/service/local_service.h"
|
||||||
#include "tensorflow/compiler/xla/service/platform_util.h"
|
#include "tensorflow/compiler/xla/service/platform_util.h"
|
||||||
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
||||||
@ -34,6 +33,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/test.h"
|
#include "tensorflow/core/platform/test.h"
|
||||||
#include "tensorflow/core/platform/test_benchmark.h"
|
#include "tensorflow/core/platform/test_benchmark.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace {
|
namespace {
|
||||||
@ -736,7 +736,7 @@ void BM_DynamicSlice(int num_iters) {
|
|||||||
|
|
||||||
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
||||||
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
||||||
StreamExecutorMemoryAllocator allocator(platform, executors);
|
se::StreamExecutorMemoryAllocator allocator(platform, executors);
|
||||||
LocalClient* client =
|
LocalClient* client =
|
||||||
ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
|
ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
|
||||||
auto* transfer_manager =
|
auto* transfer_manager =
|
||||||
|
@ -829,7 +829,7 @@ void BM_ParallelFusion(int num_iters) {
|
|||||||
|
|
||||||
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
||||||
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
||||||
StreamExecutorMemoryAllocator allocator(platform, executors);
|
se::StreamExecutorMemoryAllocator allocator(platform, executors);
|
||||||
|
|
||||||
const int64 intra_op_parallelism_threads = 24;
|
const int64 intra_op_parallelism_threads = 24;
|
||||||
xla::LocalClientOptions client_options;
|
xla::LocalClientOptions client_options;
|
||||||
|
@ -22,7 +22,6 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/client/xla_builder.h"
|
#include "tensorflow/compiler/xla/client/xla_builder.h"
|
||||||
#include "tensorflow/compiler/xla/layout_util.h"
|
#include "tensorflow/compiler/xla/layout_util.h"
|
||||||
#include "tensorflow/compiler/xla/literal.h"
|
#include "tensorflow/compiler/xla/literal.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/local_service.h"
|
#include "tensorflow/compiler/xla/service/local_service.h"
|
||||||
#include "tensorflow/compiler/xla/service/platform_util.h"
|
#include "tensorflow/compiler/xla/service/platform_util.h"
|
||||||
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
||||||
@ -41,6 +40,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/platform/test.h"
|
#include "tensorflow/core/platform/test.h"
|
||||||
#include "tensorflow/core/platform/test_benchmark.h"
|
#include "tensorflow/core/platform/test_benchmark.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace {
|
namespace {
|
||||||
@ -902,7 +902,7 @@ void BM_LocalClientOverhead(int num_iters) {
|
|||||||
|
|
||||||
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
||||||
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
||||||
StreamExecutorMemoryAllocator allocator(platform, executors);
|
se::StreamExecutorMemoryAllocator allocator(platform, executors);
|
||||||
LocalClient* client =
|
LocalClient* client =
|
||||||
ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
|
ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
|
||||||
auto* transfer_manager =
|
auto* transfer_manager =
|
||||||
|
@ -35,16 +35,15 @@ namespace xla {
|
|||||||
|
|
||||||
/* static */ TestAllocator* LocalClientTestBase::allocator_;
|
/* static */ TestAllocator* LocalClientTestBase::allocator_;
|
||||||
|
|
||||||
StatusOr<OwningDeviceMemory> TestAllocator::Allocate(int device_ordinal,
|
StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
|
||||||
uint64 size,
|
int device_ordinal, uint64 size, bool retry_on_failure) {
|
||||||
bool retry_on_failure) {
|
|
||||||
VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
|
VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
|
||||||
{
|
{
|
||||||
tensorflow::mutex_lock lock(count_mutex_);
|
tensorflow::mutex_lock lock(count_mutex_);
|
||||||
allocation_count_++;
|
allocation_count_++;
|
||||||
device_allocation_count_[device_ordinal]++;
|
device_allocation_count_[device_ordinal]++;
|
||||||
}
|
}
|
||||||
return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size,
|
return se::StreamExecutorMemoryAllocator::Allocate(device_ordinal, size,
|
||||||
retry_on_failure);
|
retry_on_failure);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -55,7 +54,7 @@ Status TestAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
|
|||||||
deallocation_count_++;
|
deallocation_count_++;
|
||||||
device_deallocation_count_[device_ordinal]++;
|
device_deallocation_count_[device_ordinal]++;
|
||||||
}
|
}
|
||||||
return StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem);
|
return se::StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem);
|
||||||
}
|
}
|
||||||
|
|
||||||
int64 TestAllocator::allocation_count() const {
|
int64 TestAllocator::allocation_count() const {
|
||||||
|
@ -24,7 +24,6 @@ limitations under the License.
|
|||||||
#include "tensorflow/compiler/xla/client/client_library.h"
|
#include "tensorflow/compiler/xla/client/client_library.h"
|
||||||
#include "tensorflow/compiler/xla/client/local_client.h"
|
#include "tensorflow/compiler/xla/client/local_client.h"
|
||||||
#include "tensorflow/compiler/xla/client/xla_computation.h"
|
#include "tensorflow/compiler/xla/client/xla_computation.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/local_service.h"
|
#include "tensorflow/compiler/xla/service/local_service.h"
|
||||||
#include "tensorflow/compiler/xla/service/platform_util.h"
|
#include "tensorflow/compiler/xla/service/platform_util.h"
|
||||||
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
||||||
@ -36,17 +35,18 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/platform/thread_annotations.h"
|
#include "tensorflow/core/platform/thread_annotations.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
class TestAllocator : public StreamExecutorMemoryAllocator {
|
class TestAllocator : public se::StreamExecutorMemoryAllocator {
|
||||||
public:
|
public:
|
||||||
explicit TestAllocator(se::Platform* platform)
|
explicit TestAllocator(se::Platform* platform)
|
||||||
: StreamExecutorMemoryAllocator(
|
: se::StreamExecutorMemoryAllocator(
|
||||||
platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) {
|
platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) {
|
||||||
}
|
}
|
||||||
|
|
||||||
StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
|
StatusOr<se::OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
|
||||||
bool retry_on_failure) override;
|
bool retry_on_failure) override;
|
||||||
Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
|
Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
|
||||||
|
|
||||||
|
@ -19,7 +19,6 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "tensorflow/compiler/xla/layout_util.h"
|
#include "tensorflow/compiler/xla/layout_util.h"
|
||||||
#include "tensorflow/compiler/xla/literal.h"
|
#include "tensorflow/compiler/xla/literal.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
|
#include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
|
||||||
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
||||||
#include "tensorflow/compiler/xla/service/stream_pool.h"
|
#include "tensorflow/compiler/xla/service/stream_pool.h"
|
||||||
@ -34,6 +33,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/platform/test_benchmark.h"
|
#include "tensorflow/core/platform/test_benchmark.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace {
|
namespace {
|
||||||
|
@ -1265,7 +1265,7 @@ void BM_WhileLoop(int num_iters) {
|
|||||||
|
|
||||||
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
|
||||||
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
|
||||||
StreamExecutorMemoryAllocator allocator(platform, executors);
|
se::StreamExecutorMemoryAllocator allocator(platform, executors);
|
||||||
LocalClient* client =
|
LocalClient* client =
|
||||||
ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
|
ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
|
||||||
|
|
||||||
|
@ -135,7 +135,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
|
|||||||
LocalService* service = ClientLibrary::GetXlaService(client->platform());
|
LocalService* service = ClientLibrary::GetXlaService(client->platform());
|
||||||
Backend* backend = service->mutable_backend();
|
Backend* backend = service->mutable_backend();
|
||||||
se::StreamExecutor* executor = backend->default_stream_executor();
|
se::StreamExecutor* executor = backend->default_stream_executor();
|
||||||
DeviceMemoryAllocator* allocator = backend->memory_allocator();
|
se::DeviceMemoryAllocator* allocator = backend->memory_allocator();
|
||||||
auto* transfer_manager = backend->transfer_manager();
|
auto* transfer_manager = backend->transfer_manager();
|
||||||
TF_ASSERT_OK_AND_ASSIGN(
|
TF_ASSERT_OK_AND_ASSIGN(
|
||||||
StreamPool::Ptr stream_ptr,
|
StreamPool::Ptr stream_ptr,
|
||||||
|
@ -271,7 +271,7 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
|
|||||||
// Run the computation num_runs times, and return the result from the last
|
// Run the computation num_runs times, and return the result from the last
|
||||||
// execution.
|
// execution.
|
||||||
const bool xla_hlo_profile = GetDebugOptionsFromFlags().xla_hlo_profile();
|
const bool xla_hlo_profile = GetDebugOptionsFromFlags().xla_hlo_profile();
|
||||||
StreamExecutorMemoryAllocator allocator(
|
se::StreamExecutorMemoryAllocator allocator(
|
||||||
client->platform(),
|
client->platform(),
|
||||||
{client->platform()->ExecutorForDevice(0).ValueOrDie()});
|
{client->platform()->ExecutorForDevice(0).ValueOrDie()});
|
||||||
absl::optional<ScopedShapedBuffer> final_result;
|
absl::optional<ScopedShapedBuffer> final_result;
|
||||||
|
@ -67,13 +67,13 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla:xla_proto",
|
"//tensorflow/compiler/xla:xla_proto",
|
||||||
"//tensorflow/compiler/xla/client:local_client",
|
"//tensorflow/compiler/xla/client:local_client",
|
||||||
"//tensorflow/compiler/xla/service:backend",
|
"//tensorflow/compiler/xla/service:backend",
|
||||||
"//tensorflow/compiler/xla/service:device_memory_allocator",
|
|
||||||
"//tensorflow/compiler/xla/service:shaped_buffer",
|
"//tensorflow/compiler/xla/service:shaped_buffer",
|
||||||
"//tensorflow/core:core_cpu_internal",
|
"//tensorflow/core:core_cpu_internal",
|
||||||
"//tensorflow/core:framework",
|
"//tensorflow/core:framework",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:lib_internal",
|
"//tensorflow/core:lib_internal",
|
||||||
"//tensorflow/stream_executor",
|
"//tensorflow/stream_executor",
|
||||||
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
"@com_google_absl//absl/memory",
|
"@com_google_absl//absl/memory",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
"@com_google_absl//absl/synchronization",
|
"@com_google_absl//absl/synchronization",
|
||||||
|
@ -117,7 +117,7 @@ Status AllocateScopedShapedBuffer(
|
|||||||
xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
|
xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
|
||||||
uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
|
uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
|
||||||
TF_ASSIGN_OR_RETURN(
|
TF_ASSIGN_OR_RETURN(
|
||||||
xla::OwningDeviceMemory buffer,
|
se::OwningDeviceMemory buffer,
|
||||||
allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/false));
|
allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/false));
|
||||||
// Move our buffer into shaped_buffer, which takes ownership of it.
|
// Move our buffer into shaped_buffer, which takes ownership of it.
|
||||||
index_to_buffer.second = buffer.Forget();
|
index_to_buffer.second = buffer.Forget();
|
||||||
@ -135,7 +135,7 @@ Status AllocateScopedShapedBuffer(
|
|||||||
|
|
||||||
XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
|
XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
|
||||||
int device_ordinal,
|
int device_ordinal,
|
||||||
xla::DeviceMemoryAllocator* allocator)
|
se::DeviceMemoryAllocator* allocator)
|
||||||
: size_(allocation.size()),
|
: size_(allocation.size()),
|
||||||
allocation_(allocation),
|
allocation_(allocation),
|
||||||
device_ordinal_(device_ordinal),
|
device_ordinal_(device_ordinal),
|
||||||
@ -169,7 +169,7 @@ void XRTBufferAllocation::DiscardAllocation() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
XRTTupleAllocation::XRTTupleAllocation(int device_ordinal,
|
XRTTupleAllocation::XRTTupleAllocation(int device_ordinal,
|
||||||
xla::DeviceMemoryAllocator* allocator,
|
se::DeviceMemoryAllocator* allocator,
|
||||||
const xla::Shape& on_host_shape,
|
const xla::Shape& on_host_shape,
|
||||||
const xla::Shape& on_device_shape)
|
const xla::Shape& on_device_shape)
|
||||||
: device_ordinal_(device_ordinal),
|
: device_ordinal_(device_ordinal),
|
||||||
@ -342,7 +342,7 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
|
|||||||
|
|
||||||
/* static */ Status XRTTupleAllocation::ExpandTreeOfTuples(
|
/* static */ Status XRTTupleAllocation::ExpandTreeOfTuples(
|
||||||
const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
|
const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
|
||||||
xla::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
|
se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
|
||||||
xla::Shape* device_shape) {
|
xla::Shape* device_shape) {
|
||||||
// Initialize both host and device shape to be the 'spine' of the new tuple
|
// Initialize both host and device shape to be the 'spine' of the new tuple
|
||||||
// shape, given by the shape of the tree of tuples.
|
// shape, given by the shape of the tree of tuples.
|
||||||
@ -415,7 +415,7 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
|
|||||||
xla::Shape subshape =
|
xla::Shape subshape =
|
||||||
xla::ShapeUtil::GetSubshape(device_shape, index);
|
xla::ShapeUtil::GetSubshape(device_shape, index);
|
||||||
uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
|
uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
|
||||||
TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer,
|
TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
|
||||||
allocator->Allocate(device_ordinal, size,
|
allocator->Allocate(device_ordinal, size,
|
||||||
/*retry_on_failure=*/false));
|
/*retry_on_failure=*/false));
|
||||||
VLOG(2) << "Allocated buffer at " << buffer.opaque() << " index "
|
VLOG(2) << "Allocated buffer at " << buffer.opaque() << " index "
|
||||||
@ -502,7 +502,7 @@ bool XRTTupleAllocation::IsExclusiveOwner() {
|
|||||||
|
|
||||||
void XRTTupleAllocation::InitializeFromShapedBuffer(
|
void XRTTupleAllocation::InitializeFromShapedBuffer(
|
||||||
const xla::ShapedBuffer& shaped_buffer,
|
const xla::ShapedBuffer& shaped_buffer,
|
||||||
xla::DeviceMemoryAllocator* allocator, int device_ordinal) {
|
se::DeviceMemoryAllocator* allocator, int device_ordinal) {
|
||||||
for (auto& buffer : buffers_) {
|
for (auto& buffer : buffers_) {
|
||||||
// Make a reference-counted version of the allocated buffer.
|
// Make a reference-counted version of the allocated buffer.
|
||||||
buffer.second = new XRTBufferAllocation(shaped_buffer.buffer(buffer.first),
|
buffer.second = new XRTBufferAllocation(shaped_buffer.buffer(buffer.first),
|
||||||
@ -549,7 +549,7 @@ XRTTupleAllocation::ToDeviceMemoryTree(
|
|||||||
if (!release_checker(buffer.first)) {
|
if (!release_checker(buffer.first)) {
|
||||||
*shaped_tree.mutable_element(buffer.first) = buffer.second->allocation();
|
*shaped_tree.mutable_element(buffer.first) = buffer.second->allocation();
|
||||||
} else {
|
} else {
|
||||||
*shaped_tree.mutable_element(buffer.first) = xla::OwningDeviceMemory(
|
*shaped_tree.mutable_element(buffer.first) = se::OwningDeviceMemory(
|
||||||
buffer.second->allocation(), device_ordinal_, allocator_);
|
buffer.second->allocation(), device_ordinal_, allocator_);
|
||||||
DiscardAllocation(buffer.first);
|
DiscardAllocation(buffer.first);
|
||||||
}
|
}
|
||||||
|
@ -25,7 +25,6 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "tensorflow/compiler/xla/literal.h"
|
#include "tensorflow/compiler/xla/literal.h"
|
||||||
#include "tensorflow/compiler/xla/service/backend.h"
|
#include "tensorflow/compiler/xla/service/backend.h"
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
|
||||||
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
|
||||||
#include "tensorflow/compiler/xla/shape_util.h"
|
#include "tensorflow/compiler/xla/shape_util.h"
|
||||||
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
||||||
@ -34,6 +33,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/lib/core/status.h"
|
#include "tensorflow/core/lib/core/status.h"
|
||||||
#include "tensorflow/core/lib/gtl/array_slice.h"
|
#include "tensorflow/core/lib/gtl/array_slice.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
#include "tensorflow/stream_executor/stream_executor.h"
|
#include "tensorflow/stream_executor/stream_executor.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
@ -45,8 +45,7 @@ namespace tensorflow {
|
|||||||
class XRTBufferAllocation : public core::RefCounted {
|
class XRTBufferAllocation : public core::RefCounted {
|
||||||
public:
|
public:
|
||||||
XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
|
XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
|
||||||
int device_ordinal,
|
int device_ordinal, se::DeviceMemoryAllocator* allocator);
|
||||||
xla::DeviceMemoryAllocator* allocator);
|
|
||||||
~XRTBufferAllocation() override;
|
~XRTBufferAllocation() override;
|
||||||
|
|
||||||
// The region of device memory being wrapped.
|
// The region of device memory being wrapped.
|
||||||
@ -69,7 +68,7 @@ class XRTBufferAllocation : public core::RefCounted {
|
|||||||
uint64 size_ = 0;
|
uint64 size_ = 0;
|
||||||
se::DeviceMemoryBase allocation_;
|
se::DeviceMemoryBase allocation_;
|
||||||
int device_ordinal_;
|
int device_ordinal_;
|
||||||
xla::DeviceMemoryAllocator* allocator_;
|
se::DeviceMemoryAllocator* allocator_;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Entry in the resource manager corresponding to an allocation handle returned
|
// Entry in the resource manager corresponding to an allocation handle returned
|
||||||
@ -197,14 +196,14 @@ class XRTTupleAllocation : public ResourceBase {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
// Creates a new handle with (tuple) shape.
|
// Creates a new handle with (tuple) shape.
|
||||||
XRTTupleAllocation(int device_ordinal, xla::DeviceMemoryAllocator* allocator,
|
XRTTupleAllocation(int device_ordinal, se::DeviceMemoryAllocator* allocator,
|
||||||
const xla::Shape& on_host_shape,
|
const xla::Shape& on_host_shape,
|
||||||
const xla::Shape& on_device_shape);
|
const xla::Shape& on_device_shape);
|
||||||
|
|
||||||
// Inherits the allocations represented in buffer, which must have the same
|
// Inherits the allocations represented in buffer, which must have the same
|
||||||
// shape as buffers_.
|
// shape as buffers_.
|
||||||
void InitializeFromShapedBuffer(const xla::ShapedBuffer& shaped_buffer,
|
void InitializeFromShapedBuffer(const xla::ShapedBuffer& shaped_buffer,
|
||||||
xla::DeviceMemoryAllocator* allocator,
|
se::DeviceMemoryAllocator* allocator,
|
||||||
int device_ordinal);
|
int device_ordinal);
|
||||||
|
|
||||||
// Takes a tree 'elements' where each leaf is an allocation, validates that
|
// Takes a tree 'elements' where each leaf is an allocation, validates that
|
||||||
@ -214,12 +213,12 @@ class XRTTupleAllocation : public ResourceBase {
|
|||||||
// grafted on.
|
// grafted on.
|
||||||
static Status ExpandTreeOfTuples(
|
static Status ExpandTreeOfTuples(
|
||||||
const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
|
const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
|
||||||
xla::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
|
se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
|
||||||
xla::Shape* device_shape);
|
xla::Shape* device_shape);
|
||||||
|
|
||||||
// Location of the memory that is being managed.
|
// Location of the memory that is being managed.
|
||||||
int device_ordinal_;
|
int device_ordinal_;
|
||||||
xla::DeviceMemoryAllocator* allocator_;
|
se::DeviceMemoryAllocator* allocator_;
|
||||||
|
|
||||||
// The shape that the caller thinks the tuple has.
|
// The shape that the caller thinks the tuple has.
|
||||||
const xla::Shape on_host_shape_;
|
const xla::Shape on_host_shape_;
|
||||||
|
@ -666,6 +666,27 @@ cc_library(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "device_memory_allocator",
|
||||||
|
srcs = [
|
||||||
|
"device_memory_allocator.cc",
|
||||||
|
"owning_device_memory.cc",
|
||||||
|
],
|
||||||
|
hdrs = [
|
||||||
|
"device_memory_allocator.h",
|
||||||
|
"owning_device_memory.h",
|
||||||
|
],
|
||||||
|
deps = [
|
||||||
|
":platform",
|
||||||
|
"//tensorflow/core:lib",
|
||||||
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
|
"//tensorflow/stream_executor/lib",
|
||||||
|
"@com_google_absl//absl/strings",
|
||||||
|
"@com_google_absl//absl/strings:str_format",
|
||||||
|
"@com_google_absl//absl/types:span",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
tf_cc_test(
|
tf_cc_test(
|
||||||
name = "stream_test",
|
name = "stream_test",
|
||||||
size = "small",
|
size = "small",
|
||||||
|
@ -13,30 +13,31 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "tensorflow/compiler/xla/status_macros.h"
|
#include "absl/strings/str_cat.h"
|
||||||
#include "tensorflow/compiler/xla/types.h"
|
#include "absl/strings/str_format.h"
|
||||||
#include "tensorflow/compiler/xla/util.h"
|
|
||||||
#include "tensorflow/core/lib/strings/numbers.h"
|
#include "tensorflow/core/lib/strings/numbers.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace stream_executor {
|
||||||
|
|
||||||
StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
|
StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
|
||||||
const se::Platform* platform,
|
const Platform* platform,
|
||||||
absl::Span<se::StreamExecutor* const> stream_executors)
|
absl::Span<StreamExecutor* const> stream_executors)
|
||||||
: DeviceMemoryAllocator(platform),
|
: DeviceMemoryAllocator(platform),
|
||||||
stream_executors_(stream_executors.begin(), stream_executors.end()) {}
|
stream_executors_(stream_executors.begin(), stream_executors.end()) {}
|
||||||
|
|
||||||
StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
|
port::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
|
||||||
int device_ordinal, uint64 size, bool retry_on_failure) {
|
int device_ordinal, uint64 size, bool retry_on_failure) {
|
||||||
TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
|
port::StatusOr<StreamExecutor*> stream_executor_or =
|
||||||
GetStreamExecutor(device_ordinal));
|
GetStreamExecutor(device_ordinal);
|
||||||
se::DeviceMemoryBase result = stream_executor->AllocateArray<uint8>(size);
|
TF_RETURN_IF_ERROR(stream_executor_or.status());
|
||||||
|
DeviceMemoryBase result =
|
||||||
|
stream_executor_or.ValueOrDie()->AllocateArray<uint8>(size);
|
||||||
if (size > 0 && result == nullptr) {
|
if (size > 0 && result == nullptr) {
|
||||||
return ResourceExhausted(
|
return tensorflow::errors::ResourceExhausted(
|
||||||
"Failed to allocate request for %s (%uB) on device ordinal %d",
|
"Failed to allocate request for %s (%uB) on device ordinal %d",
|
||||||
tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal);
|
tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal);
|
||||||
}
|
}
|
||||||
@ -47,32 +48,34 @@ StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
|
|||||||
return OwningDeviceMemory(result, device_ordinal, this);
|
return OwningDeviceMemory(result, device_ordinal, this);
|
||||||
}
|
}
|
||||||
|
|
||||||
Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
|
port::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
|
||||||
se::DeviceMemoryBase mem) {
|
DeviceMemoryBase mem) {
|
||||||
if (!mem.is_null()) {
|
if (!mem.is_null()) {
|
||||||
TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
|
port::StatusOr<StreamExecutor*> stream_executor_or =
|
||||||
GetStreamExecutor(device_ordinal));
|
GetStreamExecutor(device_ordinal);
|
||||||
|
TF_RETURN_IF_ERROR(stream_executor_or.status());
|
||||||
VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d",
|
VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d",
|
||||||
mem.opaque(), device_ordinal);
|
mem.opaque(), device_ordinal);
|
||||||
stream_executor->Deallocate(&mem);
|
stream_executor_or.ValueOrDie()->Deallocate(&mem);
|
||||||
}
|
}
|
||||||
return Status::OK();
|
return port::Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
|
port::StatusOr<StreamExecutor*>
|
||||||
int device_ordinal) {
|
StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
|
||||||
if (device_ordinal < 0) {
|
if (device_ordinal < 0) {
|
||||||
return InvalidArgument("device ordinal value (%d) must be non-negative",
|
return tensorflow::errors::InvalidArgument(
|
||||||
device_ordinal);
|
"device ordinal value (%d) must be non-negative", device_ordinal);
|
||||||
}
|
}
|
||||||
if (device_ordinal >= stream_executors_.size()) {
|
if (device_ordinal >= stream_executors_.size()) {
|
||||||
return InvalidArgument(
|
return tensorflow::errors::InvalidArgument(
|
||||||
"device ordinal value (%d) >= number of devices (%u)", device_ordinal,
|
"device ordinal value (%d) >= number of devices (%u)", device_ordinal,
|
||||||
stream_executors_.size());
|
stream_executors_.size());
|
||||||
}
|
}
|
||||||
if (stream_executors_[device_ordinal] == nullptr) {
|
if (stream_executors_[device_ordinal] == nullptr) {
|
||||||
return NotFound("Device %s:%d present but not supported",
|
return tensorflow::errors::NotFound(
|
||||||
platform()->Name(), device_ordinal);
|
absl::StrFormat("Device %s:%d present but not supported",
|
||||||
|
platform()->Name(), device_ordinal));
|
||||||
}
|
}
|
||||||
return stream_executors_[device_ordinal];
|
return stream_executors_[device_ordinal];
|
||||||
}
|
}
|
||||||
@ -81,4 +84,4 @@ bool StreamExecutorMemoryAllocator::AllowsAsynchronousDeallocation() const {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace xla
|
} // namespace stream_executor
|
@ -19,13 +19,13 @@ limitations under the License.
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "absl/types/span.h"
|
#include "absl/types/span.h"
|
||||||
#include "tensorflow/compiler/xla/service/owning_device_memory.h"
|
|
||||||
#include "tensorflow/compiler/xla/statusor.h"
|
|
||||||
#include "tensorflow/compiler/xla/types.h"
|
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
|
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||||
|
#include "tensorflow/stream_executor/owning_device_memory.h"
|
||||||
|
#include "tensorflow/stream_executor/platform.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace stream_executor {
|
||||||
|
|
||||||
// Interface for device memory allocators used within the XLA service. An
|
// Interface for device memory allocators used within the XLA service. An
|
||||||
// allocator is responsible for allocating memory on all devices of a particular
|
// allocator is responsible for allocating memory on all devices of a particular
|
||||||
@ -34,7 +34,7 @@ class DeviceMemoryAllocator {
|
|||||||
public:
|
public:
|
||||||
// Parameter platform indicates which platform the allocator allocates memory
|
// Parameter platform indicates which platform the allocator allocates memory
|
||||||
// on. Must be non-null.
|
// on. Must be non-null.
|
||||||
explicit DeviceMemoryAllocator(const se::Platform* platform)
|
explicit DeviceMemoryAllocator(const Platform* platform)
|
||||||
: platform_(platform) {}
|
: platform_(platform) {}
|
||||||
virtual ~DeviceMemoryAllocator() {}
|
virtual ~DeviceMemoryAllocator() {}
|
||||||
|
|
||||||
@ -47,23 +47,23 @@ class DeviceMemoryAllocator {
|
|||||||
// fails, the allocation should return immediately without retrying. An
|
// fails, the allocation should return immediately without retrying. An
|
||||||
// example use case is optional scratch spaces where a failure has only
|
// example use case is optional scratch spaces where a failure has only
|
||||||
// performance impact.
|
// performance impact.
|
||||||
virtual StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
|
virtual port::StatusOr<OwningDeviceMemory> Allocate(
|
||||||
bool retry_on_failure) = 0;
|
int device_ordinal, uint64 size, bool retry_on_failure) = 0;
|
||||||
|
|
||||||
// Two-arg version of Allocate(), which sets retry-on-failure to true.
|
// Two-arg version of Allocate(), which sets retry-on-failure to true.
|
||||||
//
|
//
|
||||||
// (We don't simply use a default argument on the virtual Allocate function
|
// (We don't simply use a default argument on the virtual Allocate function
|
||||||
// because default args on virtual functions are disallowed by the Google
|
// because default args on virtual functions are disallowed by the Google
|
||||||
// style guide.)
|
// style guide.)
|
||||||
StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size) {
|
port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size) {
|
||||||
return Allocate(device_ordinal, size, /*retry_on_failure=*/true);
|
return Allocate(device_ordinal, size, /*retry_on_failure=*/true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Must be a nop for null pointers.
|
// Must be a nop for null pointers.
|
||||||
virtual Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) = 0;
|
virtual port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) = 0;
|
||||||
|
|
||||||
// Return the platform that the allocator allocates memory on.
|
// Return the platform that the allocator allocates memory on.
|
||||||
const se::Platform* platform() const { return platform_; }
|
const Platform* platform() const { return platform_; }
|
||||||
|
|
||||||
// Can we call Deallocate() as soon as a computation has been scheduled on
|
// Can we call Deallocate() as soon as a computation has been scheduled on
|
||||||
// a stream, or do we have to wait for the computation to complete first?
|
// a stream, or do we have to wait for the computation to complete first?
|
||||||
@ -71,7 +71,7 @@ class DeviceMemoryAllocator {
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
friend class OwningDeviceMemory;
|
friend class OwningDeviceMemory;
|
||||||
const se::Platform* platform_;
|
const Platform* platform_;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Default memory allocator for a platform which uses
|
// Default memory allocator for a platform which uses
|
||||||
@ -79,28 +79,28 @@ class DeviceMemoryAllocator {
|
|||||||
class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
|
class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
|
||||||
public:
|
public:
|
||||||
StreamExecutorMemoryAllocator(
|
StreamExecutorMemoryAllocator(
|
||||||
const se::Platform* platform,
|
const Platform* platform,
|
||||||
absl::Span<se::StreamExecutor* const> stream_executors);
|
absl::Span<StreamExecutor* const> stream_executors);
|
||||||
|
|
||||||
StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
|
port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
|
||||||
bool retry_on_failure) override;
|
bool retry_on_failure) override;
|
||||||
|
|
||||||
// Pull in two-arg overload that sets retry_on_failure to true.
|
// Pull in two-arg overload that sets retry_on_failure to true.
|
||||||
using DeviceMemoryAllocator::Allocate;
|
using DeviceMemoryAllocator::Allocate;
|
||||||
|
|
||||||
Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
|
port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override;
|
||||||
|
|
||||||
bool AllowsAsynchronousDeallocation() const override;
|
bool AllowsAsynchronousDeallocation() const override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
StatusOr<se::StreamExecutor*> GetStreamExecutor(int device_ordinal);
|
port::StatusOr<StreamExecutor*> GetStreamExecutor(int device_ordinal);
|
||||||
|
|
||||||
// A vector indexed by device ordinal of StreamExecutors for each device of
|
// A vector indexed by device ordinal of StreamExecutors for each device of
|
||||||
// the allocator's platform type. If an element is nullptr, then the device
|
// the allocator's platform type. If an element is nullptr, then the device
|
||||||
// with the respective device ordinal is not supported by XLA.
|
// with the respective device ordinal is not supported by XLA.
|
||||||
std::vector<se::StreamExecutor*> stream_executors_;
|
std::vector<StreamExecutor*> stream_executors_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace xla
|
} // namespace stream_executor
|
||||||
|
|
||||||
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_DEVICE_MEMORY_ALLOCATOR_H_
|
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_DEVICE_MEMORY_ALLOCATOR_H_
|
@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#include "tensorflow/compiler/xla/service/owning_device_memory.h"
|
#include "tensorflow/stream_executor/owning_device_memory.h"
|
||||||
|
|
||||||
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
|
#include "tensorflow/stream_executor/device_memory_allocator.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace stream_executor {
|
||||||
|
|
||||||
void OwningDeviceMemory::Free() {
|
void OwningDeviceMemory::Free() {
|
||||||
CHECK(allocator_ != nullptr)
|
CHECK(allocator_ != nullptr)
|
||||||
@ -29,7 +29,7 @@ void OwningDeviceMemory::Free() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
allocator_ = nullptr;
|
allocator_ = nullptr;
|
||||||
mem_ = se::DeviceMemoryBase();
|
mem_ = DeviceMemoryBase();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace xla
|
} // namespace stream_executor
|
@ -16,12 +16,10 @@ limitations under the License.
|
|||||||
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
|
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
|
||||||
#define TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
|
#define TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
|
||||||
|
|
||||||
#include "tensorflow/compiler/xla/statusor.h"
|
|
||||||
#include "tensorflow/compiler/xla/types.h"
|
|
||||||
#include "tensorflow/core/platform/macros.h"
|
#include "tensorflow/core/platform/macros.h"
|
||||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||||
|
|
||||||
namespace xla {
|
namespace stream_executor {
|
||||||
|
|
||||||
// Break circular dependency between this file and device_memory_allocator.h.
|
// Break circular dependency between this file and device_memory_allocator.h.
|
||||||
class DeviceMemoryAllocator;
|
class DeviceMemoryAllocator;
|
||||||
@ -43,7 +41,7 @@ class OwningDeviceMemory {
|
|||||||
public:
|
public:
|
||||||
OwningDeviceMemory() : device_ordinal_(-1), allocator_(nullptr) {}
|
OwningDeviceMemory() : device_ordinal_(-1), allocator_(nullptr) {}
|
||||||
|
|
||||||
explicit OwningDeviceMemory(se::DeviceMemoryBase mem, int device_ordinal,
|
explicit OwningDeviceMemory(DeviceMemoryBase mem, int device_ordinal,
|
||||||
DeviceMemoryAllocator* allocator)
|
DeviceMemoryAllocator* allocator)
|
||||||
: mem_(mem), device_ordinal_(device_ordinal), allocator_(allocator) {
|
: mem_(mem), device_ordinal_(device_ordinal), allocator_(allocator) {
|
||||||
CHECK(allocator != nullptr) << "allocator cannot be null.";
|
CHECK(allocator != nullptr) << "allocator cannot be null.";
|
||||||
@ -53,7 +51,7 @@ class OwningDeviceMemory {
|
|||||||
: mem_(other.mem_),
|
: mem_(other.mem_),
|
||||||
device_ordinal_(other.device_ordinal_),
|
device_ordinal_(other.device_ordinal_),
|
||||||
allocator_(other.allocator_) {
|
allocator_(other.allocator_) {
|
||||||
other.mem_ = se::DeviceMemoryBase();
|
other.mem_ = DeviceMemoryBase();
|
||||||
other.allocator_ = nullptr;
|
other.allocator_ = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,7 +63,7 @@ class OwningDeviceMemory {
|
|||||||
device_ordinal_ = other.device_ordinal_;
|
device_ordinal_ = other.device_ordinal_;
|
||||||
allocator_ = other.allocator_;
|
allocator_ = other.allocator_;
|
||||||
|
|
||||||
other.mem_ = se::DeviceMemoryBase();
|
other.mem_ = DeviceMemoryBase();
|
||||||
other.allocator_ = nullptr;
|
other.allocator_ = nullptr;
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
@ -100,25 +98,25 @@ class OwningDeviceMemory {
|
|||||||
// !is_null() is sufficient but not necessary to imply `this` is active.
|
// !is_null() is sufficient but not necessary to imply `this` is active.
|
||||||
bool is_null() const { return mem_.is_null(); }
|
bool is_null() const { return mem_.is_null(); }
|
||||||
|
|
||||||
se::DeviceMemoryBase AsDeviceMemoryBase() const {
|
DeviceMemoryBase AsDeviceMemoryBase() const {
|
||||||
// This const_cast is necessary because DeviceMemoryBase's constructor
|
// This const_cast is necessary because DeviceMemoryBase's constructor
|
||||||
// doesn't accept a const void*. This isn't ideal, but it's better than the
|
// doesn't accept a const void*. This isn't ideal, but it's better than the
|
||||||
// alternative of making a AsDeviceMemoryBase non-const member function.
|
// alternative of making a AsDeviceMemoryBase non-const member function.
|
||||||
//
|
//
|
||||||
// This is safe (i.e. not UB) because the casted pointer is derived from a
|
// This is safe (i.e. not UB) because the casted pointer is derived from a
|
||||||
// non-const pointer, namely mem_.opaque().
|
// non-const pointer, namely mem_.opaque().
|
||||||
return se::DeviceMemoryBase(const_cast<void*>(opaque()), size());
|
return DeviceMemoryBase(const_cast<void*>(opaque()), size());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the wrapped DeviceMemoryBase without freeing it, and deactivates
|
// Returns the wrapped DeviceMemoryBase without freeing it, and deactivates
|
||||||
// this object. Precondition: `this` is active.
|
// this object. Precondition: `this` is active.
|
||||||
TF_MUST_USE_RESULT se::DeviceMemoryBase Forget() {
|
TF_MUST_USE_RESULT DeviceMemoryBase Forget() {
|
||||||
CHECK(allocator_ != nullptr)
|
CHECK(allocator_ != nullptr)
|
||||||
<< "Can't call Forget() on an inactive (i.e. moved from, Forget()'ten, "
|
<< "Can't call Forget() on an inactive (i.e. moved from, Forget()'ten, "
|
||||||
"or Free()'ed) instance.";
|
"or Free()'ed) instance.";
|
||||||
allocator_ = nullptr;
|
allocator_ = nullptr;
|
||||||
se::DeviceMemoryBase mem(mem_);
|
DeviceMemoryBase mem(mem_);
|
||||||
mem_ = se::DeviceMemoryBase();
|
mem_ = DeviceMemoryBase();
|
||||||
return mem;
|
return mem;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -127,11 +125,11 @@ class OwningDeviceMemory {
|
|||||||
void Free();
|
void Free();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
se::DeviceMemoryBase mem_;
|
DeviceMemoryBase mem_;
|
||||||
int device_ordinal_;
|
int device_ordinal_;
|
||||||
DeviceMemoryAllocator* allocator_; // Null if this object is inactive.
|
DeviceMemoryAllocator* allocator_; // Null if this object is inactive.
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace xla
|
} // namespace stream_executor
|
||||||
|
|
||||||
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
|
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
|
Loading…
Reference in New Issue
Block a user