StreamExecutor: Optimize kernel argument packing
Create a single class to hold all kernel arguments and optimize how they are added into this class. Change: 140556725
This commit is contained in:
parent
347d3ef2a8
commit
bada4a5339
@ -349,31 +349,12 @@ bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
|
||||
|
||||
bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
|
||||
const BlockDim &block_dims, const KernelBase &kernel,
|
||||
const std::vector<KernelArg> &args) {
|
||||
CHECK_EQ(kernel.Arity(), args.size());
|
||||
const KernelArgsArrayBase &args) {
|
||||
CHECK_EQ(kernel.Arity(), args.number_of_arguments());
|
||||
CUstream custream = AsCUDAStreamValue(stream);
|
||||
const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
|
||||
CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
|
||||
|
||||
std::vector<void *> addrs;
|
||||
addrs.reserve(args.size());
|
||||
int shmem_bytes = 0;
|
||||
for (size_t i = 0; i < args.size(); i++) {
|
||||
switch (args[i].type) {
|
||||
case KernelArg::kNormal:
|
||||
addrs.push_back(const_cast<void *>(
|
||||
static_cast<const void *>(args[i].data.begin())));
|
||||
break;
|
||||
case KernelArg::kSharedMemory:
|
||||
shmem_bytes += args[i].bytes;
|
||||
break;
|
||||
default:
|
||||
LOG(ERROR) << "Invalid kernel arg type passed (" << args[i].type
|
||||
<< ") for arg " << i;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Only perform/print the occupancy check 1x.
|
||||
launched_kernels_mu_.lock();
|
||||
if (launched_kernels_.find(cufunc) == launched_kernels_.end()) {
|
||||
@ -389,11 +370,15 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
|
||||
CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig());
|
||||
}
|
||||
|
||||
if (!CUDADriver::LaunchKernel(
|
||||
GetCudaContext(stream), cufunc, block_dims.x, block_dims.y,
|
||||
block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
|
||||
shmem_bytes, custream, addrs.data(), nullptr /* = extra */)) {
|
||||
LOG(ERROR) << "failed to launch CUDA kernel with args: " << args.size()
|
||||
void **kernel_params = const_cast<void **>(args.argument_addresses().data());
|
||||
|
||||
if (!CUDADriver::LaunchKernel(GetCudaContext(stream), cufunc, block_dims.x,
|
||||
block_dims.y, block_dims.z, thread_dims.x,
|
||||
thread_dims.y, thread_dims.z,
|
||||
args.number_of_shared_bytes(), custream,
|
||||
kernel_params, nullptr /* = extra */)) {
|
||||
LOG(ERROR) << "failed to launch CUDA kernel with args: "
|
||||
<< args.number_of_arguments()
|
||||
<< "; thread dim: " << thread_dims.ToString()
|
||||
<< "; block dim: " << block_dims.ToString();
|
||||
return false;
|
||||
@ -849,18 +834,6 @@ bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
KernelArg CUDAExecutor::DeviceMemoryToKernelArg(
|
||||
const DeviceMemoryBase &gpu_mem) const {
|
||||
const void* arg = gpu_mem.opaque();
|
||||
const uint8 *arg_ptr = reinterpret_cast<const uint8 *>(&arg);
|
||||
|
||||
KernelArg kernel_arg;
|
||||
kernel_arg.type = KernelArg::kNormal;
|
||||
kernel_arg.data = port::InlinedVector<uint8, 4>(arg_ptr, arg_ptr + sizeof(arg));
|
||||
kernel_arg.bytes = sizeof(arg);
|
||||
return kernel_arg;
|
||||
}
|
||||
|
||||
bool CUDAExecutor::SupportsBlas() const { return true; }
|
||||
|
||||
bool CUDAExecutor::SupportsFft() const { return true; }
|
||||
|
@ -76,7 +76,7 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
|
||||
|
||||
bool Launch(Stream *stream, const ThreadDim &thread_dims,
|
||||
const BlockDim &block_dims, const KernelBase &k,
|
||||
const std::vector<KernelArg> &args) override;
|
||||
const KernelArgsArrayBase &args) override;
|
||||
|
||||
void *Allocate(uint64 size) override;
|
||||
|
||||
@ -186,9 +186,6 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
|
||||
// will be only partially populated as a result, and an error will be logged.
|
||||
bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
|
||||
|
||||
KernelArg DeviceMemoryToKernelArg(
|
||||
const DeviceMemoryBase &gpu_mem) const override;
|
||||
|
||||
bool SupportsBlas() const override;
|
||||
|
||||
blas::BlasSupport *CreateBlas() override;
|
||||
|
@ -76,9 +76,10 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/stream_executor/device_memory.h"
|
||||
#include "tensorflow/stream_executor/kernel_cache_config.h"
|
||||
#include "tensorflow/stream_executor/lib/array_slice.h"
|
||||
#include "tensorflow/stream_executor/lib/inlined_vector.h"
|
||||
#include "tensorflow/stream_executor/lib/stringpiece.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/lib/inlined_vector.h"
|
||||
|
||||
namespace perftools {
|
||||
namespace gputools {
|
||||
@ -265,24 +266,220 @@ struct IsSharedDeviceMemory<SharedDeviceMemory<U>> {
|
||||
static constexpr bool value = true;
|
||||
};
|
||||
|
||||
// KernelArg encapsulates the information necessary for a back-end executor to
|
||||
// configure a kernel to launch using the given argument.
|
||||
// Basic data about a kernel argument.
|
||||
struct KernelArg {
|
||||
// Indicates the type of an argument: normal, to be passed to the kernel
|
||||
// in the standard manner, or shared memory, which has distinct
|
||||
// rules for specification per backend.
|
||||
enum Type {
|
||||
kNormal,
|
||||
kSharedMemory,
|
||||
} type;
|
||||
bool is_shared;
|
||||
const void *address;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
// The data to pass to the kernel - either a pointer to device memory, or the
|
||||
// argument value. compact_array is used to prevent smaller args (ex. u8, u64)
|
||||
// from requiring heap allocation.
|
||||
port::InlinedVector<uint8, 4> data;
|
||||
// An iterator for traversing all the arguments of a KernelArgsArray.
|
||||
class KernelArgIterator {
|
||||
public:
|
||||
KernelArgIterator(int number_of_argument_addresses,
|
||||
int number_of_shared_memory_arguments,
|
||||
const void *const *arg_addresses_data,
|
||||
const size_t *arg_sizes_data,
|
||||
const size_t *shmem_bytes_data,
|
||||
const size_t *shmem_indices_data)
|
||||
: arg_index_(0),
|
||||
number_of_arguments_(number_of_argument_addresses +
|
||||
number_of_shared_memory_arguments),
|
||||
arg_address_iter_(arg_addresses_data),
|
||||
arg_size_iter_(arg_sizes_data),
|
||||
shmem_bytes_iter_(shmem_bytes_data),
|
||||
shmem_indices_iter_(shmem_indices_data),
|
||||
shmem_indices_end_(shmem_indices_data +
|
||||
number_of_shared_memory_arguments) {}
|
||||
|
||||
// The size of this argument in bytes.
|
||||
uint64 bytes;
|
||||
// Returns true if another argument is present in the iterator.
|
||||
bool has_next() { return arg_index_ < number_of_arguments_; }
|
||||
|
||||
// Returns the next argument in the iterator.
|
||||
//
|
||||
// Returns a default-constructed KernelArg if there is no next argument.
|
||||
KernelArg next() {
|
||||
KernelArg result;
|
||||
if (!has_next()) {
|
||||
return result;
|
||||
} else if ((shmem_indices_iter_ != shmem_indices_end_) &&
|
||||
(arg_index_ == *shmem_indices_iter_)) {
|
||||
result.is_shared = true;
|
||||
result.address = nullptr;
|
||||
result.size = *shmem_bytes_iter_;
|
||||
++shmem_indices_iter_;
|
||||
++shmem_bytes_iter_;
|
||||
} else {
|
||||
result.is_shared = false;
|
||||
result.address = *arg_address_iter_;
|
||||
result.size = *arg_size_iter_;
|
||||
++arg_address_iter_;
|
||||
++arg_size_iter_;
|
||||
}
|
||||
++arg_index_;
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
int arg_index_;
|
||||
int number_of_arguments_;
|
||||
const void *const *arg_address_iter_;
|
||||
const size_t *arg_size_iter_;
|
||||
const size_t *shmem_bytes_iter_;
|
||||
const size_t *shmem_indices_iter_;
|
||||
const size_t *const shmem_indices_end_;
|
||||
};
|
||||
|
||||
// Base class for KernelArgsArray.
|
||||
//
|
||||
// Supports all the getter methods that do not depend on the compile-time number
|
||||
// of arguments template parameter.
|
||||
//
|
||||
// This class exists as a way to pass kernel arguments to
|
||||
// StreamExecutorInterface::Launch. That Launch method is virtual, so it can't
|
||||
// be templated to accept any KernelArgsArray type, therfore a reference to this
|
||||
// base type is passed instead.
|
||||
//
|
||||
// Performance is not a concern here because each of these methods will be
|
||||
// called at most once per kernel launch. Past performance concerns with
|
||||
// KernelArgsArray have been in reference to the argument packing routines which
|
||||
// are called once per kernel argument. Those packing routines are now handled
|
||||
// by the templated KernelArgsArray subclass of this class where they can take
|
||||
// advantage of compile-time knowledge of the number of arguments in order to be
|
||||
// very efficient.
|
||||
class KernelArgsArrayBase {
|
||||
public:
|
||||
virtual ~KernelArgsArrayBase() = default;
|
||||
|
||||
// Gets the number of arguments added so far, including shared memory
|
||||
// arguments.
|
||||
virtual size_t number_of_arguments() const = 0;
|
||||
|
||||
// Gets the total number of shared memory bytes added so far.
|
||||
virtual uint64 number_of_shared_bytes() const = 0;
|
||||
|
||||
// Gets the list of argument addresses.
|
||||
virtual port::ArraySlice<const void *> argument_addresses() const = 0;
|
||||
|
||||
// Gets an iterator to the arguments in the array.
|
||||
virtual KernelArgIterator arg_iterator() const = 0;
|
||||
};
|
||||
|
||||
// A list of arguments for a kernel call.
|
||||
//
|
||||
// The template parameter kNumArgs is the maximum number of arguments which can
|
||||
// be stored in the list.
|
||||
//
|
||||
// Contains a list of addresses for non-shared-memory arguments and a list of
|
||||
// sizes for shared-memory arguments. Since the shared-memory arguments may be
|
||||
// interspersed with the non-shared-memory arguments, it also stores a list of
|
||||
// the indices at which the shared-memory arguments appeared.
|
||||
//
|
||||
// For example, if the argument address list contains {a, b, c, d, e}, the
|
||||
// shared-memory arguments list contains the sizes of {A, B, C}, and the
|
||||
// shared-memory indices list contains {0, 3, 5}, then the original list of
|
||||
// arguments was {A, a, b, B, c, C, d, e}.
|
||||
//
|
||||
// This way of storing the arguments makes CUDA kernel calls efficient because
|
||||
// they only require the argument address list and the total number of shared
|
||||
// bytes, but it also makes it possible for OpenCL kernel calls because they
|
||||
// depend on the location of each shared-memory argument and its size.
|
||||
//
|
||||
// Note that the code for adding arguments has been identified as a performance
|
||||
// hotspot in some real-world applications so this structure has been optimized
|
||||
// for the performance of argument adding.
|
||||
template <size_t kNumArgs>
|
||||
class KernelArgsArray : public KernelArgsArrayBase {
|
||||
public:
|
||||
explicit KernelArgsArray()
|
||||
: total_shared_memory_bytes_(0),
|
||||
number_of_argument_addresses_(0),
|
||||
number_of_shared_memory_arguments_(0) {}
|
||||
|
||||
// Adds an argument to the list.
|
||||
//
|
||||
// Note that the address of the argument is stored, so the input must not go
|
||||
// out of scope before the instance of this class that calls this method does.
|
||||
template <typename T>
|
||||
void add_argument(const T &arg) {
|
||||
argument_addresses_[number_of_argument_addresses_] =
|
||||
static_cast<const void *>(&arg);
|
||||
argument_sizes_[number_of_argument_addresses_] = sizeof(arg);
|
||||
++number_of_argument_addresses_;
|
||||
}
|
||||
|
||||
// Adds a device memory argument to the list.
|
||||
void add_device_memory_argument(const DeviceMemoryBase &arg) {
|
||||
const void **copy_ptr =
|
||||
&device_memory_opaque_pointers_[number_of_argument_addresses_];
|
||||
*copy_ptr = arg.opaque();
|
||||
argument_addresses_[number_of_argument_addresses_] = copy_ptr;
|
||||
argument_sizes_[number_of_argument_addresses_] = sizeof(void *);
|
||||
++number_of_argument_addresses_;
|
||||
}
|
||||
|
||||
// Adds a shared memory argument to the list.
|
||||
//
|
||||
// The only significant information about a shared argument is its size, so
|
||||
// that is the only parameter in this function.
|
||||
void add_shared_bytes(size_t number_of_bytes) {
|
||||
shared_memory_indices_[number_of_shared_memory_arguments_] =
|
||||
number_of_argument_addresses_ + number_of_shared_memory_arguments_;
|
||||
shared_memory_bytes_[number_of_shared_memory_arguments_] = number_of_bytes;
|
||||
++number_of_shared_memory_arguments_;
|
||||
total_shared_memory_bytes_ += number_of_bytes;
|
||||
}
|
||||
|
||||
// Gets the number of arguments added so far, including shared memory
|
||||
// arguments.
|
||||
size_t number_of_arguments() const override {
|
||||
return number_of_argument_addresses_ + number_of_shared_memory_arguments_;
|
||||
}
|
||||
|
||||
// Gets the total number of shared memory bytes added so far.
|
||||
uint64 number_of_shared_bytes() const override {
|
||||
return total_shared_memory_bytes_;
|
||||
}
|
||||
|
||||
// Gets the list of argument addresses.
|
||||
port::ArraySlice<const void *> argument_addresses() const override {
|
||||
return port::ArraySlice<const void *>(argument_addresses_.data(),
|
||||
number_of_argument_addresses_);
|
||||
}
|
||||
|
||||
// Gets an iterator to the arguments in the array.
|
||||
KernelArgIterator arg_iterator() const override {
|
||||
return KernelArgIterator(
|
||||
number_of_argument_addresses_, number_of_shared_memory_arguments_,
|
||||
argument_addresses_.data(), argument_sizes_.data(),
|
||||
shared_memory_bytes_.data(), shared_memory_indices_.data());
|
||||
}
|
||||
|
||||
private:
|
||||
// A place to store copies of opaque pointers from device memory arguments.
|
||||
std::array<const void *, kNumArgs> device_memory_opaque_pointers_;
|
||||
|
||||
// Addresses for non-shared-memory arguments.
|
||||
std::array<const void *, kNumArgs> argument_addresses_;
|
||||
|
||||
// Sizes for non-shared-memory arguments.
|
||||
std::array<size_t, kNumArgs> argument_sizes_;
|
||||
|
||||
// Size in bytes for each shared memory argument.
|
||||
std::array<size_t, kNumArgs> shared_memory_bytes_;
|
||||
|
||||
// Indices in the arguments array for shared memory arguments.
|
||||
std::array<size_t, kNumArgs> shared_memory_indices_;
|
||||
|
||||
// Total of all shared memory sizes.
|
||||
size_t total_shared_memory_bytes_;
|
||||
|
||||
// Number of significant entries in argument_addresses_ and argument_sizes_.
|
||||
size_t number_of_argument_addresses_;
|
||||
|
||||
// Number of significant entries in shared_memory_bytes_ and
|
||||
// shared_memory_indices_.
|
||||
size_t number_of_shared_memory_arguments_;
|
||||
};
|
||||
|
||||
// Typed variant of KernelBase, like a typed device function pointer. See the
|
||||
@ -298,6 +495,8 @@ struct KernelArg {
|
||||
template <typename... Params>
|
||||
class TypedKernel : public KernelBase {
|
||||
public:
|
||||
static constexpr size_t kNumberOfParameters = sizeof...(Params);
|
||||
|
||||
// Delegates to KernelBase::KernelBase(), see that constructor.
|
||||
explicit TypedKernel(StreamExecutor *parent) : KernelBase(parent) {}
|
||||
|
||||
@ -318,13 +517,19 @@ class TypedKernel : public KernelBase {
|
||||
//
|
||||
// Const refs are taken as parameters on all of the handlers to avoid
|
||||
// implicit type promotion of integers.
|
||||
void PackParams(std::vector<KernelArg> *args, Params... params) const {
|
||||
//
|
||||
// WARNING: as a performance optimization this method may store pointers to
|
||||
// some of the input parameters in the kernel args structure, so any params
|
||||
// passed into this method must live at least as long as the kernel args
|
||||
// structure.
|
||||
void PackParams(KernelArgsArray<kNumberOfParameters> *args,
|
||||
Params &... params) const {
|
||||
PackOneParam(args, params...);
|
||||
}
|
||||
|
||||
template <typename T, typename... RestOfParams>
|
||||
void PackOneParam(std::vector<KernelArg> *args, const T &arg,
|
||||
const RestOfParams... rest) const {
|
||||
void PackOneParam(KernelArgsArray<kNumberOfParameters> *args, const T &arg,
|
||||
const RestOfParams &... rest) const {
|
||||
PackOneParam(args, arg);
|
||||
PackOneParam(args, rest...);
|
||||
}
|
||||
@ -334,7 +539,7 @@ class TypedKernel : public KernelBase {
|
||||
// separate implementation below.
|
||||
template <typename T>
|
||||
void PackOneParam(
|
||||
std::vector<KernelArg> *args, const T &arg,
|
||||
KernelArgsArray<kNumberOfParameters> *args, const T &arg,
|
||||
typename std::enable_if<!IsDeviceMemoryValueLike<T>::value &&
|
||||
!IsDeviceMemoryPointer<T>::value &&
|
||||
!IsSharedDeviceMemory<T>::value>::type * =
|
||||
@ -343,44 +548,40 @@ class TypedKernel : public KernelBase {
|
||||
"cannot pass raw pointer to the device");
|
||||
static_assert(!std::is_convertible<T, DeviceMemoryBase>::value,
|
||||
"cannot pass device memory as a normal value");
|
||||
const uint8 *arg_ptr = reinterpret_cast<const uint8 *>(&arg);
|
||||
args->emplace_back(KernelArg{
|
||||
KernelArg::kNormal,
|
||||
port::InlinedVector<uint8, 4>{arg_ptr, arg_ptr + sizeof(arg)}, sizeof(arg)});
|
||||
args->add_argument(arg);
|
||||
}
|
||||
|
||||
// DeviceMemoryBase family reference override.
|
||||
template <typename T>
|
||||
void PackOneParam(
|
||||
std::vector<KernelArg> *args, const T &arg,
|
||||
KernelArgsArray<kNumberOfParameters> *args, const T &arg,
|
||||
typename std::enable_if<IsDeviceMemoryValueLike<T>::value>::type * =
|
||||
nullptr) const {
|
||||
args->emplace_back(parent()->DeviceMemoryToKernelArg(arg));
|
||||
args->add_device_memory_argument(arg);
|
||||
}
|
||||
|
||||
// DeviceMemoryBase family pointer override.
|
||||
template <typename T>
|
||||
void PackOneParam(
|
||||
std::vector<KernelArg> *args, T arg,
|
||||
KernelArgsArray<kNumberOfParameters> *args, T arg,
|
||||
typename std::enable_if<IsDeviceMemoryPointer<T>::value>::type * =
|
||||
nullptr) const {
|
||||
DeviceMemoryBase *ptr = static_cast<DeviceMemoryBase *>(arg);
|
||||
args->emplace_back(parent()->DeviceMemoryToKernelArg(*ptr));
|
||||
args->add_device_memory_argument(*ptr);
|
||||
}
|
||||
|
||||
// Dynamic shared device memory has a size, but no associated allocation on
|
||||
// the host; internally, the device will allocate storage.
|
||||
template <typename T>
|
||||
void PackOneParam(
|
||||
std::vector<KernelArg> *args, T arg,
|
||||
KernelArgsArray<kNumberOfParameters> *args, T arg,
|
||||
typename std::enable_if<IsSharedDeviceMemory<T>::value>::type * =
|
||||
nullptr) const {
|
||||
args->emplace_back(KernelArg{KernelArg::kSharedMemory,
|
||||
port::InlinedVector<uint8, 4>(), arg.size()});
|
||||
args->add_shared_bytes(arg.size());
|
||||
}
|
||||
|
||||
// Base case for variadic template expansion - nothing to do!
|
||||
void PackOneParam(std::vector<KernelArg> *args) const {}
|
||||
void PackOneParam(KernelArgsArray<kNumberOfParameters> *args) const {}
|
||||
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(TypedKernel);
|
||||
};
|
||||
|
@ -184,7 +184,7 @@ class StreamExecutorInterface {
|
||||
}
|
||||
virtual bool Launch(Stream *stream, const ThreadDim &thread_dims,
|
||||
const BlockDim &block_dims, const KernelBase &k,
|
||||
const std::vector<KernelArg> &args) {
|
||||
const KernelArgsArrayBase &args) {
|
||||
return false;
|
||||
}
|
||||
virtual void *Allocate(uint64 size) = 0;
|
||||
@ -258,9 +258,6 @@ class StreamExecutorInterface {
|
||||
// caller.
|
||||
virtual DeviceDescription *PopulateDeviceDescription() const = 0;
|
||||
|
||||
virtual KernelArg DeviceMemoryToKernelArg(
|
||||
const DeviceMemoryBase &gpu_mem) const = 0;
|
||||
|
||||
// Attempts to register the provided TraceListener with the device-specific
|
||||
// Executor implementation. When this is called, the PIMPL interface has
|
||||
// already taken ownership of the object and is managing the generic tracing
|
||||
|
@ -394,7 +394,7 @@ rng::RngSupport *StreamExecutor::AsRng() {
|
||||
bool StreamExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
|
||||
const BlockDim &block_dims,
|
||||
const KernelBase &kernel,
|
||||
const std::vector<KernelArg> &args) {
|
||||
const KernelArgsArrayBase &args) {
|
||||
SubmitTrace(&TraceListener::LaunchSubmit, stream, thread_dims, block_dims,
|
||||
kernel, args);
|
||||
|
||||
@ -659,11 +659,6 @@ bool StreamExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
|
||||
return implementation_->DeviceMemoryUsage(free, total);
|
||||
}
|
||||
|
||||
KernelArg StreamExecutor::DeviceMemoryToKernelArg(
|
||||
const DeviceMemoryBase &gpu_mem) const {
|
||||
return implementation_->DeviceMemoryToKernelArg(gpu_mem);
|
||||
}
|
||||
|
||||
void StreamExecutor::EnqueueOnBackgroundThread(std::function<void()> task) {
|
||||
background_threads_->Schedule(task);
|
||||
}
|
||||
|
@ -392,7 +392,7 @@ class StreamExecutor {
|
||||
// implementation in StreamExecutorInterface::Launch().
|
||||
bool Launch(Stream *stream, const ThreadDim &thread_dims,
|
||||
const BlockDim &block_dims, const KernelBase &kernel,
|
||||
const std::vector<KernelArg> &args);
|
||||
const KernelArgsArrayBase &args);
|
||||
|
||||
// Gets-or-creates (creates with memoization) a FftSupport datatype that can
|
||||
// be used to execute FFT routines on the current platform.
|
||||
@ -427,10 +427,6 @@ class StreamExecutor {
|
||||
// previously registered.
|
||||
bool UnregisterTraceListener(TraceListener* listener);
|
||||
|
||||
// Converts a DeviceMemory object into a KernelArg object for passing to the
|
||||
// device driver for kernel launch.
|
||||
KernelArg DeviceMemoryToKernelArg(const DeviceMemoryBase &gpu_mem) const;
|
||||
|
||||
private:
|
||||
template <typename BeginCallT, typename CompleteCallT,
|
||||
typename ReturnT, typename... BeginArgsT>
|
||||
@ -758,9 +754,9 @@ inline Stream &Stream::ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
|
||||
// we pack the variadic parameters passed as ...args into the desired
|
||||
// tuple form and pass that packed form to the StreamExecutor::Launch()
|
||||
// implementation.
|
||||
std::vector<KernelArg> kernel_args;
|
||||
kernel_args.reserve(kernel.Arity());
|
||||
KernelArgsArray<sizeof...(args)> kernel_args;
|
||||
kernel.PackParams(&kernel_args, args...);
|
||||
DCHECK(parent_ != nullptr);
|
||||
bool ok =
|
||||
parent_->Launch(this, thread_dims, block_dims, kernel, kernel_args);
|
||||
if (!ok) {
|
||||
|
@ -50,7 +50,7 @@ class TraceListener {
|
||||
virtual void LaunchSubmit(Stream* stream, const ThreadDim& thread_dims,
|
||||
const BlockDim& block_dims,
|
||||
const KernelBase& kernel,
|
||||
const std::vector<KernelArg>& args) {}
|
||||
const KernelArgsArrayBase& args) {}
|
||||
|
||||
virtual void SynchronousMemcpyH2DBegin(int64 correlation_id,
|
||||
const void* host_src, int64 size,
|
||||
|
Loading…
Reference in New Issue
Block a user