From bada4a5339d4567419e993b7736eb23a6f3535c4 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Tue, 29 Nov 2016 18:55:46 -0800 Subject: [PATCH] StreamExecutor: Optimize kernel argument packing Create a single class to hold all kernel arguments and optimize how they are added into this class. Change: 140556725 --- .../stream_executor/cuda/cuda_gpu_executor.cc | 49 +--- .../stream_executor/cuda/cuda_gpu_executor.h | 5 +- tensorflow/stream_executor/kernel.h | 265 +++++++++++++++--- .../stream_executor_internal.h | 5 +- .../stream_executor/stream_executor_pimpl.cc | 7 +- .../stream_executor/stream_executor_pimpl.h | 10 +- tensorflow/stream_executor/trace_listener.h | 2 +- 7 files changed, 251 insertions(+), 92 deletions(-) diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc index 322d33ae269..6b313256942 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc @@ -349,31 +349,12 @@ bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel, bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims, const BlockDim &block_dims, const KernelBase &kernel, - const std::vector &args) { - CHECK_EQ(kernel.Arity(), args.size()); + const KernelArgsArrayBase &args) { + CHECK_EQ(kernel.Arity(), args.number_of_arguments()); CUstream custream = AsCUDAStreamValue(stream); const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel); CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue(); - std::vector addrs; - addrs.reserve(args.size()); - int shmem_bytes = 0; - for (size_t i = 0; i < args.size(); i++) { - switch (args[i].type) { - case KernelArg::kNormal: - addrs.push_back(const_cast( - static_cast(args[i].data.begin()))); - break; - case KernelArg::kSharedMemory: - shmem_bytes += args[i].bytes; - break; - default: - LOG(ERROR) << "Invalid kernel arg type passed (" << args[i].type - << ") for arg " << i; - return false; - } - } - // Only perform/print the occupancy check 1x. launched_kernels_mu_.lock(); if (launched_kernels_.find(cufunc) == launched_kernels_.end()) { @@ -389,11 +370,15 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims, CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig()); } - if (!CUDADriver::LaunchKernel( - GetCudaContext(stream), cufunc, block_dims.x, block_dims.y, - block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z, - shmem_bytes, custream, addrs.data(), nullptr /* = extra */)) { - LOG(ERROR) << "failed to launch CUDA kernel with args: " << args.size() + void **kernel_params = const_cast(args.argument_addresses().data()); + + if (!CUDADriver::LaunchKernel(GetCudaContext(stream), cufunc, block_dims.x, + block_dims.y, block_dims.z, thread_dims.x, + thread_dims.y, thread_dims.z, + args.number_of_shared_bytes(), custream, + kernel_params, nullptr /* = extra */)) { + LOG(ERROR) << "failed to launch CUDA kernel with args: " + << args.number_of_arguments() << "; thread dim: " << thread_dims.ToString() << "; block dim: " << block_dims.ToString(); return false; @@ -849,18 +834,6 @@ bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const { return true; } -KernelArg CUDAExecutor::DeviceMemoryToKernelArg( - const DeviceMemoryBase &gpu_mem) const { - const void* arg = gpu_mem.opaque(); - const uint8 *arg_ptr = reinterpret_cast(&arg); - - KernelArg kernel_arg; - kernel_arg.type = KernelArg::kNormal; - kernel_arg.data = port::InlinedVector(arg_ptr, arg_ptr + sizeof(arg)); - kernel_arg.bytes = sizeof(arg); - return kernel_arg; -} - bool CUDAExecutor::SupportsBlas() const { return true; } bool CUDAExecutor::SupportsFft() const { return true; } diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h index 9e01f48781d..3959d04439c 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h @@ -76,7 +76,7 @@ class CUDAExecutor : public internal::StreamExecutorInterface { bool Launch(Stream *stream, const ThreadDim &thread_dims, const BlockDim &block_dims, const KernelBase &k, - const std::vector &args) override; + const KernelArgsArrayBase &args) override; void *Allocate(uint64 size) override; @@ -186,9 +186,6 @@ class CUDAExecutor : public internal::StreamExecutorInterface { // will be only partially populated as a result, and an error will be logged. bool FillBlockDimLimit(BlockDim *block_dim_limit) const; - KernelArg DeviceMemoryToKernelArg( - const DeviceMemoryBase &gpu_mem) const override; - bool SupportsBlas() const override; blas::BlasSupport *CreateBlas() override; diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h index 7742e066c70..3e5453e4c9c 100644 --- a/tensorflow/stream_executor/kernel.h +++ b/tensorflow/stream_executor/kernel.h @@ -76,9 +76,10 @@ limitations under the License. #include "tensorflow/stream_executor/device_memory.h" #include "tensorflow/stream_executor/kernel_cache_config.h" +#include "tensorflow/stream_executor/lib/array_slice.h" +#include "tensorflow/stream_executor/lib/inlined_vector.h" #include "tensorflow/stream_executor/lib/stringpiece.h" #include "tensorflow/stream_executor/platform/port.h" -#include "tensorflow/stream_executor/lib/inlined_vector.h" namespace perftools { namespace gputools { @@ -265,24 +266,220 @@ struct IsSharedDeviceMemory> { static constexpr bool value = true; }; -// KernelArg encapsulates the information necessary for a back-end executor to -// configure a kernel to launch using the given argument. +// Basic data about a kernel argument. struct KernelArg { - // Indicates the type of an argument: normal, to be passed to the kernel - // in the standard manner, or shared memory, which has distinct - // rules for specification per backend. - enum Type { - kNormal, - kSharedMemory, - } type; + bool is_shared; + const void *address; + size_t size; +}; - // The data to pass to the kernel - either a pointer to device memory, or the - // argument value. compact_array is used to prevent smaller args (ex. u8, u64) - // from requiring heap allocation. - port::InlinedVector data; +// An iterator for traversing all the arguments of a KernelArgsArray. +class KernelArgIterator { + public: + KernelArgIterator(int number_of_argument_addresses, + int number_of_shared_memory_arguments, + const void *const *arg_addresses_data, + const size_t *arg_sizes_data, + const size_t *shmem_bytes_data, + const size_t *shmem_indices_data) + : arg_index_(0), + number_of_arguments_(number_of_argument_addresses + + number_of_shared_memory_arguments), + arg_address_iter_(arg_addresses_data), + arg_size_iter_(arg_sizes_data), + shmem_bytes_iter_(shmem_bytes_data), + shmem_indices_iter_(shmem_indices_data), + shmem_indices_end_(shmem_indices_data + + number_of_shared_memory_arguments) {} - // The size of this argument in bytes. - uint64 bytes; + // Returns true if another argument is present in the iterator. + bool has_next() { return arg_index_ < number_of_arguments_; } + + // Returns the next argument in the iterator. + // + // Returns a default-constructed KernelArg if there is no next argument. + KernelArg next() { + KernelArg result; + if (!has_next()) { + return result; + } else if ((shmem_indices_iter_ != shmem_indices_end_) && + (arg_index_ == *shmem_indices_iter_)) { + result.is_shared = true; + result.address = nullptr; + result.size = *shmem_bytes_iter_; + ++shmem_indices_iter_; + ++shmem_bytes_iter_; + } else { + result.is_shared = false; + result.address = *arg_address_iter_; + result.size = *arg_size_iter_; + ++arg_address_iter_; + ++arg_size_iter_; + } + ++arg_index_; + return result; + } + + private: + int arg_index_; + int number_of_arguments_; + const void *const *arg_address_iter_; + const size_t *arg_size_iter_; + const size_t *shmem_bytes_iter_; + const size_t *shmem_indices_iter_; + const size_t *const shmem_indices_end_; +}; + +// Base class for KernelArgsArray. +// +// Supports all the getter methods that do not depend on the compile-time number +// of arguments template parameter. +// +// This class exists as a way to pass kernel arguments to +// StreamExecutorInterface::Launch. That Launch method is virtual, so it can't +// be templated to accept any KernelArgsArray type, therfore a reference to this +// base type is passed instead. +// +// Performance is not a concern here because each of these methods will be +// called at most once per kernel launch. Past performance concerns with +// KernelArgsArray have been in reference to the argument packing routines which +// are called once per kernel argument. Those packing routines are now handled +// by the templated KernelArgsArray subclass of this class where they can take +// advantage of compile-time knowledge of the number of arguments in order to be +// very efficient. +class KernelArgsArrayBase { + public: + virtual ~KernelArgsArrayBase() = default; + + // Gets the number of arguments added so far, including shared memory + // arguments. + virtual size_t number_of_arguments() const = 0; + + // Gets the total number of shared memory bytes added so far. + virtual uint64 number_of_shared_bytes() const = 0; + + // Gets the list of argument addresses. + virtual port::ArraySlice argument_addresses() const = 0; + + // Gets an iterator to the arguments in the array. + virtual KernelArgIterator arg_iterator() const = 0; +}; + +// A list of arguments for a kernel call. +// +// The template parameter kNumArgs is the maximum number of arguments which can +// be stored in the list. +// +// Contains a list of addresses for non-shared-memory arguments and a list of +// sizes for shared-memory arguments. Since the shared-memory arguments may be +// interspersed with the non-shared-memory arguments, it also stores a list of +// the indices at which the shared-memory arguments appeared. +// +// For example, if the argument address list contains {a, b, c, d, e}, the +// shared-memory arguments list contains the sizes of {A, B, C}, and the +// shared-memory indices list contains {0, 3, 5}, then the original list of +// arguments was {A, a, b, B, c, C, d, e}. +// +// This way of storing the arguments makes CUDA kernel calls efficient because +// they only require the argument address list and the total number of shared +// bytes, but it also makes it possible for OpenCL kernel calls because they +// depend on the location of each shared-memory argument and its size. +// +// Note that the code for adding arguments has been identified as a performance +// hotspot in some real-world applications so this structure has been optimized +// for the performance of argument adding. +template +class KernelArgsArray : public KernelArgsArrayBase { + public: + explicit KernelArgsArray() + : total_shared_memory_bytes_(0), + number_of_argument_addresses_(0), + number_of_shared_memory_arguments_(0) {} + + // Adds an argument to the list. + // + // Note that the address of the argument is stored, so the input must not go + // out of scope before the instance of this class that calls this method does. + template + void add_argument(const T &arg) { + argument_addresses_[number_of_argument_addresses_] = + static_cast(&arg); + argument_sizes_[number_of_argument_addresses_] = sizeof(arg); + ++number_of_argument_addresses_; + } + + // Adds a device memory argument to the list. + void add_device_memory_argument(const DeviceMemoryBase &arg) { + const void **copy_ptr = + &device_memory_opaque_pointers_[number_of_argument_addresses_]; + *copy_ptr = arg.opaque(); + argument_addresses_[number_of_argument_addresses_] = copy_ptr; + argument_sizes_[number_of_argument_addresses_] = sizeof(void *); + ++number_of_argument_addresses_; + } + + // Adds a shared memory argument to the list. + // + // The only significant information about a shared argument is its size, so + // that is the only parameter in this function. + void add_shared_bytes(size_t number_of_bytes) { + shared_memory_indices_[number_of_shared_memory_arguments_] = + number_of_argument_addresses_ + number_of_shared_memory_arguments_; + shared_memory_bytes_[number_of_shared_memory_arguments_] = number_of_bytes; + ++number_of_shared_memory_arguments_; + total_shared_memory_bytes_ += number_of_bytes; + } + + // Gets the number of arguments added so far, including shared memory + // arguments. + size_t number_of_arguments() const override { + return number_of_argument_addresses_ + number_of_shared_memory_arguments_; + } + + // Gets the total number of shared memory bytes added so far. + uint64 number_of_shared_bytes() const override { + return total_shared_memory_bytes_; + } + + // Gets the list of argument addresses. + port::ArraySlice argument_addresses() const override { + return port::ArraySlice(argument_addresses_.data(), + number_of_argument_addresses_); + } + + // Gets an iterator to the arguments in the array. + KernelArgIterator arg_iterator() const override { + return KernelArgIterator( + number_of_argument_addresses_, number_of_shared_memory_arguments_, + argument_addresses_.data(), argument_sizes_.data(), + shared_memory_bytes_.data(), shared_memory_indices_.data()); + } + + private: + // A place to store copies of opaque pointers from device memory arguments. + std::array device_memory_opaque_pointers_; + + // Addresses for non-shared-memory arguments. + std::array argument_addresses_; + + // Sizes for non-shared-memory arguments. + std::array argument_sizes_; + + // Size in bytes for each shared memory argument. + std::array shared_memory_bytes_; + + // Indices in the arguments array for shared memory arguments. + std::array shared_memory_indices_; + + // Total of all shared memory sizes. + size_t total_shared_memory_bytes_; + + // Number of significant entries in argument_addresses_ and argument_sizes_. + size_t number_of_argument_addresses_; + + // Number of significant entries in shared_memory_bytes_ and + // shared_memory_indices_. + size_t number_of_shared_memory_arguments_; }; // Typed variant of KernelBase, like a typed device function pointer. See the @@ -298,6 +495,8 @@ struct KernelArg { template class TypedKernel : public KernelBase { public: + static constexpr size_t kNumberOfParameters = sizeof...(Params); + // Delegates to KernelBase::KernelBase(), see that constructor. explicit TypedKernel(StreamExecutor *parent) : KernelBase(parent) {} @@ -318,13 +517,19 @@ class TypedKernel : public KernelBase { // // Const refs are taken as parameters on all of the handlers to avoid // implicit type promotion of integers. - void PackParams(std::vector *args, Params... params) const { + // + // WARNING: as a performance optimization this method may store pointers to + // some of the input parameters in the kernel args structure, so any params + // passed into this method must live at least as long as the kernel args + // structure. + void PackParams(KernelArgsArray *args, + Params &... params) const { PackOneParam(args, params...); } template - void PackOneParam(std::vector *args, const T &arg, - const RestOfParams... rest) const { + void PackOneParam(KernelArgsArray *args, const T &arg, + const RestOfParams &... rest) const { PackOneParam(args, arg); PackOneParam(args, rest...); } @@ -334,7 +539,7 @@ class TypedKernel : public KernelBase { // separate implementation below. template void PackOneParam( - std::vector *args, const T &arg, + KernelArgsArray *args, const T &arg, typename std::enable_if::value && !IsDeviceMemoryPointer::value && !IsSharedDeviceMemory::value>::type * = @@ -343,44 +548,40 @@ class TypedKernel : public KernelBase { "cannot pass raw pointer to the device"); static_assert(!std::is_convertible::value, "cannot pass device memory as a normal value"); - const uint8 *arg_ptr = reinterpret_cast(&arg); - args->emplace_back(KernelArg{ - KernelArg::kNormal, - port::InlinedVector{arg_ptr, arg_ptr + sizeof(arg)}, sizeof(arg)}); + args->add_argument(arg); } // DeviceMemoryBase family reference override. template void PackOneParam( - std::vector *args, const T &arg, + KernelArgsArray *args, const T &arg, typename std::enable_if::value>::type * = nullptr) const { - args->emplace_back(parent()->DeviceMemoryToKernelArg(arg)); + args->add_device_memory_argument(arg); } // DeviceMemoryBase family pointer override. template void PackOneParam( - std::vector *args, T arg, + KernelArgsArray *args, T arg, typename std::enable_if::value>::type * = nullptr) const { DeviceMemoryBase *ptr = static_cast(arg); - args->emplace_back(parent()->DeviceMemoryToKernelArg(*ptr)); + args->add_device_memory_argument(*ptr); } // Dynamic shared device memory has a size, but no associated allocation on // the host; internally, the device will allocate storage. template void PackOneParam( - std::vector *args, T arg, + KernelArgsArray *args, T arg, typename std::enable_if::value>::type * = nullptr) const { - args->emplace_back(KernelArg{KernelArg::kSharedMemory, - port::InlinedVector(), arg.size()}); + args->add_shared_bytes(arg.size()); } // Base case for variadic template expansion - nothing to do! - void PackOneParam(std::vector *args) const {} + void PackOneParam(KernelArgsArray *args) const {} SE_DISALLOW_COPY_AND_ASSIGN(TypedKernel); }; diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h index acdbb07cb76..57db7775a63 100644 --- a/tensorflow/stream_executor/stream_executor_internal.h +++ b/tensorflow/stream_executor/stream_executor_internal.h @@ -184,7 +184,7 @@ class StreamExecutorInterface { } virtual bool Launch(Stream *stream, const ThreadDim &thread_dims, const BlockDim &block_dims, const KernelBase &k, - const std::vector &args) { + const KernelArgsArrayBase &args) { return false; } virtual void *Allocate(uint64 size) = 0; @@ -258,9 +258,6 @@ class StreamExecutorInterface { // caller. virtual DeviceDescription *PopulateDeviceDescription() const = 0; - virtual KernelArg DeviceMemoryToKernelArg( - const DeviceMemoryBase &gpu_mem) const = 0; - // Attempts to register the provided TraceListener with the device-specific // Executor implementation. When this is called, the PIMPL interface has // already taken ownership of the object and is managing the generic tracing diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index 2fdd1e4b49e..7739d316626 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -394,7 +394,7 @@ rng::RngSupport *StreamExecutor::AsRng() { bool StreamExecutor::Launch(Stream *stream, const ThreadDim &thread_dims, const BlockDim &block_dims, const KernelBase &kernel, - const std::vector &args) { + const KernelArgsArrayBase &args) { SubmitTrace(&TraceListener::LaunchSubmit, stream, thread_dims, block_dims, kernel, args); @@ -659,11 +659,6 @@ bool StreamExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const { return implementation_->DeviceMemoryUsage(free, total); } -KernelArg StreamExecutor::DeviceMemoryToKernelArg( - const DeviceMemoryBase &gpu_mem) const { - return implementation_->DeviceMemoryToKernelArg(gpu_mem); -} - void StreamExecutor::EnqueueOnBackgroundThread(std::function task) { background_threads_->Schedule(task); } diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h index 2b5a70f8073..83fd27599e6 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.h +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -392,7 +392,7 @@ class StreamExecutor { // implementation in StreamExecutorInterface::Launch(). bool Launch(Stream *stream, const ThreadDim &thread_dims, const BlockDim &block_dims, const KernelBase &kernel, - const std::vector &args); + const KernelArgsArrayBase &args); // Gets-or-creates (creates with memoization) a FftSupport datatype that can // be used to execute FFT routines on the current platform. @@ -427,10 +427,6 @@ class StreamExecutor { // previously registered. bool UnregisterTraceListener(TraceListener* listener); - // Converts a DeviceMemory object into a KernelArg object for passing to the - // device driver for kernel launch. - KernelArg DeviceMemoryToKernelArg(const DeviceMemoryBase &gpu_mem) const; - private: template @@ -758,9 +754,9 @@ inline Stream &Stream::ThenLaunch(ThreadDim thread_dims, BlockDim block_dims, // we pack the variadic parameters passed as ...args into the desired // tuple form and pass that packed form to the StreamExecutor::Launch() // implementation. - std::vector kernel_args; - kernel_args.reserve(kernel.Arity()); + KernelArgsArray kernel_args; kernel.PackParams(&kernel_args, args...); + DCHECK(parent_ != nullptr); bool ok = parent_->Launch(this, thread_dims, block_dims, kernel, kernel_args); if (!ok) { diff --git a/tensorflow/stream_executor/trace_listener.h b/tensorflow/stream_executor/trace_listener.h index 804c6ee8faf..88c54f982b3 100644 --- a/tensorflow/stream_executor/trace_listener.h +++ b/tensorflow/stream_executor/trace_listener.h @@ -50,7 +50,7 @@ class TraceListener { virtual void LaunchSubmit(Stream* stream, const ThreadDim& thread_dims, const BlockDim& block_dims, const KernelBase& kernel, - const std::vector& args) {} + const KernelArgsArrayBase& args) {} virtual void SynchronousMemcpyH2DBegin(int64 correlation_id, const void* host_src, int64 size,