From bada4a5339d4567419e993b7736eb23a6f3535c4 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 29 Nov 2016 18:55:46 -0800
Subject: [PATCH] StreamExecutor: Optimize kernel argument packing

Create a single class to hold all kernel arguments and optimize how they are added into this class.
Change: 140556725
---
 .../stream_executor/cuda/cuda_gpu_executor.cc |  49 +---
 .../stream_executor/cuda/cuda_gpu_executor.h  |   5 +-
 tensorflow/stream_executor/kernel.h           | 265 +++++++++++++++---
 .../stream_executor_internal.h                |   5 +-
 .../stream_executor/stream_executor_pimpl.cc  |   7 +-
 .../stream_executor/stream_executor_pimpl.h   |  10 +-
 tensorflow/stream_executor/trace_listener.h   |   2 +-
 7 files changed, 251 insertions(+), 92 deletions(-)
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 322d33ae269..6b313256942 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -349,31 +349,12 @@ bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
 
 bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
                           const BlockDim &block_dims, const KernelBase &kernel,
-                          const std::vector<KernelArg> &args) {
-  CHECK_EQ(kernel.Arity(), args.size());
+                          const KernelArgsArrayBase &args) {
+  CHECK_EQ(kernel.Arity(), args.number_of_arguments());
   CUstream custream = AsCUDAStreamValue(stream);
   const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
   CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
 
-  std::vector<void *> addrs;
-  addrs.reserve(args.size());
-  int shmem_bytes = 0;
-  for (size_t i = 0; i < args.size(); i++) {
-    switch (args[i].type) {
-      case KernelArg::kNormal:
-        addrs.push_back(const_cast<void *>(
-            static_cast<const void *>(args[i].data.begin())));
-        break;
-      case KernelArg::kSharedMemory:
-        shmem_bytes += args[i].bytes;
-        break;
-      default:
-        LOG(ERROR) << "Invalid kernel arg type passed (" << args[i].type
-                   << ") for arg " << i;
-        return false;
-    }
-  }
-
   // Only perform/print the occupancy check 1x.
   launched_kernels_mu_.lock();
   if (launched_kernels_.find(cufunc) == launched_kernels_.end()) {
@@ -389,11 +370,15 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
     CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig());
   }
 
-  if (!CUDADriver::LaunchKernel(
-          GetCudaContext(stream), cufunc, block_dims.x, block_dims.y,
-          block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
-          shmem_bytes, custream, addrs.data(), nullptr /* = extra */)) {
-    LOG(ERROR) << "failed to launch CUDA kernel with args: " << args.size()
+  void **kernel_params = const_cast<void **>(args.argument_addresses().data());
+
+  if (!CUDADriver::LaunchKernel(GetCudaContext(stream), cufunc, block_dims.x,
+                                block_dims.y, block_dims.z, thread_dims.x,
+                                thread_dims.y, thread_dims.z,
+                                args.number_of_shared_bytes(), custream,
+                                kernel_params, nullptr /* = extra */)) {
+    LOG(ERROR) << "failed to launch CUDA kernel with args: "
+               << args.number_of_arguments()
                << "; thread dim: " << thread_dims.ToString()
                << "; block dim: " << block_dims.ToString();
     return false;
@@ -849,18 +834,6 @@ bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
   return true;
 }
 
-KernelArg CUDAExecutor::DeviceMemoryToKernelArg(
-    const DeviceMemoryBase &gpu_mem) const {
-  const void* arg = gpu_mem.opaque();
-  const uint8 *arg_ptr = reinterpret_cast<const uint8 *>(&arg);
-
-  KernelArg kernel_arg;
-  kernel_arg.type = KernelArg::kNormal;
-  kernel_arg.data = port::InlinedVector<uint8, 4>(arg_ptr, arg_ptr + sizeof(arg));
-  kernel_arg.bytes = sizeof(arg);
-  return kernel_arg;
-}
-
 bool CUDAExecutor::SupportsBlas() const { return true; }
 
 bool CUDAExecutor::SupportsFft() const { return true; }
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 9e01f48781d..3959d04439c 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -76,7 +76,7 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 
   bool Launch(Stream *stream, const ThreadDim &thread_dims,
               const BlockDim &block_dims, const KernelBase &k,
-              const std::vector<KernelArg> &args) override;
+              const KernelArgsArrayBase &args) override;
 
   void *Allocate(uint64 size) override;
 
@@ -186,9 +186,6 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
   // will be only partially populated as a result, and an error will be logged.
   bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
 
-  KernelArg DeviceMemoryToKernelArg(
-      const DeviceMemoryBase &gpu_mem) const override;
-
   bool SupportsBlas() const override;
 
   blas::BlasSupport *CreateBlas() override;
diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h
index 7742e066c70..3e5453e4c9c 100644
--- a/tensorflow/stream_executor/kernel.h
+++ b/tensorflow/stream_executor/kernel.h
@@ -76,9 +76,10 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/lib/array_slice.h"
+#include "tensorflow/stream_executor/lib/inlined_vector.h"
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/lib/inlined_vector.h"
 
 namespace perftools {
 namespace gputools {
@@ -265,24 +266,220 @@ struct IsSharedDeviceMemory<SharedDeviceMemory<U>> {
   static constexpr bool value = true;
 };
 
-// KernelArg encapsulates the information necessary for a back-end executor to
-// configure a kernel to launch using the given argument.
+// Basic data about a kernel argument.
 struct KernelArg {
-  // Indicates the type of an argument: normal, to be passed to the kernel
-  // in the standard manner, or shared memory, which has distinct
-  // rules for specification per backend.
-  enum Type {
-    kNormal,
-    kSharedMemory,
-  } type;
+  bool is_shared;
+  const void *address;
+  size_t size;
+};
 
-  // The data to pass to the kernel - either a pointer to device memory, or the
-  // argument value. compact_array is used to prevent smaller args (ex. u8, u64)
-  // from requiring heap allocation.
-  port::InlinedVector<uint8, 4> data;
+// An iterator for traversing all the arguments of a KernelArgsArray.
+class KernelArgIterator {
+ public:
+  KernelArgIterator(int number_of_argument_addresses,
+                    int number_of_shared_memory_arguments,
+                    const void *const *arg_addresses_data,
+                    const size_t *arg_sizes_data,
+                    const size_t *shmem_bytes_data,
+                    const size_t *shmem_indices_data)
+      : arg_index_(0),
+        number_of_arguments_(number_of_argument_addresses +
+                             number_of_shared_memory_arguments),
+        arg_address_iter_(arg_addresses_data),
+        arg_size_iter_(arg_sizes_data),
+        shmem_bytes_iter_(shmem_bytes_data),
+        shmem_indices_iter_(shmem_indices_data),
+        shmem_indices_end_(shmem_indices_data +
+                           number_of_shared_memory_arguments) {}
 
-  // The size of this argument in bytes.
-  uint64 bytes;
+  // Returns true if another argument is present in the iterator.
+  bool has_next() { return arg_index_ < number_of_arguments_; }
+
+  // Returns the next argument in the iterator.
+  //
+  // Returns a default-constructed KernelArg if there is no next argument.
+  KernelArg next() {
+    KernelArg result;
+    if (!has_next()) {
+      return result;
+    } else if ((shmem_indices_iter_ != shmem_indices_end_) &&
+               (arg_index_ == *shmem_indices_iter_)) {
+      result.is_shared = true;
+      result.address = nullptr;
+      result.size = *shmem_bytes_iter_;
+      ++shmem_indices_iter_;
+      ++shmem_bytes_iter_;
+    } else {
+      result.is_shared = false;
+      result.address = *arg_address_iter_;
+      result.size = *arg_size_iter_;
+      ++arg_address_iter_;
+      ++arg_size_iter_;
+    }
+    ++arg_index_;
+    return result;
+  }
+
+ private:
+  int arg_index_;
+  int number_of_arguments_;
+  const void *const *arg_address_iter_;
+  const size_t *arg_size_iter_;
+  const size_t *shmem_bytes_iter_;
+  const size_t *shmem_indices_iter_;
+  const size_t *const shmem_indices_end_;
+};
+
+// Base class for KernelArgsArray.
+//
+// Supports all the getter methods that do not depend on the compile-time number
+// of arguments template parameter.
+//
+// This class exists as a way to pass kernel arguments to
+// StreamExecutorInterface::Launch. That Launch method is virtual, so it can't
+// be templated to accept any KernelArgsArray type, therfore a reference to this
+// base type is passed instead.
+//
+// Performance is not a concern here because each of these methods will be
+// called at most once per kernel launch. Past performance concerns with
+// KernelArgsArray have been in reference to the argument packing routines which
+// are called once per kernel argument. Those packing routines are now handled
+// by the templated KernelArgsArray subclass of this class where they can take
+// advantage of compile-time knowledge of the number of arguments in order to be
+// very efficient.
+class KernelArgsArrayBase {
+ public:
+  virtual ~KernelArgsArrayBase() = default;
+
+  // Gets the number of arguments added so far, including shared memory
+  // arguments.
+  virtual size_t number_of_arguments() const = 0;
+
+  // Gets the total number of shared memory bytes added so far.
+  virtual uint64 number_of_shared_bytes() const = 0;
+
+  // Gets the list of argument addresses.
+  virtual port::ArraySlice<const void *> argument_addresses() const = 0;
+
+  // Gets an iterator to the arguments in the array.
+  virtual KernelArgIterator arg_iterator() const = 0;
+};
+
+// A list of arguments for a kernel call.
+//
+// The template parameter kNumArgs is the maximum number of arguments which can
+// be stored in the list.
+//
+// Contains a list of addresses for non-shared-memory arguments and a list of
+// sizes for shared-memory arguments. Since the shared-memory arguments may be
+// interspersed with the non-shared-memory arguments, it also stores a list of
+// the indices at which the shared-memory arguments appeared.
+//
+// For example, if the argument address list contains {a, b, c, d, e}, the
+// shared-memory arguments list contains the sizes of {A, B, C}, and the
+// shared-memory indices list contains {0, 3, 5}, then the original list of
+// arguments was {A, a, b, B, c, C, d, e}.
+//
+// This way of storing the arguments makes CUDA kernel calls efficient because
+// they only require the argument address list and the total number of shared
+// bytes, but it also makes it possible for OpenCL kernel calls because they
+// depend on the location of each shared-memory argument and its size.
+//
+// Note that the code for adding arguments has been identified as a performance
+// hotspot in some real-world applications so this structure has been optimized
+// for the performance of argument adding.
+template <size_t kNumArgs>
+class KernelArgsArray : public KernelArgsArrayBase {
+ public:
+  explicit KernelArgsArray()
+      : total_shared_memory_bytes_(0),
+        number_of_argument_addresses_(0),
+        number_of_shared_memory_arguments_(0) {}
+
+  // Adds an argument to the list.
+  //
+  // Note that the address of the argument is stored, so the input must not go
+  // out of scope before the instance of this class that calls this method does.
+  template <typename T>
+  void add_argument(const T &arg) {
+    argument_addresses_[number_of_argument_addresses_] =
+        static_cast<const void *>(&arg);
+    argument_sizes_[number_of_argument_addresses_] = sizeof(arg);
+    ++number_of_argument_addresses_;
+  }
+
+  // Adds a device memory argument to the list.
+  void add_device_memory_argument(const DeviceMemoryBase &arg) {
+    const void **copy_ptr =
+        &device_memory_opaque_pointers_[number_of_argument_addresses_];
+    *copy_ptr = arg.opaque();
+    argument_addresses_[number_of_argument_addresses_] = copy_ptr;
+    argument_sizes_[number_of_argument_addresses_] = sizeof(void *);
+    ++number_of_argument_addresses_;
+  }
+
+  // Adds a shared memory argument to the list.
+  //
+  // The only significant information about a shared argument is its size, so
+  // that is the only parameter in this function.
+  void add_shared_bytes(size_t number_of_bytes) {
+    shared_memory_indices_[number_of_shared_memory_arguments_] =
+        number_of_argument_addresses_ + number_of_shared_memory_arguments_;
+    shared_memory_bytes_[number_of_shared_memory_arguments_] = number_of_bytes;
+    ++number_of_shared_memory_arguments_;
+    total_shared_memory_bytes_ += number_of_bytes;
+  }
+
+  // Gets the number of arguments added so far, including shared memory
+  // arguments.
+  size_t number_of_arguments() const override {
+    return number_of_argument_addresses_ + number_of_shared_memory_arguments_;
+  }
+
+  // Gets the total number of shared memory bytes added so far.
+  uint64 number_of_shared_bytes() const override {
+    return total_shared_memory_bytes_;
+  }
+
+  // Gets the list of argument addresses.
+  port::ArraySlice<const void *> argument_addresses() const override {
+    return port::ArraySlice<const void *>(argument_addresses_.data(),
+                                          number_of_argument_addresses_);
+  }
+
+  // Gets an iterator to the arguments in the array.
+  KernelArgIterator arg_iterator() const override {
+    return KernelArgIterator(
+        number_of_argument_addresses_, number_of_shared_memory_arguments_,
+        argument_addresses_.data(), argument_sizes_.data(),
+        shared_memory_bytes_.data(), shared_memory_indices_.data());
+  }
+
+ private:
+  // A place to store copies of opaque pointers from device memory arguments.
+  std::array<const void *, kNumArgs> device_memory_opaque_pointers_;
+
+  // Addresses for non-shared-memory arguments.
+  std::array<const void *, kNumArgs> argument_addresses_;
+
+  // Sizes for non-shared-memory arguments.
+  std::array<size_t, kNumArgs> argument_sizes_;
+
+  // Size in bytes for each shared memory argument.
+  std::array<size_t, kNumArgs> shared_memory_bytes_;
+
+  // Indices in the arguments array for shared memory arguments.
+  std::array<size_t, kNumArgs> shared_memory_indices_;
+
+  // Total of all shared memory sizes.
+  size_t total_shared_memory_bytes_;
+
+  // Number of significant entries in argument_addresses_ and argument_sizes_.
+  size_t number_of_argument_addresses_;
+
+  // Number of significant entries in shared_memory_bytes_ and
+  // shared_memory_indices_.
+  size_t number_of_shared_memory_arguments_;
 };
 
 // Typed variant of KernelBase, like a typed device function pointer. See the
@@ -298,6 +495,8 @@ struct KernelArg {
 template <typename... Params>
 class TypedKernel : public KernelBase {
  public:
+  static constexpr size_t kNumberOfParameters = sizeof...(Params);
+
   // Delegates to KernelBase::KernelBase(), see that constructor.
   explicit TypedKernel(StreamExecutor *parent) : KernelBase(parent) {}
 
@@ -318,13 +517,19 @@ class TypedKernel : public KernelBase {
   //
   // Const refs are taken as parameters on all of the handlers to avoid
   // implicit type promotion of integers.
-  void PackParams(std::vector<KernelArg> *args, Params... params) const {
+  //
+  // WARNING: as a performance optimization this method may store pointers to
+  // some of the input parameters in the kernel args structure, so any params
+  // passed into this method must live at least as long as the kernel args
+  // structure.
+  void PackParams(KernelArgsArray<kNumberOfParameters> *args,
+                  Params &... params) const {
     PackOneParam(args, params...);
   }
 
   template <typename T, typename... RestOfParams>
-  void PackOneParam(std::vector<KernelArg> *args, const T &arg,
-                    const RestOfParams... rest) const {
+  void PackOneParam(KernelArgsArray<kNumberOfParameters> *args, const T &arg,
+                    const RestOfParams &... rest) const {
     PackOneParam(args, arg);
     PackOneParam(args, rest...);
   }
@@ -334,7 +539,7 @@ class TypedKernel : public KernelBase {
   // separate implementation below.
   template <typename T>
   void PackOneParam(
-      std::vector<KernelArg> *args, const T &arg,
+      KernelArgsArray<kNumberOfParameters> *args, const T &arg,
       typename std::enable_if<!IsDeviceMemoryValueLike<T>::value &&
                               !IsDeviceMemoryPointer<T>::value &&
                               !IsSharedDeviceMemory<T>::value>::type * =
@@ -343,44 +548,40 @@ class TypedKernel : public KernelBase {
                   "cannot pass raw pointer to the device");
     static_assert(!std::is_convertible<T, DeviceMemoryBase>::value,
                   "cannot pass device memory as a normal value");
-    const uint8 *arg_ptr = reinterpret_cast<const uint8 *>(&arg);
-    args->emplace_back(KernelArg{
-        KernelArg::kNormal,
-        port::InlinedVector<uint8, 4>{arg_ptr, arg_ptr + sizeof(arg)}, sizeof(arg)});
+    args->add_argument(arg);
   }
 
   // DeviceMemoryBase family reference override.
   template <typename T>
   void PackOneParam(
-      std::vector<KernelArg> *args, const T &arg,
+      KernelArgsArray<kNumberOfParameters> *args, const T &arg,
       typename std::enable_if<IsDeviceMemoryValueLike<T>::value>::type * =
           nullptr) const {
-    args->emplace_back(parent()->DeviceMemoryToKernelArg(arg));
+    args->add_device_memory_argument(arg);
   }
 
   // DeviceMemoryBase family pointer override.
   template <typename T>
   void PackOneParam(
-      std::vector<KernelArg> *args, T arg,
+      KernelArgsArray<kNumberOfParameters> *args, T arg,
       typename std::enable_if<IsDeviceMemoryPointer<T>::value>::type * =
           nullptr) const {
     DeviceMemoryBase *ptr = static_cast<DeviceMemoryBase *>(arg);
-    args->emplace_back(parent()->DeviceMemoryToKernelArg(*ptr));
+    args->add_device_memory_argument(*ptr);
   }
 
   // Dynamic shared device memory has a size, but no associated allocation on
   // the host; internally, the device will allocate storage.
   template <typename T>
   void PackOneParam(
-      std::vector<KernelArg> *args, T arg,
+      KernelArgsArray<kNumberOfParameters> *args, T arg,
       typename std::enable_if<IsSharedDeviceMemory<T>::value>::type * =
           nullptr) const {
-    args->emplace_back(KernelArg{KernelArg::kSharedMemory,
-                                 port::InlinedVector<uint8, 4>(), arg.size()});
+    args->add_shared_bytes(arg.size());
   }
 
   // Base case for variadic template expansion - nothing to do!
-  void PackOneParam(std::vector<KernelArg> *args) const {}
+  void PackOneParam(KernelArgsArray<kNumberOfParameters> *args) const {}
 
   SE_DISALLOW_COPY_AND_ASSIGN(TypedKernel);
 };
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index acdbb07cb76..57db7775a63 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -184,7 +184,7 @@ class StreamExecutorInterface {
   }
   virtual bool Launch(Stream *stream, const ThreadDim &thread_dims,
                       const BlockDim &block_dims, const KernelBase &k,
-                      const std::vector<KernelArg> &args) {
+                      const KernelArgsArrayBase &args) {
     return false;
   }
   virtual void *Allocate(uint64 size) = 0;
@@ -258,9 +258,6 @@ class StreamExecutorInterface {
   // caller.
   virtual DeviceDescription *PopulateDeviceDescription() const = 0;
 
-  virtual KernelArg DeviceMemoryToKernelArg(
-      const DeviceMemoryBase &gpu_mem) const = 0;
-
   // Attempts to register the provided TraceListener with the device-specific
   // Executor implementation. When this is called, the PIMPL interface has
   // already taken ownership of the object and is managing the generic tracing
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 2fdd1e4b49e..7739d316626 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -394,7 +394,7 @@ rng::RngSupport *StreamExecutor::AsRng() {
 bool StreamExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
                             const BlockDim &block_dims,
                             const KernelBase &kernel,
-                            const std::vector<KernelArg> &args) {
+                            const KernelArgsArrayBase &args) {
   SubmitTrace(&TraceListener::LaunchSubmit, stream, thread_dims, block_dims,
               kernel, args);
 
@@ -659,11 +659,6 @@ bool StreamExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
   return implementation_->DeviceMemoryUsage(free, total);
 }
 
-KernelArg StreamExecutor::DeviceMemoryToKernelArg(
-    const DeviceMemoryBase &gpu_mem) const {
-  return implementation_->DeviceMemoryToKernelArg(gpu_mem);
-}
-
 void StreamExecutor::EnqueueOnBackgroundThread(std::function<void()> task) {
   background_threads_->Schedule(task);
 }
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 2b5a70f8073..83fd27599e6 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -392,7 +392,7 @@ class StreamExecutor {
   // implementation in StreamExecutorInterface::Launch().
   bool Launch(Stream *stream, const ThreadDim &thread_dims,
               const BlockDim &block_dims, const KernelBase &kernel,
-              const std::vector<KernelArg> &args);
+              const KernelArgsArrayBase &args);
 
   // Gets-or-creates (creates with memoization) a FftSupport datatype that can
   // be used to execute FFT routines on the current platform.
@@ -427,10 +427,6 @@ class StreamExecutor {
   // previously registered.
   bool UnregisterTraceListener(TraceListener* listener);
 
-  // Converts a DeviceMemory object into a KernelArg object for passing to the
-  // device driver for kernel launch.
-  KernelArg DeviceMemoryToKernelArg(const DeviceMemoryBase &gpu_mem) const;
-
  private:
   template <typename BeginCallT, typename CompleteCallT,
             typename ReturnT, typename... BeginArgsT>
@@ -758,9 +754,9 @@ inline Stream &Stream::ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
     // we pack the variadic parameters passed as ...args into the desired
     // tuple form and pass that packed form to the StreamExecutor::Launch()
     // implementation.
-    std::vector<KernelArg> kernel_args;
-    kernel_args.reserve(kernel.Arity());
+    KernelArgsArray<sizeof...(args)> kernel_args;
     kernel.PackParams(&kernel_args, args...);
+    DCHECK(parent_ != nullptr);
     bool ok =
         parent_->Launch(this, thread_dims, block_dims, kernel, kernel_args);
     if (!ok) {
diff --git a/tensorflow/stream_executor/trace_listener.h b/tensorflow/stream_executor/trace_listener.h
index 804c6ee8faf..88c54f982b3 100644
--- a/tensorflow/stream_executor/trace_listener.h
+++ b/tensorflow/stream_executor/trace_listener.h
@@ -50,7 +50,7 @@ class TraceListener {
   virtual void LaunchSubmit(Stream* stream, const ThreadDim& thread_dims,
                             const BlockDim& block_dims,
                             const KernelBase& kernel,
-                            const std::vector<KernelArg>& args) {}
+                            const KernelArgsArrayBase& args) {}
 
   virtual void SynchronousMemcpyH2DBegin(int64 correlation_id,
                                          const void* host_src, int64 size,