STT-tensorflow/tensorflow/stream_executor/gpu/gpu_kernel.h

/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

// The CUDA implementation of the StreamExecutorInterface functionality.
// CUDA inclusions are ideally confined to this implementation file.
//
// The notions from the StreamExecutor basically correspond to the CUDA streams
// programming model provided by the libcuda.so driver APIs, so we don't have
// to do much more than wrap the calls to the libraries appropriately.
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_

#include "tensorflow/stream_executor/gpu/gpu_driver.h"
#include "tensorflow/stream_executor/kernel_cache_config.h"
#include "tensorflow/stream_executor/platform/logging.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"

namespace stream_executor {
namespace gpu {

// Wraps a GpuFunctionHandle to implement the platform-independent
// KernelInterface.
class GpuKernel : public internal::KernelInterface {
 public:
  GpuKernel()
      : gpu_function_(nullptr),
        arity_(0),
        preferred_cache_config_(KernelCacheConfig::kNoPreference) {}

  // Note that the function is unloaded when the module is unloaded, and the
  // module that the function is contained in is owned by the GpuExecutor.
  ~GpuKernel() override {}

  // As arity cannot be reflected upon using the CUDA API, the arity is
  // explicitly set during the GpuExecutor::GetKernel initialization process.
  void set_arity(unsigned arity) { arity_ = arity; }
  unsigned Arity() const override { return arity_; }

  // Returns the GpuFunctionHandle value for passing to the CUDA API.
  GpuFunctionHandle AsGpuFunctionHandle() const {
    DCHECK(gpu_function_ != nullptr);
    return const_cast<GpuFunctionHandle>(gpu_function_);
  }

  // Returns the slot that the GpuFunctionHandle is stored within for this
  // object, for the CUDA API which wants to load into a GpuFunctionHandle*.
  GpuFunctionHandle* gpu_function_ptr() { return &gpu_function_; }

  // CUDA supports setting the preferred cache configuration of a
  // GpuFunctionHandle (more-or-less equivalent to a GpuKernel). We support this
  // via the below functions; users can set a preference, and that is applied
  // when the kernel is [lazy-]loaded (in GpuExecutor::Launch). The alternative
  // would be to load the kernel & set the preference when the user calls the
  // setter below; either approach is valid. Sets the current kernel cache
  // configuration preference.
  void SetPreferredCacheConfig(KernelCacheConfig config) override {
    preferred_cache_config_ = config;
  }

  // Returns the current kernel cache configuration preference.
  KernelCacheConfig GetPreferredCacheConfig() const override {
    return preferred_cache_config_;
  }

  // Returns the current kernel cache configuration preference as a
  // CUfunc_cache.
  GpuFuncCachePreference GetGpuCacheConfig() const;

 private:
  GpuFunctionHandle gpu_function_;  // Wrapped CUDA kernel handle.
  unsigned arity_;  // Number of formal parameters the kernel takes.

  // Preferred (but not required) cache configuration for this kernel.
  KernelCacheConfig preferred_cache_config_;
};

// Given a platform-independent kernel datatype, returns the (const) internal
// CUDA platform implementation pointer.
inline const GpuKernel* AsGpuKernel(const KernelBase* kernel) {
  return static_cast<const GpuKernel*>(kernel->implementation());
}

// Given a platform-independent kernel datatype, returns the (non-const)
// internal CUDA platform implementation pointer.
inline GpuKernel* AsGpuKernel(KernelBase* kernel) {
  return static_cast<GpuKernel*>(kernel->implementation());
}

}  // namespace gpu
}  // namespace stream_executor

#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_