393 lines
12 KiB
C++
393 lines
12 KiB
C++
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_API_H_
|
|
#define TENSORFLOW_LITE_DELEGATES_GPU_API_H_
|
|
|
|
// Usage example:
|
|
//
|
|
// // Builder is created from a model using GPU-specific parameters.
|
|
// std::unique_ptr<InferenceBuilder> builder = ...;
|
|
//
|
|
// // input data is coming from a texture
|
|
// // output data goes to CPU
|
|
// builder->SetInputObjectDef(0, {DataType::FLOAT16, DataLayout::PHWC4,
|
|
// ObjectType::OPENGL_TEXTURE, true});
|
|
// builder->SetOutputObjectDef(0, {DataType::FLOAT32, DataLayout::BHWC,
|
|
// ObjectType::CPU_MEMORY, false});
|
|
// std::unique_ptr<InferenceRunner> runner;
|
|
// RETURN_IF_ERROR(builder->Build(&runner)); // may take significant time.
|
|
// RETURN_IF_ERROR(
|
|
// runner->SetInputObject(0, OpenGlTexture{texture_ud, texture_format}));
|
|
// RETURN_IF_ERROR(runner->Run());
|
|
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <vector>
|
|
|
|
#include "absl/types/span.h"
|
|
#include "absl/types/variant.h"
|
|
#include <CL/cl.h>
|
|
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
|
|
#include "tensorflow/lite/delegates/gpu/common/status.h"
|
|
#include "tensorflow/lite/delegates/gpu/common/util.h"
|
|
#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
|
|
#include <vulkan/vulkan.h>
|
|
|
|
namespace tflite {
|
|
namespace gpu {
|
|
|
|
// Common abbreviations:
|
|
// B - batch
|
|
// H - height
|
|
// W - width
|
|
// C - channels
|
|
// D - depth := DivideRoundUp(C, 4)
|
|
// C4 - is the constant = 4.
|
|
enum class DataLayout {
|
|
UNKNOWN,
|
|
BHWC,
|
|
DHWC4,
|
|
HWDC4,
|
|
HDWC4,
|
|
};
|
|
|
|
enum class ObjectType {
|
|
UNKNOWN,
|
|
OPENGL_SSBO,
|
|
OPENGL_TEXTURE,
|
|
CPU_MEMORY,
|
|
OPENCL_TEXTURE,
|
|
OPENCL_BUFFER,
|
|
VULKAN_BUFFER,
|
|
VULKAN_TEXTURE
|
|
};
|
|
|
|
struct OpenGlBuffer {
|
|
OpenGlBuffer() = default;
|
|
explicit OpenGlBuffer(GLuint new_id) : id(new_id) {}
|
|
|
|
GLuint id = GL_INVALID_INDEX;
|
|
};
|
|
|
|
struct OpenGlTexture {
|
|
OpenGlTexture() = default;
|
|
OpenGlTexture(GLuint new_id, GLenum new_format)
|
|
: id(new_id), format(new_format) {}
|
|
|
|
GLuint id = GL_INVALID_INDEX;
|
|
GLenum format = GL_INVALID_ENUM;
|
|
};
|
|
|
|
struct OpenClBuffer {
|
|
OpenClBuffer() = default;
|
|
explicit OpenClBuffer(cl_mem new_memobj) : memobj(new_memobj) {}
|
|
|
|
cl_mem memobj = nullptr;
|
|
};
|
|
|
|
struct OpenClTexture {
|
|
OpenClTexture() = default;
|
|
explicit OpenClTexture(cl_mem new_memobj) : memobj(new_memobj) {}
|
|
|
|
cl_mem memobj = nullptr;
|
|
// TODO(akulik): should it specify texture format?
|
|
};
|
|
|
|
struct VulkanBuffer {
|
|
VulkanBuffer() = default;
|
|
explicit VulkanBuffer(VkBuffer buffer_, VkDeviceSize size_,
|
|
VkDeviceMemory memory_, VkDeviceSize offset_)
|
|
: buffer(buffer_), size(size_), memory(memory_), offset(offset_) {}
|
|
|
|
VkBuffer buffer;
|
|
VkDeviceSize size;
|
|
VkDeviceMemory memory;
|
|
VkDeviceSize offset;
|
|
};
|
|
|
|
struct VulkanTexture {
|
|
VulkanTexture() = default;
|
|
explicit VulkanTexture(VkDeviceMemory new_memory) : memory(new_memory) {}
|
|
|
|
VkImage image;
|
|
VkImageView image_view;
|
|
VkFormat format;
|
|
VkExtent3D extent;
|
|
VkDeviceMemory memory;
|
|
VkDeviceSize offset;
|
|
};
|
|
|
|
struct VulkanMemory {
|
|
VulkanMemory() = default;
|
|
explicit VulkanMemory(VkDeviceMemory new_memory) : memory(new_memory) {}
|
|
|
|
VkDeviceMemory memory;
|
|
VkDeviceSize size;
|
|
VkDeviceSize offset;
|
|
};
|
|
|
|
struct CpuMemory {
|
|
CpuMemory() = default;
|
|
CpuMemory(void* new_data, size_t new_size_bytes)
|
|
: data(new_data), size_bytes(new_size_bytes) {}
|
|
|
|
void* data = nullptr;
|
|
size_t size_bytes = 0;
|
|
};
|
|
|
|
template <typename T>
|
|
inline CpuMemory MakeCpuMemory(absl::Span<T> t) {
|
|
CpuMemory m;
|
|
m.data = t.data();
|
|
m.size_bytes = t.size() * sizeof(T);
|
|
return m;
|
|
}
|
|
|
|
template <typename T>
|
|
inline CpuMemory MakeReadableCpuMemory(absl::Span<const T> t) {
|
|
CpuMemory m;
|
|
m.data = const_cast<T*>(t.data());
|
|
m.size_bytes = t.size() * sizeof(T);
|
|
return m;
|
|
}
|
|
|
|
// Defines object representation.
|
|
struct ObjectDef {
|
|
DataType data_type = DataType::UNKNOWN;
|
|
DataLayout data_layout = DataLayout::UNKNOWN;
|
|
ObjectType object_type = ObjectType::UNKNOWN;
|
|
|
|
// If true, then object is managed externally and needs to be provided to
|
|
// InferenceRunner by a user before running inference.
|
|
//
|
|
// User-provided objects will not be re-used internally for any purpose to
|
|
// lower overall memory usage.
|
|
bool user_provided = false;
|
|
|
|
bool operator==(const ObjectDef& other) const {
|
|
return data_type == other.data_type && data_layout == other.data_layout &&
|
|
object_type == other.object_type &&
|
|
user_provided == other.user_provided;
|
|
}
|
|
};
|
|
|
|
bool IsValid(const ObjectDef& def);
|
|
|
|
struct Dimensions {
|
|
Dimensions() : b(1), h(1), w(1), c(1) {}
|
|
|
|
Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels)
|
|
: b(batch), h(height), w(width), c(channels) {}
|
|
|
|
int32_t d() const { return DivideRoundUp(c, 4); }
|
|
|
|
int32_t product() const { return b * h * w * c; }
|
|
|
|
bool operator==(const Dimensions& other) const {
|
|
return b == other.b && h == other.h && w == other.w && c == other.c;
|
|
}
|
|
|
|
int32_t b;
|
|
int32_t h;
|
|
int32_t w;
|
|
int32_t c;
|
|
};
|
|
|
|
// Connects tensor shape with corresponding object definition.
|
|
struct TensorObjectDef {
|
|
// Dimensions semantic is defined by corresponding DataLayout.
|
|
Dimensions dimensions;
|
|
ObjectDef object_def;
|
|
|
|
bool operator==(const TensorObjectDef& other) const {
|
|
return dimensions == other.dimensions && object_def == other.object_def;
|
|
}
|
|
};
|
|
|
|
// @return true if tensor object def is defined.
|
|
bool IsValid(const TensorObjectDef& def);
|
|
|
|
// @return the number of elements in a tensor object.
|
|
uint32_t NumElements(const TensorObjectDef& def);
|
|
|
|
using TensorObject =
|
|
absl::variant<absl::monostate, OpenGlBuffer, OpenGlTexture, CpuMemory,
|
|
OpenClBuffer, OpenClTexture, VulkanBuffer, VulkanTexture>;
|
|
|
|
// @return true if object is set and corresponding values are defined.
|
|
bool IsValid(const TensorObjectDef& def, const TensorObject& object);
|
|
|
|
ObjectType GetType(const TensorObject& object);
|
|
|
|
// @return true if corresponding object is set for the given type
|
|
bool IsObjectPresent(ObjectType type, const TensorObject& obj);
|
|
|
|
class InferenceRunner;
|
|
|
|
// Allows to inspect and change input and output definitions before a graph is
|
|
// prepared for the inference.
|
|
class InferenceBuilder {
|
|
public:
|
|
virtual ~InferenceBuilder() {}
|
|
|
|
// Returns inference graph inputs and outputs definitions.
|
|
virtual std::vector<TensorObjectDef> inputs() const = 0;
|
|
virtual std::vector<TensorObjectDef> outputs() const = 0;
|
|
|
|
// Sets new shape for the input if underlying implementation and graph
|
|
// structure allows dynamic tensors.
|
|
virtual absl::Status SetInputShape(int index,
|
|
const Dimensions& dimensions) = 0;
|
|
|
|
// Updates object definitions for the given index. Implementation may allow
|
|
// to use different layouts and/or data type conversions between objects
|
|
// defined in a graph and given objects, for example:
|
|
// input '0' is DataType::FLOAT32, DataLayout::BHWC.
|
|
// A user, however, has an input in DataType::FLOAT16, DataLayout::PHWC4.
|
|
// An implementation may allow this transformation to happen automatically
|
|
// under the hood.
|
|
virtual absl::Status SetInputObjectDef(int index, ObjectDef def) = 0;
|
|
virtual absl::Status SetOutputObjectDef(int index, ObjectDef def) = 0;
|
|
virtual absl::Status SetAllInputObjectDefsTo(ObjectDef def) {
|
|
auto input_defs = inputs();
|
|
for (int i = 0; i < input_defs.size(); ++i) {
|
|
RETURN_IF_ERROR(SetInputObjectDef(i, def));
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
virtual absl::Status SetAllOutputObjectDefsTo(ObjectDef def) {
|
|
auto output_defs = outputs();
|
|
for (int i = 0; i < output_defs.size(); ++i) {
|
|
RETURN_IF_ERROR(SetOutputObjectDef(i, def));
|
|
}
|
|
return absl::OkStatus();
|
|
}
|
|
|
|
// Creates new instance of the inference runner. InferenceBuilder stays valid
|
|
// and could be used to create another inference runner if needed.
|
|
//
|
|
// This method may take significant time to prepare new inference runner. For
|
|
// example, it may require to compile OpenGL shaders.
|
|
virtual absl::Status Build(std::unique_ptr<InferenceRunner>* runner) = 0;
|
|
};
|
|
|
|
// Runs prepared inference. Every object marked as external needs to be set
|
|
// prior calling Run method.
|
|
class InferenceRunner {
|
|
public:
|
|
virtual ~InferenceRunner() {}
|
|
|
|
// Returns inference graph inputs and outputs definitions.
|
|
virtual std::vector<TensorObjectDef> inputs() const = 0;
|
|
virtual std::vector<TensorObjectDef> outputs() const = 0;
|
|
|
|
// Getters provide access to underlying objects for the given index.
|
|
// Setters allow to set or change external object for the given index. Note,
|
|
// object need to match object definition set before in InferenceBuilder.
|
|
|
|
virtual absl::Status GetInputObject(int index, TensorObject* object) = 0;
|
|
virtual absl::Status GetOutputObject(int index, TensorObject* object) = 0;
|
|
virtual absl::Status SetInputObject(int index, TensorObject object) = 0;
|
|
virtual absl::Status SetOutputObject(int index, TensorObject object) = 0;
|
|
|
|
virtual absl::Status Run() = 0;
|
|
};
|
|
|
|
// Encapsulated compilation/runtime tradeoffs.
|
|
enum class InferenceUsage {
|
|
UNKNOWN,
|
|
|
|
// InferenceRunner will be used only once. Therefore, it is important to
|
|
// minimize bootstrap time as well.
|
|
FAST_SINGLE_ANSWER,
|
|
|
|
// Prefer maximizing the throughput. Same inference runner will be used
|
|
// repeatedly on different inputs.
|
|
SUSTAINED_SPEED,
|
|
};
|
|
|
|
// Defines aspects to control while instantiating a runner.
|
|
enum class InferencePriority {
|
|
UNKNOWN,
|
|
|
|
AUTO,
|
|
|
|
MIN_LATENCY,
|
|
|
|
MAX_PRECISION,
|
|
|
|
MIN_MEMORY_USAGE,
|
|
};
|
|
|
|
struct InferenceOptions {
|
|
InferenceUsage usage = InferenceUsage::SUSTAINED_SPEED;
|
|
|
|
// Ordered priorities provide better understanding of desired semantics,
|
|
// where priority(n) is more important than priority(n+1).
|
|
// AUTO priority is needed when a single priority is the most important
|
|
// factor. For example, priority1 = InferencePriority::MIN_LATENCY and leaving
|
|
// everything else to AUTO would result in configuration that achieves maximum
|
|
// performance.
|
|
//
|
|
// AUTO priority can only be used when higher priorities are fully specified.
|
|
// For example:
|
|
// VALID: priority1 = MIN_LATENCY, priority2 = AUTO, priority3 = AUTO
|
|
// VALID: priority1 = MIN_LATENCY, priority2 = MAX_PRECISION,
|
|
// priority3 = AUTO
|
|
// INVALID: priority1 = AUTO, priority2 = MIN_LATENCY, priority3 = AUTO
|
|
// INVALID: priority1 = MIN_LATENCY, priority2 = AUTO,
|
|
// priority3 = MAX_PRECISION
|
|
// Invalid priorities will result in error.
|
|
InferencePriority priority1 = InferencePriority::MAX_PRECISION;
|
|
|
|
InferencePriority priority2 = InferencePriority::AUTO;
|
|
|
|
InferencePriority priority3 = InferencePriority::AUTO;
|
|
};
|
|
|
|
// Returns a position number for the priority. If priority is missing,
|
|
// then it it would return 'max num priorities + 1'.
|
|
int GetPosition(const InferenceOptions& options, InferencePriority p);
|
|
|
|
// Return true if options are valid.
|
|
bool IsValid(const InferenceOptions& options);
|
|
|
|
// Resolves AUTO priorities and specifies them explicitly.
|
|
// Note, no-one should assume that these mappings will not change.
|
|
// Technically this function is declared here for code re-use purposes and
|
|
// by no means it should be treated as canonical way to resolve AUTO.
|
|
void ResolveAutoPriority(InferenceOptions* options);
|
|
|
|
enum class PriorityImportance {
|
|
UNKNOWN,
|
|
HIGHER,
|
|
LOWER,
|
|
};
|
|
|
|
// If both p1 and p2 are not present in options, return UNKNOWN
|
|
// If p1 is present, but p2 is not, return HIGHER
|
|
// If p2 is present, but p1 is not, return LOWER
|
|
// If both are present, and p1 is more important, return HIGHER, otherwise,
|
|
// LOWER.
|
|
PriorityImportance GetRelativeImportance(const InferenceOptions& options,
|
|
InferencePriority p1,
|
|
InferencePriority p2);
|
|
|
|
} // namespace gpu
|
|
} // namespace tflite
|
|
|
|
#endif // TENSORFLOW_LITE_DELEGATES_GPU_API_H_
|