minor spelling tweaks
This commit is contained in:
parent
092ae742c3
commit
51d76d6f72
@ -38,7 +38,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// There is no easy way to pass a parameter into the TfLiteDelegate's
|
// There is no easy way to pass a parameter into the TfLiteDelegate's
|
||||||
// 'prepare' function, so we keep a global map for testing purpused.
|
// 'prepare' function, so we keep a global map for testing purposed.
|
||||||
// To avoid collisions use: GetPrepareFunction<__LINE__>().
|
// To avoid collisions use: GetPrepareFunction<__LINE__>().
|
||||||
std::map<int, std::vector<int>>* GetGlobalOpLists() {
|
std::map<int, std::vector<int>>* GetGlobalOpLists() {
|
||||||
static auto* op_list = new std::map<int, std::vector<int>>;
|
static auto* op_list = new std::map<int, std::vector<int>>;
|
||||||
|
@ -113,7 +113,7 @@ const TfLiteGpuDelegateOptionsV2 kDefaultOptions =
|
|||||||
TfLiteGpuDelegateOptionsV2Default();
|
TfLiteGpuDelegateOptionsV2Default();
|
||||||
```
|
```
|
||||||
|
|
||||||
Similar for `NewTfLiteMetalDelgate()`:
|
Similar for `NewTfLiteMetalDelegate()`:
|
||||||
|
|
||||||
```c++
|
```c++
|
||||||
const TfLiteMetalDelegateOptions kDefaultOptions = {
|
const TfLiteMetalDelegateOptions kDefaultOptions = {
|
||||||
|
@ -124,9 +124,9 @@ class ProfilingCommandQueue : public CLCommandQueue {
|
|||||||
double GetQueueExecutionTimeMs() const;
|
double GetQueueExecutionTimeMs() const;
|
||||||
|
|
||||||
// Difference from GetQueueExecutionTimeMs is that this number doesn't include
|
// Difference from GetQueueExecutionTimeMs is that this number doesn't include
|
||||||
// time between kernels(kernels launchs or preparing) on GPU. Usually, this
|
// time between kernels(kernels launches or preparing) on GPU. Usually, this
|
||||||
// time should be 5-10% better than GetQueueExecutionTimeMs, because 5-10%
|
// time should be 5-10% better than GetQueueExecutionTimeMs, because 5-10%
|
||||||
// spend on something else(maybe kernels launchs or preparing)
|
// spend on something else(maybe kernels launches or preparing)
|
||||||
double GetSumOfEventsTimeMs() const;
|
double GetSumOfEventsTimeMs() const;
|
||||||
|
|
||||||
// This label will be used for all subsequent dispatches.
|
// This label will be used for all subsequent dispatches.
|
||||||
|
@ -64,7 +64,7 @@ class CLProgram {
|
|||||||
|
|
||||||
// Return the cl_device_id associated with the program object.
|
// Return the cl_device_id associated with the program object.
|
||||||
// This can be the device associated with context on which the program object
|
// This can be the device associated with context on which the program object
|
||||||
// has been created or can be device that was specified when a progam object
|
// has been created or can be device that was specified when a program object
|
||||||
// was created using clCreateProgramWithBinary.
|
// was created using clCreateProgramWithBinary.
|
||||||
cl_device_id GetDeviceId() const { return device_id_; }
|
cl_device_id GetDeviceId() const { return device_id_; }
|
||||||
|
|
||||||
|
@ -46,7 +46,7 @@ Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
|
|||||||
bool IsEglSyncFromClEventSupported();
|
bool IsEglSyncFromClEventSupported();
|
||||||
|
|
||||||
// Creates CL event from EGL sync.
|
// Creates CL event from EGL sync.
|
||||||
// Created event could only be comsumed by AcquiredGlObject::Acquire call as
|
// Created event could only be consumed by AcquiredGlObject::Acquire call as
|
||||||
// a 'wait_event'.
|
// a 'wait_event'.
|
||||||
Status CreateClEventFromEglSync(cl_context context, const EglSync& egl_sync,
|
Status CreateClEventFromEglSync(cl_context context, const EglSync& egl_sync,
|
||||||
CLEvent* event);
|
CLEvent* event);
|
||||||
|
@ -47,7 +47,7 @@ struct CLNode {
|
|||||||
// for every operation.
|
// for every operation.
|
||||||
std::vector<int2> ranges;
|
std::vector<int2> ranges;
|
||||||
|
|
||||||
// Mostly for debug purposess.
|
// Mostly for debug purposes.
|
||||||
std::string name;
|
std::string name;
|
||||||
|
|
||||||
CLNode() = default;
|
CLNode() = default;
|
||||||
@ -129,8 +129,8 @@ class InferenceContext {
|
|||||||
CalculationsPrecision precision_;
|
CalculationsPrecision precision_;
|
||||||
TensorStorageType storage_type_;
|
TensorStorageType storage_type_;
|
||||||
|
|
||||||
// Directly mapped nodes from graph, but some of them "inactiv" due
|
// Directly mapped nodes from graph, but some of them "inactive" due
|
||||||
// to fusion (inactiv = fused).
|
// to fusion (inactive = fused).
|
||||||
// Memory is allocated only once, in ConvertOperations, and is not modified
|
// Memory is allocated only once, in ConvertOperations, and is not modified
|
||||||
// anywhere.
|
// anywhere.
|
||||||
std::vector<CLNode> nodes_;
|
std::vector<CLNode> nodes_;
|
||||||
|
@ -29,7 +29,7 @@ namespace {
|
|||||||
// vec mat mult) on 4 parts to create more threads
|
// vec mat mult) on 4 parts to create more threads
|
||||||
// tid.y thread process every 4-th element in vec vec dot
|
// tid.y thread process every 4-th element in vec vec dot
|
||||||
// Good results for ~1024 x 1024 sizes, for other can be written more
|
// Good results for ~1024 x 1024 sizes, for other can be written more
|
||||||
// otimized shaders
|
// optimized shaders
|
||||||
|
|
||||||
std::string GetFullyConnectedKernelCode(
|
std::string GetFullyConnectedKernelCode(
|
||||||
const OperationDef& op_def, const LinearStorage& biases,
|
const OperationDef& op_def, const LinearStorage& biases,
|
||||||
|
@ -25,7 +25,7 @@ namespace gpu {
|
|||||||
namespace cl {
|
namespace cl {
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
std::string GetMaxUnoolingKernelCode(
|
std::string GetMaxUnpoolingKernelCode(
|
||||||
const OperationDef& op_def, const CLDevice& device,
|
const OperationDef& op_def, const CLDevice& device,
|
||||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||||
TensorCodeGenerator src("src_data",
|
TensorCodeGenerator src("src_data",
|
||||||
@ -102,7 +102,7 @@ std::string GetMaxUnoolingKernelCode(
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string GetMaxUnooling3DKernelCode(
|
std::string GetMaxUnpooling3DKernelCode(
|
||||||
const OperationDef& op_def, const CLDevice& device,
|
const OperationDef& op_def, const CLDevice& device,
|
||||||
const std::vector<ElementwiseOperation*>& linked_operations) {
|
const std::vector<ElementwiseOperation*>& linked_operations) {
|
||||||
TensorCodeGenerator src(
|
TensorCodeGenerator src(
|
||||||
@ -219,7 +219,7 @@ MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status MaxUnpooling::Compile(const CreationContext& creation_context) {
|
Status MaxUnpooling::Compile(const CreationContext& creation_context) {
|
||||||
const auto code = GetMaxUnoolingKernelCode(
|
const auto code = GetMaxUnpoolingKernelCode(
|
||||||
definition_, *creation_context.device, linked_operations_);
|
definition_, *creation_context.device, linked_operations_);
|
||||||
return creation_context.cache->GetOrCreateCLKernel(
|
return creation_context.cache->GetOrCreateCLKernel(
|
||||||
code, "main_function", *creation_context.context,
|
code, "main_function", *creation_context.context,
|
||||||
@ -292,7 +292,7 @@ MaxUnpooling3D& MaxUnpooling3D::operator=(MaxUnpooling3D&& kernel) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status MaxUnpooling3D::Compile(const CreationContext& creation_context) {
|
Status MaxUnpooling3D::Compile(const CreationContext& creation_context) {
|
||||||
const auto code = GetMaxUnooling3DKernelCode(
|
const auto code = GetMaxUnpooling3DKernelCode(
|
||||||
definition_, *creation_context.device, linked_operations_);
|
definition_, *creation_context.device, linked_operations_);
|
||||||
return creation_context.cache->GetOrCreateCLKernel(
|
return creation_context.cache->GetOrCreateCLKernel(
|
||||||
code, "main_function", *creation_context.context,
|
code, "main_function", *creation_context.context,
|
||||||
|
@ -95,7 +95,7 @@ std::string GetStridedSliceCode(
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Is4Alighed(const SliceAttributes& attr) {
|
bool Is4Aligned(const SliceAttributes& attr) {
|
||||||
return attr.strides.c == 1 && attr.starts.c % 4 == 0;
|
return attr.strides.c == 1 && attr.starts.c % 4 == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -129,7 +129,7 @@ int4 GetOffset(const SliceAttributes& attr, int src_width, int src_height,
|
|||||||
offset.z = src_channels + attr.ends.c;
|
offset.z = src_channels + attr.ends.c;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (Is4Alighed(attr)) {
|
if (Is4Aligned(attr)) {
|
||||||
offset.z /= 4;
|
offset.z /= 4;
|
||||||
}
|
}
|
||||||
if (attr.strides.b > 0) {
|
if (attr.strides.b > 0) {
|
||||||
@ -167,7 +167,7 @@ StridedSlice& StridedSlice::operator=(StridedSlice&& operation) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status StridedSlice::Compile(const CreationContext& creation_context) {
|
Status StridedSlice::Compile(const CreationContext& creation_context) {
|
||||||
const auto code = GetStridedSliceCode(definition_, Is4Alighed(attributes_),
|
const auto code = GetStridedSliceCode(definition_, Is4Aligned(attributes_),
|
||||||
linked_operations_);
|
linked_operations_);
|
||||||
return creation_context.cache->GetOrCreateCLKernel(
|
return creation_context.cache->GetOrCreateCLKernel(
|
||||||
code, "main_function", *creation_context.context,
|
code, "main_function", *creation_context.context,
|
||||||
|
@ -640,7 +640,7 @@ extern PFN_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
|
|||||||
extern PFN_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
|
extern PFN_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
|
||||||
extern PFN_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
|
extern PFN_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
|
||||||
|
|
||||||
// For convinient image creation
|
// For convenient image creation
|
||||||
// It uses clCreateImage if it available (clCreateImage available since cl 1.2)
|
// It uses clCreateImage if it available (clCreateImage available since cl 1.2)
|
||||||
// otherwise it will use legacy clCreateImage2D
|
// otherwise it will use legacy clCreateImage2D
|
||||||
cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags,
|
cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags,
|
||||||
|
@ -30,7 +30,7 @@ enum class CalculationsPrecision { F32, F32_F16, F16 };
|
|||||||
// F32_F16 - as F16, but some operations (Convolution,
|
// F32_F16 - as F16, but some operations (Convolution,
|
||||||
// DepthWiseConvolution, FullyConnected, ConvolutionTransposed)
|
// DepthWiseConvolution, FullyConnected, ConvolutionTransposed)
|
||||||
// have accumulator in F32 and usually it calculates 4 mads in F16, sum them,
|
// have accumulator in F32 and usually it calculates 4 mads in F16, sum them,
|
||||||
// than converts this partial sum to F32 and add to acumulator.
|
// than converts this partial sum to F32 and add to accumulator.
|
||||||
|
|
||||||
DataType DeduceDataTypeFromPrecision(CalculationsPrecision precision);
|
DataType DeduceDataTypeFromPrecision(CalculationsPrecision precision);
|
||||||
|
|
||||||
|
@ -475,7 +475,7 @@ Status AllocateTensorMemory(const CLContext& context, const CLDevice& device,
|
|||||||
case TensorStorageType::SINGLE_TEXTURE_2D: {
|
case TensorStorageType::SINGLE_TEXTURE_2D: {
|
||||||
if (slices != 1) {
|
if (slices != 1) {
|
||||||
return InvalidArgumentError(absl::StrCat(
|
return InvalidArgumentError(absl::StrCat(
|
||||||
"SINGLE_TEXTURE_2D support only cnannels in range [1-4], but ",
|
"SINGLE_TEXTURE_2D support only channels in range [1-4], but ",
|
||||||
shape.c, "was provided"));
|
shape.c, "was provided"));
|
||||||
}
|
}
|
||||||
cl_image_desc desc;
|
cl_image_desc desc;
|
||||||
|
@ -82,7 +82,7 @@ enum class MemoryStrategy {
|
|||||||
Status BestGreedy(const std::vector<TensorUsageRecord<size_t>>& usage_records,
|
Status BestGreedy(const std::vector<TensorUsageRecord<size_t>>& usage_records,
|
||||||
ObjectsAssignment<size_t>* assignment);
|
ObjectsAssignment<size_t>* assignment);
|
||||||
|
|
||||||
// Calculates the assignement of shared objects to given tensors, including
|
// Calculates the assignment of shared objects to given tensors, including
|
||||||
// objects' sizes. Below there are specializations for different types, that
|
// objects' sizes. Below there are specializations for different types, that
|
||||||
// support more memory strategies.
|
// support more memory strategies.
|
||||||
// If reallocation_graph is provided, assignment of shared objects support
|
// If reallocation_graph is provided, assignment of shared objects support
|
||||||
@ -130,7 +130,7 @@ Status AssignObjectsToTensors(
|
|||||||
MemoryStrategy strategy, ObjectsAssignment<uint3>* assignment,
|
MemoryStrategy strategy, ObjectsAssignment<uint3>* assignment,
|
||||||
const UsageGraph* reallocation_graph);
|
const UsageGraph* reallocation_graph);
|
||||||
|
|
||||||
// Calculates the assignement of tensors to offsets, considering those tensors
|
// Calculates the assignment of tensors to offsets, considering those tensors
|
||||||
// are going to be allocated in one continuous memory block.
|
// are going to be allocated in one continuous memory block.
|
||||||
Status AssignOffsetsToTensors(
|
Status AssignOffsetsToTensors(
|
||||||
const std::vector<TensorUsageRecord<size_t>>& usage_records,
|
const std::vector<TensorUsageRecord<size_t>>& usage_records,
|
||||||
|
@ -67,7 +67,7 @@ Status GreedyBySizeAssignment(
|
|||||||
assignment->offsets.resize(num_tensors);
|
assignment->offsets.resize(num_tensors);
|
||||||
assignment->total_size = 0;
|
assignment->total_size = 0;
|
||||||
|
|
||||||
// Ordered records are to be sorted by size of corrseponding tensor.
|
// Ordered records are to be sorted by size of corresponding tensor.
|
||||||
std::vector<TensorUsageWithIndex<size_t>> ordered_records;
|
std::vector<TensorUsageWithIndex<size_t>> ordered_records;
|
||||||
for (size_t i = 0; i < num_tensors; ++i) {
|
for (size_t i = 0; i < num_tensors; ++i) {
|
||||||
ordered_records.emplace_back(&usage_records[i], i);
|
ordered_records.emplace_back(&usage_records[i], i);
|
||||||
@ -133,7 +133,7 @@ Status GreedyBySizeAssignment(
|
|||||||
// - We have tensor usage records of all intermideate tensors as an input. Each
|
// - We have tensor usage records of all intermideate tensors as an input. Each
|
||||||
// record consists of tensor size, first and last tasks, that use it. Let's call
|
// record consists of tensor size, first and last tasks, that use it. Let's call
|
||||||
// [first_task..last_task] a tensor usage interval;
|
// [first_task..last_task] a tensor usage interval;
|
||||||
// - Distance between two usage intervals is the absoulte difference between
|
// - Distance between two usage intervals is the absolute difference between
|
||||||
// closest tasks in their intervals. If two usage intervals don't intersect,
|
// closest tasks in their intervals. If two usage intervals don't intersect,
|
||||||
// than the distance between them is positive;
|
// than the distance between them is positive;
|
||||||
// - Calculate positional maximums vector, e.g. the vector of lower bounds on
|
// - Calculate positional maximums vector, e.g. the vector of lower bounds on
|
||||||
|
@ -36,7 +36,7 @@ namespace gpu {
|
|||||||
// gap;
|
// gap;
|
||||||
// - If such a gap has been found, current tensor should be allocated into this
|
// - If such a gap has been found, current tensor should be allocated into this
|
||||||
// gap. Otherwise we can allocate it after the rightmost tensor, which usage
|
// gap. Otherwise we can allocate it after the rightmost tensor, which usage
|
||||||
// interval intersects with usage inteval of current tensor. So we assign
|
// interval intersects with usage interval of current tensor. So we assign
|
||||||
// corresponding offset to current tensor and the tensor becomes assigned.
|
// corresponding offset to current tensor and the tensor becomes assigned.
|
||||||
Status GreedyBySizeAssignment(
|
Status GreedyBySizeAssignment(
|
||||||
const std::vector<TensorUsageRecord<size_t>>& usage_records,
|
const std::vector<TensorUsageRecord<size_t>>& usage_records,
|
||||||
@ -47,7 +47,7 @@ Status GreedyBySizeAssignment(
|
|||||||
// - We have tensor usage records of all intermideate tensors as an input. Each
|
// - We have tensor usage records of all intermideate tensors as an input. Each
|
||||||
// record consists of tensor size, first and last tasks, that use it. Let's call
|
// record consists of tensor size, first and last tasks, that use it. Let's call
|
||||||
// [first_task..last_task] a tensor usage interval;
|
// [first_task..last_task] a tensor usage interval;
|
||||||
// - Distance between two usage intervals is the absoulte difference between
|
// - Distance between two usage intervals is the absolute difference between
|
||||||
// closest tasks in their intervals. If two usage intervals don't intersect,
|
// closest tasks in their intervals. If two usage intervals don't intersect,
|
||||||
// than the distance between them is positive;
|
// than the distance between them is positive;
|
||||||
// - Calculate positional maximums vector, e.g. the vector of lower bounds on
|
// - Calculate positional maximums vector, e.g. the vector of lower bounds on
|
||||||
|
@ -46,7 +46,7 @@ bool CompareBySize(const TensorUsageWithIndex<size_t>& first,
|
|||||||
const TensorUsageWithIndex<size_t>& second);
|
const TensorUsageWithIndex<size_t>& second);
|
||||||
|
|
||||||
// TaskProfile is a vector with information about all intermediate tensors, that
|
// TaskProfile is a vector with information about all intermediate tensors, that
|
||||||
// should exist in memory during the executon of the task. Elements of the
|
// should exist in memory during the execution of the task. Elements of the
|
||||||
// vector must be sorted in non-increasing order of corresponding tensors sizes.
|
// vector must be sorted in non-increasing order of corresponding tensors sizes.
|
||||||
using TaskProfile = std::vector<TensorUsageWithIndex<size_t>>;
|
using TaskProfile = std::vector<TensorUsageWithIndex<size_t>>;
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@ namespace gpu {
|
|||||||
namespace testing {
|
namespace testing {
|
||||||
|
|
||||||
// Runs Tensorflow Lite model using Tensorflow Lite with a delegate and
|
// Runs Tensorflow Lite model using Tensorflow Lite with a delegate and
|
||||||
// an appropriate operations resolver. If delegate is nullptr, infererence will
|
// an appropriate operations resolver. If delegate is nullptr, inference will
|
||||||
// be done only on CPU.
|
// be done only on CPU.
|
||||||
Status InterpreterInvokeWithOpResolver(const ::tflite::Model* model,
|
Status InterpreterInvokeWithOpResolver(const ::tflite::Model* model,
|
||||||
TfLiteDelegate* delegate,
|
TfLiteDelegate* delegate,
|
||||||
@ -38,7 +38,7 @@ Status InterpreterInvokeWithOpResolver(const ::tflite::Model* model,
|
|||||||
std::vector<TensorFloat32>* outputs);
|
std::vector<TensorFloat32>* outputs);
|
||||||
|
|
||||||
// Runs Tensorflow Lite model using Tensorflow Lite with a delegate and
|
// Runs Tensorflow Lite model using Tensorflow Lite with a delegate and
|
||||||
// builtin operations resolver. If delegate is nullptr, infererence will
|
// builtin operations resolver. If delegate is nullptr, inference will
|
||||||
// be done only on CPU.
|
// be done only on CPU.
|
||||||
Status InterpreterInvoke(const ::tflite::Model* model, TfLiteDelegate* delegate,
|
Status InterpreterInvoke(const ::tflite::Model* model, TfLiteDelegate* delegate,
|
||||||
const std::vector<TensorFloat32>& inputs,
|
const std::vector<TensorFloat32>& inputs,
|
||||||
|
@ -126,7 +126,7 @@ std::vector<int> GetDivisorsForRange(int number, int range) {
|
|||||||
std::vector<int> GetPossibleSizes(int number,
|
std::vector<int> GetPossibleSizes(int number,
|
||||||
WorkGroupSizeAlignment z_alignment) {
|
WorkGroupSizeAlignment z_alignment) {
|
||||||
if (z_alignment == WorkGroupSizeAlignment::PRECISE) {
|
if (z_alignment == WorkGroupSizeAlignment::PRECISE) {
|
||||||
// we will use for potential sizes, sizes that cover grid preciselly
|
// we will use for potential sizes, sizes that cover grid precisely
|
||||||
// work group size * k (k is integer) == grid_size
|
// work group size * k (k is integer) == grid_size
|
||||||
return GetDivisors(number);
|
return GetDivisors(number);
|
||||||
} else {
|
} else {
|
||||||
|
@ -79,7 +79,7 @@ typedef struct {
|
|||||||
// each time inference engine needs to make a decision, it uses
|
// each time inference engine needs to make a decision, it uses
|
||||||
// ordered priorities to do so.
|
// ordered priorities to do so.
|
||||||
// For example:
|
// For example:
|
||||||
// MAX_PRECISION at priority1 would not allow to decrease presision,
|
// MAX_PRECISION at priority1 would not allow to decrease precision,
|
||||||
// but moving it to priority2 or priority3 would result in F16 calculation.
|
// but moving it to priority2 or priority3 would result in F16 calculation.
|
||||||
//
|
//
|
||||||
// Priority is defined in TfLiteGpuInferencePriority.
|
// Priority is defined in TfLiteGpuInferencePriority.
|
||||||
|
@ -60,7 +60,7 @@ class VariableAccessor : public InlineRewrite {
|
|||||||
// Returns const variables that need to be inlined in the a shader's code.
|
// Returns const variables that need to be inlined in the a shader's code.
|
||||||
std::string GetConstDeclarations() const;
|
std::string GetConstDeclarations() const;
|
||||||
|
|
||||||
// Returns shared varaible declarations that need to be inlined.
|
// Returns shared variable declarations that need to be inlined.
|
||||||
std::string GetSharedVariableDeclarations() const;
|
std::string GetSharedVariableDeclarations() const;
|
||||||
|
|
||||||
// Returns uniform parameter declarations that need to be inlined.
|
// Returns uniform parameter declarations that need to be inlined.
|
||||||
|
@ -131,7 +131,7 @@ Status GetEglError() {
|
|||||||
case EGL_CONTEXT_LOST:
|
case EGL_CONTEXT_LOST:
|
||||||
return InternalError(
|
return InternalError(
|
||||||
"A power management event has occurred. The application must destroy "
|
"A power management event has occurred. The application must destroy "
|
||||||
"all contexts and reinitialise OpenGL ES state and objects to "
|
"all contexts and reinitialize OpenGL ES state and objects to "
|
||||||
"continue rendering.");
|
"continue rendering.");
|
||||||
}
|
}
|
||||||
return UnknownError("EGL error: " + std::to_string(error));
|
return UnknownError("EGL error: " + std::to_string(error));
|
||||||
|
@ -77,7 +77,7 @@ class GlSync {
|
|||||||
// Waits until GPU is done with processing.
|
// Waits until GPU is done with processing.
|
||||||
Status GlSyncWait();
|
Status GlSyncWait();
|
||||||
|
|
||||||
// Waits until all comands are flushed and then performs active waiting by
|
// Waits until all commands are flushed and then performs active waiting by
|
||||||
// spinning a thread and checking sync status. It leads to shorter wait time
|
// spinning a thread and checking sync status. It leads to shorter wait time
|
||||||
// (up to tens of ms) but consumes more CPU.
|
// (up to tens of ms) but consumes more CPU.
|
||||||
Status GlActiveSyncWait();
|
Status GlActiveSyncWait();
|
||||||
|
@ -74,7 +74,7 @@ TEST(AddTest, InputTensorAndScalar) {
|
|||||||
Pointwise(FloatNear(1e-6), {-1.9, 0.3, 0.8, 0.9, 1.2, 2.1}));
|
Pointwise(FloatNear(1e-6), {-1.9, 0.3, 0.8, 0.9, 1.2, 2.1}));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(AddTest, InputTensorWithConstandBroadcast) {
|
TEST(AddTest, InputTensorWithConstantBroadcast) {
|
||||||
TensorRef<BHWC> input;
|
TensorRef<BHWC> input;
|
||||||
input.type = DataType::FLOAT32;
|
input.type = DataType::FLOAT32;
|
||||||
input.ref = 0;
|
input.ref = 0;
|
||||||
|
@ -33,8 +33,8 @@ namespace {
|
|||||||
// (b/117291356).
|
// (b/117291356).
|
||||||
|
|
||||||
// Describes the ideal convolution for the specific operation case
|
// Describes the ideal convolution for the specific operation case
|
||||||
// Case here means specific "kernel + strides" conbination for specific
|
// Case here means specific "kernel + strides" combination for specific
|
||||||
// operatoins type, not sizes of input and output tensors, they can be any.
|
// operations type, not sizes of input and output tensors, they can be any.
|
||||||
struct IdealByCase {
|
struct IdealByCase {
|
||||||
bool ParamsAccepted(OperationType in_op_type, HW in_kernel,
|
bool ParamsAccepted(OperationType in_op_type, HW in_kernel,
|
||||||
HW in_strides) const {
|
HW in_strides) const {
|
||||||
|
@ -129,7 +129,7 @@ uint32_t BufferUseCount(ValueId id,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Examines if the second operation can be linked to the first one. Linking may
|
// Examines if the second operation can be linked to the first one. Linking may
|
||||||
// be skipped in the situation when conflic may happen: if first operation's
|
// be skipped in the situation when conflict may happen: if first operation's
|
||||||
// output is used by more than 1 other operation.
|
// output is used by more than 1 other operation.
|
||||||
bool CanFuseOperations(const ComputeTaskDescriptorPtr first,
|
bool CanFuseOperations(const ComputeTaskDescriptorPtr first,
|
||||||
const ComputeTaskDescriptorPtr second,
|
const ComputeTaskDescriptorPtr second,
|
||||||
@ -444,9 +444,9 @@ ComputeTaskDescriptorPtr NonLinkableStub(int operation_id, ValueId input_id,
|
|||||||
}
|
}
|
||||||
|
|
||||||
ComputeTaskDescriptorPtr FuseChain(const FusionSequence& chain) {
|
ComputeTaskDescriptorPtr FuseChain(const FusionSequence& chain) {
|
||||||
auto fused_desciptor = std::make_shared<ComputeTaskDescriptor>();
|
auto fused_descriptor = std::make_shared<ComputeTaskDescriptor>();
|
||||||
// The id of fused descriptor is the id of the first descriptor in the list.
|
// The id of fused descriptor is the id of the first descriptor in the list.
|
||||||
fused_desciptor->id = chain.front()->id;
|
fused_descriptor->id = chain.front()->id;
|
||||||
FusionSequence sequence;
|
FusionSequence sequence;
|
||||||
if (chain.front()->is_linkable) {
|
if (chain.front()->is_linkable) {
|
||||||
// The first task is linkable so it contains only linkable code. Insert
|
// The first task is linkable so it contains only linkable code. Insert
|
||||||
@ -503,7 +503,7 @@ ComputeTaskDescriptorPtr FuseChain(const FusionSequence& chain) {
|
|||||||
buffer.declaration + name + "[[buffer(" + index + ")]],\n";
|
buffer.declaration + name + "[[buffer(" + index + ")]],\n";
|
||||||
call_arguments += ", buffer" + index;
|
call_arguments += ", buffer" + index;
|
||||||
input_index++;
|
input_index++;
|
||||||
fused_desciptor->input_buffers.push_back({buffer.id, ""});
|
fused_descriptor->input_buffers.push_back({buffer.id, ""});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// We have an output id that is the input for the next task.
|
// We have an output id that is the input for the next task.
|
||||||
@ -517,7 +517,7 @@ ComputeTaskDescriptorPtr FuseChain(const FusionSequence& chain) {
|
|||||||
buffer.declaration + name + "[[buffer(" + index + ")]],\n";
|
buffer.declaration + name + "[[buffer(" + index + ")]],\n";
|
||||||
call_arguments += ", buffer" + index;
|
call_arguments += ", buffer" + index;
|
||||||
immutable_index++;
|
immutable_index++;
|
||||||
fused_desciptor->immutable_buffers.push_back(buffer);
|
fused_descriptor->immutable_buffers.push_back(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto buffer : desc->uniform_buffers) {
|
for (auto buffer : desc->uniform_buffers) {
|
||||||
@ -527,7 +527,7 @@ ComputeTaskDescriptorPtr FuseChain(const FusionSequence& chain) {
|
|||||||
buffer.declaration + name + "[[buffer(" + index + ")]],\n";
|
buffer.declaration + name + "[[buffer(" + index + ")]],\n";
|
||||||
call_arguments += ", buffer" + index;
|
call_arguments += ", buffer" + index;
|
||||||
uniform_index++;
|
uniform_index++;
|
||||||
fused_desciptor->uniform_buffers.push_back({"", buffer.data_function});
|
fused_descriptor->uniform_buffers.push_back({"", buffer.data_function});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (desc->is_linkable) {
|
if (desc->is_linkable) {
|
||||||
@ -539,7 +539,7 @@ ComputeTaskDescriptorPtr FuseChain(const FusionSequence& chain) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ComputeTaskDescriptorPtr non_linkable = sequence.front();
|
ComputeTaskDescriptorPtr non_linkable = sequence.front();
|
||||||
fused_desciptor->shader_source =
|
fused_descriptor->shader_source =
|
||||||
absl::Substitute(non_linkable->shader_source, function_code,
|
absl::Substitute(non_linkable->shader_source, function_code,
|
||||||
buffer_declarations, call_code);
|
buffer_declarations, call_code);
|
||||||
std::vector<ValueId> alias;
|
std::vector<ValueId> alias;
|
||||||
@ -547,13 +547,13 @@ ComputeTaskDescriptorPtr FuseChain(const FusionSequence& chain) {
|
|||||||
for (int i = 0; i < chain.size() - 1; i++) {
|
for (int i = 0; i < chain.size() - 1; i++) {
|
||||||
alias.push_back(chain[i]->output_buffer.id);
|
alias.push_back(chain[i]->output_buffer.id);
|
||||||
}
|
}
|
||||||
fused_desciptor->output_buffer = {
|
fused_descriptor->output_buffer = {
|
||||||
fused_id, "", non_linkable->output_buffer.dimensions_function, alias};
|
fused_id, "", non_linkable->output_buffer.dimensions_function, alias};
|
||||||
fused_desciptor->resize_function = non_linkable->resize_function;
|
fused_descriptor->resize_function = non_linkable->resize_function;
|
||||||
for (const auto& desc : sequence) {
|
for (const auto& desc : sequence) {
|
||||||
fused_desciptor->description += desc->description + "_";
|
fused_descriptor->description += desc->description + "_";
|
||||||
}
|
}
|
||||||
return fused_desciptor;
|
return fused_descriptor;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
@ -35,7 +35,7 @@ limitations under the License.
|
|||||||
/// 2. Model compilation. Global list of ComputeTaskDescriptors is transformed
|
/// 2. Model compilation. Global list of ComputeTaskDescriptors is transformed
|
||||||
/// into the sorted list of sets of descriptors. A set can be transformed
|
/// into the sorted list of sets of descriptors. A set can be transformed
|
||||||
/// later into a single GPU task.
|
/// later into a single GPU task.
|
||||||
/// 3. GPU compute tasks generation. Shader code generation happes here.
|
/// 3. GPU compute tasks generation. Shader code generation happens here.
|
||||||
/// 4. Intermediate resource allocation.
|
/// 4. Intermediate resource allocation.
|
||||||
/// Inference.
|
/// Inference.
|
||||||
@interface TFLInferenceContext : NSObject
|
@interface TFLInferenceContext : NSObject
|
||||||
@ -72,11 +72,11 @@ limitations under the License.
|
|||||||
/// Inserts all GPU compute tasks into the command encoder.
|
/// Inserts all GPU compute tasks into the command encoder.
|
||||||
/// @param inputOutputBuffers Must be created and passed into the method with pairs ID:buffer
|
/// @param inputOutputBuffers Must be created and passed into the method with pairs ID:buffer
|
||||||
/// @param encoderBlock User-defined block to take control over command encoder. Can be nil.
|
/// @param encoderBlock User-defined block to take control over command encoder. Can be nil.
|
||||||
/// The block can be used, for example, for fine-graned benchmarking where end encoding
|
/// The block can be used, for example, for fine-grained benchmarking where end encoding
|
||||||
/// is performed and command buffer is committed with completion block. A new command
|
/// is performed and command buffer is committed with completion block. A new command
|
||||||
/// buffer must be created and new command encoder must be returned by the block.
|
/// buffer must be created and new command encoder must be returned by the block.
|
||||||
/// The block is called after every dispatch encoding.
|
/// The block is called after every dispatch encoding.
|
||||||
/// @discussion No GPU sychronization functions are used inside. All GPU resources must be created
|
/// @discussion No GPU synchronization functions are used inside. All GPU resources must be created
|
||||||
/// with the same device which has been used in compileModelWithDevice() method.
|
/// with the same device which has been used in compileModelWithDevice() method.
|
||||||
- (void)encodeWithEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
|
- (void)encodeWithEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
|
||||||
inputOutputBuffers:(const std::map<::tflite::gpu::ValueId, id<MTLBuffer>>&)inputOutputBuffers
|
inputOutputBuffers:(const std::map<::tflite::gpu::ValueId, id<MTLBuffer>>&)inputOutputBuffers
|
||||||
|
@ -90,7 +90,7 @@ using ::tflite::gpu::metal::SingleOpModel;
|
|||||||
XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
|
XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
- (void)testInputTensorWithConstandBroadcast {
|
- (void)testInputTensorWithConstantBroadcast {
|
||||||
TensorRef<BHWC> input;
|
TensorRef<BHWC> input;
|
||||||
input.type = DataType::FLOAT32;
|
input.type = DataType::FLOAT32;
|
||||||
input.ref = 0;
|
input.ref = 0;
|
||||||
|
@ -24,7 +24,7 @@ const constexpr char* NnapiAccelerationTestParams::kAccelerationTestConfig =
|
|||||||
#
|
#
|
||||||
# The test_id is test_suite_name / test_name, this differs from the
|
# The test_id is test_suite_name / test_name, this differs from the
|
||||||
# name used by the build because of the / separator instead of .
|
# name used by the build because of the / separator instead of .
|
||||||
# Parametrised tests names are composed by the base test name / test / ordinal
|
# Parameterized tests names are composed by the base test name / test / ordinal
|
||||||
# the ordinal is the position in the list of parameters generated by the
|
# the ordinal is the position in the list of parameters generated by the
|
||||||
# cardinal product of all the different parameter sets
|
# cardinal product of all the different parameter sets
|
||||||
|
|
||||||
@ -39,7 +39,7 @@ const constexpr char* NnapiAccelerationTestParams::kAccelerationTestConfig =
|
|||||||
|
|
||||||
## Test Arguments
|
## Test Arguments
|
||||||
#
|
#
|
||||||
# The test can be parametrised with the minimum Android SDK version
|
# The test can be parameterized with the minimum Android SDK version
|
||||||
# to apply the acceleration validation for.
|
# to apply the acceleration validation for.
|
||||||
# If omitted will use 27
|
# If omitted will use 27
|
||||||
|
|
||||||
|
@ -155,7 +155,7 @@ bool IsScalarInputSupported(int builtin_code) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if the operation requires explict conversion from int8 to uint8 values.
|
// Check if the operation requires explicit conversion from int8 to uint8 values.
|
||||||
bool NeedInt8Conversion(const TfLiteContext* context, int builtin_code,
|
bool NeedInt8Conversion(const TfLiteContext* context, int builtin_code,
|
||||||
const TfLiteNode* node) {
|
const TfLiteNode* node) {
|
||||||
const int input_id = node->inputs->data[0];
|
const int input_id = node->inputs->data[0];
|
||||||
|
@ -172,7 +172,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
|
|||||||
bool disallow_nnapi_cpu;
|
bool disallow_nnapi_cpu;
|
||||||
// Tensor to ANeuralNetworksMemory mapping.
|
// Tensor to ANeuralNetworksMemory mapping.
|
||||||
std::vector<MemoryRegistration> tensor_memory_map;
|
std::vector<MemoryRegistration> tensor_memory_map;
|
||||||
// Constains a non zero value if any NNAPI method call
|
// Contains a non zero value if any NNAPI method call
|
||||||
// operation returned a non zero result code.
|
// operation returned a non zero result code.
|
||||||
int nnapi_errno;
|
int nnapi_errno;
|
||||||
// Cache of kernels already built in StatefulNnApiDelegate::DoPrepare
|
// Cache of kernels already built in StatefulNnApiDelegate::DoPrepare
|
||||||
|
@ -4811,17 +4811,17 @@ class PadV2OpConstModel : public PadOpModel<T1> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Test case where paddings is a non-const tensor.
|
// Test case where paddings is a non-const tensor.
|
||||||
template <typename RegularInputOuput>
|
template <typename RegularInputOutput>
|
||||||
class PadV2OpDynamicModel : public PadOpModel<RegularInputOuput> {
|
class PadV2OpDynamicModel : public PadOpModel<RegularInputOutput> {
|
||||||
public:
|
public:
|
||||||
PadV2OpDynamicModel(const TensorData& input,
|
PadV2OpDynamicModel(const TensorData& input,
|
||||||
std::initializer_list<int> paddings_shape,
|
std::initializer_list<int> paddings_shape,
|
||||||
RegularInputOuput constant_values,
|
RegularInputOutput constant_values,
|
||||||
const TensorData& output) {
|
const TensorData& output) {
|
||||||
this->input_ = this->AddInput(input);
|
this->input_ = this->AddInput(input);
|
||||||
this->paddings_ = this->AddInput(TensorType_INT32);
|
this->paddings_ = this->AddInput(TensorType_INT32);
|
||||||
this->constant_values_ = this->AddConstInput(
|
this->constant_values_ = this->AddConstInput(
|
||||||
GetTensorType<RegularInputOuput>(), {constant_values}, {1});
|
GetTensorType<RegularInputOutput>(), {constant_values}, {1});
|
||||||
this->output_ = this->AddOutput(output);
|
this->output_ = this->AddOutput(output);
|
||||||
|
|
||||||
this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
|
this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
|
||||||
|
Loading…
Reference in New Issue
Block a user