Opensource OpenCL-based backend.

Next step is to wire it up with OpenGL-based backend and provide a single GPU delegate.

PiperOrigin-RevId: 263822202
This commit is contained in:
A. Unique TensorFlower 2019-08-16 12:18:22 -07:00 committed by TensorFlower Gardener
parent 3a73493dfe
commit f1b58a9c2c
165 changed files with 24255 additions and 0 deletions

View File

@ -0,0 +1,423 @@
load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
package(
default_visibility = ["//visibility:public"],
licenses = ["notice"], # Apache 2.0
)
cc_library(
name = "opencl_wrapper",
srcs = ["opencl_wrapper.cc"],
hdrs = ["opencl_wrapper.h"],
linkopts = select({
"//tensorflow:android": [
"-ldl", # opencl_wrapper calls dlopen()
"-lm",
],
"//conditions:default": ["-ldl"], # opencl_wrapper calls dlopen()
}),
deps = [
"//tensorflow/lite/delegates/gpu/common:status",
"@com_google_absl//absl/strings",
"@opencl_headers",
],
)
cc_library(
name = "cl_device",
srcs = ["cl_device.cc"],
hdrs = ["cl_device.h"],
deps = [
":opencl_wrapper",
":util",
"//tensorflow/lite/delegates/gpu/common:status",
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "cl_event",
srcs = ["cl_event.cc"],
hdrs = ["cl_event.h"],
deps = [
":opencl_wrapper",
],
)
cc_library(
name = "cl_context",
srcs = ["cl_context.cc"],
hdrs = ["cl_context.h"],
deps = [
":cl_device",
":cl_image_format",
":opencl_wrapper",
":util",
"//tensorflow/lite/delegates/gpu/common:data_type",
"//tensorflow/lite/delegates/gpu/common:status",
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "cl_memory",
srcs = ["cl_memory.cc"],
hdrs = ["cl_memory.h"],
deps = [
":opencl_wrapper",
"//tensorflow/lite/delegates/gpu/common:access_type",
"//tensorflow/lite/delegates/gpu/common:status",
],
)
cc_library(
name = "cl_command_queue",
srcs = ["cl_command_queue.cc"],
hdrs = ["cl_command_queue.h"],
deps = [
":cl_context",
":cl_device",
":cl_event",
":cl_kernel",
":opencl_wrapper",
":util",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:types",
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "cl_image_format",
srcs = ["cl_image_format.cc"],
hdrs = ["cl_image_format.h"],
deps = [
":opencl_wrapper",
"//tensorflow/lite/delegates/gpu/common:data_type",
],
)
cc_library(
name = "util",
srcs = ["util.cc"],
hdrs = ["util.h"],
deps = [
":opencl_wrapper",
"//tensorflow/lite/delegates/gpu/common:data_type",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:tensor",
"//tensorflow/lite/delegates/gpu/common:util",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:span",
],
)
cc_library(
name = "cl_errors",
hdrs = ["cl_errors.h"],
deps = [
":util",
"//tensorflow/lite/delegates/gpu/common:status",
],
)
cc_library(
name = "cl_program",
srcs = ["cl_program.cc"],
hdrs = ["cl_program.h"],
deps = [
":cl_context",
":cl_device",
":opencl_wrapper",
":util",
"//tensorflow/lite/delegates/gpu/common:status",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:span",
],
)
cc_library(
name = "gl_interop",
srcs = ["gl_interop.cc"],
hdrs = ["gl_interop.h"],
deps = [
":cl_context",
":cl_device",
":cl_errors",
":cl_event",
":cl_memory",
":egl_sync",
":environment",
":opencl_wrapper",
"//tensorflow/lite/delegates/gpu/common:access_type",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/gl:gl_call",
"//tensorflow/lite/delegates/gpu/gl:gl_sync",
"//tensorflow/lite/delegates/gpu/gl:portable",
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "program_cache",
srcs = ["program_cache.cc"],
hdrs = ["program_cache.h"],
deps = [
":cl_context",
":cl_device",
":cl_kernel",
":cl_program",
":compiled_program_cache_cc_fbs",
":util",
"//tensorflow/lite/delegates/gpu/common:status",
"@com_google_absl//absl/types:span",
"@farmhash_archive//:farmhash",
"@flatbuffers",
],
)
cc_library(
name = "precision",
srcs = ["precision.cc"],
hdrs = ["precision.h"],
deps = [
"//tensorflow/lite/delegates/gpu/common:data_type",
],
)
cc_library(
name = "cl_kernel",
srcs = ["cl_kernel.cc"],
hdrs = ["cl_kernel.h"],
deps = [
":cl_context",
":cl_device",
":cl_program",
":opencl_wrapper",
":util",
"//tensorflow/lite/delegates/gpu/cl/kernels:flt_type",
"//tensorflow/lite/delegates/gpu/common:status",
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "buffer",
srcs = ["buffer.cc"],
hdrs = ["buffer.h"],
deps = [
":cl_command_queue",
":cl_context",
":opencl_wrapper",
":util",
"//tensorflow/lite/delegates/gpu/common:status",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:span",
],
)
cc_library(
name = "texture2d",
srcs = ["texture2d.cc"],
hdrs = ["texture2d.h"],
deps = [
":cl_command_queue",
":cl_context",
":opencl_wrapper",
":tensor_type",
":util",
"//tensorflow/lite/delegates/gpu/common:data_type",
"//tensorflow/lite/delegates/gpu/common:status",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:span",
],
)
cc_library(
name = "tensor",
srcs = ["tensor.cc"],
hdrs = ["tensor.h"],
deps = [
":cl_command_queue",
":cl_context",
":cl_device",
":cl_image_format",
":cl_memory",
":tensor_type",
":util",
"//tensorflow/lite/delegates/gpu/common:data_type",
"//tensorflow/lite/delegates/gpu/common:shape",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:tensor",
"//tensorflow/lite/delegates/gpu/common:types",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:span",
],
)
cc_library(
name = "tensor_type",
srcs = ["tensor_type.cc"],
hdrs = ["tensor_type.h"],
deps = [
"//tensorflow/lite/delegates/gpu/common:data_type",
],
)
cc_library(
name = "tensor_type_util",
srcs = ["tensor_type_util.cc"],
hdrs = ["tensor_type_util.h"],
deps = [
":tensor_type",
"//tensorflow/lite/delegates/gpu:api",
],
)
cc_library(
name = "environment",
srcs = ["environment.cc"],
hdrs = ["environment.h"],
deps = [
":cl_command_queue",
":cl_context",
":cl_device",
":cl_kernel",
":precision",
":program_cache",
":tensor",
":tensor_type",
":util",
"//tensorflow/lite/delegates/gpu/common:data_type",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:tensor",
],
)
cc_library(
name = "inference_context",
srcs = ["inference_context.cc"],
hdrs = ["inference_context.h"],
deps = [
":cl_command_queue",
":cl_device",
":environment",
":model_hints",
":opencl_wrapper",
":precision",
":tensor_type",
"//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
"//tensorflow/lite/delegates/gpu/cl/selectors:operation_selector",
"//tensorflow/lite/delegates/gpu/common:memory_management",
"//tensorflow/lite/delegates/gpu/common:model",
"//tensorflow/lite/delegates/gpu/common:model_transformer",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:tensor",
"//tensorflow/lite/delegates/gpu/common:types",
"//tensorflow/lite/delegates/gpu/common/transformations:add_bias",
"//tensorflow/lite/delegates/gpu/common/transformations:merge_padding_with",
],
)
cc_library(
name = "linear_storage",
srcs = ["linear_storage.cc"],
hdrs = ["linear_storage.h"],
deps = [
":buffer",
":opencl_wrapper",
":tensor_type",
":texture2d",
":util",
"//tensorflow/lite/delegates/gpu/common:data_type",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:types",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:span",
],
)
cc_library(
name = "model_hints",
hdrs = ["model_hints.h"],
)
cc_library(
name = "egl_sync",
srcs = ["egl_sync.cc"],
hdrs = ["egl_sync.h"],
defines = [
"EGL_EGLEXT_PROTOTYPES",
],
deps = [
":cl_device",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/gl:gl_call",
],
)
cc_library(
name = "api",
srcs = ["api.cc"],
hdrs = ["api.h"],
deps = [
":cl_command_queue",
":cl_errors",
":cl_event",
":egl_sync",
":environment",
":gl_interop",
":inference_context",
":opencl_wrapper",
":precision",
":tensor",
":tensor_type",
":tensor_type_util",
"//tensorflow/lite/delegates/gpu:api",
"//tensorflow/lite/delegates/gpu/cl/kernels:converter",
"//tensorflow/lite/delegates/gpu/common:data_type",
"//tensorflow/lite/delegates/gpu/common:model",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:tensor",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/types:span",
],
)
cc_library(
name = "gpu_api_delegate",
srcs = ["gpu_api_delegate.cc"],
hdrs = ["gpu_api_delegate.h"],
linkopts = select({
"//tensorflow:android": [
"-lEGL",
"-lGLESv3",
],
"//conditions:default": [],
}),
deps = [
":api",
":opencl_wrapper",
":tensor_type_util",
"//tensorflow/lite:kernel_api",
"//tensorflow/lite/c:c_api_internal",
"//tensorflow/lite/delegates/gpu:api",
"//tensorflow/lite/delegates/gpu/common:model",
"//tensorflow/lite/delegates/gpu/common:model_builder",
"//tensorflow/lite/delegates/gpu/common:model_transformer",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
"@com_google_absl//absl/types:span",
],
)
flatbuffer_cc_library(
name = "compiled_program_cache_cc_fbs",
srcs = ["compiled_program_cache.fbs"],
flatc_args = [
"--scoped-enums",
],
)
tflite_portable_test_suite()

View File

@ -0,0 +1,790 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/api.h"
#include <algorithm>
#include <cstring>
#include <EGL/eglext.h>
#include "absl/memory/memory.h"
#include "absl/types/span.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
#include "tensorflow/lite/delegates/gpu/cl/environment.h"
#include "tensorflow/lite/delegates/gpu/cl/gl_interop.h"
#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
// Connects tensor definition provided by a user (external) with tensor
// definition used by the inference engine (internal).
struct TensorTieDef {
ValueId id;
AccessType access_type;
TensorObjectDef internal_def;
TensorObjectDef external_def;
};
// Connects external tensor object to internal tensor object and provides
// functionality to copy data to/from external object to internal.
class TensorTie {
public:
explicit TensorTie(const TensorTieDef& def) : def_(def) {}
virtual ~TensorTie() {}
virtual Status SetExternalObject(TensorObject obj) {
return InvalidArgumentError("Tensor object is readonly.");
}
virtual TensorObject GetExternalObject() = 0;
virtual Status CopyToExternalObject() = 0;
virtual Status CopyFromExternalObject() = 0;
const TensorTieDef& def() const { return def_; }
private:
const TensorTieDef def_;
};
// Both internal and external defs are identical, therefore nothing to connect
// here.
class NoopTensorTie : public TensorTie {
public:
NoopTensorTie(const TensorTieDef& def, TensorObject obj)
: TensorTie(def), obj_(obj) {}
static bool IsSupported(const TensorTieDef& def) {
return def.external_def == def.internal_def;
}
static Status New(const TensorTieDef& def, TensorObject internal_object,
std::unique_ptr<TensorTie>* tie) {
*tie = absl::make_unique<NoopTensorTie>(def, internal_object);
return OkStatus();
}
TensorObject GetExternalObject() final { return obj_; }
Status CopyToExternalObject() final { return OkStatus(); }
Status CopyFromExternalObject() final { return OkStatus(); }
private:
TensorObject obj_;
};
// Does one-step conversion between internal and external objects.
// It may also allocate external objects if requested.
class DefaultTensorTie : public TensorTie {
public:
DefaultTensorTie(const TensorTieDef& def, TensorObject internal_obj)
: TensorTie(def), internal_obj_(internal_obj) {}
static bool IsSupported(const TensorTieDef& def,
TensorObjectConverterBuilder* converter_builder) {
auto object_type = def.external_def.object_def.object_type;
return (object_type == ObjectType::OPENCL_BUFFER ||
object_type == ObjectType::OPENCL_TEXTURE ||
object_type == ObjectType::CPU_MEMORY) &&
converter_builder->IsSupported(def.internal_def, def.external_def) &&
converter_builder->IsSupported(def.external_def, def.internal_def);
}
static Status New(const TensorTieDef& def, TensorObject internal_object,
TensorObjectConverterBuilder* converter_builder,
Environment* env, std::unique_ptr<TensorTie>* tie) {
auto tie_impl = absl::make_unique<DefaultTensorTie>(def, internal_object);
RETURN_IF_ERROR(tie_impl->Init(converter_builder, env));
*tie = std::move(tie_impl);
return OkStatus();
}
Status CopyToExternalObject() final {
if (!converter_to_) {
return UnavailableError("Conversion is not available");
}
return converter_to_->Convert(internal_obj_, GetExternalObject());
}
Status CopyFromExternalObject() final {
if (!converter_from_) {
return UnavailableError("Conversion is not available");
}
return converter_from_->Convert(GetExternalObject(), internal_obj_);
}
Status SetExternalObject(TensorObject obj) final {
if (!def().external_def.object_def.user_provided) {
return InvalidArgumentError("External object is read-only");
}
if (!IsValid(def().external_def, obj)) {
return InvalidArgumentError("Given object is not valid");
}
external_obj_ = obj;
return OkStatus();
}
TensorObject GetExternalObject() final { return external_obj_; }
private:
Status Init(TensorObjectConverterBuilder* converter_builder,
Environment* env) {
RETURN_IF_ERROR(converter_builder->MakeConverter(
def().internal_def, def().external_def, &converter_to_));
RETURN_IF_ERROR(converter_builder->MakeConverter(
def().external_def, def().internal_def, &converter_from_));
return MaybeAllocateExternalObject(env);
}
Status MaybeAllocateExternalObject(Environment* env) {
const TensorObjectDef& d = def().external_def;
if (d.object_def.user_provided) {
return OkStatus();
}
switch (d.object_def.object_type) {
case ObjectType::CPU_MEMORY: {
size_t bytes_size =
d.dimensions.product() * SizeOf(d.object_def.data_type);
cpu_memory_.resize(bytes_size);
external_obj_ = CpuMemory{cpu_memory_.data(), cpu_memory_.size()};
break;
}
case ObjectType::OPENCL_TEXTURE:
case ObjectType::OPENCL_BUFFER: {
auto& dims = d.dimensions;
RETURN_IF_ERROR(
AllocateTensorMemory(env->context(), env->device(), dims.w, dims.h,
dims.c, d.object_def.data_type,
ToTensorStorageType(d.object_def.object_type,
d.object_def.data_layout),
&cl_memory_));
if (d.object_def.object_type == ObjectType::OPENCL_TEXTURE) {
external_obj_ = OpenClTexture{cl_memory_.memory()};
} else {
external_obj_ = OpenClBuffer{cl_memory_.memory()};
}
break;
}
default:
return InternalError("Unexpected object type");
}
return OkStatus();
}
const TensorObject internal_obj_;
TensorObject external_obj_;
CLMemory cl_memory_;
std::vector<uint8_t> cpu_memory_;
std::unique_ptr<TensorObjectConverter> converter_to_;
std::unique_ptr<TensorObjectConverter> converter_from_;
};
// Copies data to intermediate OpenCL buffer and then does two step conversion.
// It drives the following cases were one-step conversion is not supported:
// - CPU BHWC -> CL buffer BHWC -> CL texture DHWC4.
class TwoStepTensorTie : public TensorTie {
public:
explicit TwoStepTensorTie(const TensorTieDef& def) : TensorTie(def) {}
static bool IsSupported(const TensorTieDef& def,
TensorObjectConverterBuilder* converter_builder) {
auto defs = MakeOuterInnerDefs(def);
return DefaultTensorTie::IsSupported(defs.first, converter_builder) &&
DefaultTensorTie::IsSupported(defs.second, converter_builder);
}
static Status New(const TensorTieDef& def, TensorObject internal_object,
TensorObjectConverterBuilder* converter_builder,
Environment* env, std::unique_ptr<TensorTie>* tie) {
auto tie_impl = absl::make_unique<TwoStepTensorTie>(def);
RETURN_IF_ERROR(tie_impl->Init(internal_object, converter_builder, env));
*tie = std::move(tie_impl);
return OkStatus();
}
Status CopyToExternalObject() final {
RETURN_IF_ERROR(inner_tie_->CopyToExternalObject());
return outer_tie_->CopyToExternalObject();
}
Status CopyFromExternalObject() final {
RETURN_IF_ERROR(outer_tie_->CopyFromExternalObject());
return inner_tie_->CopyFromExternalObject();
}
Status SetExternalObject(TensorObject obj) final {
return outer_tie_->SetExternalObject(obj);
}
TensorObject GetExternalObject() final {
return outer_tie_->GetExternalObject();
}
private:
static std::pair<TensorTieDef, TensorTieDef> MakeOuterInnerDefs(
const TensorTieDef& def) {
TensorTieDef outer_def;
outer_def.external_def = def.external_def;
outer_def.internal_def = def.external_def;
outer_def.internal_def.object_def.object_type = ObjectType::OPENCL_BUFFER;
outer_def.internal_def.object_def.user_provided = true;
TensorTieDef inner_def;
inner_def.external_def = outer_def.internal_def;
inner_def.external_def.object_def.user_provided = false;
inner_def.internal_def = def.internal_def;
return std::make_pair(outer_def, inner_def);
}
Status Init(TensorObject internal_object,
TensorObjectConverterBuilder* converter_builder,
Environment* env) {
auto defs = MakeOuterInnerDefs(def());
RETURN_IF_ERROR(DefaultTensorTie::New(defs.second, internal_object,
converter_builder, env, &inner_tie_));
return DefaultTensorTie::New(defs.first, inner_tie_->GetExternalObject(),
converter_builder, env, &outer_tie_);
}
std::unique_ptr<TensorTie> inner_tie_;
std::unique_ptr<TensorTie> outer_tie_;
};
// Captures GL object into CL context before performing a conversion.
class GlBufferHolder : public TensorTie {
public:
GlBufferHolder(const TensorTieDef& def, GlInteropFabric* gl_interop_fabric,
Environment* env)
: TensorTie(def),
gl_interop_fabric_(gl_interop_fabric),
environment_(env) {}
static bool IsSupported(const TensorTieDef& def,
TensorObjectConverterBuilder* converter_builder) {
if (!def.external_def.object_def.user_provided ||
def.external_def.object_def.object_type != ObjectType::OPENGL_SSBO) {
return false;
}
return DefaultTensorTie::IsSupported(MakeClDef(def), converter_builder);
}
static Status New(const TensorTieDef& def, TensorObject internal_object,
TensorObjectConverterBuilder* converter_builder,
GlInteropFabric* gl_interop_fabric, Environment* env,
std::unique_ptr<TensorTie>* tie) {
auto tie_impl =
absl::make_unique<GlBufferHolder>(def, gl_interop_fabric, env);
RETURN_IF_ERROR(DefaultTensorTie::New(MakeClDef(def), internal_object,
converter_builder, env,
&tie_impl->tie_));
*tie = std::move(tie_impl);
return OkStatus();
}
Status SetExternalObject(TensorObject obj) final {
auto ssbo = absl::get_if<OpenGlBuffer>(&obj);
if (!ssbo) {
return InvalidArgumentError("Missing OpenGL SSBO");
}
auto old_ssbo = absl::get_if<OpenGlBuffer>(&external_obj_);
if (old_ssbo && ssbo->id == old_ssbo->id) {
return OkStatus();
}
if (cl_object_.memory()) {
gl_interop_fabric_->UnregisterMemory(cl_object_.memory());
}
RETURN_IF_ERROR(CreateClMemoryFromGlBuffer(
ssbo->id, def().access_type, &environment_->context(), &cl_object_));
external_obj_ = obj;
RETURN_IF_ERROR(tie_->SetExternalObject(OpenClBuffer{cl_object_.memory()}));
gl_interop_fabric_->RegisterMemory(cl_object_.memory());
return OkStatus();
}
TensorObject GetExternalObject() final { return external_obj_; }
Status CopyFromExternalObject() final {
return tie_->CopyFromExternalObject();
}
Status CopyToExternalObject() final { return tie_->CopyToExternalObject(); }
private:
static TensorTieDef MakeClDef(const TensorTieDef& def) {
auto cl_def = def;
cl_def.external_def.object_def.object_type = ObjectType::OPENCL_BUFFER;
cl_def.external_def.object_def.user_provided = true;
return cl_def;
}
CLMemory cl_object_;
GlInteropFabric* gl_interop_fabric_;
Environment* environment_;
std::unique_ptr<TensorTie> tie_;
TensorObject external_obj_;
};
TensorObject TensorToObj(const Tensor& tensor) {
if (tensor.StorageType() == TensorStorageType::BUFFER) {
return OpenClBuffer{tensor.GetMemoryPtr()};
}
return OpenClTexture{tensor.GetMemoryPtr()};
}
// Responsible for creating new tensor objects.
class TensorTieFactory {
public:
TensorTieFactory(Environment* env, InferenceContext* context,
GlInteropFabric* gl_interop_fabric)
: env_(*env),
context_(*context),
gl_interop_fabric_(gl_interop_fabric),
converter_builder_(NewConverterBuilder(env)) {}
bool IsSupported(const TensorTieDef& def) const {
auto converter = converter_builder_.get();
return IsValid(def.external_def.object_def) &&
(NoopTensorTie::IsSupported(def) ||
DefaultTensorTie::IsSupported(def, converter) ||
GlBufferHolder::IsSupported(def, converter) ||
TwoStepTensorTie::IsSupported(def, converter));
}
Status NewTensorTie(const TensorTieDef& def,
std::unique_ptr<TensorTie>* tie) {
TensorObject internal_object = TensorToObj(*context_.GetTensor(def.id));
auto converter = converter_builder_.get();
if (NoopTensorTie::IsSupported(def)) {
return NoopTensorTie::New(def, internal_object, tie);
}
if (DefaultTensorTie::IsSupported(def, converter)) {
return DefaultTensorTie::New(def, internal_object, converter, &env_, tie);
}
if (GlBufferHolder::IsSupported(def, converter)) {
if (!gl_interop_fabric_) {
return InvalidArgumentError(
"GL object is used but InferenceEnvironmentOptions does not have "
"EGL display and context set.");
}
return GlBufferHolder::New(def, internal_object, converter,
gl_interop_fabric_, &env_, tie);
}
if (TwoStepTensorTie::IsSupported(def, converter)) {
return TwoStepTensorTie::New(def, internal_object, converter, &env_, tie);
}
return UnimplementedError("Unsupported tensor tie definition.");
}
private:
Environment& env_;
InferenceContext& context_;
GlInteropFabric* gl_interop_fabric_;
std::unique_ptr<TensorObjectConverterBuilder> converter_builder_;
};
class InferenceRunnerImpl : public InferenceRunner {
public:
InferenceRunnerImpl(const InferenceEnvironmentOptions& env_options,
Environment* environment,
std::unique_ptr<InferenceContext> context,
std::unique_ptr<GlInteropFabric> gl_interop_fabric)
: env_options_(env_options),
environment_(environment),
context_(std::move(context)),
gl_interop_fabric_(std::move(gl_interop_fabric)) {}
Status Initialize(const std::vector<TensorTieDef>& inputs,
const std::vector<TensorTieDef>& outputs,
TensorTieFactory* factory) {
RETURN_IF_ERROR(LinkTensors(inputs, factory, &inputs_));
return LinkTensors(outputs, factory, &outputs_);
}
std::vector<TensorObjectDef> inputs() const override {
return GetExternalDefinitions(inputs_);
}
std::vector<TensorObjectDef> outputs() const override {
return GetExternalDefinitions(outputs_);
}
Status GetInputObject(int index, TensorObject* object) override {
if (index < 0 || index > inputs_.size()) {
return OutOfRangeError("Index is out of range");
}
*object = inputs_[index]->GetExternalObject();
return OkStatus();
}
Status GetOutputObject(int index, TensorObject* object) override {
if (index < 0 || index > outputs_.size()) {
return OutOfRangeError("Index is out of range");
}
*object = outputs_[index]->GetExternalObject();
return OkStatus();
}
Status SetInputObject(int index, TensorObject object) override {
if (index < 0 || index > inputs_.size()) {
return OutOfRangeError("Index is out of range");
}
return inputs_[index]->SetExternalObject(object);
}
Status SetOutputObject(int index, TensorObject object) override {
if (index < 0 || index > outputs_.size()) {
return OutOfRangeError("Index is out of range");
}
return outputs_[index]->SetExternalObject(object);
}
Status Run() override {
if (gl_interop_fabric_) {
RETURN_IF_ERROR(gl_interop_fabric_->Start());
}
for (auto& obj : inputs_) {
RETURN_IF_ERROR(obj->CopyFromExternalObject());
}
RETURN_IF_ERROR(context_->AddToQueue(environment_->queue()));
clFlush(environment_->queue()->queue());
for (auto& obj : outputs_) {
RETURN_IF_ERROR(obj->CopyToExternalObject());
}
if (gl_interop_fabric_) {
RETURN_IF_ERROR(gl_interop_fabric_->Finish());
}
return OkStatus();
}
private:
static Status LinkTensors(const std::vector<TensorTieDef>& defs,
TensorTieFactory* factory,
std::vector<std::unique_ptr<TensorTie>>* objects) {
objects->reserve(defs.size());
for (auto& def : defs) {
std::unique_ptr<TensorTie> object;
RETURN_IF_ERROR(factory->NewTensorTie(def, &object));
objects->push_back(std::move(object));
}
return OkStatus();
}
static std::vector<TensorObjectDef> GetExternalDefinitions(
const std::vector<std::unique_ptr<TensorTie>>& objects) {
std::vector<TensorObjectDef> defs;
defs.reserve(objects.size());
for (auto& obj : objects) {
defs.push_back(obj->def().external_def);
}
return defs;
}
const InferenceEnvironmentOptions env_options_;
Environment* environment_;
std::unique_ptr<InferenceContext> context_;
std::unique_ptr<GlInteropFabric> gl_interop_fabric_;
std::vector<std::unique_ptr<TensorTie>> inputs_;
std::vector<std::unique_ptr<TensorTie>> outputs_;
};
TensorObjectDef TensorToDef(const Tensor& tensor) {
TensorObjectDef def;
def.dimensions.b = 1;
def.dimensions.h = tensor.Height();
def.dimensions.w = tensor.Width();
def.dimensions.c = tensor.Channels();
def.object_def.data_layout = ToDataLayout(tensor.StorageType());
def.object_def.data_type = tensor.DataType();
def.object_def.object_type = ToObjectType(tensor.StorageType());
def.object_def.user_provided = false;
return def;
}
class InferenceBuilderImpl : public InferenceBuilder {
public:
InferenceBuilderImpl(const InferenceOptions& options,
const InferenceEnvironmentOptions env_options,
const InferenceEnvironmentProperties properties,
Environment* environment,
std::unique_ptr<GraphFloat32> graph)
: options_(options),
env_options_(env_options),
properties_(properties),
environment_(environment),
graph_(std::move(graph)) {}
Status Initialize() {
// Select precision based on given options.
CalculationsPrecision precision = CalculationsPrecision::F32;
if (options_.allow_precision_loss) {
precision = options_.priority == InferencePriority::MAX_PRECISION
? CalculationsPrecision::F32_F16
: CalculationsPrecision::F16;
}
// Increase precision if not supported.
if (!environment_->IsSupported(precision)) {
precision = CalculationsPrecision::F32_F16;
if (!environment_->IsSupported(precision)) {
precision = CalculationsPrecision::F32;
}
}
context_ = absl::make_unique<InferenceContext>();
InferenceContext::CreateInferenceInfo create_info;
create_info.precision = precision;
create_info.storage_type = GetOptimalStorageType(environment_->device());
create_info.hints.Add(ModelHints::kReduceKernelsCount);
// TODO(sorokin) temporary hack to speed up init time in some cases.
// TODO(sorokin): move this check to the place where hint is applied.
if ((precision == CalculationsPrecision::F16 ||
precision == CalculationsPrecision::F32_F16) &&
create_info.storage_type == TensorStorageType::TEXTURE_ARRAY &&
environment_->device().IsAdreno6xxOrHigher()) {
create_info.hints.Add(ModelHints::kFastTuning);
}
RETURN_IF_ERROR(
context_->InitFromGraph(create_info, *graph_, environment_));
if (env_options_.IsGlAware()) {
gl_interop_fabric_ = absl::make_unique<GlInteropFabric>(
env_options_.egl_display, environment_);
}
tie_factory_ = absl::make_unique<TensorTieFactory>(
environment_, context_.get(), gl_interop_fabric_.get());
inputs_ = LinkTensors(graph_->inputs());
outputs_ = LinkTensors(graph_->outputs());
return OkStatus();
}
std::vector<TensorObjectDef> inputs() const override {
return GetExternalDefinitions(inputs_);
}
std::vector<TensorObjectDef> outputs() const override {
return GetExternalDefinitions(outputs_);
}
Status SetInputShape(int index, const Dimensions& dimensions) override {
if (index < 0 || index > inputs_.size()) {
return OutOfRangeError("Index is out of range");
}
return UnimplementedError("Changing input shapes is not supported");
}
Status SetInputObjectDef(int index, ObjectDef new_def) override {
if (index < 0 || index > inputs_.size()) {
return OutOfRangeError("Index is out of range");
}
auto def = inputs_[index];
def.external_def.object_def = new_def;
if (!tie_factory_->IsSupported(def)) {
return InvalidArgumentError("New object definition is not supported.");
}
inputs_[index] = def;
return OkStatus();
}
Status SetOutputObjectDef(int index, ObjectDef new_def) override {
if (index < 0 || index > outputs_.size()) {
return OutOfRangeError("Index is out of range");
}
auto def = outputs_[index];
def.external_def.object_def = new_def;
if (!tie_factory_->IsSupported(def)) {
return InvalidArgumentError("New object definition is not supported.");
}
outputs_[index] = def;
return OkStatus();
}
Status Build(std::unique_ptr<InferenceRunner>* runner) override {
if (gl_interop_fabric_ && !HasGlObjects()) {
// destroy interop layer when there are no GL objects to avoid
// extra synchronization cost.
gl_interop_fabric_.reset(nullptr);
}
auto runner_impl = absl::make_unique<InferenceRunnerImpl>(
env_options_, environment_, std::move(context_),
std::move(gl_interop_fabric_));
RETURN_IF_ERROR(
runner_impl->Initialize(inputs_, outputs_, tie_factory_.get()));
*runner = std::move(runner_impl);
return OkStatus();
}
private:
// Links internal tensors with external user-facing objects.
std::vector<TensorTieDef> LinkTensors(
const std::vector<Value<TensorRef<BHWC>>*>& values) {
std::vector<TensorTieDef> links;
links.reserve(values.size());
for (const auto& value : values) {
TensorObjectDef def = TensorToDef(*context_->GetTensor(value->id));
AccessType access = graph_->IsGraphInput(value->id) ? AccessType::READ
: AccessType::WRITE;
links.push_back({value->id, access, def, def});
}
return links;
}
bool HasGlObjects() const {
auto is_gl = [](ObjectType t) {
return t == ObjectType::OPENGL_SSBO || t == ObjectType::OPENGL_TEXTURE;
};
for (const TensorTieDef& def : inputs_) {
if (is_gl(def.external_def.object_def.object_type)) {
return true;
}
}
for (const TensorTieDef& def : outputs_) {
if (is_gl(def.external_def.object_def.object_type)) {
return true;
}
}
return false;
}
static std::vector<TensorObjectDef> GetExternalDefinitions(
const std::vector<TensorTieDef>& links) {
std::vector<TensorObjectDef> defs;
defs.reserve(links.size());
for (auto& desc : links) {
defs.push_back(desc.external_def);
}
return defs;
}
const InferenceOptions options_;
const InferenceEnvironmentOptions env_options_;
const InferenceEnvironmentProperties properties_;
std::unique_ptr<InferenceContext> context_;
std::unique_ptr<GlInteropFabric> gl_interop_fabric_;
Environment* environment_;
std::unique_ptr<GraphFloat32> graph_;
std::vector<TensorTieDef> inputs_;
std::vector<TensorTieDef> outputs_;
std::unique_ptr<TensorTieFactory> tie_factory_;
};
class InferenceEnvironmentImpl : public InferenceEnvironment {
public:
explicit InferenceEnvironmentImpl(const InferenceEnvironmentOptions& options)
: options_(options) {}
Status Init() {
RETURN_IF_ERROR(LoadOpenCL());
properties_.is_opencl_available = true;
if (options_.IsGlAware()) {
RETURN_IF_ERROR(CreateGLCompatibleEnvironment(
reinterpret_cast<cl_context_properties>(options_.egl_context),
reinterpret_cast<cl_context_properties>(options_.egl_display),
&environment_));
} else {
RETURN_IF_ERROR(CreateEnvironment(&environment_));
}
auto& device = environment_.device();
properties_.is_gl_sharing_supported = IsGlSharingSupported(device);
properties_.is_gl_to_cl_fast_sync_supported =
IsClEventFromEglSyncSupported(device);
properties_.is_cl_to_gl_fast_sync_supported =
IsEglSyncFromClEventSupported();
if (options_.IsGlAware() && !properties_.is_gl_sharing_supported) {
return UnavailableError("GL sharing is not supported");
}
return OkStatus();
}
Status NewInferenceBuilder(const InferenceOptions& options,
const GraphFloat32& model,
std::unique_ptr<InferenceBuilder>* builder) final {
if (environment_.program_cache() &&
!options_.serialized_binary_cache.empty()) {
// Ignore returned error. Cache is discarded.
environment_.program_cache()
->AddSerializedCache(environment_.context(), environment_.device(),
options_.serialized_binary_cache)
.IgnoreError();
}
auto cl_graph = absl::make_unique<GraphFloat32>();
RETURN_IF_ERROR(model.MakeExactCopy(cl_graph.get()));
RETURN_IF_ERROR(RunGraphTransforms(cl_graph.get()));
auto builder_impl = absl::make_unique<InferenceBuilderImpl>(
options, options_, properties_, &environment_, std::move(cl_graph));
RETURN_IF_ERROR(builder_impl->Initialize());
*builder = std::move(builder_impl);
return OkStatus();
}
std::vector<uint8_t> GetSerializedBinaryCache() const final {
std::vector<uint8_t> data;
// Is there was a problem, data would be empty.
environment_.program_cache()
->GetSerializedCache(environment_.device(), &data)
.IgnoreError();
return data;
}
const InferenceEnvironmentProperties& properties() const {
return properties_;
}
private:
const InferenceEnvironmentOptions options_;
Environment environment_;
InferenceEnvironmentProperties properties_;
};
} // namespace
Status NewInferenceEnvironment(
const InferenceEnvironmentOptions& options,
std::unique_ptr<InferenceEnvironment>* environment,
InferenceEnvironmentProperties* properties) {
auto env_impl = absl::make_unique<InferenceEnvironmentImpl>(options);
Status status = env_impl->Init();
if (properties) {
*properties = env_impl->properties();
}
RETURN_IF_ERROR(status);
*environment = std::move(env_impl);
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,125 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
#include <cstdint>
#include <memory>
#include <EGL/egl.h>
#include "absl/types/span.h"
#include "tensorflow/lite/delegates/gpu/api.h"
#include "tensorflow/lite/delegates/gpu/common/model.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
// Usage example:
//
// std::unique_ptr<InferenceEnvironment> env;
// RETURN_IF_ERROR(NewInferenceEnvironment(option, &env));
//
// InferenceOptions options;
//
// std::unique_ptr<InferenceBuilder> builder;
// RETURN_IF_ERROR(env->NewInferenceBuilder(options, model, &builder));
// // now builder is ready to prepare inference runner.
//
// -----------------
// Supported formats
// -----------------
//
// OpenCL implementation uses 2D textures as the primary format.
// Tensor in HWDC4 layout is {TEXTURE_2D, RGBA, width := W*D, height := H}.
//
namespace tflite {
namespace gpu {
namespace cl {
enum class InferencePriority {
MIN_LATENCY,
MAX_PRECISION,
};
struct InferenceOptions {
bool allow_precision_loss = false;
InferencePriority priority = InferencePriority::MAX_PRECISION;
};
// Indicates environment
struct InferenceEnvironmentProperties {
bool is_opencl_available = false;
// GL objects (buffers and textures) could be shared with CL context.
bool is_gl_sharing_supported = false;
// Indicates whether fast GL->CL synchronization is supported.
bool is_gl_to_cl_fast_sync_supported = false;
// Indicates whether fast CL->GL synchronization is supported.
bool is_cl_to_gl_fast_sync_supported = false;
};
// Environment manages all resources that need to stay until any inference is
// running using OpenCL backend.
class InferenceEnvironment {
public:
virtual ~InferenceEnvironment() {}
virtual Status NewInferenceBuilder(
const InferenceOptions& options, const GraphFloat32& model,
std::unique_ptr<InferenceBuilder>* builder) = 0;
// Returns opaque binary blob that contains a collection of already compiled
// OpenCL kernels present in a cache. Returned data could be re-used later
// to speed up compilation time when new environment is created for the same
// set of models.
// Returned data is valid only if used on the same device, otherwise it will
// not be compatible and will be discarded.
virtual std::vector<uint8_t> GetSerializedBinaryCache() const = 0;
};
struct InferenceEnvironmentOptions {
// Whenever input and/or output is GL object, EGL display and context must be
// set to create GL aware OpenCL context. Do not set these variables whenever
// GL interoperability is not needed.
EGLDisplay egl_display = EGL_NO_DISPLAY;
EGLContext egl_context = EGL_NO_CONTEXT;
// Should contain data returned from
// InferenceEnvironment::GetSerializedBinaryCache method.
// Invalid or incompatible data will be discarded. Compiled binary may become
// incompatible when GPU driver is updated.
absl::Span<const uint8_t> serialized_binary_cache;
bool IsGlAware() const {
return egl_context != EGL_NO_CONTEXT && egl_display != EGL_NO_DISPLAY;
}
};
// Creates new OpenCL environment that needs to stay around until all inference
// runners are destroyed.
Status NewInferenceEnvironment(
const InferenceEnvironmentOptions& options,
std::unique_ptr<InferenceEnvironment>* environment,
InferenceEnvironmentProperties* properties /* optional */);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_

View File

@ -0,0 +1,89 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only, const void* data,
CLContext* context, Buffer* result) {
cl_mem_flags flags = gpu_read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
if (data != nullptr) {
flags |= CL_MEM_COPY_HOST_PTR;
}
cl_int error_code;
cl_mem buffer = clCreateBuffer(context->context(), flags, size_in_bytes,
const_cast<void*>(data), &error_code);
if (!buffer) {
return UnknownError(
absl::StrCat("Failed to allocate device memory with clCreateBuffer",
CLErrorCodeToString(error_code)));
}
*result = Buffer(buffer, size_in_bytes);
return OkStatus();
}
} // namespace
Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
: buffer_(buffer), size_(size_in_bytes) {}
Buffer::Buffer(Buffer&& buffer) : buffer_(buffer.buffer_), size_(buffer.size_) {
buffer.buffer_ = nullptr;
buffer.size_ = 0;
}
Buffer& Buffer::operator=(Buffer&& buffer) {
if (this != &buffer) {
Release();
std::swap(size_, buffer.size_);
std::swap(buffer_, buffer.buffer_);
}
return *this;
}
Buffer::~Buffer() { Release(); }
void Buffer::Release() {
if (buffer_) {
clReleaseMemObject(buffer_);
buffer_ = nullptr;
size_ = 0;
}
}
Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
Buffer* result) {
return CreateBuffer(size_in_bytes, true, nullptr, context, result);
}
Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
CLContext* context, Buffer* result) {
return CreateBuffer(size_in_bytes, true, data, context, result);
}
Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext* context,
Buffer* result) {
return CreateBuffer(size_in_bytes, false, nullptr, context, result);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,99 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_
#include "absl/strings/str_cat.h"
#include "absl/types/span.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
// Buffer represent linear GPU data storage with arbitrary data format.
// Buffer is moveable but not copyable.
class Buffer {
public:
Buffer() {} // just for using Buffer as a class members
Buffer(cl_mem buffer, size_t size_in_bytes);
// Move only
Buffer(Buffer&& buffer);
Buffer& operator=(Buffer&& buffer);
Buffer(const Buffer&) = delete;
Buffer& operator=(const Buffer&) = delete;
~Buffer();
cl_mem GetMemoryPtr() const { return buffer_; }
// Writes data to a buffer. Data should point to a region that
// has exact size in bytes as size_in_bytes(constructor parameter).
template <typename T>
Status WriteData(CLCommandQueue* queue, const absl::Span<T> data);
// Reads data from Buffer into CPU memory.
template <typename T>
Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
private:
void Release();
cl_mem buffer_ = nullptr;
int size_;
};
Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
Buffer* result);
Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
CLContext* context, Buffer* result);
Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext* context,
Buffer* result);
template <typename T>
Status Buffer::WriteData(CLCommandQueue* queue, const absl::Span<T> data) {
if (size_ != sizeof(T) * data.size()) {
return InvalidArgumentError(
"absl::Span<T> data size is different from buffer allocated size.");
}
RETURN_IF_ERROR(queue->EnqueueWriteBuffer(buffer_, size_, data.data()));
return OkStatus();
}
template <typename T>
Status Buffer::ReadData(CLCommandQueue* queue, std::vector<T>* result) const {
if (size_ % sizeof(T) != 0) {
return UnknownError("Wrong element size(typename T is not correct?");
}
const int elements_count = size_ / sizeof(T);
result->resize(elements_count);
return queue->EnqueueReadBuffer(buffer_, size_, result->data());
}
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_BUFFER_H_

View File

@ -0,0 +1,326 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
#include <vector>
#include "absl/strings/str_cat.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
CLCommandQueue::CLCommandQueue(cl_command_queue queue) : queue_(queue) {}
CLCommandQueue::CLCommandQueue(CLCommandQueue&& queue) : queue_(queue.queue_) {
queue.queue_ = nullptr;
}
CLCommandQueue& CLCommandQueue::operator=(CLCommandQueue&& queue) {
if (this != &queue) {
Release();
std::swap(queue_, queue.queue_);
}
return *this;
}
CLCommandQueue::~CLCommandQueue() { Release(); }
void CLCommandQueue::Release() {
if (queue_) {
clReleaseCommandQueue(queue_);
queue_ = nullptr;
}
}
Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
int3 work_group_size, CLEvent* event) {
std::vector<size_t> local(3);
std::vector<size_t> global(3);
for (int i = 0; i < 3; ++i) {
local[i] = work_group_size[i];
global[i] = AlignByN(grid[i], work_group_size[i]);
}
cl_event resulting_event;
const int error_code =
clEnqueueNDRangeKernel(queue_, kernel.kernel(), 3, nullptr, global.data(),
local.data(), 0, nullptr, &resulting_event);
*event = CLEvent(resulting_event);
if (error_code != CL_SUCCESS) {
return UnknownError(absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
CLErrorCodeToString(error_code)));
}
return OkStatus();
}
Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
int3 work_group_size) {
std::vector<size_t> local(3);
std::vector<size_t> global(3);
for (int i = 0; i < 3; ++i) {
local[i] = work_group_size[i];
global[i] = AlignByN(grid[i], work_group_size[i]);
}
const int error_code =
clEnqueueNDRangeKernel(queue_, kernel.kernel(), 3, nullptr, global.data(),
local.data(), 0, nullptr, nullptr);
if (error_code != CL_SUCCESS) {
return UnknownError(absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
CLErrorCodeToString(error_code)));
}
return OkStatus();
}
Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
cl_event resulting_event;
const int error_code = clEnqueueMarker(queue_, &resulting_event);
*event = CLEvent(resulting_event);
if (error_code != CL_SUCCESS) {
return UnknownError(absl::StrCat("Failed to clEnqueueMarker - ",
CLErrorCodeToString(error_code)));
}
return OkStatus();
}
Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
const void* data) {
const size_t origin[] = {0, 0, 0};
const size_t r[] = {static_cast<size_t>(region.x),
static_cast<size_t>(region.y),
static_cast<size_t>(region.z)};
auto error_code = clEnqueueWriteImage(queue_, memory, CL_TRUE, origin, r, 0,
0, data, 0, nullptr, nullptr);
if (error_code != CL_SUCCESS) {
return UnknownError(
absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ",
CLErrorCodeToString(error_code)));
}
return OkStatus();
}
Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
void* data) {
const size_t origin[] = {0, 0, 0};
const size_t r[] = {static_cast<size_t>(region.x),
static_cast<size_t>(region.y),
static_cast<size_t>(region.z)};
auto error_code = clEnqueueReadImage(queue_, memory, CL_TRUE, origin, r, 0, 0,
data, 0, nullptr, nullptr);
if (error_code != CL_SUCCESS) {
return UnknownError(
absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ",
CLErrorCodeToString(error_code)));
}
return OkStatus();
}
Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
const void* data) {
auto error_code = clEnqueueWriteBuffer(
queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
if (error_code != CL_SUCCESS) {
return UnknownError(
absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
CLErrorCodeToString(error_code)));
}
return OkStatus();
}
Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes,
void* data) {
auto error_code = clEnqueueReadBuffer(
queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
if (error_code != CL_SUCCESS) {
return UnknownError(
absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
CLErrorCodeToString(error_code)));
}
return OkStatus();
}
Status CLCommandQueue::WaitForCompletion() {
auto error_code = clFinish(queue_);
if (error_code != CL_SUCCESS) {
return UnknownError(
absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code)));
}
return OkStatus();
}
ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue)
: CLCommandQueue(queue) {
events_.reserve(128);
}
ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue&& queue)
: CLCommandQueue(std::move(queue)),
events_(std::move(queue.events_)),
current_label_(std::move(queue.current_label_)) {}
ProfilingCommandQueue& ProfilingCommandQueue::operator=(
ProfilingCommandQueue&& queue) {
if (this != &queue) {
events_ = std::move(queue.events_);
current_label_ = std::move(queue.current_label_);
CLCommandQueue::operator=(std::move(queue));
}
return *this;
}
void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
current_label_ = name;
}
void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
Status ProfilingCommandQueue::DispatchImplicit(const CLKernel& kernel,
int3 grid,
int3 work_group_size) {
events_.push_back(CLEvent());
RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
kernel, grid, work_group_size, &events_[events_.size() - 1]));
events_.back().SetName(current_label_);
return OkStatus();
}
ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
ProfilingInfo result;
result.dispatches.resize(events_.size());
for (int i = 0; i < events_.size(); ++i) {
result.dispatches[i].label = events_[i].GetName();
result.dispatches[i].time_ns = events_[i].GetEventTimeNs();
}
return result;
}
Status ProfilingCommandQueue::GetBestWorkGroupIndex(
const CLKernel& kernel, const DeviceInfo& device_info, const int3& grid,
const std::vector<int3>& work_group_sizes, int* index) {
// Some Adreno 3xx can have wrong numbers for some events
const bool possible_bug_with_events =
device_info.vendor == Vendor::QUALCOMM &&
device_info.adreno_info.gpu_version < 400;
events_.resize(work_group_sizes.size());
for (int i = 0; i < work_group_sizes.size(); ++i) {
RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
kernel, grid, work_group_sizes[i], &events_[i]));
// reducing the speed of memory leak on Mali for some kernels
if (device_info.vendor == Vendor::MALI && i % 8 == 7) {
events_[i - 7].Wait();
}
if (possible_bug_with_events) {
// We are trying to increase probability for correct result.
RETURN_IF_ERROR(WaitForCompletion());
}
}
RETURN_IF_ERROR(WaitForCompletion());
// To release memory of some kernel pool on Mali.
if (device_info.vendor == Vendor::MALI) {
RETURN_IF_ERROR(kernel.ReInit());
}
int minimum_index = 0;
double minimum_time = std::numeric_limits<double>::max();
if (possible_bug_with_events) { // we will try to cut out suspicious results
double average_time = 0.0;
int average_samples_count = 0;
for (int i = 0; i < work_group_sizes.size(); ++i) {
if (events_[i].GetEventTimeMs() < 100 * 1000) { // 100 sec
average_time += events_[i].GetEventTimeMs();
average_samples_count++;
}
}
average_time /= average_samples_count;
for (int i = 0; i < work_group_sizes.size(); ++i) {
double time = events_[i].GetEventTimeMs();
if (time < minimum_time && time >= 0.1 * average_time) {
minimum_index = i;
minimum_time = time;
}
}
} else {
for (int i = 0; i < work_group_sizes.size(); ++i) {
double time = events_[i].GetEventTimeMs();
if (time < minimum_time) {
minimum_index = i;
minimum_time = time;
}
}
}
*index = minimum_index;
return OkStatus();
}
Status CreateCLCommandQueue(const CLDevice& device, const CLContext& context,
CLCommandQueue* result) {
int error_code;
cl_command_queue queue =
clCreateCommandQueue(context.context(), device.id(), 0, &error_code);
if (!queue) {
return UnknownError(absl::StrCat("Failed to create a command queue - ",
CLErrorCodeToString(error_code)));
}
*result = CLCommandQueue(queue);
return OkStatus();
}
double ProfilingCommandQueue::GetQueueExecutionTimeMs() const {
const uint64_t start = events_.front().GetStartedTimeNs();
const uint64_t end = events_.back().GetFinishedTimeNs();
const uint64_t time_ns = (end - start);
return static_cast<double>(time_ns) / 1000000.0;
}
double ProfilingCommandQueue::GetSumOfEventsTimeMs() const {
double sum = 0.0;
for (int i = 0; i < events_.size(); ++i) {
sum += events_[i].GetEventTimeMs();
}
return sum;
}
Status CreateProfilingCommandQueue(const CLDevice& device,
const CLContext& context,
ProfilingCommandQueue* result) {
int error_code;
cl_command_queue queue = clCreateCommandQueue(
context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code);
if (!queue) {
return UnknownError(absl::StrCat("Failed to create a command queue - ",
CLErrorCodeToString(error_code)));
}
*result = ProfilingCommandQueue(queue);
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,136 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
#include <cstdint>
#include <string>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
struct ProfilingInfo {
struct DispatchInfo {
std::string label;
uint64_t time_ns;
double GetTimeMs() const { return static_cast<double>(time_ns) * 1e-6; }
};
std::vector<DispatchInfo> dispatches;
};
// A wrapper around opencl command queue
class CLCommandQueue {
public:
CLCommandQueue() {}
explicit CLCommandQueue(cl_command_queue queue);
// Move only
CLCommandQueue(CLCommandQueue&& queue);
CLCommandQueue& operator=(CLCommandQueue&& queue);
CLCommandQueue(const CLCommandQueue&) = delete;
CLCommandQueue& operator=(const CLCommandQueue&) = delete;
virtual ~CLCommandQueue();
cl_command_queue queue() const { return queue_; }
virtual Status DispatchImplicit(const CLKernel& kernel, int3 grid,
int3 work_group_size);
Status EnqueueEvent(CLEvent* event);
Status DispatchImplicit(const CLKernel& kernel, int3 grid,
int3 work_group_size, CLEvent* event);
Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data);
Status EnqueueReadImage(cl_mem memory, int3 region, void* data);
Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
const void* data);
Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes, void* data);
Status WaitForCompletion();
protected:
void Release();
cl_command_queue queue_ = nullptr;
};
class ProfilingCommandQueue : public CLCommandQueue {
public:
ProfilingCommandQueue() {}
explicit ProfilingCommandQueue(cl_command_queue queue);
// Move only
ProfilingCommandQueue(ProfilingCommandQueue&& queue);
ProfilingCommandQueue& operator=(ProfilingCommandQueue&& queue);
ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
Status DispatchImplicit(const CLKernel& kernel, int3 grid,
int3 work_group_size) override;
// will write index for fastest work_group among work_group_sizes
Status GetBestWorkGroupIndex(const CLKernel& kernel,
const DeviceInfo& device_info, const int3& grid,
const std::vector<int3>& work_group_sizes,
int* index);
// call ResetMeasurements() to start new seriese of measurements
void ResetMeasurements();
double GetQueueExecutionTimeMs() const;
// Difference from GetQueueExecutionTimeMs is that this number doesn't include
// time between kernels(kernels launchs or preparing) on GPU. Usually, this
// time should be 5-10% better than GetQueueExecutionTimeMs, because 5-10%
// spend on something else(maybe kernels launchs or preparing)
double GetSumOfEventsTimeMs() const;
// This label will be used for all subsequent dispatches.
void SetEventsLabel(const std::string& name);
ProfilingInfo GetProfilingInfo() const;
private:
std::vector<CLEvent> events_;
std::string current_label_;
};
Status CreateCLCommandQueue(const CLDevice& device, const CLContext& context,
CLCommandQueue* result);
Status CreateProfilingCommandQueue(const CLDevice& device,
const CLContext& context,
ProfilingCommandQueue* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_

View File

@ -0,0 +1,123 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "absl/strings/str_cat.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_image_format.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::vector<cl_image_format> GetSupportedImage2DFormats(cl_context context,
cl_mem_flags flags) {
cl_uint num_image_formats;
cl_int error = clGetSupportedImageFormats(
context, flags, CL_MEM_OBJECT_IMAGE2D, 0, nullptr, &num_image_formats);
if (error != CL_SUCCESS) {
return {};
}
std::vector<cl_image_format> result(num_image_formats);
error = clGetSupportedImageFormats(context, flags, CL_MEM_OBJECT_IMAGE2D,
num_image_formats, &result[0], nullptr);
if (error != CL_SUCCESS) {
return {};
}
return result;
}
Status CreateCLContext(const CLDevice& device,
cl_context_properties* properties, CLContext* result) {
int error_code;
cl_device_id device_id = device.id();
cl_context context =
clCreateContext(properties, 1, &device_id, nullptr, nullptr, &error_code);
if (!context) {
return UnknownError(absl::StrCat("Failed to create a compute context - ",
CLErrorCodeToString(error_code)));
}
*result = CLContext(context);
return OkStatus();
}
} // namespace
CLContext::CLContext(cl_context context) : context_(context) {}
CLContext::CLContext(CLContext&& context) : context_(context.context_) {
context.context_ = nullptr;
}
CLContext& CLContext::operator=(CLContext&& context) {
if (this != &context) {
Release();
std::swap(context_, context.context_);
}
return *this;
}
CLContext::~CLContext() { Release(); }
void CLContext::Release() {
if (context_) {
clReleaseContext(context_);
context_ = nullptr;
}
}
bool CLContext::IsFloatTexture2DSupported(int num_channels, DataType data_type,
cl_mem_flags flags) const {
auto supported_formats = GetSupportedImage2DFormats(context_, flags);
for (auto format : supported_formats) {
if (format.image_channel_data_type == ToImageChannelType(data_type) &&
format.image_channel_order == ToChannelOrder(num_channels)) {
return true;
}
}
return false;
}
Status CreateCLContext(const CLDevice& device, CLContext* result) {
return CreateCLContext(device, nullptr, result);
}
Status CreateCLGLContext(const CLDevice& device,
cl_context_properties egl_context,
cl_context_properties egl_display, CLContext* result) {
if (!device.SupportsExtension("cl_khr_gl_sharing")) {
return UnavailableError("Device doesn't support CL-GL sharing.");
}
cl_context_properties platform =
reinterpret_cast<cl_context_properties>(device.platform());
cl_context_properties props[] = {CL_GL_CONTEXT_KHR,
egl_context,
CL_EGL_DISPLAY_KHR,
egl_display,
CL_CONTEXT_PLATFORM,
platform,
0};
return CreateCLContext(device, props, result);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,62 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
// A RAII wrapper around opencl context
class CLContext {
public:
CLContext() {}
explicit CLContext(cl_context context);
// Move only
CLContext(CLContext&& context);
CLContext& operator=(CLContext&& context);
CLContext(const CLContext&) = delete;
CLContext& operator=(const CLContext&) = delete;
~CLContext();
cl_context context() const { return context_; }
bool IsFloatTexture2DSupported(int num_channels, DataType data_type,
cl_mem_flags flags = CL_MEM_READ_WRITE) const;
private:
void Release();
cl_context context_ = nullptr;
};
Status CreateCLContext(const CLDevice& device, CLContext* result);
Status CreateCLGLContext(const CLDevice& device,
cl_context_properties egl_context,
cl_context_properties egl_display, CLContext* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_CONTEXT_H_

View File

@ -0,0 +1,398 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include <algorithm>
#include <string>
#include <vector>
#include "absl/strings/numbers.h"
#include "absl/strings/str_split.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
template <>
std::string GetDeviceInfo<std::string>(cl_device_id id, cl_device_info info) {
size_t size;
cl_int error = clGetDeviceInfo(id, info, 0, nullptr, &size);
if (error != CL_SUCCESS) {
return "";
}
std::string result(size - 1, 0);
error = clGetDeviceInfo(id, info, size, &result[0], nullptr);
if (error != CL_SUCCESS) {
return "";
}
return result;
}
namespace {
template <typename T>
T GetPlatformInfo(cl_platform_id id, cl_platform_info info) {
T result;
cl_int error = clGetPlatformInfo(id, info, sizeof(T), &result, nullptr);
if (error != CL_SUCCESS) {
return -1;
}
return result;
}
std::string GetPlatformInfo(cl_platform_id id, cl_platform_info info) {
size_t size;
cl_int error = clGetPlatformInfo(id, info, 0, nullptr, &size);
if (error != CL_SUCCESS) {
return "";
}
std::string result(size - 1, 0);
error = clGetPlatformInfo(id, info, size, &result[0], nullptr);
if (error != CL_SUCCESS) {
return "";
}
return result;
}
void GetDeviceWorkDimsSizes(cl_device_id id, int* result) {
int dims_count =
GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
if (dims_count < 3) {
return;
}
std::vector<size_t> limits(dims_count);
cl_int error =
clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES,
sizeof(size_t) * dims_count, limits.data(), nullptr);
if (error != CL_SUCCESS) {
return;
}
// dims_count must be at least 3 according to spec
result[0] = limits[0];
result[1] = limits[1];
result[2] = limits[2];
}
OpenCLVersion ParseCLVersion(const std::string& version) {
const auto first_dot_pos = version.find_first_of('.');
if (first_dot_pos == std::string::npos) {
return OpenCLVersion::CL_1_0;
}
const int major = version[first_dot_pos - 1] - '0';
const int minor = version[first_dot_pos + 1] - '0';
if (major == 1) {
if (minor == 2) {
return OpenCLVersion::CL_1_2;
} else if (minor == 1) {
return OpenCLVersion::CL_1_1;
} else {
return OpenCLVersion::CL_1_0;
}
} else {
return OpenCLVersion::CL_2_0;
}
}
Vendor ParseVendor(const std::string& device_name,
const std::string& vendor_name) {
std::string d_name = device_name;
std::string v_name = vendor_name;
std::transform(d_name.begin(), d_name.end(), d_name.begin(), ::tolower);
std::transform(v_name.begin(), v_name.end(), v_name.begin(), ::tolower);
if (d_name.find("qualcomm") != std::string::npos ||
v_name.find("qualcomm") != std::string::npos) {
return Vendor::QUALCOMM;
} else if (d_name.find("mali") != std::string::npos ||
v_name.find("mali") != std::string::npos) {
return Vendor::MALI;
} else if (d_name.find("power") != std::string::npos ||
v_name.find("power") != std::string::npos) {
return Vendor::POWERVR;
} else if (d_name.find("nvidia") != std::string::npos ||
v_name.find("nvidia") != std::string::npos) {
return Vendor::NVIDIA;
} else {
return Vendor::UNKNOWN;
}
}
// check that gpu_version belong to range min_version-max_version
// min_version is included and max_version is excluded.
bool isGPUVersionInRange(int gpu_version, int min_version, int max_version) {
return gpu_version >= min_version && gpu_version < max_version;
}
} // namespace
// There is no rule for gpu version encoding, but we found these samples:
// Version: OpenCL C 2.0 Adreno(TM) 540 // Pixel 2
// Version: OpenCL C 2.0 Adreno(TM) 630 // Sony Compact XZ2
// Version: OpenCL C 2.0 Adreno(TM) 630 // Pixel 3
// Version: OpenCL C 2.0 Adreno(TM) 540 // Samsung S8
// Version: OpenCL C 1.2 Adreno(TM) 430 // HTC One M9
// Version: OpenCL C 2.0 Adreno(TM) 530 // Samsung S7 Edge
// Version: OpenCL C 1.2 Adreno(TM) 405 // Motorola Moto G(4)
// After the number string ends.
// It is assumed that the <vendor-specific information> for Adreno GPUs has
// the following format:
// <text?><space?>Adreno(TM)<space><text?><version>
// Returns -1 if vendor-specific information cannot be parsed
int GetAdrenoGPUVersion(const std::string& gpu_version) {
const std::string gpu = absl::AsciiStrToLower(gpu_version);
const std::vector<absl::string_view> words = absl::StrSplit(gpu, ' ');
int i = 0;
for (; i < words.size(); ++i) {
if (words[i].find("adreno") != words[i].npos) {
break;
}
}
i += 1;
for (; i < words.size(); ++i) {
int number;
bool is_number = absl::SimpleAtoi(words[i], &number);
// Adreno GPUs starts from 2xx, but opencl support should be only from 3xx
if (is_number && number >= 300) {
return number;
}
}
return -1;
}
std::string VendorToString(Vendor v) {
switch (v) {
case Vendor::QUALCOMM:
return "Qualcomm";
case Vendor::MALI:
return "Mali";
case Vendor::POWERVR:
return "PowerVR";
case Vendor::NVIDIA:
return "NVIDIA";
case Vendor::UNKNOWN:
return "unknown vendor";
}
}
std::string OpenCLVersionToString(OpenCLVersion version) {
switch (version) {
case OpenCLVersion::CL_1_0:
return "1.0";
case OpenCLVersion::CL_1_1:
return "1.1";
case OpenCLVersion::CL_1_2:
return "1.2";
case OpenCLVersion::CL_2_0:
return "2.0";
}
}
AdrenoInfo::AdrenoInfo(const std::string& device_version)
: gpu_version(GetAdrenoGPUVersion(device_version)) {}
int AdrenoInfo::GetMaximumWavesCount() const {
if (gpu_version < 400) {
return -1; // Adreno 3xx does not support it currently
} else if (gpu_version >= 400 && gpu_version < 500) {
return -1; // Adreno 4xx does not support it currently
} else if (gpu_version >= 500 && gpu_version < 600) {
return -1; // Adreno 5xx does not support it currently
} else if (gpu_version >= 600 && gpu_version < 700) {
return gpu_version == 640 ? 30 : 16;
} else {
return -1; // Adreno 7xx and higher does not exist yet
}
}
int AdrenoInfo::GetRegisterMemorySizePerComputeUnit() const {
if (gpu_version < 400) {
return -1; // Adreno 3xx does not support it currently
} else if (gpu_version >= 400 && gpu_version < 500) {
return -1; // Adreno 4xx does not support it currently
} else if (gpu_version >= 500 && gpu_version < 600) {
return -1; // Adreno 5xx does not support it currently
} else if (gpu_version >= 600 && gpu_version < 700) {
return gpu_version == 640 ? 128 * 144 * 16 : 128 * 96 * 16;
} else {
return -1; // Adreno 7xx and higher does not exist yet
}
}
int AdrenoInfo::GetMaximumWavesCount(int register_footprint_per_tread,
bool full_wave) const {
const int register_usage_per_wave =
GetWaveSize(full_wave) * register_footprint_per_tread;
const int possible_waves_count =
GetRegisterMemorySizePerComputeUnit() / register_usage_per_wave;
return std::min(possible_waves_count, GetMaximumWavesCount());
}
int AdrenoInfo::GetWaveSize(bool full_wave) const {
if (gpu_version < 400) {
return -1; // Adreno 3xx does not support it currently
} else if (gpu_version < 600) {
return full_wave ? 64 : 32;
} else {
return full_wave ? 128 : 64;
}
}
DeviceInfo::DeviceInfo(cl_device_id id)
: adreno_info(GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION)) {
const auto device_name = GetDeviceInfo<std::string>(id, CL_DEVICE_NAME);
const auto vendor_name = GetDeviceInfo<std::string>(id, CL_DEVICE_VENDOR);
vendor = ParseVendor(device_name, vendor_name);
cl_version = ParseCLVersion(
GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION));
extensions =
absl::StrSplit(GetDeviceInfo<std::string>(id, CL_DEVICE_EXTENSIONS), ' ');
supports_fp16 = false;
for (const auto& ext : extensions) {
if (ext == "cl_khr_fp16") {
supports_fp16 = true;
}
}
compute_units_count = GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_COMPUTE_UNITS);
image2d_max_width = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
image2d_max_height = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_WIDTH);
if (cl_version >= OpenCLVersion::CL_1_2) {
image_buffer_max_size =
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE);
image_array_max_layers =
GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE);
}
GetDeviceWorkDimsSizes(id, max_work_items_sizes);
}
bool DeviceInfo::SupportsTextureArray() const {
return cl_version >= OpenCLVersion::CL_1_2;
}
CLDevice::CLDevice(cl_device_id id, cl_platform_id platform_id)
: id_(id), platform_id_(platform_id), info_(id) {}
CLDevice::CLDevice(const CLDevice& device)
: id_(device.id_), platform_id_(device.platform_id_), info_(device.info_) {}
CLDevice& CLDevice::operator=(const CLDevice& device) {
if (this != &device) {
id_ = device.id_;
platform_id_ = device.platform_id_;
info_ = device.info_;
}
return *this;
}
CLDevice::CLDevice(CLDevice&& device)
: id_(device.id_),
platform_id_(device.platform_id_),
info_(std::move(device.info_)) {
device.id_ = nullptr;
device.platform_id_ = nullptr;
}
CLDevice& CLDevice::operator=(CLDevice&& device) {
if (this != &device) {
id_ = nullptr;
platform_id_ = nullptr;
std::swap(id_, device.id_);
std::swap(platform_id_, device.platform_id_);
info_ = std::move(device.info_);
}
return *this;
}
bool CLDevice::SupportsFP16() const { return info_.supports_fp16; }
bool CLDevice::SupportsExtension(const std::string& extension) const {
for (const auto& ext : info_.extensions) {
if (ext == extension) {
return true;
}
}
return false;
}
bool CLDevice::SupportsTextureArray() const {
return info_.SupportsTextureArray();
}
std::string CLDevice::GetPlatformVersion() const {
return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
}
bool CLDevice::IsAdreno() const { return info_.vendor == Vendor::QUALCOMM; }
bool CLDevice::IsAdreno3xx() const {
return IsAdreno() &&
isGPUVersionInRange(info_.adreno_info.gpu_version, 300, 400);
}
bool CLDevice::IsAdreno4xx() const {
return IsAdreno() &&
isGPUVersionInRange(info_.adreno_info.gpu_version, 400, 500);
}
bool CLDevice::IsAdreno5xx() const {
return IsAdreno() &&
isGPUVersionInRange(info_.adreno_info.gpu_version, 500, 600);
}
bool CLDevice::IsAdreno6xx() const {
return IsAdreno() &&
isGPUVersionInRange(info_.adreno_info.gpu_version, 600, 700);
}
bool CLDevice::IsAdreno6xxOrHigher() const {
return IsAdreno() && info_.adreno_info.gpu_version >= 600;
}
bool CLDevice::SupportsOneLayerTextureArray() const {
return !IsAdreno() || info_.adreno_info.support_one_layer_texture_array;
}
void CLDevice::DisableOneLayerTextureArray() {
info_.adreno_info.support_one_layer_texture_array = false;
}
Status CreateDefaultGPUDevice(CLDevice* result) {
cl_uint num_platforms;
clGetPlatformIDs(0, nullptr, &num_platforms);
if (num_platforms == 0) {
return UnknownError("No supported OpenCL platform.");
}
std::vector<cl_platform_id> platforms(num_platforms);
clGetPlatformIDs(num_platforms, platforms.data(), nullptr);
cl_uint num_devices;
clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices);
if (num_devices == 0) {
return UnknownError("No GPU on current platform.");
}
std::vector<cl_device_id> devices(num_devices);
clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices.data(),
nullptr);
*result = CLDevice(devices[0], platforms[0]);
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,140 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_
#include <string>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
enum class Vendor { QUALCOMM, MALI, POWERVR, NVIDIA, UNKNOWN };
std::string VendorToString(Vendor v);
enum class OpenCLVersion { CL_1_0, CL_1_1, CL_1_2, CL_2_0 };
std::string OpenCLVersionToString(OpenCLVersion version);
// for use only in cl_device.cc, but putted here to make tests
int GetAdrenoGPUVersion(const std::string& gpu_version);
struct AdrenoInfo {
AdrenoInfo() = default;
explicit AdrenoInfo(const std::string& device_version);
int gpu_version = -1; // can be, for example, 405/430/540/530/630 etc.
// This function returns some not very documented physical parameter of
// Adreno6xx GPU.
// We obtained it using Snapdragon Profiler.
int GetMaximumWavesCount() const;
// returns amount of register memory per CU(Compute Unit) in bytes.
int GetRegisterMemorySizePerComputeUnit() const;
// returns maximum possible amount of waves based on register usage.
int GetMaximumWavesCount(int register_footprint_per_tread,
bool full_wave = true) const;
int GetWaveSize(bool full_wave) const;
// Not supported on some Adreno devices with specific driver version.
// b/131099086
bool support_one_layer_texture_array = true;
};
struct DeviceInfo {
DeviceInfo() = default;
explicit DeviceInfo(cl_device_id id);
bool SupportsTextureArray() const;
std::vector<std::string> extensions;
bool supports_fp16;
Vendor vendor;
OpenCLVersion cl_version;
int compute_units_count;
int image2d_max_width;
int image2d_max_height;
int image_buffer_max_size;
int image_array_max_layers;
int max_work_items_sizes[3];
AdrenoInfo adreno_info;
};
// A wrapper around opencl device id
class CLDevice {
public:
CLDevice() = default;
CLDevice(cl_device_id id, cl_platform_id platform_id);
CLDevice(CLDevice&& device);
CLDevice& operator=(CLDevice&& device);
CLDevice(const CLDevice&);
CLDevice& operator=(const CLDevice&);
~CLDevice() {}
cl_device_id id() const { return id_; }
cl_platform_id platform() const { return platform_id_; }
std::string GetPlatformVersion() const;
const DeviceInfo& GetInfo() const { return info_; }
const DeviceInfo* GetInfoPtr() const { return &info_; }
Vendor vendor() const { return info_.vendor; }
OpenCLVersion cl_version() const { return info_.cl_version; }
bool SupportsFP16() const;
bool SupportsTextureArray() const;
bool SupportsExtension(const std::string& extension) const;
bool IsAdreno() const;
bool IsAdreno3xx() const;
bool IsAdreno4xx() const;
bool IsAdreno5xx() const;
bool IsAdreno6xx() const;
bool IsAdreno6xxOrHigher() const;
// To track bug on some Adreno. b/131099086
bool SupportsOneLayerTextureArray() const;
void DisableOneLayerTextureArray();
private:
cl_device_id id_ = nullptr;
cl_platform_id platform_id_ = nullptr;
DeviceInfo info_;
};
Status CreateDefaultGPUDevice(CLDevice* result);
template <typename T>
T GetDeviceInfo(cl_device_id id, cl_device_info info) {
T result;
cl_int error = clGetDeviceInfo(id, info, sizeof(T), &result, nullptr);
if (error != CL_SUCCESS) {
return -1;
}
return result;
}
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_DEVICE_H_

View File

@ -0,0 +1,41 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
// @return if error_code is success, then return OK status. Otherwise translates
// error code into a message.
inline Status GetOpenCLError(cl_int error_code) {
if (error_code == CL_SUCCESS) {
return OkStatus();
}
return InternalError("OpenCL error: " + CLErrorCodeToString(error_code));
}
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_ERRORS_H_

View File

@ -0,0 +1,81 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
namespace tflite {
namespace gpu {
namespace cl {
CLEvent::CLEvent(cl_event event) : event_(event) {}
CLEvent::CLEvent(CLEvent&& event)
: event_(event.event_), name_(std::move(event.name_)) {
event.event_ = nullptr;
}
CLEvent& CLEvent::operator=(CLEvent&& event) {
if (this != &event) {
Release();
std::swap(event_, event.event_);
name_ = std::move(event.name_);
}
return *this;
}
uint64_t CLEvent::GetStartedTimeNs() const {
cl_ulong time_ns;
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
&time_ns, nullptr);
return time_ns;
}
uint64_t CLEvent::GetFinishedTimeNs() const {
cl_ulong time_ns;
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
&time_ns, nullptr);
return time_ns;
}
double CLEvent::GetEventTimeMs() const {
const uint64_t start = GetStartedTimeNs();
const uint64_t end = GetFinishedTimeNs();
const uint64_t time_ns = (end - start);
return static_cast<double>(time_ns) * 1e-6;
}
uint64_t CLEvent::GetEventTimeNs() const {
return GetFinishedTimeNs() - GetStartedTimeNs();
}
void CLEvent::SetName(const std::string& name) { name_ = name; }
void CLEvent::Wait() const { clWaitForEvents(1, &event_); }
CLEvent::~CLEvent() { Release(); }
void CLEvent::Release() {
if (event_) {
clReleaseEvent(event_);
event_ = nullptr;
}
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,69 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_
#include <cstdint>
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
namespace tflite {
namespace gpu {
namespace cl {
// A RAII wrapper around opencl event
class CLEvent {
public:
CLEvent() {}
explicit CLEvent(cl_event event);
// Move only
CLEvent(CLEvent&& event);
CLEvent& operator=(CLEvent&& event);
CLEvent(const CLEvent&) = delete;
CLEvent& operator=(const CLEvent&) = delete;
~CLEvent();
uint64_t GetStartedTimeNs() const;
uint64_t GetFinishedTimeNs() const;
double GetEventTimeMs() const;
uint64_t GetEventTimeNs() const;
void Wait() const;
cl_event event() const { return event_; }
bool is_valid() const { return event_ != nullptr; }
void SetName(const std::string& name);
std::string GetName() const { return name_; }
private:
void Release();
cl_event event_ = nullptr;
std::string name_; // optional, for profiling mostly
};
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_EVENT_H_

View File

@ -0,0 +1,50 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/cl_image_format.h"
namespace tflite {
namespace gpu {
namespace cl {
cl_channel_order ToChannelOrder(int num_channels) {
switch (num_channels) {
case 1:
return CL_R;
case 2:
return CL_RG;
case 3:
return CL_RGB;
case 4:
return CL_RGBA;
default:
return -1;
}
}
cl_channel_type ToImageChannelType(DataType data_type) {
switch (data_type) {
case DataType::FLOAT32:
return CL_FLOAT;
case DataType::FLOAT16:
return CL_HALF_FLOAT;
default:
return -1;
}
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,34 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
namespace tflite {
namespace gpu {
namespace cl {
cl_channel_order ToChannelOrder(int num_channels);
cl_channel_type ToImageChannelType(DataType data_type);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_IMAGE_FORMAT_H_

View File

@ -0,0 +1,178 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
#include "absl/strings/str_cat.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
Status GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id,
int* result) {
size_t max_work_group_size;
cl_int error_code =
clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,
sizeof(size_t), &max_work_group_size, nullptr);
if (error_code != CL_SUCCESS) {
return UnknownError(
absl::StrCat("Failed to get info CL_KERNEL_WORK_GROUP_SIZE ",
CLErrorCodeToString(error_code)));
}
*result = static_cast<int>(max_work_group_size);
return OkStatus();
}
Status GetKernelPrivateMemorySize(cl_kernel kernel, cl_device_id device_id,
int* result) {
cl_ulong private_mem_size;
cl_int error_code =
clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PRIVATE_MEM_SIZE,
sizeof(cl_ulong), &private_mem_size, nullptr);
if (error_code != CL_SUCCESS) {
return UnknownError(
absl::StrCat("Failed to get info CL_KERNEL_PRIVATE_MEM_SIZE ",
CLErrorCodeToString(error_code)));
}
*result = static_cast<int>(private_mem_size);
return OkStatus();
}
} // namespace
CLKernel::CLKernel(CLKernel&& kernel)
: private_memory_size_(kernel.private_memory_size_),
max_work_group_size_(kernel.max_work_group_size_),
binding_counter_(kernel.binding_counter_),
function_name_(std::move(kernel.function_name_)),
program_(kernel.program_),
kernel_(kernel.kernel_) {
kernel.kernel_ = nullptr;
}
CLKernel& CLKernel::operator=(CLKernel&& kernel) {
if (this != &kernel) {
Release();
std::swap(private_memory_size_, kernel.private_memory_size_);
std::swap(max_work_group_size_, kernel.max_work_group_size_);
std::swap(binding_counter_, kernel.binding_counter_);
function_name_ = std::move(kernel.function_name_);
std::swap(program_, kernel.program_);
std::swap(kernel_, kernel.kernel_);
}
return *this;
}
CLKernel::~CLKernel() { Release(); }
Status CLKernel::ReInit() const {
clReleaseKernel(kernel_);
cl_kernel* kern_ptr = const_cast<cl_kernel*>(&kernel_);
int error_code;
*kern_ptr = clCreateKernel(program_, function_name_.c_str(), &error_code);
if (!kernel_ || error_code != CL_SUCCESS) {
*kern_ptr = nullptr;
return UnknownError(absl::StrCat("Failed to create ", function_name_,
CLErrorCodeToString(error_code)));
}
return OkStatus();
}
void CLKernel::Release() {
if (kernel_) {
clReleaseKernel(kernel_);
clReleaseProgram(program_);
kernel_ = nullptr;
}
}
Status CLKernel::CreateFromProgram(const CLProgram& program,
const std::string& function_name) {
int error_code;
function_name_ = function_name;
kernel_ =
clCreateKernel(program.program(), function_name.c_str(), &error_code);
if (!kernel_ || error_code != CL_SUCCESS) {
kernel_ = nullptr;
return UnknownError(absl::StrCat("Failed to create ", function_name,
CLErrorCodeToString(error_code)));
}
program_ = program.program();
clRetainProgram(program_);
RETURN_IF_ERROR(GetKernelPrivateMemorySize(kernel_, program.GetDeviceId(),
&private_memory_size_));
RETURN_IF_ERROR(GetKernelMaxWorkGroupSize(kernel_, program.GetDeviceId(),
&max_work_group_size_));
return OkStatus();
}
Status CLKernel::SetMemory(int index, cl_mem memory) {
return SetBytes(index, &memory, sizeof(cl_mem));
}
Status CLKernel::SetMemoryAuto(cl_mem memory) {
return SetBytesAuto(&memory, sizeof(cl_mem));
}
Status CLKernel::SetBytes(int index, const void* ptr, int length) const {
const int error_code = clSetKernelArg(kernel_, index, length, ptr);
if (error_code != CL_SUCCESS) {
return UnknownError(absl::StrCat("Failed to set kernel arguments - ",
CLErrorCodeToString(error_code)));
}
return OkStatus();
}
Status CLKernel::SetBytesAuto(const void* ptr, int length) {
const int error_code = clSetKernelArg(kernel_, binding_counter_, length, ptr);
if (error_code != CL_SUCCESS) {
return UnknownError(absl::StrCat("Failed to set kernel arguments - ",
CLErrorCodeToString(error_code),
"(at index - ", binding_counter_, ")"));
}
binding_counter_++;
return OkStatus();
}
template <>
Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const {
return SetBytes(index, value.GetData(), value.GetSize());
}
template <>
Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const {
return SetBytes(index, value.GetData(), value.GetSize());
}
template <>
Status CLKernel::SetBytesAuto<FLT>(const FLT& value) {
return SetBytesAuto(value.GetData(), value.GetSize());
}
template <>
Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value) {
return SetBytesAuto(value.GetData(), value.GetSize());
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,105 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
// Arguments binding to CLKernel can be manual or automatic
// In manual you specify binding index explicitly
// In automatic binding, index auto-incremented with every binding call
// Also, if you use automatic mode you must call ResetBindingCounter
// before parameters binding
class CLKernel {
public:
CLKernel() {}
// Move only
CLKernel(CLKernel&& kernel);
CLKernel& operator=(CLKernel&& kernel);
CLKernel(const CLKernel&) = delete;
CLKernel& operator=(const CLKernel&) = delete;
~CLKernel();
cl_kernel kernel() const { return kernel_; }
Status CreateFromProgram(const CLProgram& program,
const std::string& function_name);
Status SetMemory(int index, cl_mem memory);
Status SetMemoryAuto(cl_mem memory);
template <typename T>
Status SetBytes(int index, const T& value) const {
return SetBytes(index, static_cast<const void*>(&value), sizeof(T));
}
template <typename T>
Status SetBytesAuto(const T& value) {
return SetBytesAuto(static_cast<const void*>(&value), sizeof(T));
}
int GetPrivateMemorySize() const { return private_memory_size_; }
int GetMaxWorkGroupSize() const { return max_work_group_size_; }
void ResetBindingCounter() { binding_counter_ = 0; }
// Do not use this function
// workaround for Mali memory leak
Status ReInit() const;
private:
void Release();
Status SetBytes(int index, const void* ptr, int length) const;
Status SetBytesAuto(const void* ptr, int length);
int private_memory_size_;
int max_work_group_size_;
int binding_counter_ = -1;
std::string function_name_;
// reference to program from which kernel was created
cl_program program_ = nullptr;
cl_kernel kernel_ = nullptr;
};
template <>
Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const;
template <>
Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const;
template <>
Status CLKernel::SetBytesAuto<FLT>(const FLT& value);
template <>
Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_KERNEL_H_

View File

@ -0,0 +1,37 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/cl_memory.h"
namespace tflite {
namespace gpu {
namespace cl {
cl_mem_flags ToClMemFlags(AccessType access_type) {
switch (access_type) {
case AccessType::READ:
return CL_MEM_READ_ONLY;
case AccessType::WRITE:
return CL_MEM_WRITE_ONLY;
case AccessType::READ_WRITE:
return CL_MEM_READ_WRITE;
}
return CL_MEM_READ_ONLY; // unreachable
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,89 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/common/access_type.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
// RAII wrapper for OpenCL memory object.
//
// Image is moveable but not copyable.
class CLMemory {
public:
// Creates invalid object.
CLMemory() : CLMemory(nullptr, false) {}
CLMemory(cl_mem memory, bool has_ownership)
: memory_(memory), has_ownership_(has_ownership) {}
// Move-only
CLMemory(const CLMemory&) = delete;
CLMemory& operator=(const CLMemory&) = delete;
CLMemory(CLMemory&& image)
: memory_(image.memory_), has_ownership_(image.has_ownership_) {
image.memory_ = nullptr;
}
~CLMemory() { Invalidate(); }
CLMemory& operator=(CLMemory&& image) {
if (this != &image) {
Invalidate();
std::swap(memory_, image.memory_);
has_ownership_ = image.has_ownership_;
}
return *this;
}
cl_mem memory() const { return memory_; }
bool is_valid() const { return memory_ != nullptr; }
// @return true if this object actually owns corresponding CL memory
// and manages it's lifetime.
bool has_ownership() const { return has_ownership_; }
cl_mem Release() {
cl_mem to_return = memory_;
memory_ = nullptr;
return to_return;
}
private:
void Invalidate() {
if (memory_ && has_ownership_) {
clReleaseMemObject(memory_);
}
memory_ = nullptr;
}
cl_mem memory_ = nullptr;
bool has_ownership_ = false;
};
cl_mem_flags ToClMemFlags(AccessType access_type);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_MEMORY_H_

View File

@ -0,0 +1,186 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
#include <cstdint>
#include <cstring>
#include <vector>
#include "absl/strings/str_cat.h"
#include "absl/types/span.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GetProgramBuildInfo(cl_program program, cl_device_id id,
cl_program_build_info info) {
size_t size;
cl_int error_code =
clGetProgramBuildInfo(program, id, info, 0, nullptr, &size);
if (error_code != CL_SUCCESS) {
return absl::StrCat("Failed to GetProgramBuildInfo - ",
CLErrorCodeToString(error_code));
}
std::string result(size - 1, 0);
error_code =
clGetProgramBuildInfo(program, id, info, size, &result[0], nullptr);
if (error_code != CL_SUCCESS) {
return absl::StrCat("Failed to GetProgramBuildInfo - ",
CLErrorCodeToString(error_code));
}
return result;
}
Status GetBinarySize(cl_program program, size_t* binary_size) {
cl_int error_code = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
sizeof(size_t), binary_size, nullptr);
if (error_code != CL_SUCCESS) {
return UnknownError(absl::StrCat("Failed to get program binary size - ",
CLErrorCodeToString(error_code)));
}
return OkStatus();
}
Status BuildProgram(cl_program program, const CLDevice& device,
const std::string& compiler_options) {
const int error_code = clBuildProgram(
program, 0, nullptr, compiler_options.c_str(), nullptr, nullptr);
if (error_code != CL_SUCCESS) {
return UnknownError(absl::StrCat(
"Failed to build program executable - ",
CLErrorCodeToString(error_code),
GetProgramBuildInfo(program, device.id(), CL_PROGRAM_BUILD_LOG)));
}
return OkStatus();
}
std::string CompilerOptionToString(const CLDevice& device,
CompilerOptions option) {
switch (option) {
case CompilerOptions::ADRENO_FULL_SIMD_LINE:
if (device.GetInfo().adreno_info.gpu_version < 500) {
return "-qcom-accelerate-16-bit";
} else {
return "-qcom-accelerate-16-bit=true";
}
}
}
} // namespace
std::string CompilerOptionsToString(
const CLDevice& device,
const std::vector<CompilerOptions>& compiler_options) {
std::string result;
for (auto option : compiler_options) {
absl::StrAppend(&result, CompilerOptionToString(device, option), " ");
}
return result;
}
CLProgram::CLProgram(cl_program program, cl_device_id device_id)
: program_(program), device_id_(device_id) {}
CLProgram::CLProgram(CLProgram&& program)
: program_(program.program_), device_id_(program.device_id_) {
program.program_ = nullptr;
}
CLProgram& CLProgram::operator=(CLProgram&& program) {
if (this != &program) {
Release();
std::swap(program_, program.program_);
std::swap(device_id_, program.device_id_);
}
return *this;
}
CLProgram::~CLProgram() { Release(); }
void CLProgram::Release() {
if (program_) {
clReleaseProgram(program_);
program_ = nullptr;
}
}
Status CLProgram::GetBinary(std::vector<uint8_t>* result) const {
size_t binary_size;
RETURN_IF_ERROR(GetBinarySize(program_, &binary_size));
result->resize(result->size() + binary_size);
uint8_t* binary_ptr = result->data() + result->size() - binary_size;
cl_int error_code = clGetProgramInfo(program_, CL_PROGRAM_BINARIES,
binary_size, &binary_ptr, nullptr);
if (error_code != CL_SUCCESS) {
return UnknownError(absl::StrCat("Failed to get program binary - ",
CLErrorCodeToString(error_code)));
}
return OkStatus();
}
Status CreateCLProgram(const std::string& code,
const std::string& compiler_options,
const CLContext& context, const CLDevice& device,
CLProgram* result) {
int error_code;
const char* source = code.c_str();
cl_program program = clCreateProgramWithSource(context.context(), 1, &source,
nullptr, &error_code);
if (!program || error_code != CL_SUCCESS) {
return UnknownError(absl::StrCat("Failed to create compute program - ",
CLErrorCodeToString(error_code)));
}
*result = CLProgram(program, device.id());
RETURN_IF_ERROR(BuildProgram(program, device, compiler_options));
return OkStatus();
}
Status CreateCLProgramFromBinary(const CLContext& context,
const CLDevice& device,
absl::Span<const uint8_t> binary,
CLProgram* result) {
cl_int binary_status;
cl_int error_code;
cl_device_id devices_list[] = {device.id()};
size_t binary_size = binary.size();
const uint8_t* binary_pointer = binary.data();
cl_program program = clCreateProgramWithBinary(
context.context(), 1, devices_list, &binary_size, &binary_pointer,
&binary_status, &error_code);
if (binary_status != CL_SUCCESS) {
return UnknownError(absl::StrCat(
"Something wrong with binary after clCreateProgramWithBinary - ",
binary_status));
}
if (error_code != CL_SUCCESS) {
return UnknownError(absl::StrCat("Failed to create program - ",
CLErrorCodeToString(error_code)));
}
*result = CLProgram(program, device.id());
return BuildProgram(program, device, "");
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,90 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_
#include <cstdint>
#include <vector>
#include "absl/types/span.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
// ADRENO_FULL_SIMD_LINE:
// Adreno can have 2 sizes for SIMD size.
// On Adreno 4xx/5xx it is 32/64, on Adreno6xx it is 64/128.
// Some our algorithms actually rely on exact size, for example on full
// SIMD size, so we need this define.
// This define is actually -qcom-accelerate-16-bit, but it controls SIMD size.
enum class CompilerOptions { ADRENO_FULL_SIMD_LINE };
std::string CompilerOptionsToString(
const CLDevice& device,
const std::vector<CompilerOptions>& compiler_options);
class CLProgram {
public:
CLProgram() {}
CLProgram(cl_program program, cl_device_id device_id);
// Move only
CLProgram(CLProgram&& program);
CLProgram& operator=(CLProgram&& program);
CLProgram(const CLProgram&) = delete;
CLProgram& operator=(const CLProgram&) = delete;
~CLProgram();
cl_program program() const { return program_; }
// Return the cl_device_id associated with the program object.
// This can be the device associated with context on which the program object
// has been created or can be device that was specified when a progam object
// was created using clCreateProgramWithBinary.
cl_device_id GetDeviceId() const { return device_id_; }
Status GetBinary(std::vector<uint8_t>* result) const;
private:
void Release();
cl_program program_ = nullptr;
// reference
cl_device_id device_id_ = nullptr;
};
Status CreateCLProgram(const std::string& code,
const std::string& compiler_options,
const CLContext& context, const CLDevice& device,
CLProgram* result);
Status CreateCLProgramFromBinary(const CLContext& context,
const CLDevice& device,
absl::Span<const uint8_t> binary,
CLProgram* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_PROGRAM_H_

View File

@ -0,0 +1,17 @@
namespace tflite.gpu.cl.data;
file_identifier "AFCM";
file_extension "jetbin";
table Program {
fingerprint:uint64;
binary:[ubyte];
}
table CompiledCache {
driver_version:string;
programs:[Program];
}
root_type CompiledCache;

View File

@ -0,0 +1,71 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
#include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
namespace tflite {
namespace gpu {
namespace cl {
Status EglSync::NewFence(EGLDisplay display, EglSync* sync) {
EGLSyncKHR egl_sync;
RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglCreateSyncKHR, &egl_sync, display,
EGL_SYNC_FENCE_KHR, nullptr));
if (egl_sync == EGL_NO_SYNC_KHR) {
return InternalError("Returned empty KHR EGL sync");
}
*sync = EglSync(display, egl_sync);
return OkStatus();
}
EglSync& EglSync::operator=(EglSync&& sync) {
if (this != &sync) {
Invalidate();
std::swap(sync_, sync.sync_);
display_ = sync.display_;
}
return *this;
}
void EglSync::Invalidate() {
if (sync_ != EGL_NO_SYNC_KHR) {
eglDestroySyncKHR(display_, sync_);
sync_ = EGL_NO_SYNC_KHR;
}
}
Status EglSync::ServerWait() {
EGLint result;
RETURN_IF_ERROR(
TFLITE_GPU_CALL_EGL(eglWaitSyncKHR, &result, display_, sync_, 0));
return result == EGL_TRUE ? OkStatus() : InternalError("eglWaitSync failed");
}
Status EglSync::ClientWait() {
EGLint result;
// TODO(akulik): make it active wait for better performance
RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglClientWaitSyncKHR, &result, display_,
sync_, EGL_SYNC_FLUSH_COMMANDS_BIT_KHR,
EGL_FOREVER_KHR));
return result == EGL_CONDITION_SATISFIED_KHR
? OkStatus()
: InternalError("eglClientWaitSync failed");
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,78 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_
#include <EGL/egl.h>
#include <EGL/eglext.h>
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
// RAII wrapper for EGL sync object.
// EglSync is moveable but not copyable.
class EglSync {
public:
// Creates a fence in OpenGL command stream. This sync is enqueued and *not*
// flushed.
//
// Depends on EGL_KHR_fence_sync extension.
static Status NewFence(EGLDisplay display, EglSync* sync);
// Creates invalid object.
EglSync() : EglSync(EGL_NO_DISPLAY, EGL_NO_SYNC_KHR) {}
EglSync(EGLDisplay display, EGLSyncKHR sync)
: display_(display), sync_(sync) {}
// Move-only
EglSync(EglSync&& sync);
EglSync& operator=(EglSync&& sync);
EglSync(const EglSync&) = delete;
EglSync& operator=(const EglSync&) = delete;
~EglSync() { Invalidate(); }
// Causes GPU to block and wait until this sync has been signaled.
// This call does not block and returns immediately.
Status ServerWait();
// Causes CPU to block and wait until this sync has been signaled.
Status ClientWait();
// Returns the EGLDisplay on which this instance was created.
EGLDisplay display() const { return display_; }
// Returns the EGLSyncKHR wrapped by this instance.
EGLSyncKHR sync() const { return sync_; }
// Returns true if this instance wraps a valid EGLSync object.
bool is_valid() const { return sync_ != nullptr; }
private:
void Invalidate();
EGLDisplay display_;
EGLSyncKHR sync_;
};
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_EGL_SYNC_H_

View File

@ -0,0 +1,240 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/environment.h"
#include <string>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
CalculationsPrecision GetPossiblePrecision(
const CLDevice& gpu, CalculationsPrecision desired_precision) {
if (!gpu.SupportsFP16() && desired_precision != CalculationsPrecision::F32) {
return CalculationsPrecision::F32;
}
return desired_precision;
}
std::string GetKernelOneLayerTextureArray() {
return R"(
__kernel void main_function(__write_only image2d_array_t dst) {
int X = (int)(get_global_id(0));
int Y = (int)(get_global_id(1));
write_imagef(dst, (int4)(X, Y, 0, 0), (float4)(2.0, 2.0, 2.0, 2.0));
}
)";
}
// Some Adreno < 600 have bug with one layer texture array. b/131099086
// If we have one layer texture array and will write smt from kernel to this
// texture, we will get zeroes instead of actual values.
// The same kernel will work, if we use texture array with more than one layer.
// With help of this code we can detect this bug.
Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
bool* result) {
// No bug on Adreno 6xx
if (env->device().GetInfo().adreno_info.gpu_version >= 600) {
*result = true;
return OkStatus();
}
CLKernel kernel;
RETURN_IF_ERROR(CreateKernel(GetKernelOneLayerTextureArray(), "main_function",
env, &kernel));
Tensor tensor;
RETURN_IF_ERROR(CreateTensor(env->context(), env->device(), 4, 4, 4,
DataType::FLOAT32,
TensorStorageType::TEXTURE_ARRAY, &tensor));
RETURN_IF_ERROR(kernel.SetMemory(0, tensor.GetMemoryPtr()));
RETURN_IF_ERROR(env->queue()->DispatchImplicit(kernel, {4, 4, 1}, {4, 4, 1}));
std::vector<float> cpu_data(64, 0.0f);
RETURN_IF_ERROR(tensor.ReadDataBHWC(absl::MakeSpan(cpu_data), env->queue()));
*result = true;
for (int i = 0; i < 64; ++i) {
if (cpu_data[i] != 2.0) {
*result = false;
break;
}
}
return OkStatus();
}
Status CreateEnvironment(Environment* result, bool shared,
cl_context_properties egl_context,
cl_context_properties egl_display) {
CLDevice gpu;
RETURN_IF_ERROR(CreateDefaultGPUDevice(&gpu));
CLContext context;
if (shared) {
RETURN_IF_ERROR(CreateCLGLContext(gpu, egl_context, egl_display, &context));
} else {
RETURN_IF_ERROR(CreateCLContext(gpu, &context));
}
CLCommandQueue queue;
RETURN_IF_ERROR(CreateCLCommandQueue(gpu, context, &queue));
ProfilingCommandQueue profiling_queue;
RETURN_IF_ERROR(CreateProfilingCommandQueue(gpu, context, &profiling_queue));
*result = Environment(std::move(gpu), std::move(context), std::move(queue),
std::move(profiling_queue));
if (result->device().IsAdreno() && result->device().SupportsTextureArray()) {
bool supports_one_layer;
RETURN_IF_ERROR(
CheckKernelSupportOfOneLayerTextureArray(result, &supports_one_layer));
if (!supports_one_layer) {
result->GetDevicePtr()->DisableOneLayerTextureArray();
}
}
return OkStatus();
}
} // namespace
Environment::Environment(CLDevice&& device, CLContext&& context,
CLCommandQueue&& queue,
ProfilingCommandQueue&& profiling_queue)
: device_(std::move(device)),
context_(std::move(context)),
queue_(std::move(queue)),
profiling_queue_(std::move(profiling_queue)) {}
Environment::Environment(Environment&& environment)
: device_(std::move(environment.device_)),
context_(std::move(environment.context_)),
queue_(std::move(environment.queue_)),
profiling_queue_(std::move(environment.profiling_queue_)),
program_cache_(std::move(environment.program_cache_)) {}
Environment& Environment::operator=(Environment&& environment) {
if (this != &environment) {
device_ = std::move(environment.device_);
context_ = std::move(environment.context_);
queue_ = std::move(environment.queue_);
profiling_queue_ = std::move(environment.profiling_queue_);
program_cache_ = std::move(environment.program_cache_);
}
return *this;
}
void Environment::SetHighPerformance() const {
// TODO(sorokin) use cl_perf_hint if available
}
void Environment::SetDefaultPerformance() const {
// TODO(sorokin) use cl_perf_hint if available
}
void Environment::SetLowPerformance() const {
// TODO(sorokin) use cl_perf_hint if available
}
std::vector<CalculationsPrecision> Environment::GetSupportedPrecisions() const {
std::vector<CalculationsPrecision> precisions;
for (CalculationsPrecision precision :
{CalculationsPrecision::F32, CalculationsPrecision::F32_F16,
CalculationsPrecision::F16}) {
if (IsSupported(precision)) {
precisions.push_back(precision);
}
}
return precisions;
}
bool Environment::IsSupported(CalculationsPrecision precision) const {
switch (precision) {
case CalculationsPrecision::F32_F16:
case CalculationsPrecision::F16:
return device_.SupportsFP16();
case CalculationsPrecision::F32:
return true;
}
}
std::vector<TensorStorageType> Environment::GetSupportedTextureStorages()
const {
std::vector<TensorStorageType> storage_types = {
TensorStorageType::TEXTURE_2D};
if (device_.SupportsTextureArray()) {
storage_types.push_back(TensorStorageType::TEXTURE_ARRAY);
}
return storage_types;
}
std::vector<TensorStorageType> Environment::GetSupportedStorages() const {
std::vector<TensorStorageType> storage_types = {TensorStorageType::TEXTURE_2D,
TensorStorageType::BUFFER};
if (device_.SupportsTextureArray()) {
storage_types.push_back(TensorStorageType::TEXTURE_ARRAY);
}
return storage_types;
}
TensorStorageType GetOptimalStorageType(const CLDevice& gpu) {
TensorStorageType storage_type;
if (gpu.vendor() != Vendor::QUALCOMM) {
storage_type = TensorStorageType::BUFFER;
} else {
if (gpu.IsAdreno6xxOrHigher()) {
storage_type = TensorStorageType::TEXTURE_ARRAY;
} else {
storage_type = TensorStorageType::TEXTURE_2D;
}
}
return storage_type;
}
Status CreateDefaultEnvironment(Environment* result) {
return CreateEnvironment(result, false, 0, 0);
}
Status CreateEnvironment(Environment* result) {
return CreateEnvironment(result, false, 0, 0);
}
Status CreateGLCompatibleEnvironment(cl_context_properties egl_context,
cl_context_properties egl_display,
Environment* result) {
return CreateEnvironment(result, true, egl_context, egl_display);
}
Status CreateKernel(const std::string& code, const std::string& function_name,
Environment* env, CLKernel* result) {
return CreateKernel(code, function_name, {}, env, result);
}
Status CreateKernel(const std::string& code, const std::string& function_name,
const std::vector<CompilerOptions>& compiler_options,
Environment* env, CLKernel* result) {
return env->program_cache()->GetOrCreateCLKernel(
code, function_name, compiler_options, env->context(), env->device(),
result);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,93 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
namespace tflite {
namespace gpu {
namespace cl {
class Environment {
public:
Environment() = default;
explicit Environment(CLDevice&& device, CLContext&& context,
CLCommandQueue&& queue,
ProfilingCommandQueue&& profiling_queue);
// Move only
Environment(Environment&& environment);
Environment& operator=(Environment&& environment);
Environment(const Environment&) = delete;
Environment& operator=(const Environment&) = delete;
const CLDevice& device() const { return device_; }
CLDevice* GetDevicePtr() { return &device_; }
const CLDevice* GetDevicePtr() const { return &device_; }
CLContext& context() { return context_; }
CLCommandQueue* queue() { return &queue_; }
ProfilingCommandQueue* profiling_queue() { return &profiling_queue_; }
ProgramCache* program_cache() { return &program_cache_; }
const ProgramCache* program_cache() const { return &program_cache_; }
std::vector<CalculationsPrecision> GetSupportedPrecisions() const;
bool IsSupported(CalculationsPrecision precision) const;
std::vector<TensorStorageType> GetSupportedTextureStorages() const;
std::vector<TensorStorageType> GetSupportedStorages() const;
void SetHighPerformance() const;
void SetDefaultPerformance() const;
void SetLowPerformance() const; // for energy saving
private:
CLDevice device_;
CLContext context_;
CLCommandQueue queue_;
ProfilingCommandQueue profiling_queue_;
ProgramCache program_cache_;
};
TensorStorageType GetOptimalStorageType(const CLDevice& gpu);
Status CreateDefaultEnvironment(Environment* result);
Status CreateEnvironment(Environment* result);
Status CreateGLCompatibleEnvironment(cl_context_properties egl_context,
cl_context_properties egl_display,
Environment* result);
Status CreateKernel(const std::string& code, const std::string& function_name,
Environment* env, CLKernel* result);
Status CreateKernel(const std::string& code, const std::string& function_name,
const std::vector<CompilerOptions>& compiler_options,
Environment* env, CLKernel* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_ENVIRONMENT_H_

View File

@ -0,0 +1,259 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/gl_interop.h"
#include "absl/strings/str_cat.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
#include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
#include "tensorflow/lite/delegates/gpu/gl/gl_sync.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
// TODO(b/131897059): replace with 64 version when EGL 1.5 is available.
// it should use KHR_cl_event2 extension. More details are in b/129974818.
using PFNEGLCREATESYNCPROC = EGLSync(EGLAPIENTRYP)(
EGLDisplay dpy, EGLenum type, const EGLAttrib* attrib_list);
PFNEGLCREATESYNCPROC g_eglCreateSync = nullptr;
} // namespace
Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
EglSync* sync) {
if (!IsEglSyncFromClEventSupported()) {
return UnimplementedError("CreateEglSyncFromClEvent is not supported");
}
EGLSync egl_sync;
const EGLAttrib attributes[] = {EGL_CL_EVENT_HANDLE,
reinterpret_cast<EGLAttrib>(event), EGL_NONE};
RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(g_eglCreateSync, &egl_sync, display,
EGL_SYNC_CL_EVENT, attributes));
if (egl_sync == EGL_NO_SYNC) {
return InternalError("Returned empty EGL sync");
}
*sync = EglSync(display, egl_sync);
return OkStatus();
}
bool IsEglSyncFromClEventSupported() {
// In C++11, static initializers are guaranteed to be evaluated only once.
static bool supported = []() -> bool {
// This function requires EGL 1.5 to work
g_eglCreateSync = reinterpret_cast<PFNEGLCREATESYNCPROC>(
eglGetProcAddress("eglCreateSync"));
// eglQueryString accepts EGL_NO_DISPLAY only starting EGL 1.5
if (!eglQueryString(EGL_NO_DISPLAY, EGL_EXTENSIONS)) {
g_eglCreateSync = nullptr;
}
return (g_eglCreateSync != nullptr);
}();
return supported;
}
Status CreateClEventFromEglSync(cl_context context, const EglSync& egl_sync,
CLEvent* event) {
cl_int error_code;
cl_event new_event = clCreateEventFromEGLSyncKHR(
context, egl_sync.sync(), egl_sync.display(), &error_code);
if (error_code != CL_SUCCESS) {
return InternalError(
absl::StrCat("Unable to create CL sync from EGL sync. ",
CLErrorCodeToString(error_code)));
}
*event = CLEvent(new_event);
return OkStatus();
}
bool IsClEventFromEglSyncSupported(const CLDevice& device) {
return device.SupportsExtension("cl_khr_egl_event");
}
Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id, AccessType access_type,
CLContext* context, CLMemory* memory) {
cl_int error_code;
auto mem = clCreateFromGLBuffer(context->context(), ToClMemFlags(access_type),
gl_ssbo_id, &error_code);
if (error_code != CL_SUCCESS) {
return InternalError(
absl::StrCat("Unable to acquire CL buffer from GL buffer. ",
CLErrorCodeToString(error_code)));
}
*memory = CLMemory(mem, true);
return OkStatus();
}
Status CreateClMemoryFromGlTexture(GLenum texture_target, GLuint texture_id,
AccessType access_type, CLContext* context,
CLMemory* memory) {
cl_int error_code;
auto mem =
clCreateFromGLTexture(context->context(), ToClMemFlags(access_type),
texture_target, 0, texture_id, &error_code);
if (error_code != CL_SUCCESS) {
return InternalError(
absl::StrCat("Unable to create CL buffer from GL texture. ",
CLErrorCodeToString(error_code)));
}
*memory = CLMemory(mem, true);
return OkStatus();
}
bool IsGlSharingSupported(const CLDevice& device) {
return clCreateFromGLBuffer && clCreateFromGLTexture &&
device.SupportsExtension("cl_khr_gl_sharing");
}
AcquiredGlObjects::~AcquiredGlObjects() { Release({}, nullptr).IgnoreError(); }
Status AcquiredGlObjects::Acquire(const std::vector<cl_mem>& memory,
cl_command_queue queue,
const std::vector<cl_event>& wait_events,
CLEvent* acquire_event,
AcquiredGlObjects* objects) {
if (!memory.empty()) {
cl_event new_event;
cl_int error_code = clEnqueueAcquireGLObjects(
queue, memory.size(), memory.data(), wait_events.size(),
wait_events.data(), acquire_event ? &new_event : nullptr);
if (error_code != CL_SUCCESS) {
return InternalError(absl::StrCat("Unable to acquire GL object. ",
CLErrorCodeToString(error_code)));
}
if (acquire_event) {
*acquire_event = CLEvent(new_event);
}
clFlush(queue);
}
*objects = AcquiredGlObjects(memory, queue);
return OkStatus();
}
Status AcquiredGlObjects::Release(const std::vector<cl_event>& wait_events,
CLEvent* release_event) {
if (queue_ && !memory_.empty()) {
cl_event new_event;
cl_int error_code = clEnqueueReleaseGLObjects(
queue_, memory_.size(), memory_.data(), wait_events.size(),
wait_events.data(), release_event ? &new_event : nullptr);
if (error_code != CL_SUCCESS) {
return InternalError(absl::StrCat("Unable to release GL object. ",
CLErrorCodeToString(error_code)));
}
if (release_event) {
*release_event = CLEvent(new_event);
}
clFlush(queue_);
queue_ = nullptr;
}
return OkStatus();
}
GlInteropFabric::GlInteropFabric(EGLDisplay egl_display,
Environment* environment)
: is_egl_sync_supported_(true),
is_egl_to_cl_mapping_supported_(
IsClEventFromEglSyncSupported(environment->device())),
is_cl_to_egl_mapping_supported_(IsEglSyncFromClEventSupported()),
egl_display_(egl_display),
context_(environment->context().context()),
queue_(environment->queue()->queue()) {}
void GlInteropFabric::RegisterMemory(cl_mem memory) {
memory_.push_back(memory);
}
void GlInteropFabric::UnregisterMemory(cl_mem memory) {
auto it = std::find(memory_.begin(), memory_.end(), memory);
if (it != memory_.end()) {
memory_.erase(it);
}
}
Status GlInteropFabric::Start() {
if (!is_enabled()) {
return OkStatus();
}
// In GL-CL interoperability, we need to make sure GL finished processing of
// all commands that might affect GL objects. There are a few ways:
// a) glFinish
// slow, but portable
// b) EglSync + ClientWait
// faster alternative for glFinish, but still slow as it stalls GPU
// pipeline.
// c) EglSync->CLEvent or GlSync->CLEvent mapping
// Fast, as it allows to map sync to CL event and use it as a dependency
// later without stalling GPU pipeline.
if (is_egl_sync_supported_) {
EglSync sync;
RETURN_IF_ERROR(EglSync::NewFence(egl_display_, &sync));
if (is_egl_to_cl_mapping_supported_) {
// (c) EglSync->CLEvent or GlSync->CLEvent mapping
glFlush();
RETURN_IF_ERROR(
CreateClEventFromEglSync(context_, sync, &inbound_event_));
} else {
// (b) EglSync + ClientWait
RETURN_IF_ERROR(sync.ClientWait());
}
} else {
// (a) glFinish / GL fence sync
RETURN_IF_ERROR(gl::GlActiveSyncWait());
}
// Acquire all GL objects needed while processing.
auto make_acquire_wait = [&]() -> std::vector<cl_event> {
if (inbound_event_.is_valid()) {
return {inbound_event_.event()};
}
return {};
};
return AcquiredGlObjects::Acquire(memory_, queue_, make_acquire_wait(),
nullptr, &gl_objects_);
}
Status GlInteropFabric::Finish() {
if (!is_enabled()) {
return OkStatus();
}
RETURN_IF_ERROR(gl_objects_.Release({}, &outbound_event_));
// if (is_egl_sync_supported_ && is_cl_to_egl_mapping_supported_) {
// EglSync egl_outbound_sync;
// RETURN_IF_ERROR(CreateEglSyncFromClEvent(outbound_event_.event(),
// egl_display_,
// &egl_outbound_sync));
// // Instruct GL pipeline to wait until corresponding CL event is signaled.
// RETURN_IF_ERROR(egl_outbound_sync.ServerWait());
// glFlush();
// } else {
// // Slower option if proper sync is not supported. It is equivalent to
// // clFinish, but, hopefully, faster.
// outbound_event_.Wait();
// }
// This slow sync is the only working solution right now. We have to debug why
// above version is not working fast and reliable.
outbound_event_.Wait();
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,144 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
#include <vector>
#include <EGL/egl.h>
#include <EGL/eglext.h>
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_memory.h"
#include "tensorflow/lite/delegates/gpu/cl/egl_sync.h"
#include "tensorflow/lite/delegates/gpu/cl/environment.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/common/access_type.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
namespace tflite {
namespace gpu {
namespace cl {
// Creates an EglSync from OpenCL event. Source event does not need to outlive
// returned sync and could be safely destroyed.
//
// Depends on EGL 1.5.
Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
EglSync* sync);
// Returns true if 'CreateEglSyncFromClEvent' is supported.
bool IsEglSyncFromClEventSupported();
// Creates CL event from EGL sync.
// Created event could only be comsumed by AcquiredGlObject::Acquire call as
// a 'wait_event'.
Status CreateClEventFromEglSync(cl_context context, const EglSync& egl_sync,
CLEvent* event);
// Returns true if 'CreateClEventFromEglSync' is supported.
bool IsClEventFromEglSyncSupported(const CLDevice& device);
// Creates new CL memory object from OpenGL buffer.
Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id, AccessType access_type,
CLContext* context, CLMemory* memory);
// Creates new CL memory object from OpenGL texture.
Status CreateClMemoryFromGlTexture(GLenum texture_target, GLuint texture_id,
AccessType access_type, CLContext* context,
CLMemory* memory);
// Returns true if GL objects could be shared with OpenCL context.
bool IsGlSharingSupported(const CLDevice& device);
// RAII-wrapper for GL objects acquired into CL context.
class AcquiredGlObjects {
public:
static bool IsSupported(const CLDevice& device);
AcquiredGlObjects() : AcquiredGlObjects({}, nullptr) {}
// Quitely releases OpenGL objects. It is recommended to call Release()
// explicitly to properly handle potential errors.
~AcquiredGlObjects();
// Acquires memory from the OpenGL context. Memory must be created by either
// CreateClMemoryFromGlBuffer or CreateClMemoryFromGlTexture calls.
// If 'acquire_event' is not nullptr, it will be signared once acquisition is
// complete.
static Status Acquire(const std::vector<cl_mem>& memory,
cl_command_queue queue,
const std::vector<cl_event>& wait_events,
CLEvent* acquire_event /* optional */,
AcquiredGlObjects* objects);
// Releases OpenCL memory back to OpenGL context. If 'release_event' is not
// nullptr, it will be signalled once release is complete.
Status Release(const std::vector<cl_event>& wait_events,
CLEvent* release_event /* optional */);
private:
AcquiredGlObjects(const std::vector<cl_mem>& memory, cl_command_queue queue)
: memory_(memory), queue_(queue) {}
std::vector<cl_mem> memory_;
cl_command_queue queue_;
};
// Incapsulates all complicated GL-CL synchronization. It manages life time of
// all appropriate events to ensure fast synchronization whenever possible.
class GlInteropFabric {
public:
GlInteropFabric(EGLDisplay egl_display, Environment* environment);
// Ensures proper GL->CL synchronization is in place before
// GL objects that are mapped to CL objects are used.
Status Start();
// Puts appropriate CL->GL synchronization after all work is complete.
Status Finish();
// Registers memory to be used from GL context. Such CL memory object must
// be created with CreateClMemoryFromGlBuffer or CreateClMemoryFromGlTexture
// call.
void RegisterMemory(cl_mem memory);
// Unregisters memory registered with RegisterMemory call.
void UnregisterMemory(cl_mem memory);
private:
bool is_enabled() const { return egl_display_ && !memory_.empty(); }
bool is_egl_sync_supported_;
bool is_egl_to_cl_mapping_supported_;
bool is_cl_to_egl_mapping_supported_;
const EGLDisplay egl_display_;
cl_context context_;
cl_command_queue queue_;
CLEvent inbound_event_;
CLEvent outbound_event_;
std::vector<cl_mem> memory_;
AcquiredGlObjects gl_objects_; // transient during Start/Finish calls.
};
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_

View File

@ -0,0 +1,367 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h"
#include <cstdint>
#include "absl/types/span.h"
#include "tensorflow/lite/builtin_ops.h"
#include "tensorflow/lite/delegates/gpu/api.h"
#include "tensorflow/lite/delegates/gpu/cl/api.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
#include "tensorflow/lite/delegates/gpu/common/model.h"
#include "tensorflow/lite/delegates/gpu/common/model_builder.h"
#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
// Forward declarations.
TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
InferencePriority ToPriority(int32_t priority) {
switch (priority) {
case TfLiteGpuInferencePriority::
TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION:
return InferencePriority::MAX_PRECISION;
case TfLiteGpuInferencePriority::TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY:
return InferencePriority::MIN_LATENCY;
}
return InferencePriority::MAX_PRECISION;
}
DataType ToDataType(TfLiteType data_type) {
switch (data_type) {
case kTfLiteFloat16:
return DataType::FLOAT16;
case kTfLiteFloat32:
return DataType::FLOAT32;
default:
return DataType::UNKNOWN;
}
}
DataLayout ToDataLayoutFromTFL(TfLiteGpuDataLayout data_layout) {
switch (data_layout) {
case TFLITE_GPU_DATA_LAYOUT_BHWC:
return DataLayout::BHWC;
case TFLITE_GPU_DATA_LAYOUT_DHWC4:
return DataLayout::DHWC4;
default:
return DataLayout::UNKNOWN;
}
}
class Delegate {
public:
explicit Delegate(const TfLiteGpuDelegateOptions_New* options) {
if (options) {
options_ = *options;
} else {
// Default options.
options_.compile_options.precision_loss_allowed = 0;
options_.compile_options.inference_priority = TfLiteGpuInferencePriority::
TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
options_.egl_display = eglGetCurrentDisplay();
options_.egl_context = eglGetCurrentContext();
options_.serialized_binary_cache_data = nullptr;
options_.serialized_binary_cache_size = 0;
}
}
Status Prepare(TfLiteContext* context,
const TfLiteDelegateParams* delegate_params) {
// Extract TFLite delegate execution plan from the context and convert it
// into FlowGraph32.
GraphFloat32 graph;
RETURN_IF_ERROR(BuildModel(context, delegate_params, &graph));
// Apply general transformations on the graph.
NullTransformationReporter reporter;
ModelTransformer transformer(&graph, &reporter);
if (!ApplyGeneralTransformations(&transformer)) {
return InternalError("Graph general transformations failed");
}
InferenceEnvironmentOptions env_options;
env_options.egl_context = options_.egl_context;
env_options.egl_display = options_.egl_display;
env_options.serialized_binary_cache = {
options_.serialized_binary_cache_data,
options_.serialized_binary_cache_size};
InferenceEnvironmentProperties properties;
Status status =
NewInferenceEnvironment(env_options, &environment_, &properties);
if (!properties.is_opencl_available) {
context->ReportError(context,
"TfLiteGpuDelegate: OpenCL is not available");
}
if (!properties.is_gl_sharing_supported) {
context->ReportError(context,
"TfLiteGpuDelegate: GL sharing is not supported");
}
if (!properties.is_cl_to_gl_fast_sync_supported) {
context->ReportError(
context, "TfLiteGpuDelegate: fast CL to GL sync is not supported");
}
if (!properties.is_gl_to_cl_fast_sync_supported) {
context->ReportError(
context, "TfLiteGpuDelegate: fast GL to CL sync is not supported");
}
RETURN_IF_ERROR(status);
InferenceOptions options;
options.priority = ToPriority(options_.compile_options.inference_priority);
options.allow_precision_loss =
options_.compile_options.precision_loss_allowed != 0;
std::unique_ptr<InferenceBuilder> builder;
RETURN_IF_ERROR(
environment_->NewInferenceBuilder(options, graph, &builder));
// At this point tflite didn't allocate tensors yet, therefore, collect
// indices and set all input and output tensors from tflite later.
auto inputs = graph.inputs();
input_indices_.reserve(inputs.size());
for (auto input : inputs) {
auto tensor_index = input->tensor.ref;
int object_index = input_indices_.size();
input_indices_.push_back(tensor_index);
RETURN_IF_ERROR(
builder->SetInputObjectDef(object_index, GetObjectDef(tensor_index)));
}
auto outputs = graph.outputs();
output_indices_.reserve(outputs.size());
for (auto output : outputs) {
auto tensor_index = output->tensor.ref;
int object_index = output_indices_.size();
output_indices_.push_back(tensor_index);
RETURN_IF_ERROR(builder->SetOutputObjectDef(object_index,
GetObjectDef(tensor_index)));
}
return builder->Build(&runner_);
}
Status SetInputsAndOutputs(TfLiteContext* context) {
int i = 0;
for (auto index : input_indices_) {
RETURN_IF_ERROR(
runner_->SetInputObject(i++, GetTensorObject(index, context)));
}
i = 0;
for (auto index : output_indices_) {
RETURN_IF_ERROR(
runner_->SetOutputObject(i++, GetTensorObject(index, context)));
}
return OkStatus();
}
Status Invoke(TfLiteContext* context) {
RETURN_IF_ERROR(SetInputsAndOutputs(context));
return runner_->Run();
}
void BindGlBufferToTensor(GLuint buffer_id, int tensor_index,
DataType data_type, DataLayout data_layout) {
// At this point the delegate haven't seen a model yet. Therefore, just
// record what object gets assigned.
if (tensor_index >= tensors_.size()) {
tensors_.resize(tensor_index + 1);
}
TensorObjectDef def;
def.object_def.data_type = data_type;
def.object_def.data_layout = data_layout;
def.object_def.object_type = ObjectType::OPENGL_SSBO;
def.object_def.user_provided = true;
def.dimensions = Dimensions(0, 0, 0, 0);
OpenGlBuffer buffer;
buffer.id = buffer_id;
TensorObject obj = buffer;
tensors_[tensor_index] = std::make_pair(obj, def);
}
ObjectDef GetObjectDef(int index) const {
if (index < tensors_.size() && IsValid(tensors_[index].second)) {
return tensors_[index].second.object_def;
}
ObjectDef default_object_def;
default_object_def.data_type = DataType::FLOAT32;
default_object_def.data_layout = DataLayout::BHWC;
default_object_def.object_type = ObjectType::CPU_MEMORY;
default_object_def.user_provided = true;
return default_object_def;
}
TensorObject GetTensorObject(int index, TfLiteContext* context) const {
if (index < tensors_.size() &&
IsValid(tensors_[index].second, tensors_[index].first)) {
return tensors_[index].first;
}
auto& tensor = context->tensors[index];
return MakeCpuMemory(absl::MakeSpan(tensor.data.raw, tensor.bytes));
}
TfLiteDelegate* tflite_delegate() { return &delegate_; }
bool SupportsGlObjects() const {
return options_.egl_context != EGL_NO_CONTEXT &&
options_.egl_display != EGL_NO_DISPLAY;
}
absl::Span<const uint8_t> GetSerializedBinaryCache() {
binary_cache_ = environment_->GetSerializedBinaryCache();
return binary_cache_;
}
private:
TfLiteDelegate delegate_ = {
reinterpret_cast<void*>(this), // .data_
DelegatePrepare, // .Prepare
nullptr, // .CopyFromBufferHandle
nullptr, // .CopyToBufferHandle
nullptr, // .FreeBufferHandle
kTfLiteDelegateFlagsNone, // .flags
};
TfLiteGpuDelegateOptions_New options_;
std::unique_ptr<InferenceEnvironment> environment_;
std::unique_ptr<InferenceRunner> runner_;
std::vector<int64_t> input_indices_;
std::vector<int64_t> output_indices_;
std::vector<uint8_t> binary_cache_;
std::vector<std::pair<TensorObject, TensorObjectDef>> tensors_;
};
inline Delegate* GetDelegate(TfLiteNode* node) {
return reinterpret_cast<Delegate*>(node->user_data);
}
inline Delegate* GetDelegate(TfLiteDelegate* delegate) {
return reinterpret_cast<Delegate*>(delegate->data_);
}
TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
const TfLiteRegistration kRegistration = {
// .init
[](TfLiteContext* context, const char* buffer, size_t) -> void* {
const auto* params =
reinterpret_cast<const TfLiteDelegateParams*>(buffer);
auto* gpu_delegate = GetDelegate(params->delegate);
// Everything below should happen in prepare function call, but TFLite
// for whatever reason forbids that.
const auto status = gpu_delegate->Prepare(context, params);
if (!status.ok()) {
context->ReportError(context, "TfLiteGpuDelegate Init: %s",
status.error_message().c_str());
return nullptr;
}
return gpu_delegate;
},
// .free
[](TfLiteContext*, void* buffer) -> void {},
// .prepare
[](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
if (!node->user_data) {
context->ReportError(
context,
"TfLiteGpuDelegate Prepare: delegate is not initialized");
return kTfLiteError;
}
// TODO(akulik): tflite tensors are not allocated here either. It would
// be good to set inputs and outputs only once here instead of setting
// them every time in .invoke.
return kTfLiteOk;
},
// .invoke
[](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
const auto status = GetDelegate(node)->Invoke(context);
if (!status.ok()) {
context->ReportError(context, "TfLiteGpuDelegate Invoke: %s",
status.error_message().c_str());
return kTfLiteError;
}
return kTfLiteOk;
},
nullptr, // .profiling_string
0, // .builtin_code
"TfLiteGpuDelegate_New", // .custom_name
1, // .version
};
TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
context, kRegistration, ops_to_replace, delegate);
TfLiteIntArrayFree(ops_to_replace);
return status;
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite
TfLiteDelegate* TfLiteGpuDelegateCreate_New(
const TfLiteGpuDelegateOptions_New* options) {
auto* gpu_delegate = new tflite::gpu::cl::Delegate(options);
return gpu_delegate ? gpu_delegate->tflite_delegate() : nullptr;
}
void TfLiteGpuDelegateDelete_New(TfLiteDelegate* delegate) {
delete tflite::gpu::cl::GetDelegate(delegate);
}
TFL_CAPI_EXPORT TfLiteStatus TfLiteGpuDelegateBindGlBufferToTensor(
TfLiteDelegate* delegate, GLuint buffer_id, int tensor_index,
TfLiteType data_type, TfLiteGpuDataLayout data_layout) {
auto* gpu_delegate = tflite::gpu::cl::GetDelegate(delegate);
if (!gpu_delegate) {
return kTfLiteError;
}
if (!gpu_delegate->SupportsGlObjects()) {
return kTfLiteError;
}
auto type = tflite::gpu::cl::ToDataType(data_type);
if (type == tflite::gpu::DataType::UNKNOWN) {
return kTfLiteError;
}
auto layout = tflite::gpu::cl::ToDataLayoutFromTFL(data_layout);
if (layout == tflite::gpu::DataLayout::UNKNOWN) {
return kTfLiteError;
}
gpu_delegate->BindGlBufferToTensor(buffer_id, tensor_index, type, layout);
return kTfLiteOk;
}
bool TfLiteGpuDelegateGetSerializedBinaryCache(TfLiteDelegate* delegate,
size_t* size,
const uint8_t** data) {
*size = 0;
auto* gpu_delegate = tflite::gpu::cl::GetDelegate(delegate);
if (!gpu_delegate) {
return false;
}
auto cache = gpu_delegate->GetSerializedBinaryCache();
if (cache.empty()) {
return false;
}
*size = cache.size();
*data = cache.data();
return true;
}

View File

@ -0,0 +1,120 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
#include <stdint.h>
#include <EGL/egl.h>
#include <GLES3/gl31.h>
#include "tensorflow/lite/c/c_api_internal.h"
#ifdef SWIG
#define TFL_CAPI_EXPORT
#else
#if defined(_WIN32)
#ifdef TF_COMPILE_LIBRARY
#define TFL_CAPI_EXPORT __declspec(dllexport)
#else
#define TFL_CAPI_EXPORT __declspec(dllimport)
#endif // TF_COMPILE_LIBRARY
#else
#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
#endif // _WIN32
#endif // SWIG
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
enum TfLiteGpuInferencePriority {
TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION = 0,
TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY = 1,
};
// Shader compilation options.
struct TFL_CAPI_EXPORT TfLiteGpuCompileOptions_New {
// When set to zero, computations are carried out in 32-bit floating point.
// Otherwise, the GPU may quantify tensors, downcast values, process in FP16
// (recommended).
int32_t precision_loss_allowed;
// Priority is defined in TfLiteGpuInferencePriority.
int32_t inference_priority;
};
struct TFL_CAPI_EXPORT TfLiteGpuDelegateOptions_New {
TfLiteGpuCompileOptions_New compile_options;
// [Optional]
// Whenever EGL display and EGL context are set, corresponding OpenCL context
// will be created.
// These variables are required when using GL objects as inputs or outputs.
EGLDisplay egl_display;
EGLContext egl_context;
// [Optional]
// Contains data returned from TfLiteGpuDelegateGetSerializedBinaryCache call.
// Invalid or incompatible data will be discarded. Compiled binary may become
// incompatible when GPU driver is updated.
const uint8_t* serialized_binary_cache_data;
size_t serialized_binary_cache_size;
};
// Creates a new delegate instance that need to be destroyed with
// TfLiteGpuDelegateDelete_New when delegate is no longer used by TFLite.
// When `options` is set to `nullptr`, the following default values are used:
// .compile_options = {
// .precision_loss_allowed = false,
// }
// .egl_display = eglGetCurrentDisplay(),
// .egl_context = eglGetCurrentContext();
TFL_CAPI_EXPORT TfLiteDelegate* TfLiteGpuDelegateCreate_New(
const TfLiteGpuDelegateOptions_New* options);
// Destroys a delegate created with `TfLiteGpuDelegateCreate_New` call.
TFL_CAPI_EXPORT void TfLiteGpuDelegateDelete_New(TfLiteDelegate* delegate);
enum TfLiteGpuDataLayout {
TFLITE_GPU_DATA_LAYOUT_BHWC = 0,
TFLITE_GPU_DATA_LAYOUT_DHWC4 = 1,
};
// Binds GL shader storage object to an input or an output tensor in the
// initialized delegate. Bound buffer should have sufficient storage to
// accommodate all elements of a tensor.
//
// Supports data of kTfliteFloat16 or kTfliteFloat32 types in BHWC or DHWC4 data
// layouts.
//
// *** Must be called *before* `Interpreter::ModifyGraphWithDelegate`. ***
TFL_CAPI_EXPORT TfLiteStatus TfLiteGpuDelegateBindGlBufferToTensor(
TfLiteDelegate* delegate, GLuint buffer_id, int tensor_index,
TfLiteType data_type, TfLiteGpuDataLayout data_layout);
// Returns opaque binary blob that contains a collection of cached OpenCL
// binaries. Returned data could be re-used later to speed up initialization
// time when new delegate is created for the same model.
// Returned data is valid only if used on the same device, otherwise it will
// not be compatible and will be discarded.
TFL_CAPI_EXPORT bool TfLiteGpuDelegateGetSerializedBinaryCache(
TfLiteDelegate* delegate, size_t* size, const uint8_t** data);
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_

View File

@ -0,0 +1,419 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
#include <algorithm>
#include <cmath>
#include <map>
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h"
#include "tensorflow/lite/delegates/gpu/common/memory_management.h"
#include "tensorflow/lite/delegates/gpu/common/model.h"
#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
#include "tensorflow/lite/delegates/gpu/common/transformations/add_bias.h"
#include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
bool IsReady(const std::unordered_set<ValueId>& ready_tensors,
const CLNode& node) {
for (const ValueId in_id : node.inputs) {
if (ready_tensors.find(in_id) == ready_tensors.end()) {
return false;
}
}
return true;
}
std::vector<std::pair<ValueId, TensorDescriptor>> GetCLNodeTensors(
const CLNode& node) {
std::vector<std::pair<ValueId, TensorDescriptor>> result;
for (int i = 0; i < node.operations.size(); ++i) {
const OperationDef op_def = node.operations[i]->GetDefinition();
const auto& first_range = node.ranges[0];
for (int k = first_range.x; k < first_range.y; ++k) {
result.push_back({node.inputs[k], op_def.src_tensors[k - first_range.x]});
}
for (int j = 1; j < node.ranges.size(); ++j) {
const auto& range = node.ranges[j];
for (int k = range.x; k < range.y; ++k) {
result.push_back({node.inputs[k], op_def.src_tensors[k - range.x + 1]});
}
}
for (int j = 0; j < node.outputs.size(); ++j) {
result.push_back({node.outputs[j], op_def.dst_tensors[j]});
}
}
return result;
}
void MergeCLNodes(CLNode* src, CLNode* dst) {
int offset = dst->inputs.size();
for (int j = 0; j < src->inputs.size(); ++j) {
if (src->inputs[j] != dst->outputs[0]) {
dst->inputs.push_back(src->inputs[j]);
}
}
auto first_range = src->ranges[0];
dst->ranges.push_back(
int2(first_range.x + offset, first_range.y - 1 + offset));
for (int i = 1; i < src->ranges.size(); ++i) {
auto range = src->ranges[i];
dst->ranges.push_back(int2(range.x + offset, range.y + offset));
}
dst->outputs[0] = src->outputs[0];
for (int i = 0; i < src->operations.size(); ++i) {
dst->operations.push_back(std::move(src->operations[i]));
}
dst->name += " linked : " + src->name;
}
void AddUsage(ValueId id, int task_index,
std::map<ValueId, int2>* usage_records) {
auto it = usage_records->find(id);
if (it == usage_records->end()) {
(*usage_records)[id].x = task_index;
(*usage_records)[id].y = task_index;
} else {
(*usage_records)[id].y = task_index;
}
}
} // namespace
CLNode::CLNode(CLNode&& node)
: operations(std::move(node.operations)),
inputs(std::move(node.inputs)),
outputs(std::move(node.outputs)),
ranges(std::move(node.ranges)),
name(std::move(node.name)) {}
CLNode& CLNode::operator=(CLNode&& node) {
if (this != &node) {
operations = std::move(node.operations);
inputs = std::move(node.inputs);
outputs = std::move(node.outputs);
ranges = std::move(node.ranges);
name = std::move(node.name);
}
return *this;
}
Status InferenceContext::InitFromGraph(const CreateInferenceInfo& create_info,
const GraphFloat32& graph,
Environment* env) {
precision_ = create_info.precision;
storage_type_ = create_info.storage_type;
if (env->device().vendor() == Vendor::MALI) {
need_flush_ = true;
need_manual_release_ = true;
}
CopyInAndOutIds(graph);
CreationContext creation_context;
creation_context.device = env->GetDevicePtr();
creation_context.context = &env->context();
creation_context.queue = env->queue();
creation_context.cache = env->program_cache();
RETURN_IF_ERROR(
ConvertOperations(creation_context, graph, create_info.hints));
Merge();
RETURN_IF_ERROR(
AllocateMemory(graph, env->device(), creation_context.context));
BindMemoryToOperations();
RETURN_IF_ERROR(Compile(creation_context));
TuningParameters tuning_parameters;
tuning_parameters.queue = env->profiling_queue();
tuning_parameters.info = env->device().GetInfoPtr();
if (create_info.hints.Check(ModelHints::kFastTuning)) {
tuning_parameters.tuning_type = TuningType::FAST;
}
RETURN_IF_ERROR(Tune(tuning_parameters));
return OkStatus();
}
Status InferenceContext::InitFromGraphWithTransforms(
const CreateInferenceInfo& create_info, GraphFloat32* graph,
Environment* env) {
RETURN_IF_ERROR(RunGraphTransforms(graph));
RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env));
return OkStatus();
}
void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) {
const auto inputs = graph.inputs();
for (const auto& input : inputs) {
input_ids_.push_back(input->id);
}
const auto outputs = graph.outputs();
for (const auto& output : outputs) {
output_ids_.push_back(output->id);
}
}
Status InferenceContext::ConvertOperations(
const CreationContext& creation_context, const GraphFloat32& graph,
ModelHints hints) {
std::vector<Node*> graph_nodes = graph.nodes();
for (int i = 0; i < graph_nodes.size(); ++i) {
const Node& node = *graph_nodes[i];
auto inputs = graph.FindInputs(node.id);
auto outputs = graph.FindOutputs(node.id);
OperationDef op_def;
op_def.precision = precision_;
auto data_type = DeduceDataTypeFromPrecision(precision_);
for (int j = 0; j < inputs.size(); ++j) {
op_def.src_tensors.push_back({data_type, storage_type_});
}
for (int j = 0; j < outputs.size(); ++j) {
op_def.dst_tensors.push_back({data_type, storage_type_});
}
std::unique_ptr<GPUOperation> gpu_op;
RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints, graph,
node, &gpu_op));
CLNode cl_node;
cl_node.operations.push_back(std::move(gpu_op));
cl_node.ranges.push_back(int2(0, static_cast<int>(inputs.size())));
cl_node.inputs.resize(inputs.size());
for (int j = 0; j < inputs.size(); ++j) {
cl_node.inputs[j] = inputs[j]->id;
}
cl_node.outputs.resize(outputs.size());
for (int j = 0; j < outputs.size(); ++j) {
cl_node.outputs[j] = outputs[j]->id;
}
cl_node.name = node.operation.type + " " + std::to_string(node.id) + " " +
std::to_string(i);
nodes_.push_back(std::move(cl_node));
}
return OkStatus();
}
void InferenceContext::Merge() {
std::unordered_set<ValueId> ready_tensors;
for (const auto& input_id : input_ids_) {
ready_tensors.insert(input_id);
}
for (int i = 0; i < nodes_.size(); ++i) {
auto& node = nodes_[i];
for (const auto& out_id : node.outputs) {
ready_tensors.insert(out_id);
}
if (node.outputs.size() != 1) {
continue;
}
std::vector<int> next_nodes;
for (int j = i + 1; j < nodes_.size(); ++j) {
for (int k = 0; k < nodes_[j].inputs.size(); ++k) {
if (nodes_[j].inputs[k] == node.outputs[0]) {
next_nodes.push_back(j);
}
}
}
if (next_nodes.size() != 1) {
continue;
}
auto& linkable_node = nodes_[next_nodes[0]];
auto* elementwise =
dynamic_cast<ElementwiseOperation*>(linkable_node.operations[0].get());
if (!elementwise || linkable_node.outputs.size() != 1 ||
!IsReady(ready_tensors, linkable_node)) {
continue;
}
MergeCLNodes(&linkable_node, &node);
nodes_.erase(nodes_.begin() + next_nodes[0]);
i -= 1;
}
for (auto& node : nodes_) {
for (int j = 1; j < node.operations.size(); ++j) {
auto* elementwise =
dynamic_cast<ElementwiseOperation*>(node.operations[j].get());
node.operations[0]->AddOperation(elementwise);
}
}
}
Status InferenceContext::AllocateMemory(const GraphFloat32& graph,
const CLDevice& device,
CLContext* context) {
std::map<ValueId, int2> usages;
for (int op_index = 0; op_index < nodes_.size(); ++op_index) {
auto tensors = GetCLNodeTensors(nodes_[op_index]);
for (auto& tensor : tensors) {
AddUsage(tensor.first, op_index, &usages);
}
}
std::vector<TensorUsageRecord<BHWC>> usage_records;
std::map<ValueId, ValueId> remap_from_graph_ids;
for (auto& usage : usages) {
const auto& shape = graph.GetValue(usage.first)->tensor.shape;
remap_from_graph_ids[usage.first] = usage_records.size();
usage_records.push_back({shape, static_cast<TaskId>(usage.second.x),
static_cast<TaskId>(usage.second.y)});
}
ObjectsAssignment<BHWC> assignment;
RETURN_IF_ERROR(AssignObjectsToTensors(
usage_records, MemoryStrategy::EQUALITY, &assignment));
for (auto& node : nodes_) {
for (auto& id : node.inputs) {
ValueId new_id = assignment.object_ids[remap_from_graph_ids[id]];
remap_from_graph_ids_to_shared_[id] = new_id;
id = new_id;
}
for (auto& id : node.outputs) {
ValueId new_id = assignment.object_ids[remap_from_graph_ids[id]];
remap_from_graph_ids_to_shared_[id] = new_id;
id = new_id;
}
}
for (auto& node : nodes_) {
auto tensors = GetCLNodeTensors(node);
for (auto& tensor : tensors) {
const auto& it = tensors_.find(tensor.first);
if (it == tensors_.end()) {
const auto& shape = assignment.object_sizes[tensor.first];
Tensor* t = &tensors_[tensor.first];
RETURN_IF_ERROR(CreateTensor(*context, device, shape.w, shape.h,
shape.c, tensor.second.data_type,
tensor.second.storage_type, t));
}
}
}
return OkStatus();
}
void InferenceContext::BindMemoryToOperations() {
for (auto& node : nodes_) {
const auto& first_range = node.ranges[0];
for (int k = first_range.x; k < first_range.y; ++k) {
auto id = node.inputs[k];
const auto& it = tensors_.find(id);
node.operations[0]->SetSrc(&it->second, k - first_range.x);
}
for (int i = 1; i < node.ranges.size(); ++i) {
const auto& range = node.ranges[i];
for (int k = range.x; k < range.y; ++k) {
auto id = node.inputs[k];
const auto& it = tensors_.find(id);
node.operations[i]->SetSrc(&it->second, k - range.x + 1);
}
}
for (int i = 0; i < node.outputs.size(); ++i) {
auto id = node.outputs[i];
const auto& it = tensors_.find(id);
node.operations[0]->SetDst(&it->second, i);
}
}
}
Status InferenceContext::Compile(const CreationContext& creation_context) {
for (auto& node : nodes_) {
RETURN_IF_ERROR(node.operations[0]->Compile(creation_context));
}
return OkStatus();
}
Status InferenceContext::Tune(const TuningParameters& tuning_parameters) {
for (auto& node : nodes_) {
RETURN_IF_ERROR(node.operations[0]->Tune(tuning_parameters));
}
return OkStatus();
}
Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
if (need_manual_release_) {
if (prev_enqueue_start_point_.is_valid()) {
prev_enqueue_start_point_.Wait();
}
RETURN_IF_ERROR(queue->EnqueueEvent(&prev_enqueue_start_point_));
}
for (auto& node : nodes_) {
RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue));
}
if (need_flush_) {
clFlush(queue->queue());
}
return OkStatus();
}
Status InferenceContext::Profile(ProfilingCommandQueue* queue,
ProfilingInfo* result) {
queue->ResetMeasurements();
for (auto& node : nodes_) {
queue->SetEventsLabel(node.name);
RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue));
}
RETURN_IF_ERROR(queue->WaitForCompletion());
*result = queue->GetProfilingInfo();
return OkStatus();
}
Tensor* InferenceContext::GetTensor(ValueId id) {
return &tensors_[remap_from_graph_ids_to_shared_[id]];
}
Status InferenceContext::SetInputTensor(ValueId id, const TensorFloat32& tensor,
CLCommandQueue* queue) {
return GetTensor(id)->WriteData(queue, tensor);
}
Status InferenceContext::GetOutputTensor(ValueId id, CLCommandQueue* queue,
TensorFloat32* result) {
const auto& gpu_tensor = *GetTensor(id);
const int4 dst_size = gpu_tensor.GetSizeWithDepth();
const auto dst_shape = BHWC(1, dst_size.y, dst_size.x, dst_size.z);
result->id = id;
result->shape = dst_shape;
result->data.resize(dst_shape.DimensionsProduct());
return gpu_tensor.ReadData(queue, result);
}
Status RunGraphTransforms(GraphFloat32* graph) {
auto merge_padding_transform = NewMergePaddingWithAdd();
auto add_bias_transform = NewAddBias();
ModelTransformer transformer(graph, /*reporter=*/nullptr);
if (!transformer.Apply("add_bias", add_bias_transform.get())) {
return InternalError("Invalid add_bias transform");
}
if (!transformer.Apply("merge_padding", merge_padding_transform.get())) {
return InternalError("Invalid merge_padding transform");
}
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,131 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_
#include <cstdint>
#include <map>
#include <memory>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
#include "tensorflow/lite/delegates/gpu/cl/environment.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
#include "tensorflow/lite/delegates/gpu/common/model.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
namespace tflite {
namespace gpu {
namespace cl {
struct CLNode {
std::vector<std::unique_ptr<GPUOperation>> operations;
std::vector<ValueId> inputs;
std::vector<ValueId> outputs;
// So as CLNode can have few operations, ranges keep range of ids from inputs,
// for every operation.
std::vector<int2> ranges;
// Mostly for debug purposess.
std::string name;
CLNode() = default;
CLNode(CLNode&& node);
CLNode& operator=(CLNode&& node);
CLNode(const CLNode&) = delete;
CLNode& operator=(const CLNode&) = delete;
};
class InferenceContext {
public:
struct CreateInferenceInfo {
CalculationsPrecision precision;
TensorStorageType storage_type;
ModelHints hints;
};
Status InitFromGraph(const CreateInferenceInfo& create_info,
const GraphFloat32& graph, Environment* env);
// Applies OpenCL-specific transformations to the graph before the
// initialization. These transformations are either impossible or useless in
// other backends.
Status InitFromGraphWithTransforms(const CreateInferenceInfo& create_info,
GraphFloat32* graph, Environment* env);
Status AddToQueue(CLCommandQueue* queue);
Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
Status SetInputTensor(ValueId id, const TensorFloat32& tensor,
CLCommandQueue* queue);
// It will work only with input/output tensor ids. For all other ids we don't
// have any guarantees.
Tensor* GetTensor(ValueId id);
Status GetOutputTensor(ValueId id, CLCommandQueue* queue,
TensorFloat32* result);
private:
void CopyInAndOutIds(const GraphFloat32& graph);
Status ConvertOperations(const CreationContext& creation_context,
const GraphFloat32& graph, ModelHints hints);
void CreateLinks();
void Merge();
Status AllocateMemory(const GraphFloat32& graph, const CLDevice& device,
CLContext* context);
void BindMemoryToOperations();
Status Compile(const CreationContext& creation_context);
Status Tune(const TuningParameters& tuning_parameters);
// performance hacks
bool need_flush_ = false;
// In order to reduce memory leak on Mali a pipeline needs to be synchronized
// with CPU to prevent growing internal global OpenCL kernel pool. One trick
// is to enqueue an event from a previous run. Most of the time is should
// already be executed on GPU and should not stall the pipeline.
bool need_manual_release_ = false;
CLEvent prev_enqueue_start_point_;
CalculationsPrecision precision_;
TensorStorageType storage_type_;
// Directly mapped nodes from graph, but some of them "inactiv" due
// to fusion (inactiv = fused).
// Memory is allocated only once, in ConvertOperations, and is not modified
// anywhere.
std::vector<CLNode> nodes_;
std::map<ValueId, Tensor> tensors_;
std::map<ValueId, ValueId> remap_from_graph_ids_to_shared_;
std::vector<ValueId> input_ids_;
std::vector<ValueId> output_ids_;
};
// Runs OpenCL specific transforms for the graph.
Status RunGraphTransforms(GraphFloat32* graph);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_INFERENCE_CONTEXT_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,48 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/abs.h"
#include <string>
#include "absl/strings/str_cat.h"
namespace tflite {
namespace gpu {
namespace cl {
Abs::Abs(Abs&& operation) : ElementwiseOperation(std::move(operation)) {}
Abs& Abs::operator=(Abs&& operation) {
if (this != &operation) {
ElementwiseOperation::operator=(std::move(operation));
}
return *this;
}
std::string Abs::GetCoreCode(const std::string& src, const std::string& z_coord,
const std::string& address) const {
return absl::StrCat(src, " = fabs(", src, ");\n");
}
Abs CreateAbs(const OperationDef& definition) {
Abs operation(definition);
operation.SetLinkIndex(0);
return operation;
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,48 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ABS_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ABS_H_
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
namespace tflite {
namespace gpu {
namespace cl {
class Abs : public ElementwiseOperation {
public:
explicit Abs(const OperationDef& definition)
: ElementwiseOperation(definition) {}
// Move only
Abs(Abs&& operation);
Abs& operator=(Abs&& operation);
Abs(const Abs&) = delete;
Abs& operator=(const Abs&) = delete;
std::string GetCoreCode(const std::string& src, const std::string& z_coord,
const std::string& address) const override;
};
Abs CreateAbs(const OperationDef& definition);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ABS_H_

View File

@ -0,0 +1,60 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/abs.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, Abs) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {half(0.0f), half(-1.0f), half(-0.05f), half(0.045f)};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Abs operation = CreateAbs(op_def);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(0.0f), {half(0.0f), half(1.0f),
half(0.05f), half(0.045f)}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,193 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
#include "absl/strings/str_cat.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/common/util.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
bool HasTexture2DStorageType(const OperationDef& def) {
for (auto& src_tensor : def.src_tensors) {
if (src_tensor.storage_type == TensorStorageType::TEXTURE_2D) {
return true;
}
}
return false;
}
} // namespace
std::string Add::GetElementWiseCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(precision);
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ);
c += GetArgsDeclaration();
c += ::tflite::gpu::cl::GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 dst_size\n";
c += ") {\n";
c += " int X = get_global_id(0);\n";
c += " int Y = get_global_id(1);\n";
c += " int Z = get_global_id(2);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y) { \n";
c += " return; \n";
c += " } \n";
c += " FLT4 src = (FLT4)(0.0);\n";
c += " " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
if (src_depthes_[0] != dst_depth_) {
c += " if (Z < " + std::to_string(src_depthes_[0]) + ") {\n";
if (src_descriptor.storage_type == TensorStorageType::TEXTURE_2D) {
c += " float t_y = address.y - Z; \n";
c += " int ti_y = (t_y + 0.5) * " + inv_divisor_name_ + "; \n";
c += " int2 tmp_add = (int2)(address.x, ti_y * " +
std::to_string(src_depthes_[0]) + " + Z);\n";
c += " src += " + src_tensor.Read3D("tmp_add") + ";\n";
} else {
c += " src += " + src_tensor.Read3D("address") + ";\n";
}
c += " }\n";
} else {
c += " src += " + src_tensor.Read3D("address") + ";\n";
}
c += " " + GetCoreCode("src", "Z", "address");
c += PostProcess(linked_operations, "src", "Z", "address");
c += " " + dst_tensor.Write3D("src", "address") + "\n";
c += "} \n";
return c;
}
Add::Add(const OperationDef& definition, const std::vector<int>& channels,
int dst_channels)
: ElementwiseOperation(definition),
dst_depth_(IntegralDivideRoundUp(dst_channels, 4)) {
src_depthes_.resize(channels.size());
for (int i = 0; i < channels.size(); ++i) {
src_depthes_[i] = IntegralDivideRoundUp(channels[i], 4);
}
}
Add::Add(Add&& operation)
: ElementwiseOperation(std::move(operation)),
link_index_(operation.link_index_),
inv_divisor_name_(std::move(operation.inv_divisor_name_)),
src_depthes_(std::move(operation.src_depthes_)),
dst_depth_(operation.dst_depth_) {}
Add& Add::operator=(Add&& operation) {
if (this != &operation) {
link_index_ = operation.link_index_;
inv_divisor_name_ = std::move(operation.inv_divisor_name_);
src_depthes_ = std::move(operation.src_depthes_);
dst_depth_ = operation.dst_depth_;
ElementwiseOperation::operator=(std::move(operation));
}
return *this;
}
void Add::SetLinkIndex(int index) {
inv_divisor_name_ = absl::StrCat("inv_divisor_", index);
link_index_ = index;
}
std::string Add::GetCoreCode(const std::string& src, const std::string& z_coord,
const std::string& address) const {
std::string result;
for (int i = 1; i < src_depthes_.size(); ++i) {
const std::string tensor_name =
absl::StrCat("src_data_", link_index_, "_", i);
TensorCodeGenerator src_tensor(tensor_name, "", definition_.src_tensors[i]);
if (src_depthes_[i] != dst_depth_) {
absl::StrAppend(&result, " if (", z_coord, " < ", src_depthes_[i],
") {\n");
if (definition_.src_tensors[i].storage_type ==
TensorStorageType::TEXTURE_2D) {
absl::StrAppend(&result, " float t_y = ", address, ".y - ", z_coord,
";\n");
absl::StrAppend(&result, " int ti_y = (t_y + 0.5) * ",
inv_divisor_name_, ";\n");
absl::StrAppend(&result, " int2 tmp_add = (int2)(", address,
".x, ti_y * ", src_depthes_[i], " + ", z_coord, ");\n");
absl::StrAppend(&result, " ", src,
" += ", src_tensor.Read3D("tmp_add"), ";\n");
} else {
absl::StrAppend(&result, " ", src,
" += ", src_tensor.Read3D(address), ";\n");
}
absl::StrAppend(&result, " }\n");
} else {
absl::StrAppend(&result, " ", src,
" += ", src_tensor.Read3D(address) + ";\n");
}
}
return result;
}
std::string Add::GetArgsDeclaration() const {
std::string args;
for (int i = 1; i < src_depthes_.size(); ++i) {
const std::string tensor_name =
absl::StrCat("src_data_", link_index_, "_", i);
TensorCodeGenerator src_tensor(tensor_name, "", definition_.src_tensors[i]);
absl::StrAppend(&args, ",\n", src_tensor.GetDeclaration(AccessType::READ));
}
if (HasTexture2DStorageType(definition_)) {
absl::StrAppend(&args, ",\n float ", inv_divisor_name_);
}
return args;
}
Status Add::BindArguments(CLKernel* kernel) {
for (int i = 1; i < src_depthes_.size(); ++i) {
RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[i]->GetMemoryPtr()));
}
if (HasTexture2DStorageType(definition_)) {
float inv_divisor = 1.0f / static_cast<float>(dst_depth_);
RETURN_IF_ERROR(kernel->SetBytesAuto(inv_divisor));
}
return OkStatus();
}
Status Add::Compile(const CreationContext& creation_context) {
const auto code =
GetElementWiseCode(definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Add CreateAdd(const OperationDef& definition, const std::vector<int>& channels,
int dst_channels) {
Add operation(definition, channels, dst_channels);
operation.SetLinkIndex(0);
return operation;
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,72 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ADD_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ADD_H_
#include <string>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
// Add operation inherited from ElementwiseOperation, but it is much more
// complicated than usual elementwise, that is why it has own versions for
// Compile. Add operation support not equal tensors on input (for possibility to
// remove Padding operation with zeroes in Z dimension)
class Add : public ElementwiseOperation {
public:
Add(const OperationDef& definition, const std::vector<int>& channels,
int dst_channels);
Status Compile(const CreationContext& creation_context) override;
// Move only
Add(Add&& operation);
Add& operator=(Add&& operation);
Add(const Add&) = delete;
Add& operator=(const Add&) = delete;
void SetLinkIndex(int index) override;
std::string GetCoreCode(const std::string& src, const std::string& z_coord,
const std::string& address) const override;
std::string GetArgsDeclaration() const override;
Status BindArguments(CLKernel* kernel) override;
private:
std::string GetElementWiseCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const std::vector<ElementwiseOperation*>& linked_operations);
int link_index_;
std::string inv_divisor_name_;
std::vector<int> src_depthes_;
int dst_depth_;
};
Add CreateAdd(const OperationDef& definition, const std::vector<int>& channels,
int dst_channels);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_ADD_H_

View File

@ -0,0 +1,124 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, AddTwoEqualTensors) {
TensorFloat32 src0, src1;
src0.shape = BHWC(1, 2, 1, 2);
src0.data = {0.0f, -1.0f, -0.05f, 0.045f};
src1.shape = BHWC(1, 2, 1, 2);
src1.data = {0.0f, 1.0f, -0.05f, -0.045f};
std::vector<int> channels = {2, 2};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Add operation = CreateAdd(op_def, channels, channels[0]);
ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {0.0f, 0.0f, -0.1f, 0.0f}));
}
}
}
TEST_F(OpenCLOperationTest, AddFirstTensorHasMoreChannelsThanSecond) {
TensorFloat32 src0, src1;
src0.shape = BHWC(1, 2, 1, 6);
src0.data = {0.0f, -1.0f, -0.05f, 0.045f, 1.0f, -2.0f,
-1.05f, 1.045f, 2.0f, -3.0f, -2.05f, 2.045f};
src1.shape = BHWC(1, 2, 1, 2);
src1.data = {0.0f, 1.0f, -0.05f, -0.045f};
std::vector<int> channels = {6, 2};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Add operation = CreateAdd(op_def, channels, channels[0]);
ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
BHWC(1, 2, 1, 6), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps),
{0.0f, 0.0f, -0.05f, 0.045f, 1.0f, -2.0f, -1.1f,
1.0f, 2.0f, -3.0f, -2.05f, 2.045f}));
}
}
}
TEST_F(OpenCLOperationTest, AddFirstTensorHasLessChannelsThanSecond) {
TensorFloat32 src0, src1;
src1.shape = BHWC(1, 2, 1, 6);
src1.data = {0.0f, -1.0f, -0.05f, 0.045f, 1.0f, -2.0f,
-1.05f, 1.045f, 2.0f, -3.0f, -2.05f, 2.045f};
src0.shape = BHWC(1, 2, 1, 2);
src0.data = {0.0f, 1.0f, -0.05f, -0.045f};
std::vector<int> channels = {2, 6};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Add operation = CreateAdd(op_def, channels, 6);
ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
BHWC(1, 2, 1, 6), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps),
{0.0f, 0.0f, -0.05f, 0.045f, 1.0f, -2.0f, -1.1f,
1.0f, 2.0f, -3.0f, -2.05f, 2.045f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,136 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h"
#include <string>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GetApplyMaskKernelCode(
const OperationDef& definition,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src("src_data", "src_size", definition.src_tensors[0]);
TensorCodeGenerator mask("src_mask", "src_size_1", definition.src_tensors[1]);
TensorCodeGenerator dst("dst_data", "dst_size", definition.dst_tensors[0]);
std::string c = GetCommonDefines(definition.precision);
c += "__kernel void main_function(\n";
c += src.GetDeclaration(AccessType::READ) + ",\n";
c += mask.GetDeclaration(AccessType::READ) + ",\n";
c += dst.GetDeclaration(AccessType::WRITE);
c += GetArgsDeclaration(linked_operations);
c += " int apply_mask_type,\n";
c += " int4 src_size,\n";
c += " int4 src_size_1,\n";
c += " int4 dst_size \n";
c += ") {\n";
c += " int X = get_global_id(0);\n";
c += " int Y = get_global_id(1);\n";
c += " int Z = get_global_id(2);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y) return;\n";
c += " FLT4 result = " + src.Read3D("X", "Y", "Z") + ";\n";
c += " if (apply_mask_type == 1) {\n";
c += " result *= " + mask.Read3D("X", "Y", "Z") + ";\n";
c += " } else if (apply_mask_type == 2) {\n";
c += " result *= " + mask.Read3D("0", "0", "Z") + ";\n";
c += " } else {\n";
c += " result *= " + mask.Read3D("X", "Y", "0") + ".x;\n";
c += " }\n";
c += " " + dst.GetAddress("dst_adr", "X", "Y", "Z");
c += PostProcess(linked_operations, "result", "Z", "dst_adr");
c += " " + dst.Write3D("result", "dst_adr");
c += "}\n";
return c;
}
int GetMaskType(int4 src_size, int4 mask_size) {
if (mask_size.z == 1) {
return 0;
} else if (src_size.x == mask_size.x && src_size.y == mask_size.y) {
return 1;
} else {
return 2;
}
}
} // namespace
ApplyMask::ApplyMask(ApplyMask&& operation)
: GPUOperation(std::move(operation)),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}
ApplyMask& ApplyMask::operator=(ApplyMask&& operation) {
if (this != &operation) {
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}
Status ApplyMask::Compile(const CreationContext& creation_context) {
const auto code = GetApplyMaskKernelCode(definition_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status ApplyMask::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(
GetMaskType(src_[0]->GetSizeWithDepth(), src_[1]->GetSizeWithDepth()))));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[1]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
return OkStatus();
}
int3 ApplyMask::GetGridSize() const {
return int3(dst_[0]->Width(), dst_[0]->Height(), dst_[0]->Depth());
}
Status ApplyMask::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status ApplyMask::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
ApplyMask CreateApplyMask(const OperationDef& definition) {
return ApplyMask(definition);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,58 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_APPLY_MASK_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_APPLY_MASK_H_
#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class ApplyMask : public GPUOperation {
public:
explicit ApplyMask(const OperationDef& definition)
: GPUOperation(definition) {}
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
ApplyMask(ApplyMask&& operation);
ApplyMask& operator=(ApplyMask&& operation);
ApplyMask(const ApplyMask&) = delete;
ApplyMask& operator=(const ApplyMask&) = delete;
private:
Status BindArguments();
int3 GetGridSize() const;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
ApplyMask CreateApplyMask(const OperationDef& definition);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_APPLY_MASK_H_

View File

@ -0,0 +1,124 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/apply_mask.h"
#include <memory>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, ApplyMaskOneChannel) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 3.0f, 4.0f, 6.0f};
TensorFloat32 mask_tensor;
mask_tensor.shape = BHWC(1, 2, 2, 1);
mask_tensor.data = {2.0f, 0.5f, 1.0f, 0.0f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ApplyMask operation = CreateApplyMask(op_def);
ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {-8.0f, -6.0f, -0.5f, 0.0f, 1.0f,
3.0f, 0.0f, 0.0f}));
}
}
}
TEST_F(OpenCLOperationTest, ApplyMaskEqualSizes) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 3.0f, 4.0f, 6.0f};
TensorFloat32 mask_tensor;
mask_tensor.shape = BHWC(1, 2, 2, 2);
mask_tensor.data = {2.0f, 0.5f, 1.0f, 0.0f, 2.0f, 0.5f, 1.0f, 0.0f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ApplyMask operation = CreateApplyMask(op_def);
ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {-8.0f, -1.5f, -1.0f, 0.0f, 2.0f,
1.5f, 4.0f, 0.0f}));
}
}
}
TEST_F(OpenCLOperationTest, ApplyMaskVector) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 3.0f, 4.0f, 6.0f};
TensorFloat32 mask_tensor;
mask_tensor.shape = BHWC(1, 1, 1, 2);
mask_tensor.data = {2.0f, 0.5f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ApplyMask operation = CreateApplyMask(op_def);
ASSERT_OK(ExecuteGPUOperation({src_tensor, mask_tensor},
creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {-8.0f, -1.5f, -2.0f, 0.0f, 2.0f,
1.5f, 8.0f, 3.0f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,83 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
const CreationContext& creation_context,
GPUOperation* operation,
const std::vector<BHWC>& dst_sizes,
const std::vector<TensorFloat32*>& dst_cpu) {
const OperationDef& op_def = operation->GetDefinition();
std::vector<Tensor> src(src_cpu.size());
for (int i = 0; i < src_cpu.size(); ++i) {
auto src_shape = src_cpu[i].shape;
RETURN_IF_ERROR(CreateTensor(
*creation_context.context, *creation_context.device, src_shape.w,
src_shape.h, src_shape.c, op_def.src_tensors[0].data_type,
op_def.src_tensors[0].storage_type, &src[i]));
RETURN_IF_ERROR(src[i].WriteData(creation_context.queue, src_cpu[i]));
operation->SetSrc(&src[i], i);
}
std::vector<Tensor> dst(dst_cpu.size());
for (int i = 0; i < dst_cpu.size(); ++i) {
auto dst_shape = dst_sizes[i];
RETURN_IF_ERROR(CreateTensor(
*creation_context.context, *creation_context.device, dst_shape.w,
dst_shape.h, dst_shape.c, op_def.dst_tensors[0].data_type,
op_def.dst_tensors[0].storage_type, &dst[i]));
operation->SetDst(&dst[i], i);
}
RETURN_IF_ERROR(operation->Compile(creation_context));
RETURN_IF_ERROR(operation->AddToQueue(creation_context.queue));
RETURN_IF_ERROR(creation_context.queue->WaitForCompletion());
for (int i = 0; i < dst_cpu.size(); ++i) {
dst_cpu[i]->shape = dst_sizes[i];
dst_cpu[i]->data = std::vector<float>(dst_sizes[i].DimensionsProduct(), 0);
RETURN_IF_ERROR(dst[i].ReadData(creation_context.queue, dst_cpu[i]));
}
return OkStatus();
}
Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
const CreationContext& creation_context,
GPUOperation* operation, const BHWC& dst_size,
TensorFloat32* result) {
return ExecuteGPUOperation(
std::vector<TensorFloat32>{src_cpu}, creation_context, operation,
std::vector<BHWC>{dst_size}, std::vector<TensorFloat32*>{result});
}
Status ExecuteGPUOperation(const TensorFloat32& src_cpu,
const CreationContext& creation_context,
GPUOperation* operation, const BHWC& dst_size,
TensorFloat32* result) {
return ExecuteGPUOperation(std::vector<TensorFloat32>{src_cpu},
creation_context, operation, dst_size, result);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,73 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/environment.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
namespace tflite {
namespace gpu {
namespace cl {
#ifndef ASSERT_OK
#define ASSERT_OK(x) ASSERT_TRUE(x.ok());
#endif
class OpenCLOperationTest : public ::testing::Test {
public:
void SetUp() override {
ASSERT_OK(LoadOpenCL());
ASSERT_OK(CreateDefaultEnvironment(&env_));
creation_context_.device = env_.GetDevicePtr();
creation_context_.context = &env_.context();
creation_context_.queue = env_.queue();
creation_context_.cache = env_.program_cache();
}
protected:
Environment env_;
CreationContext creation_context_;
};
Status ExecuteGPUOperation(const TensorFloat32& src_cpu,
const CreationContext& creation_context,
GPUOperation* operation, const BHWC& dst_size,
TensorFloat32* result);
Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
const CreationContext& creation_context,
GPUOperation* operation, const BHWC& dst_size,
TensorFloat32* result);
Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
const CreationContext& creation_context,
GPUOperation* operation,
const std::vector<BHWC>& dst_sizes,
const std::vector<TensorFloat32*>& dst_cpu);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CL_TEST_H_

View File

@ -0,0 +1,170 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, ConcatWidth) {
TensorFloat32 src0, src1;
src0.shape = BHWC(1, 2, 1, 2);
src0.data = {half(0.0f), half(-1.0f), half(-0.05f), half(0.045f)};
src1.shape = BHWC(1, 2, 2, 2);
src1.data = {half(1.0f), half(-1.2f), half(-0.45f), half(1.045f),
half(1.1f), half(-1.3f), half(-0.55f), half(2.045f)};
ConcatAttributes attr;
attr.axis = Axis::WIDTH;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConcatXY operation = CreateConcatXY(op_def, attr, 2);
ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
BHWC(1, 2, 3, 2), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(0.0f),
{half(0.0f), half(-1.0f), half(1.0f), half(-1.2f),
half(-0.45f), half(1.045f), half(-0.05f), half(0.045f),
half(1.1f), half(-1.3f), half(-0.55f), half(2.045f)}));
}
}
}
TEST_F(OpenCLOperationTest, ConcatHeight) {
TensorFloat32 src0, src1;
src0.shape = BHWC(1, 2, 1, 2);
src0.data = {half(0.0f), half(-1.0f), half(-0.05f), half(0.045f)};
src1.shape = BHWC(1, 1, 1, 2);
src1.data = {half(1.0f), half(-1.2f)};
ConcatAttributes attr;
attr.axis = Axis::HEIGHT;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConcatXY operation = CreateConcatXY(op_def, attr, 2);
ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
BHWC(1, 3, 1, 2), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(0.0f), {half(0.0f), half(-1.0f), half(-0.05f),
half(0.045f), half(1.0f), half(-1.2f)}));
}
}
}
TEST_F(OpenCLOperationTest, ConcatChannels) {
TensorFloat32 src0, src1, src2;
src0.shape = BHWC(1, 2, 1, 1);
src0.data = {half(0.0f), half(-1.0f)};
src1.shape = BHWC(1, 2, 1, 2);
src1.data = {half(1.0f), half(2.0f), half(3.0f), half(4.0f)};
src2.shape = BHWC(1, 2, 1, 3);
src2.data = {half(5.0f), half(6.0f), half(7.0f),
half(8.0f), half(9.0), half(10.0f)};
ConcatAttributes attr;
attr.axis = Axis::CHANNELS;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConcatZ operation = CreateConcatZ(op_def, {1, 2, 3});
ASSERT_OK(ExecuteGPUOperation({src0, src1, src2}, creation_context_,
&operation, BHWC(1, 2, 1, 6), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(0.0f),
{half(0.0f), half(1.0f), half(2.0f), half(5.0f),
half(6.0f), half(7.0f), half(-1.0f), half(3.0f),
half(4.0f), half(8.0f), half(9.0), half(10.0f)}));
}
}
}
TEST_F(OpenCLOperationTest, ConcatChannelsAlignedx4) {
TensorFloat32 src0, src1;
src0.shape = BHWC(1, 2, 1, 4);
src0.data = {half(-1.0f), half(-2.0f), half(-3.0f), half(-4.0f),
half(1.0f), half(2.0f), half(3.0f), half(4.0f)};
src1.shape = BHWC(1, 2, 1, 4);
src1.data = {half(5.0f), half(6.0f), half(7.0f), half(8.0f),
half(-5.0f), half(-6.0f), half(-7.0f), half(-8.0f)};
ConcatAttributes attr;
attr.axis = Axis::CHANNELS;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConcatZ operation = CreateConcatZ(op_def, {4, 4});
ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
BHWC(1, 2, 1, 8), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(0.0f),
{half(-1.0f), half(-2.0f), half(-3.0f), half(-4.0f),
half(5.0f), half(6.0f), half(7.0f), half(8.0f), half(1.0f),
half(2.0f), half(3.0f), half(4.0f), half(-5.0f),
half(-6.0f), half(-7.0f), half(-8.0f)}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,164 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GetConcatKernelCode(
const OperationDef& definition, int tensors_count,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::vector<std::shared_ptr<TensorCodeGenerator>> srcs(tensors_count);
for (int i = 0; i < tensors_count; ++i) {
const std::string tensor_name = "src_data_" + std::to_string(i);
const std::string uniform_name = "src_size_" + std::to_string(i);
srcs[i] = std::shared_ptr<TensorCodeGenerator>(new TensorCodeGenerator(
tensor_name, uniform_name, definition.src_tensors[i]));
}
TensorCodeGenerator dst("dst_data", "dst_size", definition.dst_tensors[0]);
std::string c = GetCommonDefines(definition.precision);
c += "__kernel void main_function(\n";
for (const auto& src : srcs) {
c += src->GetDeclaration(AccessType::READ) + ",\n";
}
c += dst.GetDeclaration(AccessType::WRITE);
c += GetArgsDeclaration(linked_operations);
for (int i = 0; i < tensors_count; ++i) {
const std::string uniform_name = "src_size_" + std::to_string(i);
c += " int4 " + uniform_name + ",\n";
}
for (int i = 0; i < tensors_count; ++i) {
const std::string uniform_name = "dst_offset_" + std::to_string(i);
c += " int2 " + uniform_name + ",\n";
}
c += " int4 dst_size \n";
c += ") {\n";
c += " int X = get_global_id(0);\n";
c += " int Y = get_global_id(1);\n";
c += " int Z = get_global_id(2);\n";
for (int i = 0; i < tensors_count; ++i) {
const std::string offset_name = "dst_offset_" + std::to_string(i);
const std::string size_name = "src_size_" + std::to_string(i);
c += " if (X < " + size_name + ".x && Y < " + size_name + ".y) { \n";
c += " FLT4 result = " + srcs[i]->Read3D("X", "Y", "Z") + ";\n";
c += " int dst_x = X + " + offset_name + ".x;\n";
c += " int dst_y = Y + " + offset_name + ".y;\n";
c += " " + dst.GetAddress("dst_adr", "dst_x", "dst_y", "Z");
c += PostProcess(linked_operations, "result", "Z", "dst_adr");
c += " " + dst.Write3D("result", "dst_adr");
c += " } \n";
}
c += "}\n";
return c;
}
} // namespace
ConcatXY::ConcatXY(ConcatXY&& operation)
: GPUOperation(std::move(operation)),
attr_(operation.attr_),
tensors_count_(operation.tensors_count_),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}
ConcatXY& ConcatXY::operator=(ConcatXY&& operation) {
if (this != &operation) {
attr_ = operation.attr_;
tensors_count_ = operation.tensors_count_;
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}
Status ConcatXY::Compile(const CreationContext& creation_context) {
const auto code =
GetConcatKernelCode(definition_, tensors_count_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status ConcatXY::BindArguments() {
kernel_.ResetBindingCounter();
for (int i = 0; i < tensors_count_; ++i) {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
}
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
int max_src_width = 0;
int max_src_height = 0;
for (int i = 0; i < tensors_count_; ++i) {
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[i]->GetSizeWithDepth()));
max_src_width = std::max(max_src_width, src_[i]->Width());
max_src_height = std::max(max_src_height, src_[i]->Height());
}
int x_offset = 0;
int y_offset = 0;
for (int i = 0; i < tensors_count_; ++i) {
RETURN_IF_ERROR(kernel_.SetBytesAuto(int2(x_offset, y_offset)));
x_offset += attr_.axis == Axis::WIDTH ? src_[i]->Width() : 0;
y_offset += attr_.axis == Axis::HEIGHT ? src_[i]->Height() : 0;
}
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
return OkStatus();
}
int3 ConcatXY::GetGridSize() const {
int max_src_width = 0;
int max_src_height = 0;
for (int i = 0; i < tensors_count_; ++i) {
max_src_width = std::max(max_src_width, src_[i]->Width());
max_src_height = std::max(max_src_height, src_[i]->Height());
}
const int grid_x = max_src_width;
const int grid_y = max_src_height;
const int grid_z = dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
}
Status ConcatXY::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status ConcatXY::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
ConcatXY CreateConcatXY(const OperationDef& definition,
const ConcatAttributes& attr, int tensors_count) {
return ConcatXY(definition, attr, tensors_count);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,62 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_XY_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_XY_H_
#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class ConcatXY : public GPUOperation {
public:
ConcatXY(const OperationDef& definition, const ConcatAttributes& attr,
int tensors_count)
: GPUOperation(definition), attr_(attr), tensors_count_(tensors_count) {}
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
ConcatXY(ConcatXY&& operation);
ConcatXY& operator=(ConcatXY&& operation);
ConcatXY(const ConcatXY&) = delete;
ConcatXY& operator=(const ConcatXY&) = delete;
private:
Status BindArguments();
int3 GetGridSize() const;
ConcatAttributes attr_;
int tensors_count_;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
ConcatXY CreateConcatXY(const OperationDef& definition,
const ConcatAttributes& attr, int tensors_count);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_XY_H_

View File

@ -0,0 +1,216 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h"
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
bool IsAllChannelsX4(const std::vector<int>& channels) {
for (int channel : channels) {
if (channel % 4 != 0) {
return false;
}
}
return true;
}
std::string GetConcatKernelCode(
const OperationDef& definition, const std::vector<int>& channels,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::vector<std::shared_ptr<TensorCodeGenerator>> srcs(channels.size());
for (int i = 0; i < channels.size(); ++i) {
const std::string tensor_name = "src_data_" + std::to_string(i);
const std::string uniform_name = "src_size_" + std::to_string(i);
srcs[i] = std::shared_ptr<TensorCodeGenerator>(new TensorCodeGenerator(
tensor_name, uniform_name, definition.src_tensors[i]));
}
TensorCodeGenerator dst("dst_data", "dst_size", definition.dst_tensors[0]);
std::string code = GetCommonDefines(definition.precision);
const std::string postfix[] = {".x", ".y", ".z", ".w"};
code += "__kernel void main_function(\n";
for (const auto& src : srcs) {
code += src->GetDeclaration(AccessType::READ) + ",\n";
}
code += dst.GetDeclaration(AccessType::WRITE);
code += GetArgsDeclaration(linked_operations);
for (int i = 0; i < channels.size(); ++i) {
const std::string uniform_name = "src_size_" + std::to_string(i);
code += " int4 " + uniform_name + ",\n";
}
code += " int4 dst_size\n";
code += ") {\n";
code += " int X = get_global_id(0);\n";
code += " int Y = get_global_id(1);\n";
code += " if (X >= dst_size.x || Y >= dst_size.y) { \n";
code += " return; \n";
code += " } \n";
if (IsAllChannelsX4(channels)) {
// When all channels % 4 == 0 we can read/assign/write FLT4 elements easily.
// Also it is easy to write a loop in this case, to prevent long kernel
// generation.
code += " int Z = 0;\n";
for (int i = 0; i < channels.size(); ++i) {
const std::string uniform_name = "src_size_" + std::to_string(i);
const int depth = IntegralDivideRoundUp(channels[i], 4);
if (depth % 2 == 0) {
// We can read more at once inside of loop in case depth % 2 == 0
// it should be better for reading latency hiding
code += " for (int i = 0; i < " + uniform_name + ".w; i += 2) {\n";
code += " FLT4 result0 = " + srcs[i]->Read3D("X", "Y", "i") + ";\n";
code +=
" FLT4 result1 = " + srcs[i]->Read3D("X", "Y", "i + 1") + ";\n";
code += " " + dst.GetAddress("dst_adr0", "X", "Y", "Z") + "\n";
code += " " + dst.GetAddress("dst_adr1", "X", "Y", "Z + 1") + "\n";
code += PostProcess(linked_operations, "result0", "Z", "dst_adr0");
code += PostProcess(linked_operations, "result1", "Z + 1", "dst_adr1");
code += " " + dst.Write3D("result0", "dst_adr0");
code += " " + dst.Write3D("result1", "dst_adr1");
code += " Z += 2;\n";
code += " }\n";
} else {
code += " for (int i = 0; i < " + uniform_name + ".w; ++i) {\n";
code += " FLT4 result = " + srcs[i]->Read3D("X", "Y", "i") + ";\n";
code += " " + dst.GetAddress("dst_adr", "X", "Y", "Z") + "\n";
code += PostProcess(linked_operations, "result", "Z", "dst_adr");
code += " " + dst.Write3D("result", "dst_adr");
code += " Z++;\n";
code += " }\n";
}
}
} else {
code += " FLT4 result = (FLT4)(0.0);\n";
int out_channel = 0;
int read_index = 0;
int z = 0;
for (int i = 0; i < channels.size(); ++i) {
const int depth = IntegralDivideRoundUp(channels[i], 4);
for (int d = 0; d < depth; ++d) {
const int channels_in_group = std::min(4, channels[i] - d * 4);
const std::string temp_name = "t" + std::to_string(read_index);
code += " FLT4 " + temp_name + " = ";
code += srcs[i]->Read3D("X", "Y", std::to_string(d)) + ";\n";
for (int c = 0; c < channels_in_group; ++c) {
code += " result" + postfix[out_channel] + " = ";
code += temp_name + postfix[c] + ";\n";
out_channel++;
if (out_channel == 4) {
out_channel = 0;
code += " {\n";
code += " " +
dst.GetAddress("dst_adr", "X", "Y", std::to_string(z)) +
"\n";
code += PostProcess(linked_operations, "result", std::to_string(z),
"dst_adr");
code += " " + dst.Write3D("result", "dst_adr");
code += " }\n";
z++;
}
}
read_index++;
}
}
if (out_channel != 0) {
code += " {\n";
code +=
" " + dst.GetAddress("dst_adr", "X", "Y", std::to_string(z)) + "\n";
code += PostProcess(linked_operations, "result", std::to_string(z),
"dst_adr");
code += " " + dst.Write3D("result", "dst_adr");
code += " }\n";
}
}
code += "}\n";
return code;
}
} // namespace
ConcatZ::ConcatZ(ConcatZ&& kernel)
: GPUOperation(std::move(kernel)),
channels_(std::move(kernel.channels_)),
kernel_(std::move(kernel.kernel_)),
work_group_size_(kernel.work_group_size_) {}
ConcatZ& ConcatZ::operator=(ConcatZ&& kernel) {
if (this != &kernel) {
channels_ = std::move(kernel.channels_);
kernel_ = std::move(kernel.kernel_);
std::swap(work_group_size_, kernel.work_group_size_);
GPUOperation::operator=(std::move(kernel));
}
return *this;
}
Status ConcatZ::Compile(const CreationContext& creation_context) {
const auto code =
GetConcatKernelCode(definition_, channels_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status ConcatZ::BindArguments() {
kernel_.ResetBindingCounter();
for (int i = 0; i < channels_.size(); ++i) {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
}
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
for (int i = 0; i < channels_.size(); ++i) {
int4 size(src_[i]->Width(), src_[i]->Height(), channels_[i],
IntegralDivideRoundUp(channels_[i], 4));
RETURN_IF_ERROR(kernel_.SetBytesAuto(size));
}
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
return OkStatus();
}
int3 ConcatZ::GetGridSize() const {
const int grid_x = dst_[0]->Width();
const int grid_y = dst_[0]->Height();
const int grid_z = 1;
return int3(grid_x, grid_y, grid_z);
}
Status ConcatZ::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status ConcatZ::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
ConcatZ CreateConcatZ(const OperationDef& definition,
const std::vector<int>& channels) {
return ConcatZ(definition, channels);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,63 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_Z_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_Z_H_
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class ConcatZ : public GPUOperation {
public:
ConcatZ(const OperationDef& definition, const std::vector<int>& channels)
: GPUOperation(definition), channels_(channels) {}
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
ConcatZ(ConcatZ&& kernel);
ConcatZ& operator=(ConcatZ&& kernel);
ConcatZ(const ConcatZ&) = delete;
ConcatZ& operator=(const ConcatZ&) = delete;
private:
Status BindArguments();
int3 GetGridSize() const;
std::vector<int> channels_;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
ConcatZ CreateConcatZ(const OperationDef& definition,
const std::vector<int>& channels);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONCAT_Z_H_

View File

@ -0,0 +1,281 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h"
#include <string>
#include <utility>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GenerateConvBuffer(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
int x_elements, int y_elements,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
switch (precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
c += "#define CONV(R, S) \\\n";
c += "R += S.x * f0.s0123; \\\n";
c += "R += S.y * f0.s4567; \\\n";
c += "R += S.z * f0.s89ab; \\\n";
c += "R += S.w * f0.scdef; \n";
break;
case CalculationsPrecision::F32_F16:
c += "#define CONV(R, S) \\\n";
c += "R += convert_float4(S.x * f0.s0123 + S.y * f0.s4567 + S.z * "
"f0.s89ab + S.w * f0.scdef);\n";
break;
}
switch (precision) {
case CalculationsPrecision::F32:
c += "#define FLT16 float16\n";
break;
case CalculationsPrecision::F32_F16:
case CalculationsPrecision::F16:
c += "#define FLT16 half16\n";
break;
}
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
c += " __global FLT16* filters_buffer, \n";
c += " __global FLT4* biases \n";
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 src_size, \n";
c += " int4 dst_size, \n";
c += " int2 kernel_size, \n";
c += " int2 dillation, \n";
c += " int2 stride, \n";
c += " int2 padding \n";
c += ") {\n";
c += " int X = get_global_id(0) * " + std::to_string(x_elements) + ";\n";
c += " int Y = get_global_id(1) * " + std::to_string(y_elements) + ";\n";
c += " int Z = get_global_id(2);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
c += " __global FLT16* temp = filters_buffer + Z * src_size.w * "
"kernel_size.x * kernel_size.y;\n";
c += " ACCUM_FLT4 bias_val = TO_ACCUM_TYPE(biases[Z]);\n";
for (int i = 0; i < x_elements * y_elements; ++i) {
c += " ACCUM_FLT4 r" + std::to_string(i) + " = bias_val;\n";
}
for (int x = 0; x < x_elements; ++x) {
std::string x_s = std::to_string(x);
c += " int xc" + x_s + " = (X + " + x_s + ") * stride.x + padding.x;\n";
}
for (int y = 0; y < y_elements; ++y) {
std::string y_s = std::to_string(y);
c += " int yc" + y_s + " = (Y + " + y_s + ") * stride.y + padding.y;\n";
}
c += " for (int y = 0; y < kernel_size.y; ++y) {\n";
for (int y = 0; y < y_elements; ++y) {
std::string y_s = std::to_string(y);
c += " int c" + y_s + "y = y * dillation.y + yc" + y_s + ";\n";
c += " bool y" + y_s + "_in = c" + y_s + "y >= 0 && c" + y_s +
"y < src_size.y;\n";
c += " c" + y_s + "y = clamp(c" + y_s + "y, 0, src_size.y - 1);\n";
}
c += " for (int x = 0; x < kernel_size.x; ++x) {\n";
for (int x = 0; x < x_elements; ++x) {
std::string x_s = std::to_string(x);
c += " int c" + x_s + "x = x * dillation.x + xc" + x_s + ";\n";
c += " bool x" + x_s + "_in = c" + x_s + "x >= 0 && c" + x_s +
"x < src_size.x;\n";
c += " c" + x_s + "x = clamp(c" + x_s + "x, 0, src_size.x - 1);\n";
}
for (int x = 0; x < x_elements; ++x) {
std::string x_s = std::to_string(x);
for (int y = 0; y < y_elements; ++y) {
std::string y_s = std::to_string(y);
std::string i_s = std::to_string(y * x_elements + x);
c += " int src_addr_" + i_s + " = c" + y_s + "y * src_size.x + c" + x_s +
"x;\n";
}
}
c += " for (int s = 0; s < src_size.w; ++s) {\n";
for (int x = 0; x < x_elements; ++x) {
std::string x_s = std::to_string(x);
for (int y = 0; y < y_elements; ++y) {
std::string y_s = std::to_string(y);
std::string i_s = std::to_string(y * x_elements + x);
c += " FLT4 s" + i_s + " = src_data[src_addr_" + i_s + "] * (FLT)(y" +
y_s + "_in && x" + x_s + "_in);\n";
}
}
c += " FLT16 f0 = temp[0];\n";
for (int i = 0; i < x_elements * y_elements; ++i) {
std::string i_s = std::to_string(i);
c += " CONV(r" + i_s + ", s" + i_s + ");\n";
}
for (int i = 0; i < x_elements * y_elements; ++i) {
std::string i_s = std::to_string(i);
c += " src_addr_" + i_s + " += src_size.z;\n";
}
c += " temp += 1;\n";
c += " }\n"; // src_size.w - SRC_DEPTH
c += " }\n"; // kernel_size.x
c += " }\n"; // kernel_size.y
for (int x = 0; x < x_elements; ++x) {
std::string x_s = std::to_string(x);
for (int y = 0; y < y_elements; ++y) {
std::string y_s = std::to_string(y);
std::string i_s = std::to_string(y * x_elements + x);
c += " if (X + " + x_s + " < dst_size.x && Y + " + y_s +
" < dst_size.y) {\n";
c += " FLT4 res = TO_FLT4(r" + i_s + ");\n";
c += " " +
dst_tensor.GetAddress("address", "X + " + x_s, "Y + " + y_s, "Z") +
"\n";
c += PostProcess(linked_operations, "res", "Z", "address");
c += " " + dst_tensor.Write3D("res", "address") + "\n";
c += " }\n";
}
}
c += "}\n";
return c;
}
} // namespace
ConvBuffer::ConvBuffer(const OperationDef& definition,
const Convolution2DAttributes& attr, int x_elements,
int y_elements)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
stride_(attr.strides.w, attr.strides.h),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
dilation_(attr.dilations.w, attr.dilations.h),
x_elements_(x_elements),
y_elements_(y_elements),
work_group_size_(4, 4, 4) {}
ConvBuffer::ConvBuffer(ConvBuffer&& operation)
: GPUOperation(std::move(operation)),
weights_(std::move(operation.weights_)),
biases_(std::move(operation.biases_)),
kernel_size_(operation.kernel_size_),
stride_(operation.stride_),
padding_(operation.padding_),
dilation_(operation.dilation_),
x_elements_(operation.x_elements_),
y_elements_(operation.y_elements_),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}
ConvBuffer& ConvBuffer::operator=(ConvBuffer&& operation) {
if (this != &operation) {
weights_ = std::move(operation.weights_);
biases_ = std::move(operation.biases_);
std::swap(kernel_size_, operation.kernel_size_);
std::swap(stride_, operation.stride_);
std::swap(padding_, operation.padding_);
std::swap(dilation_, operation.dilation_);
std::swap(x_elements_, operation.x_elements_);
std::swap(y_elements_, operation.y_elements_);
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}
Status ConvBuffer::Compile(const CreationContext& creation_context) {
std::string code = GenerateConvBuffer(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, x_elements_, y_elements_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status ConvBuffer::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
int4 src_size = int4(src_[0]->Width(), src_[0]->Height(),
src_[0]->Width() * src_[0]->Height(), src_[0]->Depth());
int4 dst_size = int4(dst_[0]->Width(), dst_[0]->Height(),
dst_[0]->Width() * dst_[0]->Height(), dst_[0]->Depth());
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_size));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_size));
RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
return OkStatus();
}
int3 ConvBuffer::GetGridSize() const {
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), x_elements_);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), y_elements_);
const int grid_z = dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
}
Status ConvBuffer::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
&work_group_size_);
}
Status ConvBuffer::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
Status CreateConvBuffer(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvBuffer* result) {
int x_elements = 2;
int y_elements = 1;
if (definition.precision != CalculationsPrecision::F16) {
x_elements = 1;
y_elements = 1;
}
*result = ConvBuffer(definition, attr, x_elements, y_elements);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;
create_info.storage_type = LinearStorageType::BUFFER;
create_info.data_type = definition.GetDataType();
create_info.aligned_size = attr.weights.shape.o;
RETURN_IF_ERROR(CreateLinearStorage(
create_info, attr.bias, creation_context.context, &result->biases_));
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,116 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class ConvBuffer : public GPUOperation {
public:
ConvBuffer() = default;
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
ConvBuffer(ConvBuffer&& operation);
ConvBuffer& operator=(ConvBuffer&& operation);
ConvBuffer(const ConvBuffer&) = delete;
ConvBuffer& operator=(const ConvBuffer&) = delete;
private:
friend Status CreateConvBuffer(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvBuffer* result);
ConvBuffer(const OperationDef& definition,
const Convolution2DAttributes& attr, int x_elements,
int y_elements);
template <DataType T>
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
Status BindArguments();
int3 GetGridSize() const;
Buffer weights_;
LinearStorage biases_;
int2 kernel_size_;
int2 stride_;
int2 padding_;
int2 dilation_;
int x_elements_;
int y_elements_;
CLKernel kernel_;
int3 work_group_size_;
};
template <DataType T>
Status ConvBuffer::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int float4_size = definition_.precision == CalculationsPrecision::F32
? sizeof(float4)
: sizeof(half4);
const int elements_count =
weights.shape.h * weights.shape.w * src_depth * dst_depth * 4;
if (definition_.GetDataType() == DataType::FLOAT32) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
context, &weights_);
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
context, &weights_);
}
}
Status CreateConvBuffer(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvBuffer* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_

View File

@ -0,0 +1,351 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
#include <array>
#include <string>
#include <utility>
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
// x_elements - amount of elements processed by thread in W dimension
// y_elements - amount of elements processed by thread in H dimension
// element_size must be 1, 2 or 4
// 1 - is FLT4
// 2 - is FLT8
// 4 - is FLT16
// This function generates code for arithmetic part of convolution
std::string GetComputationPart(int x_elements, int y_elements, int element_size,
CalculationsPrecision precision) {
const std::string hexes[16] = {"0", "1", "2", "3", "4", "5", "6", "7",
"8", "9", "a", "b", "c", "d", "e", "f"};
std::string c;
for (int y = 0; y < y_elements; ++y) {
for (int x = 0; x < x_elements; ++x) {
std::string s_index = std::to_string(y * x_elements + x);
for (int e = 0; e < element_size; ++e) {
std::string r_index =
std::to_string((y * x_elements + x) * element_size + e);
switch (precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
c += " r" + r_index + " += f0.s0123 * s" + s_index + ".s" +
hexes[e * 4 + 0] + ";\n";
c += " r" + r_index + " += f0.s4567 * s" + s_index + ".s" +
hexes[e * 4 + 1] + ";\n";
c += " r" + r_index + " += f0.s89ab * s" + s_index + ".s" +
hexes[e * 4 + 2] + ";\n";
c += " r" + r_index + " += f0.scdef * s" + s_index + ".s" +
hexes[e * 4 + 3] + ";\n";
break;
case CalculationsPrecision::F32_F16:
c += " r" + r_index + " += convert_float4(f0.s0123 * s" +
s_index + ".s" + hexes[e * 4 + 0] + " + f0.s4567 * s" +
s_index + ".s" + hexes[e * 4 + 1] + " + f0.s89ab * s" +
s_index + ".s" + hexes[e * 4 + 2] + " + f0.scdef * s" +
s_index + ".s" + hexes[e * 4 + 3] + ");\n";
break;
}
}
}
}
return c;
}
std::string GetShiftFromElementSize(int element_size) {
if (element_size == 4) {
return " >> 2";
} else if (element_size == 2) {
return " >> 1";
} else {
return "";
}
}
std::string GenerateConvBuffer1x1(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
int x_elements, int y_elements, int element_size,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
switch (precision) {
case CalculationsPrecision::F32:
c += "#define FLT8 float8\n";
c += "#define FLT16 float16\n";
break;
case CalculationsPrecision::F32_F16:
case CalculationsPrecision::F16:
c += "#define FLT8 half8\n";
c += "#define FLT16 half16\n";
break;
}
c += "__kernel void main_function(\n";
c += " __global FLT" + std::to_string(element_size * 4) + "* src_data,\n";
c += " __global FLT16* filters_buffer, \n";
c += " __global FLT4* biases \n";
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 src_size, \n";
c += " int4 dst_size \n";
c += ") {\n";
c += " int X = get_global_id(0) * " +
std::to_string(x_elements * element_size) + ";\n";
c += " int Y = get_global_id(1) * " + std::to_string(y_elements) + ";\n";
c += " int Z = get_global_id(2);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
c += " __global FLT16* temp = filters_buffer + Z * src_size.w;\n";
c += " ACCUM_FLT4 bias_val = TO_ACCUM_TYPE(biases[Z]);\n";
for (int i = 0; i < x_elements * element_size * y_elements; ++i) {
c += " ACCUM_FLT4 r" + std::to_string(i) + " = bias_val;\n";
}
for (int x = 0; x < x_elements; ++x) {
std::string x_s = std::to_string(x);
c += " int xc" + x_s + " = min(X + " + std::to_string(x * element_size) +
", src_size.x - 1);\n";
}
for (int y = 0; y < y_elements; ++y) {
std::string y_s = std::to_string(y);
c += " int yc" + y_s + " = min(Y + " + y_s + ", src_size.y - 1);\n";
}
std::string shift = GetShiftFromElementSize(element_size);
for (int y = 0; y < y_elements; ++y) {
std::string y_s = std::to_string(y);
for (int x = 0; x < x_elements; ++x) {
std::string x_s = std::to_string(x);
std::string i_s = std::to_string(y * x_elements + x);
c += " int src_addr_" + i_s + " = ((yc" + y_s + ") * src_size.x + (xc" +
x_s + "))" + shift + ";\n";
}
}
c += " for (int s = 0; s < src_size.w; ++s) {\n";
for (int y = 0; y < y_elements; ++y) {
std::string y_s = std::to_string(y);
for (int x = 0; x < x_elements; ++x) {
std::string x_s = std::to_string(x);
std::string i_s = std::to_string(y * x_elements + x);
c += " FLT" + std::to_string(element_size * 4) + " s" + i_s +
" = src_data[src_addr_" + i_s + "];\n";
}
}
c += " FLT16 f0 = temp[0];\n";
c += GetComputationPart(x_elements, y_elements, element_size, precision);
for (int i = 0; i < x_elements * y_elements; ++i) {
std::string i_s = std::to_string(i);
c += " src_addr_" + i_s + " += src_size.z;\n";
}
c += " temp += 1;\n";
c += " }\n"; // src_size.w = SRC_DEPTH
for (int y = 0; y < y_elements; ++y) {
std::string y_s = std::to_string(y);
for (int x = 0; x < x_elements * element_size; ++x) {
std::string x_s = std::to_string(x);
std::string i_s = std::to_string(y * x_elements * element_size + x);
c += " if (X + " + x_s + " < dst_size.x && Y + " + y_s +
" < dst_size.y) {\n";
c += " FLT4 res = TO_FLT4(r" + i_s + ");\n";
c += " " +
dst_tensor.GetAddress("address", "X + " + x_s, "Y + " + y_s, "Z") +
"\n";
c += PostProcess(linked_operations, "res", "Z", "address");
c += " " + dst_tensor.Write3D("res", "address") + "\n";
c += " }\n";
}
}
c += "}\n";
return c;
}
int GetGridWidth(int width) {
if (width % 2 == 0) { // using kernel_flt8_
return width / 2;
} else { // using kernel_flt4_
return width;
}
}
} // namespace
ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
const Convolution2DAttributes& attr,
int flt4_x_count, int flt4_y_count,
int flt8_x_count, int flt8_y_count)
: GPUOperation(definition),
flt4_x_count_(flt4_x_count),
flt4_y_count_(flt4_y_count),
flt8_x_count_(flt8_x_count),
flt8_y_count_(flt8_y_count),
work_group_size_(2, 4, 1) {}
ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
: GPUOperation(std::move(operation)),
weights_(std::move(operation.weights_)),
biases_(std::move(operation.biases_)),
kernel_flt4_(std::move(operation.kernel_flt4_)),
flt4_x_count_(operation.flt4_x_count_),
flt4_y_count_(operation.flt4_y_count_),
kernel_flt8_(std::move(operation.kernel_flt8_)),
flt8_x_count_(operation.flt8_x_count_),
flt8_y_count_(operation.flt8_y_count_),
work_group_size_(operation.work_group_size_) {}
ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
if (this != &operation) {
weights_ = std::move(operation.weights_);
biases_ = std::move(operation.biases_);
kernel_flt4_ = std::move(operation.kernel_flt4_);
std::swap(flt4_x_count_, operation.flt4_x_count_);
std::swap(flt4_y_count_, operation.flt4_y_count_);
kernel_flt8_ = std::move(operation.kernel_flt8_);
std::swap(flt8_x_count_, operation.flt8_x_count_);
std::swap(flt8_y_count_, operation.flt8_y_count_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}
Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
std::string code_flt4 = GenerateConvBuffer1x1(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, flt4_x_count_, flt4_y_count_, 1,
linked_operations_);
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code_flt4, "main_function", *creation_context.context,
*creation_context.device, &kernel_flt4_));
std::string code_flt8 = GenerateConvBuffer1x1(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, flt8_x_count_, flt8_y_count_, 2,
linked_operations_);
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code_flt8, "main_function", *creation_context.context,
*creation_context.device, &kernel_flt8_));
return OkStatus();
}
CLKernel* ConvBuffer1x1::GetKernel(int width) {
if (width % 2 == 0) {
return &kernel_flt8_;
} else {
return &kernel_flt4_;
}
}
Status ConvBuffer1x1::BindArguments() {
CLKernel* kernel = GetKernel(src_[0]->Width());
kernel->ResetBindingCounter();
RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel->SetMemoryAuto(weights_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel->SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(kernel, linked_operations_));
RETURN_IF_ERROR(kernel->SetMemoryAuto(dst_[0]->GetMemoryPtr()));
int4 src_size = int4(src_[0]->Width(), src_[0]->Height(),
GetGridWidth(src_[0]->Width()) * src_[0]->Height(),
src_[0]->Depth());
int4 dst_size = int4(dst_[0]->Width(), dst_[0]->Height(),
dst_[0]->Width() * dst_[0]->Height(), dst_[0]->Depth());
RETURN_IF_ERROR(kernel->SetBytesAuto(src_size));
RETURN_IF_ERROR(kernel->SetBytesAuto(dst_size));
return OkStatus();
}
int3 ConvBuffer1x1::GetGridSize() const {
if (src_[0]->Width() % 2 == 0) { // using kernel_flt8_
const int grid_x =
IntegralDivideRoundUp(GetGridWidth(dst_[0]->Width()), flt8_x_count_);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), flt8_y_count_);
const int grid_z = dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
} else { // using kernel_flt4_
const int grid_x =
IntegralDivideRoundUp(GetGridWidth(dst_[0]->Width()), flt4_x_count_);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), flt4_y_count_);
const int grid_z = dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
}
}
Status ConvBuffer1x1::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroupConv(params, *GetKernel(src_[0]->Width()),
GetGridSize(), &work_group_size_);
}
Status ConvBuffer1x1::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(*GetKernel(src_[0]->Width()), GetGridSize(),
work_group_size_);
}
bool IsConvBuffer1x1Supported(const OperationDef& definition,
const Convolution2DAttributes& attr) {
auto src_storage_type = definition.src_tensors[0].storage_type;
return src_storage_type == TensorStorageType::BUFFER &&
attr.weights.shape.w == 1 && attr.weights.shape.h == 1 &&
attr.dilations.w == 1 && attr.dilations.w == 1 &&
attr.strides.w == 1 && attr.strides.h == 1 &&
attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
}
Status CreateConvBuffer1x1(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvBuffer1x1* result) {
if (!IsConvBuffer1x1Supported(definition, attr)) {
return InvalidArgumentError("ConvBuffer1x1 doesn't supported");
}
int flt4_x_count = 1;
int flt4_y_count = 1;
int flt8_x_count = 1;
int flt8_y_count = 1;
if (creation_context.device->vendor() == Vendor::MALI) {
if (definition.precision == CalculationsPrecision::F16 &&
creation_context.device->GetInfo().compute_units_count <= 4) {
flt4_x_count = 2;
flt8_x_count = 2;
}
}
*result = ConvBuffer1x1(definition, attr, flt4_x_count, flt4_y_count,
flt8_x_count, flt8_y_count);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;
create_info.storage_type = LinearStorageType::BUFFER;
create_info.data_type = definition.GetDataType();
create_info.aligned_size = attr.weights.shape.o;
RETURN_IF_ERROR(CreateLinearStorage(
create_info, attr.bias, creation_context.context, &result->biases_));
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,123 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class ConvBuffer1x1 : public GPUOperation {
public:
ConvBuffer1x1() = default;
// Move only
ConvBuffer1x1(ConvBuffer1x1&& operation);
ConvBuffer1x1& operator=(ConvBuffer1x1&& operation);
ConvBuffer1x1(const ConvBuffer1x1&) = delete;
ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
private:
friend Status CreateConvBuffer1x1(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvBuffer1x1* result);
ConvBuffer1x1(const OperationDef& definition,
const Convolution2DAttributes& attr, int flt4_x_count,
int flt4_y_count, int flt8_x_count, int flt8_y_count);
template <DataType T>
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
Status BindArguments();
int3 GetGridSize() const;
CLKernel* GetKernel(int width);
Buffer weights_;
LinearStorage biases_;
CLKernel kernel_flt4_;
int flt4_x_count_;
int flt4_y_count_;
CLKernel kernel_flt8_;
int flt8_x_count_;
int flt8_y_count_;
int3 work_group_size_;
};
template <DataType T>
Status ConvBuffer1x1::UploadWeights(
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int float4_size = definition_.precision == CalculationsPrecision::F32
? sizeof(float4)
: sizeof(half4);
const int elements_count =
weights.shape.h * weights.shape.w * src_depth * dst_depth * 4;
if (definition_.GetDataType() == DataType::FLOAT32) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
context, &weights_);
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsToOHWI4I4O(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
context, &weights_);
}
}
bool IsConvBuffer1x1Supported(const OperationDef& definition,
const Convolution2DAttributes& attr);
Status CreateConvBuffer1x1(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvBuffer1x1* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_

View File

@ -0,0 +1,103 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, ConvBuffer1x1SimpleWeights) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 4);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Convolution2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(0, 0);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(2, 1, 1, 4);
attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.0f, 0.0f};
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
TensorFloat32 dst_tensor;
ConvBuffer1x1 operation;
ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {6.0f, 6.0f, 22.0f, 22.0f}));
}
}
TEST_F(OpenCLOperationTest, ConvBuffer1x1) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 4);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Convolution2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(0, 0);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(4, 1, 1, 4);
attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
attr.bias.shape = Linear(4);
attr.bias.data = {0.5f, -0.5f, 0.5f, -0.5f};
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
TensorFloat32 dst_tensor;
ConvBuffer1x1 operation;
ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 4), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {20.5f, 43.5f, 68.5f, 91.5f, 60.5f,
147.5f, 236.5f, 323.5f}));
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,103 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, ConvBufferSimpleWeights) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Convolution2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(1, 1);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(1, 2, 2, 2);
attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
attr.bias.shape = Linear(1);
attr.bias.data = {0.0f};
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
TensorFloat32 dst_tensor;
ConvBuffer operation;
ASSERT_OK(CreateConvBuffer(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
}
}
TEST_F(OpenCLOperationTest, ConvBuffer) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Convolution2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(1, 1);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(2, 2, 2, 2);
attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.5f, -0.5f};
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, TensorStorageType::BUFFER});
op_def.dst_tensors.push_back({data_type, TensorStorageType::BUFFER});
TensorFloat32 dst_tensor;
ConvBuffer operation;
ASSERT_OK(CreateConvBuffer(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f, 60.5f,
235.5f, 20.5f, 123.5f}));
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,294 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
#include <string>
#include <utility>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GenerateConvolutionConstantCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const int2& kernel_size, const int2& dilation, int src_channels,
int dst_channels,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(precision);
const int out_z = IntegralDivideRoundUp(dst_channels, 4);
const std::string kOutZ = std::to_string(out_z);
const int src_depth = IntegralDivideRoundUp(src_channels, 4);
switch (precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
c += "#define CONV4(R, SRC, F, i) \\\n";
c += " R += SRC.x * F[i + 0]; \\\n";
c += " R += SRC.y * F[i + 1]; \\\n";
c += " R += SRC.z * F[i + 2]; \\\n";
c += " R += SRC.w * F[i + 3]; \n";
c += "#define CONV3(R, SRC, F, i) \\\n";
c += " R += SRC.x * F[i + 0]; \\\n";
c += " R += SRC.y * F[i + 1]; \\\n";
c += " R += SRC.z * F[i + 2]; \n";
c += "#define CONV2(R, SRC, F, i) \\\n";
c += " R += SRC.x * F[i + 0]; \\\n";
c += " R += SRC.y * F[i + 1]; \n";
c += "#define CONV1(R, SRC, F, i) \\\n";
c += " R += SRC * F[i + 0]; \n";
break;
case CalculationsPrecision::F32_F16:
c += "#define CONV4(R, SRC, F, i) \\\n";
c += " R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
c += " + SRC.z * F[i + 2] + SRC.w * F[i + 3]);\n";
c += "#define CONV3(R, SRC, F, i) \\\n";
c += " R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
c += " + SRC.z * F[i + 2]);\n";
c += "#define CONV2(R, SRC, F, i) \\\n";
c += " R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]);\n";
c += "#define CONV1(R, SRC, F, i) \\\n";
c += " R += convert_float4(SRC * F[i + 0]);\n";
break;
}
const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
c += " __constant FLT4* filters, \n";
c += " __constant FLT4* biases";
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int2 stride, \n";
c += " int2 padding, \n";
c += " int4 src_size, \n";
c += " int4 dst_size \n";
c += ") {\n";
c += " int X = get_global_id(0);\n";
c += " int Y = get_global_id(1);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y) return;\n";
c += " int start_x = X * stride.x - padding.x;\n";
c += " int start_y = Y * stride.y - padding.y;\n";
c += " ACCUM_FLT4 r[" + kOutZ + "];\n";
c += " for (int i = 0; i < " + kOutZ + "; ++i) {\n";
c += " r[i] = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
c += " }\n";
int filters_counter = 0;
for (int s = 0; s < src_depth; ++s) {
const int ch_count = std::min(4, src_channels - s * 4);
const std::string s_conv = "CONV" + std::to_string(ch_count);
const std::string s_count = ch_count == 1 ? "" : std::to_string(ch_count);
const std::string s_type = absl::StrCat("FLT", s_count);
const std::string s_postfix = postfixes[ch_count - 1];
for (int ky = 0; ky < kernel_size.y; ++ky) {
std::string s_y = absl::StrCat("(start_y + ", ky * dilation.y, ")");
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " {\n";
c += " bool y_out = " + s_y + " < 0 || " + s_y + " >= src_size.y;\n";
}
for (int kx = 0; kx < kernel_size.x; ++kx) {
c += " {\n";
std::string s_x = absl::StrCat("(start_x + ", kx * dilation.x, ")");
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " bool x_out = " + s_x + "< 0 || " + s_x + ">= src_size.x;\n";
c += " " + s_type + " src = x_out || y_out ?";
c += "(" + s_type + ")(0.0) : ";
c += src_tensor.Read3D(s_x, s_y, std::to_string(s)) + s_postfix +
";\n";
} else {
c += " " + s_type +
" src = " + src_tensor.Read3D(s_x, s_y, std::to_string(s)) +
s_postfix + ";\n";
}
for (int d = 0; d < out_z; ++d) {
c += " " + s_conv + "(r[" + std::to_string(d) + "], src, filters,";
c += " " + std::to_string(filters_counter) + ");\n";
filters_counter += ch_count;
}
c += " }\n";
}
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " }\n";
}
}
}
for (int i = 0; i < out_z; ++i) {
std::string s_i = std::to_string(i);
c += " {\n";
c += " FLT4 res = TO_FLT4(r[" + s_i + "]) + biases[" + s_i + "];\n";
c += " " + dst_tensor.GetAddress("dst_adr", "X", "Y", s_i) + "\n";
c += PostProcess(linked_operations, "res", s_i, "dst_adr");
c += " " + dst_tensor.Write3D("res", "dst_adr");
c += " }\n";
}
c += "}\n";
return c;
}
// Adreno can provide up to ~3-4KB of constant memory, but in some cases even
// 3KB can have very bad performance.
int GetAdrenoOptimalMaxConstantSize(int gpu_version) {
if (gpu_version < 600) {
return 256 * 10; // 2.5KB
} else {
return 256 * 14; // 3.5KB
}
}
int GetOptimalMaxConstantSize(const DeviceInfo& info) {
if (info.vendor != Vendor::QUALCOMM) {
// In general we not expect that this kernel will be used with non Adreno
// so as it tuned for Adreno special memory.
return 256 * 16; // 4KB
} else {
return GetAdrenoOptimalMaxConstantSize(info.adreno_info.gpu_version);
}
}
} // namespace
ConvConstants::ConvConstants(ConvConstants&& kernel)
: GPUOperation(std::move(kernel)),
weights_(std::move(kernel.weights_)),
biases_(std::move(kernel.biases_)),
kernel_size_(kernel.kernel_size_),
stride_(kernel.stride_),
padding_(kernel.padding_),
dilation_(kernel.dilation_),
src_channels_(kernel.src_channels_),
dst_channels_(kernel.dst_channels_),
kernel_(std::move(kernel.kernel_)),
work_group_size_(kernel.work_group_size_) {}
ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
if (this != &kernel) {
weights_ = std::move(kernel.weights_);
biases_ = std::move(kernel.biases_);
std::swap(kernel_size_, kernel.kernel_size_);
std::swap(stride_, kernel.stride_);
std::swap(padding_, kernel.padding_);
std::swap(dilation_, kernel.dilation_);
std::swap(src_channels_, kernel.src_channels_);
std::swap(dst_channels_, kernel.dst_channels_);
kernel_ = std::move(kernel.kernel_);
std::swap(work_group_size_, kernel.work_group_size_);
GPUOperation::operator=(std::move(kernel));
}
return *this;
}
Status ConvConstants::Compile(const CreationContext& creation_context) {
const auto code = GenerateConvolutionConstantCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, kernel_size_, dilation_, src_channels_,
dst_channels_, linked_operations_);
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsAdreno3xx()) {
options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_);
}
Status ConvConstants::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
return OkStatus();
}
int3 ConvConstants::GetGridSize() const {
const int grid_x = dst_[0]->Width();
const int grid_y = dst_[0]->Height();
return int3(grid_x, grid_y, 1);
}
Status ConvConstants::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status ConvConstants::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
bool IsConvConstantsSupported(const CLDevice& device,
const OperationDef& definition,
const Convolution2DAttributes& attr) {
if (!device.IsAdreno()) {
return false;
}
const auto& w_shape = attr.weights.shape;
const int dst_channels = AlignByN(w_shape.o, 4);
const int filters_count = w_shape.i * dst_channels * w_shape.h * w_shape.w;
const int float_size = definition.precision == CalculationsPrecision::F32
? sizeof(float)
: sizeof(half);
const int filters_buffer_size = filters_count * float_size;
const int kConstantMaxSize = GetOptimalMaxConstantSize(device.GetInfo());
const int flt4_registers = IntegralDivideRoundUp(w_shape.o, 4);
return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
}
Status CreateConvConstants(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvConstants* result) {
if (!IsConvConstantsSupported(*creation_context.device, definition, attr)) {
return InvalidArgumentError("ConvConstants doesn't supported");
}
*result = ConvConstants(definition, attr);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;
create_info.storage_type = LinearStorageType::BUFFER;
create_info.data_type = definition.GetDataType();
create_info.aligned_size = attr.weights.shape.o;
RETURN_IF_ERROR(CreateLinearStorage(
create_info, attr.bias, creation_context.context, &result->biases_));
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,169 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_CONSTANTS_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_CONSTANTS_H_
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class ConvConstants : public GPUOperation {
public:
ConvConstants() = default;
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
ConvConstants(ConvConstants&& kernel);
ConvConstants& operator=(ConvConstants&& kernel);
ConvConstants(const ConvConstants&) = delete;
ConvConstants& operator=(const ConvConstants&) = delete;
private:
friend Status CreateConvConstants(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvConstants* result);
explicit ConvConstants(const OperationDef& definition,
const Convolution2DAttributes& attr)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
stride_(attr.strides.w, attr.strides.h),
padding_(attr.padding.prepended.w, attr.padding.prepended.h),
dilation_(attr.dilations.w, attr.dilations.h),
src_channels_(attr.weights.shape.i),
dst_channels_(attr.weights.shape.o) {}
template <DataType T>
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
template <DataType S, typename T>
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
absl::Span<T> dst);
Status BindArguments();
int3 GetGridSize() const;
Buffer weights_;
LinearStorage biases_;
int2 kernel_size_;
int2 stride_;
int2 padding_;
int2 dilation_;
int src_channels_;
int dst_channels_;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
template <DataType T>
Status ConvConstants::UploadWeights(
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
const int float_size =
definition_.precision == CalculationsPrecision::F32 ? 4 : 2;
const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
if (definition_.GetDataType() == DataType::FLOAT32) {
std::vector<float4> gpu_data(float_count / 4);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(float_size * float_count, gpu_data.data(),
context, &weights_);
} else {
std::vector<half4> gpu_data(float_count / 4);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(float_size * float_count, gpu_data.data(),
context, &weights_);
}
}
template <DataType S, typename T>
void ConvConstants::RearrangeWeightsData(
const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
int counter = 0;
for (int s = 0; s < src_depth; ++s) {
for (int y = 0; y < kernel_y; ++y) {
for (int x = 0; x < kernel_x; ++x) {
for (int d = 0; d < dst_depth; ++d) {
const int channels_count = std::min(4, src_channels_ - s * 4);
T filters[4];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < channels_count; ++j) {
const int s_ch = s * 4 + j;
const int d_ch = d * 4 + i;
if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
const int f_index =
weights.shape.LinearIndex({d_ch, y, x, s_ch});
filters[i][j] = weights.data[f_index];
} else {
filters[i][j] = 0.0f;
}
}
}
T filters_new[4];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
filters_new[i][j] = filters[j][i];
}
}
for (int i = 0; i < channels_count; ++i) {
dst[counter++] = filters_new[i];
}
}
}
}
}
}
bool IsConvConstantsSupported(const CLDevice& device,
const OperationDef& definition,
const Convolution2DAttributes& attr);
Status CreateConvConstants(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvConstants* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_CONSTANTS_H_

View File

@ -0,0 +1,109 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Convolution2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(1, 1);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(1, 2, 2, 2);
attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
attr.bias.shape = Linear(1);
attr.bias.data = {0.0f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConvConstants operation;
ASSERT_OK(
CreateConvConstants(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
}
}
}
TEST_F(OpenCLOperationTest, ConvConstants) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Convolution2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(1, 1);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(2, 2, 2, 2);
attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.5f, -0.5f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConvConstants operation;
ASSERT_OK(
CreateConvConstants(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f,
60.5f, 235.5f, 20.5f, 123.5f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,312 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
#include <string>
#include <utility>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GenerateConvCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
bool is1x1, bool adreno4xx_optimization,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
switch (precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
c += "#define CONV1(R, S) \\\n";
c += "R += S.x * f0; \\\n";
c += "R += S.y * f1; \\\n";
c += "R += S.z * f2; \\\n";
c += "R += S.w * f3; \n";
c += "#define CONV2(R, S) \\\n";
c += "R += S.x * f4; \\\n";
c += "R += S.y * f5; \\\n";
c += "R += S.z * f6; \\\n";
c += "R += S.w * f7; \n";
break;
case CalculationsPrecision::F32_F16:
c += "#define CONV1(R, S) \\\n";
c += "R += convert_float4(S.x * f0 + S.y * f1 + S.z * f2 + S.w * f3);\n";
c += "#define CONV2(R, S) \\\n";
c += "R += convert_float4(S.x * f4 + S.y * f5 + S.z * f6 + S.w * f7);\n";
break;
}
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
c += " __read_only image2d_t filters0, \n";
c += " __read_only image2d_t filters1, \n";
c += " __read_only image2d_t filters2, \n";
c += " __read_only image2d_t filters3, \n";
c += " __read_only image2d_t biases";
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 src_size, \n";
c += " int4 dst_size, \n";
if (!is1x1) {
c += " int2 kernel_size, \n";
c += " int2 dillation, \n";
}
c += " int2 stride, \n";
c += " int2 padding \n";
c += ") {\n";
c += " int X = get_global_id(0) * 2;\n";
c += " int Y = get_global_id(1) * 2;\n";
c += " int Z = get_global_id(2) * 2;\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
c += " int xc0 = X * stride.x + padding.x;\n";
c += " int xc1 = (X + 1) * stride.x + padding.x;\n";
c += " int yc0 = Y * stride.y + padding.y;\n";
c += " int yc1 = (Y + 1) * stride.y + padding.y;\n";
for (int i = 0; i < 8; ++i) {
c += " ACCUM_FLT4 r" + std::to_string(i) +
" = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
}
std::string f_y = is1x1 ? "s" : "filter_offset";
std::string s_x0 = is1x1 ? "xc0" : "c0.x";
std::string s_x1 = is1x1 ? "xc1" : "c1.x";
std::string s_y0 = is1x1 ? "yc0" : "c0.y";
std::string s_y1 = is1x1 ? "yc1" : "c1.y";
if (!is1x1) {
c += " int2 c0;\n";
c += " int2 c1;\n";
c += " int filter_offset = 0;\n";
c += " for (int y = 0; y < kernel_size.y; ++y) {\n";
c += " c0.y = y * dillation.y + yc0;\n";
c += " c1.y = y * dillation.y + yc1;\n";
c += " for (int x = 0; x < kernel_size.x; ++x) {\n";
c += " c0.x = x * dillation.x + xc0;\n";
c += " c1.x = x * dillation.x + xc1;\n";
}
c += " for (int s = 0; s < src_size.w; ++s) {\n";
std::string fc0 = "(int2)(Z, " + f_y + ")";
std::string fc1 = "(int2)(Z + 1, " + f_y + ")";
c += " FLT4 f0 = READ_IMAGE(filters0, smp_none, " + fc0 + ");\n";
c += " FLT4 f1 = READ_IMAGE(filters1, smp_none, " + fc0 + ");\n";
c += " FLT4 f2 = READ_IMAGE(filters2, smp_none, " + fc0 + ");\n";
c += " FLT4 f3 = READ_IMAGE(filters3, smp_none, " + fc0 + ");\n";
c += " FLT4 f4 = READ_IMAGE(filters0, smp_none, " + fc1 + ");\n";
c += " FLT4 f5 = READ_IMAGE(filters1, smp_none, " + fc1 + ");\n";
c += " FLT4 f6 = READ_IMAGE(filters2, smp_none, " + fc1 + ");\n";
c += " FLT4 f7 = READ_IMAGE(filters3, smp_none, " + fc1 + ");\n";
c += " FLT4 src0 =" + src_tensor.Read3D(s_x0, s_y0, "s") + ";\n";
c += " FLT4 src1 =" + src_tensor.Read3D(s_x1, s_y0, "s") + ";\n";
c += " FLT4 src2 =" + src_tensor.Read3D(s_x0, s_y1, "s") + ";\n";
c += " FLT4 src3 =" + src_tensor.Read3D(s_x1, s_y1, "s") + ";\n";
for (int i = 0; i < 4; ++i) {
c += " CONV1(r" + std::to_string(i) + ", src" + std::to_string(i) +
");\n";
}
for (int i = 0; i < 4; ++i) {
c += " CONV2(r" + std::to_string(i + 4) + ", src" + std::to_string(i) +
");\n";
}
if (!is1x1) {
c += " filter_offset++;\n";
}
c += " }\n"; // src_size.w
if (!is1x1) {
c += " }\n"; // kernel_size.x
c += " }\n"; // kernel_size.y
}
// when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
c += " if (Z < dst_size.w) {\n";
c += " FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n";
for (int i = 0; i < 4; ++i) {
c += " {\n";
c += " int xc = " + dst_x + " + " + std::to_string(i % 2) + ";\n";
c += " int yc = " + dst_y + " + " + std::to_string(i / 2) + ";\n";
c += " if (xc < dst_size.x && yc < dst_size.y) {\n";
c += " FLT4 res = TO_FLT4(r" + std::to_string(i) + ") + bias_val;\n";
c += " " + dst_tensor.GetAddress("address", "xc", "yc", "Z") + "\n";
c += PostProcess(linked_operations, "res", "Z", "address");
c += " " + dst_tensor.Write3D("res", "address") + "\n";
c += " }\n";
c += " }\n";
}
c += " }\n";
c += " Z++;\n";
c += " if (Z < dst_size.w) {\n";
c += " FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n";
for (int i = 0; i < 4; ++i) {
c += " {\n";
c += " int xc = " + dst_x + " + " + std::to_string(i % 2) + ";\n";
c += " int yc = " + dst_y + " + " + std::to_string(i / 2) + ";\n";
c += " if (xc < dst_size.x && yc < dst_size.y) {\n";
c += " FLT4 res = TO_FLT4(r" + std::to_string(i + 4) + ") + bias_val;\n";
c += " " + dst_tensor.GetAddress("address", "xc", "yc", "Z") + "\n";
c += PostProcess(linked_operations, "res", "Z", "address");
c += " " + dst_tensor.Write3D("res", "address") + "\n";
c += " }\n";
c += " }\n";
}
c += " }\n";
c += "}\n";
return c;
}
bool UseFP16SIMD(const CLDevice& device, CalculationsPrecision precision,
bool kernel1x1) {
if (!device.IsAdreno()) {
return false;
}
switch (precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F32_F16:
return false;
case CalculationsPrecision::F16:
return device.IsAdreno3xx() && kernel1x1;
}
}
} // namespace
ConvTexture::ConvTexture(const OperationDef& definition,
const Convolution2DAttributes& attr)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
stride_(attr.strides.w, attr.strides.h),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
dilation_(attr.dilations.w, attr.dilations.h),
work_group_size_(4, 4, 2) {}
ConvTexture::ConvTexture(ConvTexture&& operation)
: GPUOperation(std::move(operation)),
weights_0_(std::move(operation.weights_0_)),
weights_1_(std::move(operation.weights_1_)),
weights_2_(std::move(operation.weights_2_)),
weights_3_(std::move(operation.weights_3_)),
biases_(std::move(operation.biases_)),
kernel_size_(operation.kernel_size_),
stride_(operation.stride_),
padding_(operation.padding_),
dilation_(operation.dilation_),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}
ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
if (this != &operation) {
weights_0_ = std::move(operation.weights_0_);
weights_1_ = std::move(operation.weights_1_);
weights_2_ = std::move(operation.weights_2_);
weights_3_ = std::move(operation.weights_3_);
biases_ = std::move(operation.biases_);
std::swap(kernel_size_, operation.kernel_size_);
std::swap(stride_, operation.stride_);
std::swap(padding_, operation.padding_);
std::swap(dilation_, operation.dilation_);
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}
Status ConvTexture::Compile(const CreationContext& creation_context) {
auto storage_type = definition_.GetPrimaryStorageType();
bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
bool adreno4xx_optimization =
stride_.x == 1 && stride_.y == 1 && padding_.x == 0 && padding_.y == 0 &&
creation_context.device->IsAdreno4xx() &&
storage_type == TensorStorageType::TEXTURE_ARRAY &&
definition_.precision == CalculationsPrecision::F16;
std::string code = GenerateConvCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, is1x1, adreno4xx_optimization, linked_operations_);
std::vector<CompilerOptions> options;
if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_);
}
Status ConvTexture::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_0_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_1_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_2_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_3_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_));
}
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
return OkStatus();
}
int3 ConvTexture::GetGridSize() const {
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), 2);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
const int grid_z = IntegralDivideRoundUp(dst_[0]->Depth(), 2);
return int3(grid_x, grid_y, grid_z);
}
Status ConvTexture::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
&work_group_size_);
}
Status ConvTexture::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
Status CreateConvTexture(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvTexture* result) {
*result = ConvTexture(definition, attr);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;
create_info.storage_type = LinearStorageType::TEXTURE_2D;
create_info.data_type = definition.GetDataType();
create_info.aligned_size = attr.weights.shape.o;
RETURN_IF_ERROR(CreateLinearStorage(
create_info, attr.bias, creation_context.context, &result->biases_));
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,193 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
// This convolution process 2x2x2(XxYxZ) block of FLT4 values per thread.
class ConvTexture : public GPUOperation {
public:
ConvTexture() = default;
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
ConvTexture(ConvTexture&& operation);
ConvTexture& operator=(ConvTexture&& operation);
ConvTexture(const ConvTexture&) = delete;
ConvTexture& operator=(const ConvTexture&) = delete;
private:
friend Status CreateConvTexture(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvTexture* result);
ConvTexture(const OperationDef& definition,
const Convolution2DAttributes& attr);
template <DataType T>
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
template <DataType S, typename T>
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
absl::Span<T> dst_0, absl::Span<T> dst_1,
absl::Span<T> dst_2, absl::Span<T> dst_3);
Status BindArguments();
int3 GetGridSize() const;
Texture2D weights_0_;
Texture2D weights_1_;
Texture2D weights_2_;
Texture2D weights_3_;
LinearStorage biases_;
int2 kernel_size_;
int2 stride_;
int2 padding_;
int2 dilation_;
CLKernel kernel_;
int3 work_group_size_;
};
template <DataType T>
Status ConvTexture::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context) {
const int dst_depth = AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), 2);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
int texture_width = dst_depth;
int texture_height = src_depth * kernel_size_.x * kernel_size_.y;
DataType data_type = definition_.GetDataType();
const int elements_count = texture_width * texture_height;
if (data_type == DataType::FLOAT32) {
std::vector<float4> gpu_data_0(elements_count);
std::vector<float4> gpu_data_1(elements_count);
std::vector<float4> gpu_data_2(elements_count);
std::vector<float4> gpu_data_3(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
absl::MakeSpan(gpu_data_3));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_0.data(),
context, &weights_0_));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_1.data(),
context, &weights_1_));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_2.data(),
context, &weights_2_));
return CreateTexture2DRGBA(data_type, texture_width, texture_height,
gpu_data_3.data(), context, &weights_3_);
} else {
std::vector<half4> gpu_data_0(elements_count);
std::vector<half4> gpu_data_1(elements_count);
std::vector<half4> gpu_data_2(elements_count);
std::vector<half4> gpu_data_3(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
absl::MakeSpan(gpu_data_3));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_0.data(),
context, &weights_0_));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_1.data(),
context, &weights_1_));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_2.data(),
context, &weights_2_));
return CreateTexture2DRGBA(data_type, texture_width, texture_height,
gpu_data_3.data(), context, &weights_3_);
}
}
template <DataType S, typename T>
void ConvTexture::RearrangeWeightsData(
const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
const int dst_depth = AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), 2);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
int texture_width = dst_depth;
for (int d = 0; d < dst_depth / 2; ++d) {
for (int y = 0; y < kernel_size_.y; ++y) {
for (int x = 0; x < kernel_size_.x; ++x) {
for (int s = 0; s < src_depth; ++s) {
for (int sub_d = 0; sub_d < 2; ++sub_d) {
T filters[4];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
const int s_ch = s * 4 + j;
const int d_ch = (d * 2 + sub_d) * 4 + i;
if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
const int f_index =
weights.shape.LinearIndex({d_ch, y, x, s_ch});
filters[j][i] = weights.data[f_index];
} else {
filters[j][i] = 0.0f;
}
}
}
int x_coord = d * 2 + sub_d;
int y_coord = (y * kernel_size_.x + x) * src_depth + s;
int offset = y_coord * texture_width + x_coord;
dst_0[offset] = filters[0];
dst_1[offset] = filters[1];
dst_2[offset] = filters[2];
dst_3[offset] = filters[3];
}
}
}
}
}
}
Status CreateConvTexture(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvTexture* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_

View File

@ -0,0 +1,107 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Convolution2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(1, 1);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(1, 2, 2, 2);
attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
attr.bias.shape = Linear(1);
attr.bias.data = {0.0f};
for (auto storage : env_.GetSupportedTextureStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConvTexture operation;
ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
}
}
}
TEST_F(OpenCLOperationTest, ConvTexture) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Convolution2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(1, 1);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(2, 2, 2, 2);
attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.5f, -0.5f};
for (auto storage : env_.GetSupportedTextureStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConvTexture operation;
ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f,
60.5f, 235.5f, 20.5f, 123.5f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,471 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h"
#include <algorithm>
#include <array>
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_errors.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
#include "tensorflow/lite/delegates/gpu/common/util.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
class OpenClConverterImpl : public TensorObjectConverter {
public:
virtual Status Init(const TensorObjectDef& input_def,
const TensorObjectDef& output_def,
Environment* environment) = 0;
protected:
Status DispatchKernel(cl_mem input, cl_mem output) {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(input));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(output));
int3 grid = int3(dims_.w, dims_.h, dims_.d());
int4 size = int4(dims_.w, dims_.h, dims_.c, dims_.d());
RETURN_IF_ERROR(kernel_.SetBytesAuto(size));
return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
}
Dimensions dims_;
CLKernel kernel_;
CLCommandQueue* queue_ = nullptr;
};
bool IsSupportedDataType(DataType type) {
return type == DataType::FLOAT16 || type == DataType::FLOAT32;
}
// Implements conversion from OpenCL-specific tensor layout to BHWC.
class FromTensorConverter : public OpenClConverterImpl {
public:
static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
return IsSupportedDataType(input.data_type) &&
IsSupportedDataType(output.data_type) &&
// Output is always Buffer/BHWC
output.object_type == ObjectType::OPENCL_BUFFER &&
(output.data_layout == DataLayout::BHWC ||
output.data_layout == DataLayout::DHWC4) &&
// Texture2D/HDWC4 ->
((input.object_type == ObjectType::OPENCL_TEXTURE &&
input.data_layout == DataLayout::HDWC4) ||
// SingleTextureArray/BHWC ->
(input.object_type == ObjectType::OPENCL_TEXTURE &&
input.data_layout == DataLayout::BHWC) ||
// TextureArray/DHWC4 ->
(input.object_type == ObjectType::OPENCL_TEXTURE &&
input.data_layout == DataLayout::DHWC4) ||
// Buffer/DHWC4 ->
(input.object_type == ObjectType::OPENCL_BUFFER &&
input.data_layout == DataLayout::DHWC4));
}
std::pair<std::string, std::string> GetToDhwc4Kernel(
const TensorObjectDef& input_def,
const TensorObjectDef& output_def) const {
return std::make_pair(
"__global " + GetDataType4(output_def.object_def.data_type) + "* dst",
"dst[(d * size.y + y) * size.x + x] = " +
(output_def.object_def.data_type == input_def.object_def.data_type
? "input;"
: "convert_" + GetDataType4(output_def.object_def.data_type) +
"(input);"));
}
std::pair<std::string, std::string> GetToBhwcKernel(
const TensorObjectDef& input_def,
const TensorObjectDef& output_def) const {
return std::make_pair(
"__global " + GetDataType(output_def.object_def.data_type) + "* dst",
R"(
int c = d * 4;
int index = (y * size.x + x) * size.z + c;
dst[index] = input.x;
if (c + 1 < size.z) {
dst[index + 1] = input.y;
}
if (c + 2 < size.z) {
dst[index + 2] = input.z;
}
if (c + 3 < size.z) {
dst[index + 3] = input.w;
})");
}
Status Init(const TensorObjectDef& input_def,
const TensorObjectDef& output_def,
Environment* environment) final {
auto params_kernel = output_def.object_def.data_layout == DataLayout::BHWC
? GetToBhwcKernel(input_def, output_def)
: GetToDhwc4Kernel(input_def, output_def);
TensorStorageType src_tensor_type = ToTensorStorageType(
input_def.object_def.object_type, input_def.object_def.data_layout);
TensorDescriptor src_descr;
src_descr.storage_type = src_tensor_type;
src_descr.data_type = input_def.object_def.data_type;
TensorCodeGenerator src_tensor("src", "size", src_descr);
std::string shader_src =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
const sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void from_tensor()" +
GetTensorDeclaration(src_tensor_type, AccessType::READ,
input_def.object_def.data_type) +
" src, " + params_kernel.first + R"(, int4 size) {
int x = get_global_id(0);
int y = get_global_id(1);
int d = get_global_id(2);
if (x >= size.x || y >= size.y || d >= size.w) return;
)" + GetDataType4(input_def.object_def.data_type) +
" input = " + src_tensor.Read3D("x", "y", "d") + ";\n" +
params_kernel.second + "\n}";
queue_ = environment->queue();
dims_ = input_def.dimensions;
return CreateKernel(shader_src, "from_tensor", environment, &kernel_);
}
Status Convert(const TensorObject& input_obj,
const TensorObject& output_obj) override {
auto output = absl::get_if<OpenClBuffer>(&output_obj);
if (!output || !output->memobj) {
return InvalidArgumentError("Missing output in to_bhwc converter");
}
auto input_texture = absl::get_if<OpenClTexture>(&input_obj);
if (input_texture && input_texture->memobj) {
return DispatchKernel(input_texture->memobj, output->memobj);
}
auto input_buffer = absl::get_if<OpenClBuffer>(&input_obj);
if (input_buffer && input_buffer->memobj) {
return DispatchKernel(input_buffer->memobj, output->memobj);
}
return InvalidArgumentError("Missing input in to_bhwc converter");
}
};
// Implements conversion from BHWC to OpenCL-specific tensor layout.
class ToTensorConverter : public OpenClConverterImpl {
public:
static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
return IsSupportedDataType(input.data_type) &&
IsSupportedDataType(output.data_type) &&
// Input is always Buffer/BHWC
input.object_type == ObjectType::OPENCL_BUFFER &&
(input.data_layout == DataLayout::BHWC ||
input.data_layout == DataLayout::DHWC4) &&
// -> Texture2D/HDWC4
((output.object_type == ObjectType::OPENCL_TEXTURE &&
output.data_layout == DataLayout::HDWC4) ||
// -> TextureArray/DHWC4
(output.object_type == ObjectType::OPENCL_TEXTURE &&
output.data_layout == DataLayout::DHWC4) ||
// -> SingleTextureArray/BHWC
(output.object_type == ObjectType::OPENCL_TEXTURE &&
output.data_layout == DataLayout::BHWC) ||
// -> Buffer/DHWC4
(output.object_type == ObjectType::OPENCL_BUFFER &&
output.data_layout == DataLayout::DHWC4));
}
std::pair<std::string, std::string> GetFromDhwc4Kernel(
const TensorObjectDef& input_def,
const TensorObjectDef& output_def) const {
return std::make_pair(
"__global " + GetDataType4(input_def.object_def.data_type) + "* src",
output_def.object_def.data_type == input_def.object_def.data_type
? "result = src[(d * size.y + y) * size.x + x];"
: "result = convert_" +
GetDataType4(output_def.object_def.data_type) +
"(src[(d * size.y + y) * size.x + x]);");
}
std::pair<std::string, std::string> GetFromBhwcKernel(
const TensorObjectDef& input_def,
const TensorObjectDef& output_def) const {
return std::make_pair(
"__global " + GetDataType(input_def.object_def.data_type) + "* src",
R"(int c = d * 4;
int index = (y * size.x + x) * size.z + c;
result.x = src[index];
result.y = c + 1 < size.z ? src[index + 1] : 1;
result.z = c + 2 < size.z ? src[index + 2] : 2;
result.w = c + 3 < size.z ? src[index + 3] : 3;
)");
}
Status Init(const TensorObjectDef& input_def,
const TensorObjectDef& output_def,
Environment* environment) final {
auto params_kernel = input_def.object_def.data_layout == DataLayout::BHWC
? GetFromBhwcKernel(input_def, output_def)
: GetFromDhwc4Kernel(input_def, output_def);
TensorStorageType dst_tensor_type = ToTensorStorageType(
output_def.object_def.object_type, output_def.object_def.data_layout);
TensorDescriptor dst_descr;
dst_descr.storage_type = dst_tensor_type;
dst_descr.data_type = output_def.object_def.data_type;
TensorCodeGenerator dst_tensor("dst", "size", dst_descr);
std::string shader_src =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void to_tensor()" +
params_kernel.first + ", " +
GetTensorDeclaration(dst_tensor_type, AccessType::WRITE,
output_def.object_def.data_type) +
R"( dst, int4 size) {
int x = get_global_id(0);
int y = get_global_id(1);
int d = get_global_id(2);
if (x >= size.x || y >= size.y || d >= size.w) return;
)" + GetDataType4(output_def.object_def.data_type) +
" result;\n" + params_kernel.second + "\n " +
dst_tensor.Write3D("result", "x", "y", "d") + ";\n}";
queue_ = environment->queue();
dims_ = output_def.dimensions;
return CreateKernel(shader_src, "to_tensor", environment, &kernel_);
}
Status Convert(const TensorObject& input_obj,
const TensorObject& output_obj) override {
auto input = absl::get_if<OpenClBuffer>(&input_obj);
if (!input || !input->memobj) {
return InvalidArgumentError("Missing input in from_bhwc converter");
}
auto output_texture = absl::get_if<OpenClTexture>(&output_obj);
if (output_texture && output_texture->memobj) {
return DispatchKernel(input->memobj, output_texture->memobj);
}
auto output_buffer = absl::get_if<OpenClBuffer>(&output_obj);
if (output_buffer && output_buffer->memobj) {
return DispatchKernel(input->memobj, output_buffer->memobj);
}
return InvalidArgumentError("Missing input in from_bhwc converter");
}
};
std::array<size_t, 3> CalculateTextureRegion(const TensorObjectDef& def) {
const auto& dims = def.dimensions;
std::array<size_t, 3> region = {0, 0, 1};
switch (ToTensorStorageType(def.object_def.object_type,
def.object_def.data_layout)) {
case TensorStorageType::SINGLE_TEXTURE_2D:
region[0] = static_cast<size_t>(dims.w);
region[1] = static_cast<size_t>(dims.h);
break;
case TensorStorageType::TEXTURE_2D:
region[0] = static_cast<size_t>(dims.w);
region[1] = static_cast<size_t>(dims.h * dims.d());
break;
case TensorStorageType::TEXTURE_ARRAY:
region[0] = static_cast<size_t>(dims.w);
region[1] = static_cast<size_t>(dims.h);
region[2] = static_cast<size_t>(dims.d());
break;
default:
break;
}
return region;
}
// Copies data from one object of the same type and layout to another object.
class TrivialCopier : public OpenClConverterImpl {
public:
static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
return input.data_type == output.data_type &&
input.object_type == output.object_type &&
input.data_layout == output.data_layout;
}
Status Init(const TensorObjectDef& input_def,
const TensorObjectDef& output_def,
Environment* environment) final {
dims_ = input_def.dimensions;
data_type_ = input_def.object_def.data_type;
queue_ = environment->queue();
region_ = CalculateTextureRegion(output_def);
return OkStatus();
}
Status Convert(const TensorObject& input_obj,
const TensorObject& output_obj) override {
auto texture_input = absl::get_if<OpenClTexture>(&input_obj);
auto texture_output = absl::get_if<OpenClTexture>(&output_obj);
if (texture_input && texture_output) {
return Copy(*texture_input, *texture_output);
}
auto buffer_input = absl::get_if<OpenClBuffer>(&input_obj);
auto buffer_output = absl::get_if<OpenClBuffer>(&output_obj);
if (buffer_input && buffer_output) {
return Copy(*buffer_input, *buffer_output);
}
return UnimplementedError("Unsupported conversion");
}
Status Copy(const OpenClBuffer& input, const OpenClBuffer& output) {
if (input.memobj == output.memobj) {
return OkStatus();
}
return GetOpenCLError(clEnqueueCopyBuffer(
queue_->queue(), input.memobj, output.memobj, 0, 0,
SizeOf(data_type_) * dims_.w * dims_.h * dims_.d() * 4, 0, nullptr,
nullptr));
}
Status Copy(const OpenClTexture& input, const OpenClTexture& output) {
if (input.memobj == output.memobj) {
return OkStatus();
}
size_t origin[3] = {0, 0, 0};
return GetOpenCLError(
clEnqueueCopyImage(queue_->queue(), input.memobj, output.memobj, origin,
origin, region_.data(), 0, nullptr, nullptr));
}
private:
DataType data_type_ = DataType::UNKNOWN;
std::array<size_t, 3> region_;
};
static bool IsOpenClTextureOrBuffer(ObjectType type) {
return type == ObjectType::OPENCL_BUFFER ||
type == ObjectType::OPENCL_TEXTURE;
}
// Copies data from/to CPU into a tensor.
class CpuCopier : public OpenClConverterImpl {
public:
static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
return input.data_type == output.data_type &&
input.data_layout == output.data_layout &&
((input.object_type == ObjectType::CPU_MEMORY &&
IsOpenClTextureOrBuffer(output.object_type)) ||
(output.object_type == ObjectType::CPU_MEMORY &&
IsOpenClTextureOrBuffer(input.object_type)));
}
Status Init(const TensorObjectDef& input_def,
const TensorObjectDef& output_def,
Environment* environment) final {
region_ = CalculateTextureRegion(
input_def.object_def.object_type == ObjectType::CPU_MEMORY ? output_def
: input_def);
queue_ = environment->queue();
return OkStatus();
}
Status Convert(const TensorObject& input_obj,
const TensorObject& output_obj) override {
auto cpu_input = absl::get_if<CpuMemory>(&input_obj);
auto cpu_output = absl::get_if<CpuMemory>(&output_obj);
if (cpu_input) {
auto texture_output = absl::get_if<OpenClTexture>(&output_obj);
if (texture_output) {
return queue_->EnqueueWriteImage(
texture_output->memobj, int3(region_[0], region_[1], region_[2]),
cpu_input->data);
}
auto buffer_output = absl::get_if<OpenClBuffer>(&output_obj);
if (buffer_output) {
return queue_->EnqueueWriteBuffer(
buffer_output->memobj, cpu_input->size_bytes, cpu_input->data);
}
} else if (cpu_output) {
auto texture_input = absl::get_if<OpenClTexture>(&input_obj);
if (texture_input) {
return queue_->EnqueueReadImage(
texture_input->memobj, int3(region_[0], region_[1], region_[2]),
cpu_output->data);
}
auto buffer_input = absl::get_if<OpenClBuffer>(&input_obj);
if (buffer_input) {
return queue_->EnqueueReadBuffer(
buffer_input->memobj, cpu_output->size_bytes, cpu_output->data);
}
}
return UnimplementedError("Unsupported conversion");
}
private:
std::array<size_t, 3> region_;
};
class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder {
public:
explicit OpenClTensorConverterBuilder(Environment* environment)
: environment_(environment) {}
bool IsSupported(const TensorObjectDef& input,
const TensorObjectDef& output) final {
const auto& input_def = input.object_def;
const auto& output_def = output.object_def;
return input.dimensions == output.dimensions &&
(TrivialCopier::IsSupported(input_def, output_def) ||
CpuCopier::IsSupported(input_def, output_def) ||
FromTensorConverter::IsSupported(input_def, output_def) ||
ToTensorConverter::IsSupported(input_def, output_def));
}
Status MakeConverter(
const TensorObjectDef& input, const TensorObjectDef& output,
std::unique_ptr<TensorObjectConverter>* converter) final {
std::unique_ptr<OpenClConverterImpl> impl;
const auto& input_def = input.object_def;
const auto& output_def = output.object_def;
if (TrivialCopier::IsSupported(input_def, output_def)) {
impl = absl::make_unique<TrivialCopier>();
} else if (CpuCopier::IsSupported(input_def, output_def)) {
impl = absl::make_unique<CpuCopier>();
} else if (FromTensorConverter::IsSupported(input_def, output_def)) {
impl = absl::make_unique<FromTensorConverter>();
} else if (ToTensorConverter::IsSupported(input_def, output_def)) {
impl = absl::make_unique<ToTensorConverter>();
} else {
return UnimplementedError("Unsupported conversion");
}
RETURN_IF_ERROR(impl->Init(input, output, environment_));
*converter = std::move(impl);
return OkStatus();
}
Environment* environment_;
};
} // namespace
std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
Environment* environment) {
return absl::make_unique<OpenClTensorConverterBuilder>(environment);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,49 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_
#include <memory>
#include "tensorflow/lite/delegates/gpu/cl/environment.h"
#include "tensorflow/lite/delegates/gpu/spi.h"
namespace tflite {
namespace gpu {
namespace cl {
class TensorObjectConverterBuilder {
public:
virtual ~TensorObjectConverterBuilder() = default;
virtual bool IsSupported(const TensorObjectDef& input,
const TensorObjectDef& output) = 0;
virtual Status MakeConverter(
const TensorObjectDef& input, const TensorObjectDef& output,
std::unique_ptr<TensorObjectConverter>* converter) = 0;
};
// Supports conversions from BHWC to internal OpenCL tensor representation and
// back. Also supports F16/F32.
std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(
Environment* environment);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVERTER_H_

View File

@ -0,0 +1,282 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h"
#include <string>
#include <utility>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GenerateConvolutionTransposedCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const LinearStorage& biases,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(precision);
switch (precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += "#define CONV(R, S) \\\n";
c += "R += S.x * f0.s0123; \\\n";
c += "R += S.y * f0.s4567; \\\n";
c += "R += S.z * f0.s89ab; \\\n";
c += "R += S.w * f0.scdef; \n";
} else {
c += "#define CONV(R, S) \\\n";
c += "R += S.x * f[0]; \\\n";
c += "R += S.y * f[1]; \\\n";
c += "R += S.z * f[2]; \\\n";
c += "R += S.w * f[3]; \n";
}
break;
case CalculationsPrecision::F32_F16:
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += "#define CONV(R, S) \\\n";
c += "R += convert_float4(S.x * f0.s0123 + S.y * f0.s4567 + S.z * "
"f0.s89ab + S.w * f0.scdef);\n";
} else {
c += "#define CONV(R, S) \\\n";
c += "R += convert_float4(S.x * f[0] + S.y * f[1]";
c += "+ S.z * f[2] + S.w * f[3]);\n";
}
break;
}
switch (precision) {
case CalculationsPrecision::F32:
c += "#define FLT16 float16\n";
break;
case CalculationsPrecision::F32_F16:
case CalculationsPrecision::F16:
c += "#define FLT16 half16\n";
break;
}
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " __global FLT16* filters, \n";
c += " __global FLT4* biases";
} else {
c += " __read_only image2d_t filters, \n";
c += " __read_only image2d_t biases";
}
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int2 kernel_size, \n";
c += " int2 stride, \n";
c += " int2 padding, \n";
c += " int2 k_offset, \n";
c += " int2 inner_size, \n";
c += " int4 src_size, \n";
c += " int4 dst_size \n";
c += ") {\n";
c += " int X = get_global_id(0);\n";
c += " int Y = get_global_id(1);\n";
c += " int Z = get_global_id(2);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y) return;\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " int f_base = Z * src_size.w * kernel_size.x * kernel_size.y;\n";
}
c += " int2 offset = (int2)(X, Y) + padding - k_offset;\n";
c += " offset.x = offset.x % stride.x;\n";
c += " offset.y = offset.y % stride.y;\n";
c += " offset += stride;\n";
c += " offset.x = offset.x % stride.x;\n";
c += " offset.y = offset.y % stride.y;\n";
c += " int2 f_offset;\n";
c += " f_offset.x = offset.x == 0 ? 0 : stride.x - offset.x;\n";
c += " f_offset.y = offset.y == 0 ? 0 : stride.y - offset.y;\n";
c += " ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
c += " for (int ky = 0; ky < inner_size.y; ++ky) {\n";
c += " int index_y = ky * stride.y + f_offset.y;\n";
c += " bool inside_y = index_y < kernel_size.y;\n";
c += " int s_y = (Y + index_y + padding.y - k_offset.y) / stride.y;\n";
c += " index_y = kernel_size.y - 1 - index_y;\n";
c += " bool out_y = s_y < 0 || s_y >= src_size.y;\n";
c += " for (int kx = 0; kx < inner_size.x; ++kx) {\n";
c += " int index_x = kx * stride.x + f_offset.x;\n";
c += " bool inside_kernel = index_x < kernel_size.x && inside_y;\n";
c += " int s_x = (X + index_x + padding.x - k_offset.x) / stride.x;\n";
c += " index_x = kernel_size.x - 1 - index_x;\n";
c += " bool out_x = s_x < 0 || s_x >= src_size.x;\n";
c += " int kernel_index = index_y * kernel_size.x + index_x;\n";
c += " if (inside_kernel && !(out_x || out_y)) {\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " int f_offset = f_base + kernel_index * src_size.w;\n";
} else {
c += " int x_c = kernel_index * src_size.w * 4;\n";
}
c += " for (int l = 0; l < src_size.w; ++l) {\n";
c += " FLT4 src =" + src_tensor.Read3D("s_x", "s_y", "l") + ";\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " FLT16 f0 = filters[f_offset]; f_offset++;\n";
} else {
c += " FLT4 f[4];\n";
c += " f[0] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
"x_c++;\n";
c += " f[1] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
"x_c++;\n";
c += " f[2] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
"x_c++;\n";
c += " f[3] = READ_IMAGE(filters, smp_none, (int2)(x_c, Z)); "
"x_c++;\n";
}
c += " CONV(r0, src);\n";
c += " }\n";
c += " }\n";
c += " }\n";
c += " }\n";
c += " FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
c += " FLT4 res0 = TO_FLT4(r0) + bias_val;\n";
c += " " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
c += PostProcess(linked_operations, "res0", "Z", "address");
c += " " + dst_tensor.Write3D("res0", "address") + "\n";
c += "}\n";
return c;
}
} // namespace
ConvolutionTransposed::ConvolutionTransposed(
const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
stride_(attr.stride.w, attr.stride.h),
padding_(attr.padding.prepended.w, attr.padding.prepended.h),
src_channels_(attr.weights.shape.i),
dst_channels_(attr.weights.shape.o) {
const int inner_size_x = (kernel_size_.x - 1) / stride_.x + 1;
const int inner_size_y = (kernel_size_.y - 1) / stride_.y + 1;
inner_size_ = int2(inner_size_x, inner_size_y);
kernel_offset_ = int2(kernel_size_.x - 1, kernel_size_.y - 1);
}
ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& kernel)
: GPUOperation(std::move(kernel)),
biases_(std::move(kernel.biases_)),
weights_tex2d_(std::move(kernel.weights_tex2d_)),
weights_buf_(std::move(kernel.weights_buf_)),
weights_(kernel.weights_),
kernel_size_(kernel.kernel_size_),
stride_(kernel.stride_),
padding_(kernel.padding_),
kernel_offset_(kernel.kernel_offset_),
inner_size_(kernel.inner_size_),
src_channels_(kernel.src_channels_),
dst_channels_(kernel.dst_channels_),
kernel_(std::move(kernel.kernel_)),
work_group_size_(kernel.work_group_size_) {}
ConvolutionTransposed& ConvolutionTransposed::operator=(
ConvolutionTransposed&& kernel) {
if (this != &kernel) {
biases_ = std::move(kernel.biases_);
weights_tex2d_ = std::move(kernel.weights_tex2d_);
weights_buf_ = std::move(kernel.weights_buf_);
std::swap(weights_, kernel.weights_);
std::swap(kernel_size_, kernel.kernel_size_);
std::swap(stride_, kernel.stride_);
std::swap(padding_, kernel.padding_);
std::swap(kernel_offset_, kernel.kernel_offset_);
std::swap(inner_size_, kernel.inner_size_);
std::swap(src_channels_, kernel.src_channels_);
std::swap(dst_channels_, kernel.dst_channels_);
kernel_ = std::move(kernel.kernel_);
std::swap(work_group_size_, kernel.work_group_size_);
GPUOperation::operator=(std::move(kernel));
}
return *this;
}
Status ConvolutionTransposed::Compile(const CreationContext& creation_context) {
const auto code = GenerateConvolutionTransposedCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, biases_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status ConvolutionTransposed::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_offset_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(inner_size_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
return OkStatus();
}
int3 ConvolutionTransposed::GetGridSize() const {
const int grid_x = dst_[0]->Width();
const int grid_y = dst_[0]->Height();
const int grid_z = dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
}
Status ConvolutionTransposed::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
&work_group_size_);
}
Status ConvolutionTransposed::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
Status CreateConvolutionTransposed(const CreationContext& creation_context,
const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposed* result) {
*result = ConvolutionTransposed(definition, attr);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;
create_info.storage_type =
DeduceLinearStorageType(definition.GetPrimaryStorageType());
create_info.data_type = definition.GetDataType();
create_info.name = "biases";
create_info.aligned_size = attr.weights.shape.o;
RETURN_IF_ERROR(CreateLinearStorage(
create_info, attr.bias, creation_context.context, &result->biases_));
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,190 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class ConvolutionTransposed : public GPUOperation {
public:
ConvolutionTransposed() = default;
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
ConvolutionTransposed(ConvolutionTransposed&& kernel);
ConvolutionTransposed& operator=(ConvolutionTransposed&& kernel);
ConvolutionTransposed(const ConvolutionTransposed&) = delete;
ConvolutionTransposed& operator=(const ConvolutionTransposed&) = delete;
private:
friend Status CreateConvolutionTransposed(
const CreationContext& creation_context, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposed* result);
explicit ConvolutionTransposed(const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
template <DataType T>
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
template <DataType S, typename T>
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
absl::Span<T> dst);
Status BindArguments();
int3 GetGridSize() const;
LinearStorage biases_;
Texture2D weights_tex2d_;
Buffer weights_buf_;
cl_mem weights_;
int2 kernel_size_;
int2 stride_;
int2 padding_;
int2 kernel_offset_;
int2 inner_size_;
int src_channels_;
int dst_channels_;
CLKernel kernel_;
int3 work_group_size_ = int3(16, 8, 1);
};
template <DataType T>
Status ConvolutionTransposed::UploadWeights(
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;
const int elements_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
bool is_buffer_storage =
definition_.GetPrimaryStorageType() == TensorStorageType::BUFFER;
const int float4_size =
definition_.precision == CalculationsPrecision::F32 ? 16 : 8;
if (definition_.GetDataType() == DataType::FLOAT32) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (is_buffer_storage) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf_));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), src_depth * kernel_x * kernel_y * 4,
dst_depth, gpu_data.data(), context, &weights_tex2d_));
}
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (is_buffer_storage) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf_));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), src_depth * kernel_x * kernel_y * 4,
dst_depth, gpu_data.data(), context, &weights_tex2d_));
}
}
if (is_buffer_storage) {
weights_ = weights_buf_.GetMemoryPtr();
} else {
weights_ = weights_tex2d_.GetMemoryPtr();
}
return OkStatus();
}
template <DataType S, typename T>
void ConvolutionTransposed::RearrangeWeightsData(
const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;
int counter = 0;
for (int d = 0; d < dst_depth; ++d) {
for (int y = 0; y < kernel_y; ++y) {
for (int x = 0; x < kernel_x; ++x) {
for (int s = 0; s < src_depth; ++s) {
T filters[4];
for (int j = 0; j < 4; ++j) {
for (int i = 0; i < 4; ++i) {
const int s_ch = s * 4 + j;
const int d_ch = d * 4 + i;
if (s_ch < src_channels_ && d_ch < dst_channels_) {
const int f_index =
weights.shape.LinearIndex({d_ch, y, x, s_ch});
filters[i][j] = weights.data[f_index];
} else {
filters[i][j] = 0.0f;
}
}
}
T filters_new[4];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
filters_new[i][j] = filters[j][i];
}
}
dst[counter++] = filters_new[0];
dst[counter++] = filters_new[1];
dst[counter++] = filters_new[2];
dst[counter++] = filters_new[3];
}
}
}
}
}
Status CreateConvolutionTransposed(const CreationContext& creation_context,
const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposed* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_

View File

@ -0,0 +1,254 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h"
#include <string>
#include <utility>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GenerateConvolutionTransposedCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const LinearStorage& biases, int src_depth, int dst_depth,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(precision);
switch (precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
c += "#define CONV(R, SRC, F, i) \\\n";
c += " R += SRC.x * F[i + 0]; \\\n";
c += " R += SRC.y * F[i + 1]; \\\n";
c += " R += SRC.z * F[i + 2]; \\\n";
c += " R += SRC.w * F[i + 3]; \n";
break;
case CalculationsPrecision::F32_F16:
c += "#define CONV(R, SRC, F, i) \\\n";
c += " R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
c += "+ SRC.z * F[i + 2] + SRC.w * F[i + 3]);\n";
break;
}
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
c += " __constant FLT4* filters, \n";
c += biases.GetDeclaration();
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 src_size, \n";
c += " int4 dst_size \n";
c += ") {\n";
c += " int X = get_global_id(0);\n";
c += " int Y = get_global_id(1);\n";
c += " if (X >= src_size.x || Y >= src_size.y) return;\n";
for (int d = 0; d < dst_depth; ++d) {
const std::string layer = std::to_string(d);
c += " ACCUM_FLT4 r" + layer + "[2][2];\n";
c += " r" + layer + "[0][0] = (ACCUM_FLT4)(0.0f);\n";
c += " r" + layer + "[0][1] = (ACCUM_FLT4)(0.0f);\n";
c += " r" + layer + "[1][0] = (ACCUM_FLT4)(0.0f);\n";
c += " r" + layer + "[1][1] = (ACCUM_FLT4)(0.0f);\n";
}
int filters_index = 0;
for (int s = 0; s < src_depth; ++s) {
const std::string z = std::to_string(s);
c += " {\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " bool x_in = X + 1 < src_size.x;\n";
c += " bool y_in = Y + 1 < src_size.y;\n";
c += " FLT4 src0 = " + src_tensor.Read3D("X", "Y", z) + ";\n";
c += " FLT4 src1 = (FLT4)(0.0);\n";
c += " FLT4 src2 = (FLT4)(0.0);\n";
c += " FLT4 src3 = (FLT4)(0.0);\n";
c += " if (x_in) {\n";
c += " src1 = " + src_tensor.Read3D("X + 1", "Y", z) + ";\n";
c += " }\n";
c += " if (y_in) {\n";
c += " src2 = " + src_tensor.Read3D("X", "Y + 1", z) + ";\n";
c += " }\n";
c += " if (x_in && y_in) {\n";
c += " src3 = " + src_tensor.Read3D("X + 1", "Y + 1", z) + ";\n";
c += " }\n";
} else {
c += " FLT4 src0 = " + src_tensor.Read3D("X", "Y", z) + ";\n";
c += " FLT4 src1 = " + src_tensor.Read3D("X + 1", "Y", z) + ";\n";
c += " FLT4 src2 = " + src_tensor.Read3D("X", "Y + 1", z) + ";\n";
c += " FLT4 src3 = " + src_tensor.Read3D("X + 1", "Y + 1", z) + ";\n";
}
for (int d = 0; d < dst_depth; ++d) {
const std::string layer = std::to_string(d);
const std::string f_offset = std::to_string(filters_index);
filters_index++;
c += " {\n";
c += " __constant FLT4* L0 = filters + 36 * " + f_offset + ";\n";
c += " CONV(r" + layer + "[0][0], src0, L0, 0);\n";
c += " CONV(r" + layer + "[0][1], src0, L0, 4);\n";
c += " CONV(r" + layer + "[0][1], src1, L0, 8);\n";
c += " CONV(r" + layer + "[1][0], src0, L0, 12);\n";
c += " CONV(r" + layer + "[1][0], src2, L0, 16);\n";
c += " CONV(r" + layer + "[1][1], src0, L0, 20);\n";
c += " CONV(r" + layer + "[1][1], src1, L0, 24);\n";
c += " CONV(r" + layer + "[1][1], src2, L0, 28);\n";
c += " CONV(r" + layer + "[1][1], src3, L0, 32);\n";
c += " }\n";
}
c += " }\n";
}
c += " X *= 2;\n";
c += " Y *= 2;\n";
for (int d = 0; d < dst_depth; ++d) {
const std::string layer = std::to_string(d);
c += " {\n";
c += " FLT4 bias_val = " + biases.ReadLinearFLT4(layer) + ";\n";
for (int y = 0; y < 2; ++y) {
for (int x = 0; x < 2; ++x) {
c += " {\n";
c += " FLT4 result = TO_FLT4(r" + layer + "[" + std::to_string(y) +
"][" + std::to_string(x) + "]) + bias_val;\n";
c += " " +
dst_tensor.GetAddress("address", "X + " + std::to_string(x),
"Y + " + std::to_string(y), layer) +
"\n";
c += PostProcess(linked_operations, "result", layer, "address");
c += " " + dst_tensor.Write3D("result", "address") + "\n";
c += " }\n";
}
}
c += " }\n";
}
c += "}\n";
return c;
}
} // namespace
ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
: GPUOperation(definition),
src_channels_(attr.weights.shape.i),
dst_channels_(attr.weights.shape.o) {}
ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
ConvolutionTransposed3x3Thin&& operation)
: GPUOperation(std::move(operation)),
weights_(std::move(operation.weights_)),
biases_(std::move(operation.biases_)),
src_channels_(operation.src_channels_),
dst_channels_(operation.dst_channels_),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}
ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
ConvolutionTransposed3x3Thin&& operation) {
if (this != &operation) {
weights_ = std::move(operation.weights_);
biases_ = std::move(operation.biases_);
std::swap(src_channels_, operation.src_channels_);
std::swap(dst_channels_, operation.dst_channels_);
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}
Status ConvolutionTransposed3x3Thin::Compile(
const CreationContext& creation_context) {
const auto code = GenerateConvolutionTransposedCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, biases_, IntegralDivideRoundUp(src_channels_, 4),
IntegralDivideRoundUp(dst_channels_, 4), linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status ConvolutionTransposed3x3Thin::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
return OkStatus();
}
int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
const int grid_x = src_[0]->Width();
const int grid_y = src_[0]->Height();
const int grid_z = 1;
return int3(grid_x, grid_y, grid_z);
}
Status ConvolutionTransposed3x3Thin::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status ConvolutionTransposed3x3Thin::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
bool IsConvolutionTransposed3x3ThinSupported(
const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
return device.IsAdreno() && attr.weights.shape.o <= 8 &&
attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
attr.stride.w == 2 && attr.stride.h == 2 &&
attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1;
}
Status CreateConvolutionTransposed3x3Thin(
const CreationContext& creation_context, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposed3x3Thin* result) {
if (!IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
attr)) {
return InvalidArgumentError(
"ConvolutionTransposed3x3Thin doesn't support this attributes");
}
*result = ConvolutionTransposed3x3Thin(definition, attr);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;
create_info.storage_type =
DeduceLinearStorageType(definition.GetPrimaryStorageType());
create_info.data_type = definition.GetDataType();
create_info.name = "biases";
create_info.aligned_size = attr.weights.shape.o;
RETURN_IF_ERROR(CreateLinearStorage(
create_info, attr.bias, creation_context.context, &result->biases_));
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,162 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class ConvolutionTransposed3x3Thin : public GPUOperation {
public:
ConvolutionTransposed3x3Thin() = default;
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
ConvolutionTransposed3x3Thin(ConvolutionTransposed3x3Thin&& operation);
ConvolutionTransposed3x3Thin& operator=(
ConvolutionTransposed3x3Thin&& operation);
ConvolutionTransposed3x3Thin(const ConvolutionTransposed3x3Thin&) = delete;
ConvolutionTransposed3x3Thin& operator=(const ConvolutionTransposed3x3Thin&) =
delete;
private:
friend Status CreateConvolutionTransposed3x3Thin(
const CreationContext& creation_context, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposed3x3Thin* result);
explicit ConvolutionTransposed3x3Thin(
const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
template <DataType T>
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
template <DataType S, typename T>
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
absl::Span<T> dst);
Status BindArguments();
int3 GetGridSize() const;
Buffer weights_;
LinearStorage biases_;
int src_channels_;
int dst_channels_;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
template <DataType T>
Status ConvolutionTransposed3x3Thin::UploadWeights(
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
const int kernel_x = 3; // This operation support only 3x3 kernel
const int kernel_y = 3;
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
const int flt4_size = definition_.precision == CalculationsPrecision::F32
? sizeof(float4)
: sizeof(half4);
if (definition_.GetDataType() == DataType::FLOAT32) {
std::vector<float4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
context, &weights_);
} else {
std::vector<half4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
context, &weights_);
}
}
template <DataType S, typename T>
void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
const int kernel_x = 3;
const int kernel_y = 3;
const int remap[9] = {4, 5, 3, 7, 1, 8, 6, 2, 0};
int counter = 0;
for (int s = 0; s < src_depth; ++s) {
for (int d = 0; d < dst_depth; ++d) {
for (int y = 0; y < kernel_y; ++y) {
for (int x = 0; x < kernel_x; ++x) {
const int kernel_index = remap[y * kernel_x + x];
const int kernel_index_x = kernel_index % kernel_x;
const int kernel_index_y = kernel_index / kernel_x;
T filters[4];
for (int j = 0; j < 4; ++j) {
for (int i = 0; i < 4; ++i) {
const int s_ch = s * 4 + i;
const int d_ch = d * 4 + j;
if (s_ch < src_channels_ && d_ch < dst_channels_) {
const int f_index = weights.shape.LinearIndex(
{d_ch, kernel_index_y, kernel_index_x, s_ch});
filters[i][j] = weights.data[f_index];
} else {
filters[i][j] = 0.0f;
}
}
}
dst[counter++] = filters[0];
dst[counter++] = filters[1];
dst[counter++] = filters[2];
dst[counter++] = filters[3];
}
}
}
}
}
bool IsConvolutionTransposed3x3ThinSupported(
const CLDevice& device, const ConvolutionTransposedAttributes& attr);
Status CreateConvolutionTransposed3x3Thin(
const CreationContext& creation_context, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposed3x3Thin* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_THIN_H_

View File

@ -0,0 +1,110 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3ThinSimpleWeights) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 1);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
ConvolutionTransposedAttributes attr;
attr.padding.prepended = HW(1, 1);
attr.padding.appended = HW(1, 1);
attr.stride = HW(2, 2);
attr.weights.shape = OHWI(1, 3, 3, 1);
attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.0f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConvolutionTransposed3x3Thin operation;
ASSERT_OK(CreateConvolutionTransposed3x3Thin(creation_context_, op_def,
attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps),
{0.0f, 1.0f, 1.0f, 1.0f, 2.0f, 6.0f, 4.0f, 4.0f,
2.0f, 5.0f, 3.0f, 3.0f, 2.0f, 5.0f, 3.0f, 3.0f}));
}
}
}
TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3Thin) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 1);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
ConvolutionTransposedAttributes attr;
attr.padding.prepended = HW(1, 1);
attr.padding.appended = HW(1, 1);
attr.stride = HW(2, 2);
attr.weights.shape = OHWI(1, 3, 3, 1);
attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
attr.bias.shape = Linear(1);
attr.bias.data = {0.5f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConvolutionTransposed3x3Thin operation;
ASSERT_OK(CreateConvolutionTransposed3x3Thin(creation_context_, op_def,
attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(eps),
{0.5f, 4.5f, 5.5f, 6.5f, 4.5f, 16.5f, 14.5f, 18.5f, 10.5f,
24.5f, 15.5f, 18.5f, 16.5f, 39.5f, 24.5f, 27.5f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,114 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, ConvolutionTransposedSimpleWeights) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
ConvolutionTransposedAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(0, 0);
attr.stride = HW(2, 2);
attr.weights.shape = OHWI(2, 2, 2, 2);
attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.0f, 0.0f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConvolutionTransposed operation;
ASSERT_OK(CreateConvolutionTransposed(creation_context_, op_def, attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 2), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(eps),
{1.0f, 1.0f, 1.0f, 1.0f, 5.0f, 5.0f, 5.0f, 5.0f,
1.0f, 1.0f, 1.0f, 1.0f, 5.0f, 5.0f, 5.0f, 5.0f,
9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f,
9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f}));
}
}
}
TEST_F(OpenCLOperationTest, ConvolutionTransposed) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
ConvolutionTransposedAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(0, 0);
attr.stride = HW(2, 2);
attr.weights.shape = OHWI(1, 2, 2, 2);
attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
attr.bias.shape = Linear(1);
attr.bias.data = {0.5f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConvolutionTransposed operation;
ASSERT_OK(CreateConvolutionTransposed(creation_context_, op_def, attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(eps),
{2.5f, 4.5f, 8.5f, 18.5f, 6.5f, 8.5f, 28.5f, 38.5f, 14.5f,
32.5f, 20.5f, 46.5f, 50.5f, 68.5f, 72.5f, 98.5f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,247 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h"
#include <string>
#include <utility>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GenerateConvolutionTransposedCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
int src_depth, int dst_channels, const int2& kernel_size,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(precision);
const std::string channel_x = dst_channels == 1 ? "" : ".x";
const std::vector<std::string> channel = {channel_x, ".y", ".z", ".w"};
const std::string type_postfix =
dst_channels == 1 ? "" : std::to_string(dst_channels);
std::string accum_type;
switch (precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F32_F16:
accum_type = "float" + type_postfix;
break;
case CalculationsPrecision::F16:
accum_type = "half" + type_postfix;
break;
}
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
c += " __constant FLT4* filters";
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 src_size, \n";
c += " int4 dst_size, \n";
c += " FLT4 bias_value \n";
c += ") {\n";
c += " int X = get_global_id(0);\n";
c += " int Y = get_global_id(1);\n";
c += " if (X >= src_size.x || Y >= src_size.y) return;\n";
c += " " + accum_type + " r[" + std::to_string(kernel_size.y) + "][" +
std::to_string(kernel_size.x) + "];\n";
c += " {\n";
c += " FLT4 src = " + src_tensor.Read3D("X", "Y", "0") + ";\n";
int index = 0;
for (int y = 0; y < kernel_size.y; ++y) {
for (int x = 0; x < kernel_size.x; ++x) {
std::string r_s =
" r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
for (int d = 0; d < dst_channels; ++d) {
c += r_s + channel[d] + " = TO_ACCUM_FLT(dot(src, filters[" +
std::to_string(index) + "]));\n";
index++;
}
}
}
c += " }\n";
for (int i = 1; i < src_depth; ++i) {
if (precision != CalculationsPrecision::F32_F16) {
c += " if (X < src_size.x + " + std::to_string(i + 1) + ") {\n";
} else {
c += " {\n";
}
c += " FLT4 src = " + src_tensor.Read3D("X", "Y", std::to_string(i)) +
";\n";
for (int y = 0; y < kernel_size.y; ++y) {
for (int x = 0; x < kernel_size.x; ++x) {
std::string r_s =
" r[" + std::to_string(y) + "][" + std::to_string(x) + "]";
for (int d = 0; d < dst_channels; ++d) {
c += r_s + channel[d] + " += TO_ACCUM_FLT(dot(src, filters[" +
std::to_string(index) + "]));\n";
index++;
}
}
}
c += " }\n";
}
c += " X *= " + std::to_string(kernel_size.x) + ";\n";
c += " Y *= " + std::to_string(kernel_size.x) + ";\n";
for (int y = 0; y < kernel_size.y; ++y) {
for (int x = 0; x < kernel_size.x; ++x) {
if (precision != CalculationsPrecision::F32_F16) {
c += " if (X + " + std::to_string(x) + " < dst_size.x && ";
c += "Y + " + std::to_string(y) + " < dst_size.y) {\n";
} else {
c += " {\n";
}
c += " FLT4 result = bias_value;\n";
for (int d = 0; d < dst_channels; ++d) {
c += " result" + channel[d] + " += r[" + std::to_string(y) + "][" +
std::to_string(x) + "]" + channel[d] + ";\n";
}
c += " " +
dst_tensor.GetAddress("address", "X + " + std::to_string(x),
"Y + " + std::to_string(y), "0") +
"\n";
c += PostProcess(linked_operations, "result", "0", "address");
c += " " + dst_tensor.Write3D("result", "address") + "\n";
c += " }\n";
}
}
c += "}\n";
return c;
}
} // namespace
ConvolutionTransposedThin::ConvolutionTransposedThin(
const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
src_channels_(attr.weights.shape.i),
dst_channels_(attr.weights.shape.o) {
float4 bias_value(0.0f);
for (int i = 0; i < attr.weights.shape.o; ++i) {
bias_value[i] = attr.bias.data[i];
}
bias_value_ = FLT4(definition_.precision, bias_value);
}
ConvolutionTransposedThin::ConvolutionTransposedThin(
ConvolutionTransposedThin&& operation)
: GPUOperation(std::move(operation)),
weights_buf_(std::move(operation.weights_buf_)),
bias_value_(std::move(operation.bias_value_)),
kernel_size_(operation.kernel_size_),
src_channels_(operation.src_channels_),
dst_channels_(operation.dst_channels_),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}
ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
ConvolutionTransposedThin&& operation) {
if (this != &operation) {
weights_buf_ = std::move(operation.weights_buf_);
bias_value_ = std::move(operation.bias_value_);
std::swap(kernel_size_, operation.kernel_size_);
std::swap(src_channels_, operation.src_channels_);
std::swap(dst_channels_, operation.dst_channels_);
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}
Status ConvolutionTransposedThin::Compile(
const CreationContext& creation_context) {
const auto code = GenerateConvolutionTransposedCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, IntegralDivideRoundUp(src_channels_, 4),
dst_channels_, kernel_size_, linked_operations_);
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsAdreno3xx()) {
options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status ConvolutionTransposedThin::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(bias_value_));
return OkStatus();
}
int3 ConvolutionTransposedThin::GetGridSize() const {
const int grid_x = src_[0]->Width();
const int grid_y = src_[0]->Height();
const int grid_z = 1;
return int3(grid_x, grid_y, grid_z);
}
Status ConvolutionTransposedThin::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status ConvolutionTransposedThin::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
bool IsConvolutionTransposedThinSupported(
const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
return device.IsAdreno() && attr.weights.shape.o <= 4 &&
attr.weights.shape.w == attr.stride.w &&
attr.weights.shape.h == attr.stride.h &&
attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0;
}
Status CreateConvolutionTransposedThin(
const CreationContext& creation_context, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposedThin* result) {
if (!IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
return InvalidArgumentError(
"ConvolutionTransposedThin doesn't support this attributes");
}
*result = ConvolutionTransposedThin(definition, attr);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,148 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_THIN_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_THIN_H_
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class ConvolutionTransposedThin : public GPUOperation {
public:
ConvolutionTransposedThin() = default;
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
ConvolutionTransposedThin(ConvolutionTransposedThin&& operation);
ConvolutionTransposedThin& operator=(ConvolutionTransposedThin&& operation);
ConvolutionTransposedThin(const ConvolutionTransposedThin&) = delete;
ConvolutionTransposedThin& operator=(const ConvolutionTransposedThin&) =
delete;
private:
friend Status CreateConvolutionTransposedThin(
const CreationContext& creation_context, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposedThin* result);
ConvolutionTransposedThin(const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
template <DataType T>
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
template <DataType S, typename T>
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
absl::Span<T> dst);
Status BindArguments();
int3 GetGridSize() const;
Buffer weights_buf_;
FLT4 bias_value_;
int2 kernel_size_;
int src_channels_;
int dst_channels_;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
template <DataType T>
Status ConvolutionTransposedThin::UploadWeights(
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
const int elements_count =
kernel_size_.x * kernel_size_.y * src_depth * 4 * dst_channels_;
const int float4_size =
definition_.precision == CalculationsPrecision::F32 ? 16 : 8;
if (definition_.GetDataType() == DataType::FLOAT32) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
context, &weights_buf_);
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
context, &weights_buf_);
}
}
template <DataType S, typename T>
void ConvolutionTransposedThin::RearrangeWeightsData(
const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;
int counter = 0;
for (int s = 0; s < src_depth; ++s) {
for (int y = 0; y < kernel_y; ++y) {
for (int x = 0; x < kernel_x; ++x) {
std::vector<T> filters(dst_channels_);
for (int j = 0; j < dst_channels_; ++j) {
for (int i = 0; i < 4; ++i) {
const int s_ch = s * 4 + i;
const int d_ch = j;
if (s_ch < src_channels_ && d_ch < dst_channels_) {
const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch});
filters[j][i] = weights.data[f_index];
} else {
filters[j][i] = 0.0f;
}
}
}
for (int j = 0; j < dst_channels_; ++j) {
dst[counter++] = filters[j];
}
}
}
}
}
bool IsConvolutionTransposedThinSupported(
const CLDevice& device, const ConvolutionTransposedAttributes& attr);
Status CreateConvolutionTransposedThin(
const CreationContext& creation_context, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposedThin* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_THIN_H_

View File

@ -0,0 +1,114 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, ConvolutionTransposedThinSimpleWeights) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
ConvolutionTransposedAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(0, 0);
attr.stride = HW(2, 2);
attr.weights.shape = OHWI(2, 2, 2, 2);
attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.0f, 0.0f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConvolutionTransposedThin operation;
ASSERT_OK(CreateConvolutionTransposedThin(creation_context_, op_def, attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 2), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(eps),
{1.0f, 1.0f, 1.0f, 1.0f, 5.0f, 5.0f, 5.0f, 5.0f,
1.0f, 1.0f, 1.0f, 1.0f, 5.0f, 5.0f, 5.0f, 5.0f,
9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f,
9.0f, 9.0f, 9.0f, 9.0f, 13.0f, 13.0f, 13.0f, 13.0f}));
}
}
}
TEST_F(OpenCLOperationTest, ConvolutionTransposedThin) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
ConvolutionTransposedAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(0, 0);
attr.stride = HW(2, 2);
attr.weights.shape = OHWI(1, 2, 2, 2);
attr.weights.data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
attr.bias.shape = Linear(1);
attr.bias.data = {0.5f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
ConvolutionTransposedThin operation;
ASSERT_OK(CreateConvolutionTransposedThin(creation_context_, op_def, attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(eps),
{2.5f, 4.5f, 8.5f, 18.5f, 6.5f, 8.5f, 28.5f, 38.5f, 14.5f,
32.5f, 20.5f, 46.5f, 50.5f, 68.5f, 72.5f, 98.5f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,257 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h"
#include <string>
#include <utility>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
bool IsSpecializedCase(int channel_multiplier) {
return channel_multiplier == 1 || channel_multiplier == 2 ||
channel_multiplier == 4;
}
std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
int channel_multiplier) {
std::string c;
if (channel_multiplier == 1) {
c +=
" FLT4 src_final =" + src_tensor.Read3D("x_c", "y_c", "Z") + ";\n";
} else if (channel_multiplier == 2) {
c += " int z_layer = Z / 2;\n";
c +=
" FLT4 src =" + src_tensor.Read3D("x_c", "y_c", "z_layer") + ";\n";
c += " FLT2 t0 = Z % 2 == 0 ? src.xy : src.zw;\n";
c += " FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n";
} else if (channel_multiplier == 4) {
c += " int z_layer = Z / 4;\n";
c +=
" FLT4 src =" + src_tensor.Read3D("x_c", "y_c", "z_layer") + ";\n";
c += " FLT t0 = src.x;\n";
c += " int reminder = Z % 4;\n";
c += " if (reminder == 1) t0 = src.y;\n";
c += " if (reminder == 2) t0 = src.z;\n";
c += " if (reminder == 3) t0 = src.w;\n";
c += " FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n";
} else {
c += " int z_layer = Z / channel_multiplier;\n";
c +=
" FLT4 src =" + src_tensor.Read3D("x_c", "y_c", "z_layer") + ";\n";
c += " int z_offset = (Z % channel_multiplier) * 4;\n";
c += " FLT4 src_final;\n";
c += " FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
c += " src_final.x = temp_arr[(z_offset + 0) / channel_multiplier];\n";
c += " src_final.y = temp_arr[(z_offset + 1) / channel_multiplier];\n";
c += " src_final.z = temp_arr[(z_offset + 2) / channel_multiplier];\n";
c += " src_final.w = temp_arr[(z_offset + 3) / channel_multiplier];\n";
}
return c;
}
std::string GenerateDepthWiseConvolutionCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const LinearStorage& biases, int channel_multiplier,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(precision);
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " __global FLT4* filters, \n";
} else {
c += " __read_only image2d_t filters, \n";
}
c += biases.GetDeclaration();
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int2 kernel_size, \n";
c += " int2 stride, \n";
c += " int2 padding, \n";
c += " int2 dilation, \n";
if (!IsSpecializedCase(channel_multiplier)) {
c += " int channel_multiplier, \n";
}
c += " int4 src_size, \n";
c += " int4 dst_size \n";
c += ") {\n";
c += " int X = get_global_id(0);\n";
c += " int Y = get_global_id(1);\n";
c += " int Z = get_global_id(2);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
c += " ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
c += " int x_offseted = X * stride.x - padding.x;\n";
c += " int y_offseted = Y * stride.y - padding.y;\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " int fx_c = Z * kernel_size.x * kernel_size.y;\n";
} else {
c += " int fx_c = 0;\n";
}
c += " for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
c += " int y_c = y_offseted + ky * dilation.y;\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
}
c += " for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
c += " int x_c = x_offseted + kx * dilation.x;\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
c += " bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
c += " if (!outside_x && !outside_y) {\n";
c += GetSrcValue(src_tensor, channel_multiplier);
c += " FLT4 f = filters[fx_c];\n";
c += " r += TO_ACCUM_TYPE(src_final * f);\n";
c += " };\n";
c += " fx_c++;\n";
} else {
c += GetSrcValue(src_tensor, channel_multiplier);
c += " FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z)); "
"fx_c++;\n";
c += " r += TO_ACCUM_TYPE(src_final * f);\n";
}
c += " }\n";
c += " }\n";
c += " FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
c += " FLT4 res0 = TO_FLT4(r) + bias_val;\n";
c += " " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
c += PostProcess(linked_operations, "res0", "Z", "address");
c += " " + dst_tensor.Write3D("res0", "address") + "\n";
c += "}\n";
return c;
}
} // namespace
DepthWiseConvolution::DepthWiseConvolution(
const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
stride_(attr.strides.w, attr.strides.h),
padding_(attr.padding.prepended.w, attr.padding.prepended.h),
dilation_(attr.dilations.w, attr.dilations.h),
channel_multiplier_(attr.weights.shape.o),
work_group_size_(8, 8, 1) {}
DepthWiseConvolution::DepthWiseConvolution(DepthWiseConvolution&& operation)
: GPUOperation(std::move(operation)),
weights_tex2d_(std::move(operation.weights_tex2d_)),
weights_buf_(std::move(operation.weights_buf_)),
weights_(operation.weights_),
biases_(std::move(operation.biases_)),
kernel_size_(operation.kernel_size_),
stride_(operation.stride_),
padding_(operation.padding_),
dilation_(operation.dilation_),
channel_multiplier_(operation.channel_multiplier_),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}
DepthWiseConvolution& DepthWiseConvolution::operator=(
DepthWiseConvolution&& operation) {
if (this != &operation) {
weights_tex2d_ = std::move(operation.weights_tex2d_);
weights_buf_ = std::move(operation.weights_buf_);
std::swap(weights_, operation.weights_);
biases_ = std::move(operation.biases_);
std::swap(kernel_size_, operation.kernel_size_);
std::swap(stride_, operation.stride_);
std::swap(padding_, operation.padding_);
std::swap(dilation_, operation.dilation_);
std::swap(channel_multiplier_, operation.channel_multiplier_);
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}
Status DepthWiseConvolution::Compile(const CreationContext& creation_context) {
const auto code = GenerateDepthWiseConvolutionCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, biases_, channel_multiplier_, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status DepthWiseConvolution::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dilation_));
if (!IsSpecializedCase(channel_multiplier_)) {
RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(channel_multiplier_)));
}
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
return OkStatus();
}
int3 DepthWiseConvolution::GetGridSize() const {
const int grid_x = dst_[0]->Width();
const int grid_y = dst_[0]->Height();
const int grid_z = dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
}
Status DepthWiseConvolution::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status DepthWiseConvolution::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
Status CreateDepthWiseConvolution(const CreationContext& creation_context,
const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr,
DepthWiseConvolution* result) {
*result = DepthWiseConvolution(definition, attr);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;
create_info.storage_type =
DeduceLinearStorageType(definition.GetPrimaryStorageType());
create_info.data_type = definition.GetDataType();
create_info.name = "biases";
create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
RETURN_IF_ERROR(CreateLinearStorage(
create_info, attr.bias, creation_context.context, &result->biases_));
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,175 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_H_
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class DepthWiseConvolution : public GPUOperation {
public:
DepthWiseConvolution() = default;
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
DepthWiseConvolution(DepthWiseConvolution&& operation);
DepthWiseConvolution& operator=(DepthWiseConvolution&& operation);
DepthWiseConvolution(const DepthWiseConvolution&) = delete;
DepthWiseConvolution& operator=(const DepthWiseConvolution&) = delete;
private:
friend Status CreateDepthWiseConvolution(
const CreationContext& creation_context, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr,
DepthWiseConvolution* result);
explicit DepthWiseConvolution(const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr);
template <DataType T>
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
template <DataType S, typename T>
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
absl::Span<T> dst);
Status BindArguments();
int3 GetGridSize() const;
Texture2D weights_tex2d_;
Buffer weights_buf_;
cl_mem weights_;
LinearStorage biases_;
int2 kernel_size_;
int2 stride_;
int2 padding_;
int2 dilation_;
int channel_multiplier_;
CLKernel kernel_;
int3 work_group_size_;
};
template <DataType T>
Status DepthWiseConvolution::UploadWeights(
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_channels = weights.shape.i * weights.shape.o;
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
const int elements_count = kernel_x * kernel_y * dst_depth;
bool is_buffer_storage =
definition_.GetPrimaryStorageType() == TensorStorageType::BUFFER;
const int float4_size =
definition_.precision == CalculationsPrecision::F32 ? 16 : 8;
if (definition_.GetDataType() == DataType::FLOAT32) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (is_buffer_storage) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf_));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), kernel_x * kernel_y, dst_depth,
gpu_data.data(), context, &weights_tex2d_));
}
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (is_buffer_storage) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf_));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), kernel_x * kernel_y, dst_depth,
gpu_data.data(), context, &weights_tex2d_));
}
}
if (is_buffer_storage) {
weights_ = weights_buf_.GetMemoryPtr();
} else {
weights_ = weights_tex2d_.GetMemoryPtr();
}
return OkStatus();
}
template <DataType S, typename T>
void DepthWiseConvolution::RearrangeWeightsData(
const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int dst_channels = weights.shape.i * weights.shape.o;
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
int counter = 0;
for (int d = 0; d < dst_depth; ++d) {
for (int y = 0; y < kernel_y; ++y) {
for (int x = 0; x < kernel_x; ++x) {
T filter_val;
for (int i = 0; i < 4; ++i) {
const int d_ch = d * 4 + i;
if (d_ch < dst_channels) {
const int f_index = weights.shape.LinearIndex(
{d_ch % weights.shape.o, y, x, d_ch / weights.shape.o});
filter_val[i] = weights.data[f_index];
} else {
filter_val[i] = 0.0f;
}
}
dst[counter++] = filter_val;
}
}
}
}
Status CreateDepthWiseConvolution(const CreationContext& creation_context,
const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr,
DepthWiseConvolution* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_H_

View File

@ -0,0 +1,249 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h"
#include <string>
#include <utility>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GenerateDepthWiseConvCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const std::vector<ElementwiseOperation*>& linked_operations) {
std::string c = GetCommonDefines(precision);
TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
c += " __read_only image2d_t filters\n";
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 dst_size\n";
c += ") {\n";
c += " int X = get_global_id(0) * 2;\n";
c += " int Y = get_global_id(1) * 2;\n";
c += " int Z = get_global_id(2);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) return;\n";
c += " ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
c += " ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
c += " ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
c += " ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
c += " FLT4 f0 = READ_IMAGE(filters, smp_none, (int2)(0, Z));\n";
c += " FLT4 f1 = READ_IMAGE(filters, smp_none, (int2)(1, Z));\n";
c += " FLT4 f2 = READ_IMAGE(filters, smp_none, (int2)(2, Z));\n";
c += " FLT4 f3 = READ_IMAGE(filters, smp_none, (int2)(3, Z));\n";
c += " FLT4 f4 = READ_IMAGE(filters, smp_none, (int2)(4, Z));\n";
c += " FLT4 f5 = READ_IMAGE(filters, smp_none, (int2)(5, Z));\n";
c += " FLT4 f6 = READ_IMAGE(filters, smp_none, (int2)(6, Z));\n";
c += " FLT4 f7 = READ_IMAGE(filters, smp_none, (int2)(7, Z));\n";
c += " FLT4 f8 = READ_IMAGE(filters, smp_none, (int2)(8, Z));\n";
c += " \n";
c += " FLT4 s0;\n";
c += " FLT4 s1;\n";
c += " FLT4 s2;\n";
c += " FLT4 s3;\n";
c += " \n";
c += " {\n";
c += " s0 = " + src_tensor.Read3D("X - 1", "Y - 1", "Z") + ";\n";
c += " s1 = " + src_tensor.Read3D("X", "Y - 1", "Z") + ";\n";
c += " s2 = " + src_tensor.Read3D("X + 1", "Y - 1", "Z") + ";\n";
c += " s3 = " + src_tensor.Read3D("X + 2", "Y - 1", "Z") + ";\n";
c += " r0 += TO_ACCUM_TYPE(f0 * s0);\n";
c += " r0 += TO_ACCUM_TYPE(f1 * s1);\n";
c += " r1 += TO_ACCUM_TYPE(f0 * s1);\n";
c += " r0 += TO_ACCUM_TYPE(f2 * s2);\n";
c += " r1 += TO_ACCUM_TYPE(f1 * s2);\n";
c += " r1 += TO_ACCUM_TYPE(f2 * s3);\n";
c += " }\n";
c += " {\n";
c += " s0 = " + src_tensor.Read3D("X - 1", "Y", "Z") + ";\n";
c += " s1 = " + src_tensor.Read3D("X", "Y", "Z") + ";\n";
c += " s2 = " + src_tensor.Read3D("X + 1", "Y", "Z") + ";\n";
c += " s3 = " + src_tensor.Read3D("X + 2", "Y", "Z") + ";\n";
c += " r0 += TO_ACCUM_TYPE(f3 * s0);\n";
c += " r2 += TO_ACCUM_TYPE(f0 * s0);\n";
c += " r0 += TO_ACCUM_TYPE(f4 * s1);\n";
c += " r1 += TO_ACCUM_TYPE(f3 * s1);\n";
c += " r2 += TO_ACCUM_TYPE(f1 * s1);\n";
c += " r3 += TO_ACCUM_TYPE(f0 * s1);\n";
c += " r0 += TO_ACCUM_TYPE(f5 * s2);\n";
c += " r1 += TO_ACCUM_TYPE(f4 * s2);\n";
c += " r2 += TO_ACCUM_TYPE(f2 * s2);\n";
c += " r3 += TO_ACCUM_TYPE(f1 * s2);\n";
c += " r1 += TO_ACCUM_TYPE(f5 * s3);\n";
c += " r3 += TO_ACCUM_TYPE(f2 * s3);\n";
c += " }\n";
c += " {\n";
c += " s0 = " + src_tensor.Read3D("X - 1", "Y + 1", "Z") + ";\n";
c += " s1 = " + src_tensor.Read3D("X", "Y + 1", "Z") + ";\n";
c += " s2 = " + src_tensor.Read3D("X + 1", "Y + 1", "Z") + ";\n";
c += " s3 = " + src_tensor.Read3D("X + 2", "Y + 1", "Z") + ";\n";
c += " r0 += TO_ACCUM_TYPE(f6 * s0);\n";
c += " r2 += TO_ACCUM_TYPE(f3 * s0);\n";
c += " r0 += TO_ACCUM_TYPE(f7 * s1);\n";
c += " r1 += TO_ACCUM_TYPE(f6 * s1);\n";
c += " r2 += TO_ACCUM_TYPE(f4 * s1);\n";
c += " r3 += TO_ACCUM_TYPE(f3 * s1);\n";
c += " r0 += TO_ACCUM_TYPE(f8 * s2);\n";
c += " r1 += TO_ACCUM_TYPE(f7 * s2);\n";
c += " r2 += TO_ACCUM_TYPE(f5 * s2);\n";
c += " r3 += TO_ACCUM_TYPE(f4 * s2);\n";
c += " r1 += TO_ACCUM_TYPE(f8 * s3);\n";
c += " r3 += TO_ACCUM_TYPE(f5 * s3);\n";
c += " }\n";
c += " {\n";
c += " s0 = " + src_tensor.Read3D("X - 1", "Y + 2", "Z") + ";\n";
c += " s1 = " + src_tensor.Read3D("X", "Y + 2", "Z") + ";\n";
c += " s2 = " + src_tensor.Read3D("X + 1", "Y + 2", "Z") + ";\n";
c += " s3 = " + src_tensor.Read3D("X + 2", "Y + 2", "Z") + ";\n";
c += " r2 += TO_ACCUM_TYPE(f6 * s0);\n";
c += " r2 += TO_ACCUM_TYPE(f7 * s1);\n";
c += " r3 += TO_ACCUM_TYPE(f6 * s1);\n";
c += " r2 += TO_ACCUM_TYPE(f8 * s2);\n";
c += " r3 += TO_ACCUM_TYPE(f7 * s2);\n";
c += " r3 += TO_ACCUM_TYPE(f8 * s3);\n";
c += " }\n";
c += " FLT4 bias = READ_IMAGE(filters, smp_none, (int2)(9, Z));\n";
c += " r0 += TO_ACCUM_TYPE(bias);\n";
c += " r1 += TO_ACCUM_TYPE(bias);\n";
c += " r2 += TO_ACCUM_TYPE(bias);\n";
c += " r3 += TO_ACCUM_TYPE(bias);\n";
c += " if(X + 0 < dst_size.x && Y + 0 < dst_size.y) {\n";
c += " FLT4 result = TO_FLT4(r0);\n";
c += " " + dst_tensor.GetAddress("address", "X + 0", "Y + 0", "Z") + "\n";
c += PostProcess(linked_operations, "result", "Z", "address");
c += " " + dst_tensor.Write3D("result", "address") + "\n";
c += " }\n";
c += " if(X + 1 < dst_size.x && Y + 0 < dst_size.y) {\n";
c += " FLT4 result = TO_FLT4(r1);\n";
c += " " + dst_tensor.GetAddress("address", "X + 1", "Y + 0", "Z") + "\n";
c += PostProcess(linked_operations, "result", "Z", "address");
c += " " + dst_tensor.Write3D("result", "address") + "\n";
c += " }\n";
c += " if(X + 0 < dst_size.x && Y + 1 < dst_size.y) {\n";
c += " FLT4 result = TO_FLT4(r2);\n";
c += " " + dst_tensor.GetAddress("address", "X + 0", "Y + 1", "Z") + "\n";
c += PostProcess(linked_operations, "result", "Z", "address");
c += " " + dst_tensor.Write3D("result", "address") + "\n";
c += " }\n";
c += " if(X + 1 < dst_size.x && Y + 1 < dst_size.y) {\n";
c += " FLT4 result = TO_FLT4(r3);\n";
c += " " + dst_tensor.GetAddress("address", "X + 1", "Y + 1", "Z") + "\n";
c += PostProcess(linked_operations, "result", "Z", "address");
c += " " + dst_tensor.Write3D("result", "address") + "\n";
c += " }\n";
c += " }\n";
return c;
}
} // namespace
DepthWiseConv3x3Texture::DepthWiseConv3x3Texture(const OperationDef& definition)
: GPUOperation(definition) {}
DepthWiseConv3x3Texture::DepthWiseConv3x3Texture(
DepthWiseConv3x3Texture&& kernel)
: GPUOperation(std::move(kernel)),
weights_(std::move(kernel.weights_)),
kernel_(std::move(kernel.kernel_)),
work_group_size_(kernel.work_group_size_) {}
DepthWiseConv3x3Texture& DepthWiseConv3x3Texture::operator=(
DepthWiseConv3x3Texture&& kernel) {
if (this != &kernel) {
weights_ = std::move(kernel.weights_);
kernel_ = std::move(kernel.kernel_);
std::swap(work_group_size_, kernel.work_group_size_);
GPUOperation::operator=(std::move(kernel));
}
return *this;
}
Status DepthWiseConv3x3Texture::Compile(
const CreationContext& creation_context) {
std::string code = GenerateDepthWiseConvCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status DepthWiseConv3x3Texture::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
return OkStatus();
}
int3 DepthWiseConv3x3Texture::GetGridSize() const {
const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), 2);
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
const int grid_z = dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
}
Status DepthWiseConv3x3Texture::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status DepthWiseConv3x3Texture::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
bool IsDepthWiseConv3x3TextureSupported(
const DepthwiseConvolution2DAttributes& attr) {
return attr.weights.shape.o == 1 && attr.dilations.w == 1 &&
attr.dilations.h == 1 && attr.weights.shape.w == 3 &&
attr.weights.shape.h == 3 && attr.strides.w == 1 &&
attr.strides.h == 1 && attr.padding.prepended.w == 1 &&
attr.padding.prepended.h == 1 && attr.padding.appended.w == 1 &&
attr.padding.appended.h == 1;
}
Status CreateDepthWiseConv3x3Texture(
const CreationContext& creation_context, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr,
DepthWiseConv3x3Texture* result) {
if (!IsDepthWiseConv3x3TextureSupported(attr)) {
return InvalidArgumentError(
"DepthWiseConv3x3Texture doesn't support this attributes");
}
*result = DepthWiseConv3x3Texture(definition);
RETURN_IF_ERROR(result->UploadWeightsAndBiases(attr.weights, attr.bias,
creation_context.context));
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,145 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_3X3_TEXTURE_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_3X3_TEXTURE_H_
#include <memory>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class DepthWiseConv3x3Texture : public GPUOperation {
public:
DepthWiseConv3x3Texture() = default;
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
DepthWiseConv3x3Texture(DepthWiseConv3x3Texture&& kernel);
DepthWiseConv3x3Texture& operator=(DepthWiseConv3x3Texture&& kernel);
DepthWiseConv3x3Texture(const DepthWiseConv3x3Texture&) = delete;
DepthWiseConv3x3Texture& operator=(const DepthWiseConv3x3Texture&) = delete;
private:
explicit DepthWiseConv3x3Texture(const OperationDef& definition);
template <DataType T>
Status UploadWeightsAndBiases(const ::tflite::gpu::Tensor<OHWI, T>& weights,
const ::tflite::gpu::Tensor<Linear, T>& biases,
CLContext* context);
friend Status CreateDepthWiseConv3x3Texture(
const CreationContext& creation_context, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr,
DepthWiseConv3x3Texture* result);
template <DataType S, typename T>
void RearrangeWeightsAndBiasesData(
const ::tflite::gpu::Tensor<OHWI, S>& weights,
const ::tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
Status BindArguments();
int3 GetGridSize() const;
Texture2D weights_;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
template <DataType T>
Status DepthWiseConv3x3Texture::UploadWeightsAndBiases(
const ::tflite::gpu::Tensor<OHWI, T>& weights,
const ::tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
int texture_width = 10; // 3x3 kernel + 1 bias
int texture_height = src_depth;
const int elements_count = texture_width * texture_height;
if (definition_.GetDataType() == DataType::FLOAT32) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
return CreateTexture2DRGBA(definition_.GetDataType(), texture_width,
texture_height, gpu_data.data(), context,
&weights_);
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
return CreateTexture2DRGBA(definition_.GetDataType(), texture_width,
texture_height, gpu_data.data(), context,
&weights_);
}
}
template <DataType S, typename T>
void DepthWiseConv3x3Texture::RearrangeWeightsAndBiasesData(
const ::tflite::gpu::Tensor<OHWI, S>& weights,
const ::tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
int counter = 0;
for (int s = 0; s < src_depth; ++s) {
for (int y = 0; y < 3; ++y) {
for (int x = 0; x < 3; ++x) {
T filter_val;
for (int i = 0; i < 4; ++i) {
const int s_ch = s * 4 + i;
if (s_ch < weights.shape.i) {
const int f_index = weights.shape.LinearIndex({0, y, x, s_ch});
filter_val[i] = weights.data[f_index];
} else {
filter_val[i] = 0.0f;
}
}
dst[counter++] = filter_val;
}
}
T bias_val;
for (int i = 0; i < 4; ++i) {
const int dst_ch = s * 4 + i;
bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch];
}
dst[counter++] = bias_val;
}
}
bool IsDepthWiseConv3x3TextureSupported(
const DepthwiseConvolution2DAttributes& attr);
Status CreateDepthWiseConv3x3Texture(
const CreationContext& creation_context, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr,
DepthWiseConv3x3Texture* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTH_WISE_CONV_3X3_TEXTURE_H_

View File

@ -0,0 +1,111 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3_texture.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, DepthWiseConv3x3TextureSimpleWeights) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
DepthwiseConvolution2DAttributes attr;
attr.padding.prepended = HW(1, 1);
attr.padding.appended = HW(1, 1);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(1, 3, 3, 2);
attr.weights.data = {0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.0f, 0.0f};
for (auto storage : env_.GetSupportedTextureStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
DepthWiseConv3x3Texture operation;
ASSERT_OK(CreateDepthWiseConv3x3Texture(creation_context_, op_def, attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {6.0f, 16.0f, 8.0f, 16.0f, 10.0f,
16.0f, 12.0f, 16.0f}));
}
}
}
TEST_F(OpenCLOperationTest, DepthWiseConv3x3Texture) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
DepthwiseConvolution2DAttributes attr;
attr.padding.prepended = HW(1, 1);
attr.padding.appended = HW(1, 1);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(1, 3, 3, 2);
attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 0.0f, 1.0f, 2.0f,
3.0f, 4.0f, 5.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.5f, -0.5f};
for (auto storage : env_.GetSupportedTextureStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
DepthWiseConv3x3Texture operation;
ASSERT_OK(CreateDepthWiseConv3x3Texture(creation_context_, op_def, attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {40.5f, 67.5f, 16.5f, 35.5f, 40.5f,
67.5f, 16.5f, 35.5f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,148 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, DepthWiseConvSimpleWeights) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
DepthwiseConvolution2DAttributes attr;
attr.padding.prepended = HW(1, 0);
attr.padding.appended = HW(1, 0);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(1, 3, 1, 2);
attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.0f, 0.0f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
DepthWiseConvolution operation;
ASSERT_OK(CreateDepthWiseConvolution(creation_context_, op_def, attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {4.0f, 6.0f, 8.0f, 10.0f, 4.0f,
6.0f, 8.0f, 10.0f}));
}
}
}
TEST_F(OpenCLOperationTest, DepthWiseConvNoMultiplier) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
DepthwiseConvolution2DAttributes attr;
attr.padding.prepended = HW(1, 0);
attr.padding.appended = HW(1, 0);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(1, 3, 1, 2);
attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.5f, -0.5f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
DepthWiseConvolution operation;
ASSERT_OK(CreateDepthWiseConvolution(creation_context_, op_def, attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {16.5f, 27.5f, 28.5f, 43.5f, 8.5f,
15.5f, 12.5f, 23.5f}));
}
}
}
TEST_F(OpenCLOperationTest, DepthWiseConvMultiplier2) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
DepthwiseConvolution2DAttributes attr;
attr.padding.prepended = HW(1, 0);
attr.padding.appended = HW(1, 0);
attr.strides = HW(1, 1);
attr.dilations = HW(1, 1);
attr.weights.shape = OHWI(2, 3, 1, 2);
attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
attr.bias.shape = Linear(4);
attr.bias.data = {0.5f, -0.5f, 1.0f, -1.0f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
DepthWiseConvolution operation;
ASSERT_OK(CreateDepthWiseConvolution(creation_context_, op_def, attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 4), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(eps),
{16.5f, 39.5f, 29.0f, 63.0f, 28.5f, 75.5f, 45.0f, 103.0f,
8.5f, 31.5f, 17.0f, 51.0f, 12.5f, 59.5f, 25.0f, 83.0f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,64 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
#include "absl/strings/str_cat.h"
namespace tflite {
namespace gpu {
namespace cl {
FLT::FLT(CalculationsPrecision precision, float value)
: f32_(precision == CalculationsPrecision::F32), active_(true) {
if (f32_) {
f_value_ = value;
} else {
h_value_ = half(value);
}
}
const void* FLT::GetData() const {
return f32_ ? static_cast<const void*>(&f_value_)
: static_cast<const void*>(&h_value_);
}
std::string FLT::GetDeclaration() const {
const std::string type = f32_ ? "float" : "half";
return absl::StrCat(type, " ", name_);
}
FLT4::FLT4(CalculationsPrecision precision, const float4& value)
: f32_(precision == CalculationsPrecision::F32), active_(true) {
if (f32_) {
f_value_ = value;
} else {
h_value_ = half4(value);
}
}
const void* FLT4::GetData() const {
return f32_ ? static_cast<const void*>(&f_value_)
: static_cast<const void*>(&h_value_);
}
std::string FLT4::GetDeclaration() const {
const std::string type = f32_ ? "float4" : "half4";
return absl::StrCat(type, " ", name_);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,72 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class FLT {
public:
FLT() = default;
FLT(CalculationsPrecision precision, float value);
const void* GetData() const;
size_t GetSize() const { return f32_ ? sizeof(float) : sizeof(half); }
bool Active() const { return active_; }
std::string GetDeclaration() const;
std::string GetName() const { return name_; }
void SetName(const std::string& name) { name_ = name; }
private:
float f_value_;
half h_value_;
bool f32_;
bool active_ = false;
std::string name_;
};
class FLT4 {
public:
FLT4() {}
FLT4(CalculationsPrecision precision, const float4& value);
const void* GetData() const;
size_t GetSize() const { return f32_ ? sizeof(float4) : sizeof(half4); }
bool Active() const { return active_; }
std::string GetDeclaration() const;
std::string GetName() const { return name_; }
void SetName(const std::string& name) { name_ = name; }
private:
float4 f_value_;
half4 h_value_;
bool f32_;
bool active_ = false;
std::string name_;
};
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FLT_TYPE_H_

View File

@ -0,0 +1,189 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h"
#include <string>
#include <utility>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
// We split vec vec dot (every thread do vec vec dot product in basic
// vec mat mult) on 4 parts to create more threads
// tid.y thread process every 4-th element in vec vec dot
// Good results for ~1024 x 1024 sizes, for other can be written more
// otimized shaders
std::string GetFullyConnectedKernelCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const std::vector<ElementwiseOperation*>& linked_operations,
const int3& work_group_size) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(precision);
switch (precision) {
case CalculationsPrecision::F32:
c += "#define READ_IMAGE read_imagef\n";
break;
case CalculationsPrecision::F32_F16:
case CalculationsPrecision::F16:
c += "#define READ_IMAGE read_imageh\n";
break;
}
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
c += " __read_only image2d_t filters,\n";
c += " __read_only image2d_t biases";
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 src_size, \n";
c += " int4 dst_size, \n";
c += " int src_depth_x4 \n";
c += ") {\n";
c += " int gid = get_global_id(0);\n";
c += " int2 tid = (int2)(get_local_id(0), get_local_id(1));\n";
c += " ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);\n";
c += " uint c = tid.y;\n"; // vector coord for every thread
c += " uint c2 = tid.y * 2;\n"; // it should be * 4, so as we have FLT4
// but we keep half8 in float4 so, we have * 2 y_coord for texture
c += " for (int i = 0; i < src_depth_x4; ++i, c += 4, c2 += 8) {\n";
c += " FLT4 v = " + src_tensor.Read3D("0", "0", "c") + ";\n";
if (precision != CalculationsPrecision::F32) {
c += " half8 m0 = as_half8(read_imagef(filters, smp_none, (int2)(gid, "
"c2+0)));\n";
c += " half8 m1 = as_half8(read_imagef(filters, smp_none, (int2)(gid, "
"c2+1)));\n";
c += " s.x += (v.x * m0.s0 + v.y * m0.s1 + v.z * m0.s2 + v.w * m0.s3);\n";
c += " s.y += (v.x * m0.s4 + v.y * m0.s5 + v.z * m0.s6 + v.w * m0.s7);\n";
c += " s.z += (v.x * m1.s0 + v.y * m1.s1 + v.z * m1.s2 + v.w * m1.s3);\n";
c += " s.w += (v.x * m1.s4 + v.y * m1.s5 + v.z * m1.s6 + v.w * m1.s7);\n";
} else {
c += " float4 m0 = read_imagef(filters, smp_none, (int2)(gid * 4 + 0, "
"c));\n";
c += " float4 m1 = read_imagef(filters, smp_none, (int2)(gid * 4 + 1, "
"c));\n";
c += " float4 m2 = read_imagef(filters, smp_none, (int2)(gid * 4 + 2, "
"c));\n";
c += " float4 m3 = read_imagef(filters, smp_none, (int2)(gid * 4 + 3, "
"c));\n";
c += " s.x += (v.x * m0.s0 + v.y * m0.s1 + v.z * m0.s2 + v.w * m0.s3);\n";
c += " s.y += (v.x * m1.s0 + v.y * m1.s1 + v.z * m1.s2 + v.w * m1.s3);\n";
c += " s.z += (v.x * m2.s0 + v.y * m2.s1 + v.z * m2.s2 + v.w * m2.s3);\n";
c += " s.w += (v.x * m3.s0 + v.y * m3.s1 + v.z * m3.s2 + v.w * m3.s3);\n";
}
c += " }\n";
c += " __local ACCUM_FLT4 temp[" + std::to_string(work_group_size.x) + "][" +
std::to_string(work_group_size.y) + "];\n";
c += " temp[tid.x][tid.y] = s;\n";
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
c += " if (tid.y == 0 && gid < dst_size.w) {\n";
c += " s += temp[tid.x][1];\n";
c += " s += temp[tid.x][2];\n";
c += " s += temp[tid.x][3];\n";
c += " FLT4 r0 = TO_FLT4(s) + READ_IMAGE(biases, smp_none, (int2)(gid, "
"0));\n";
c += " " + dst_tensor.GetAddress("dst_adr", "0", "0", "gid") + "\n";
c += PostProcess(linked_operations, "r0", "gid", "dst_adr");
c += " " + dst_tensor.Write3D("r0", "dst_adr") + "\n";
c += " }\n";
c += "}\n";
return c;
}
} // namespace
FullyConnectedTexture::FullyConnectedTexture(const OperationDef& definition)
: GPUOperation(definition) {}
FullyConnectedTexture::FullyConnectedTexture(FullyConnectedTexture&& kernel)
: GPUOperation(std::move(kernel)),
weights_(std::move(kernel.weights_)),
biases_(std::move(kernel.biases_)),
kernel_(std::move(kernel.kernel_)),
work_group_size_(kernel.work_group_size_) {}
FullyConnectedTexture& FullyConnectedTexture::operator=(
FullyConnectedTexture&& kernel) {
if (this != &kernel) {
weights_ = std::move(kernel.weights_), biases_ = std::move(kernel.biases_),
kernel_ = std::move(kernel.kernel_);
std::swap(work_group_size_, kernel.work_group_size_);
GPUOperation::operator=(std::move(kernel));
}
return *this;
}
Status FullyConnectedTexture::Compile(const CreationContext& creation_context) {
int wg_width = 32;
int wg_height = 4;
int work_items;
do {
work_group_size_ = {wg_width, wg_height, 1};
wg_width /= 2;
const auto code = GetFullyConnectedKernelCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_, work_group_size_);
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_));
work_items = work_group_size_.x * work_group_size_.y * work_group_size_.z;
} while (work_items > kernel_.GetMaxWorkGroupSize());
return OkStatus();
}
Status FullyConnectedTexture::AddToQueue(CLCommandQueue* queue) {
const int src_depth_x4 = IntegralDivideRoundUp(src_[0]->Depth(), 4);
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_depth_x4));
return queue->DispatchImplicit(kernel_, {dst_[0]->Depth(), 1, 1},
work_group_size_);
}
Status CreateFullyConnectedTexture(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
FullyConnectedTexture* result) {
*result = FullyConnectedTexture(definition);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;
create_info.storage_type = LinearStorageType::TEXTURE_2D;
create_info.data_type = definition.GetDataType();
create_info.aligned_size = attr.weights.shape.o;
RETURN_IF_ERROR(CreateLinearStorage(
create_info, attr.bias, creation_context.context, &result->biases_));
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,179 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class FullyConnectedTexture : public GPUOperation {
public:
FullyConnectedTexture() = default;
Status AddToQueue(CLCommandQueue* queue) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
FullyConnectedTexture(FullyConnectedTexture&& kernel);
FullyConnectedTexture& operator=(FullyConnectedTexture&& kernel);
FullyConnectedTexture(const FullyConnectedTexture&) = delete;
FullyConnectedTexture& operator=(const FullyConnectedTexture&) = delete;
private:
explicit FullyConnectedTexture(const OperationDef& definition);
friend Status CreateFullyConnectedTexture(
const CreationContext& creation_context, const OperationDef& definition,
const FullyConnectedAttributes& attr, FullyConnectedTexture* result);
template <DataType T>
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
template <DataType T>
void RearrangeWeightsFP16(const ::tflite::gpu::Tensor<OHWI, T>& weights,
absl::Span<half4> dst);
template <DataType T>
void RearrangeWeightsFP32(const ::tflite::gpu::Tensor<OHWI, T>& weights,
absl::Span<float4> dst);
Texture2D weights_;
LinearStorage biases_;
CLKernel kernel_;
int3 work_group_size_ = int3(0, 0, 0);
};
template <DataType T>
Status FullyConnectedTexture::UploadWeights(
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = AlignByN(IntegralDivideRoundUp(weights.shape.i, 4), 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
if (definition_.GetDataType() == DataType::FLOAT32) {
std::vector<float4> gpu_data(dst_depth * src_depth * 4);
RearrangeWeightsFP32(weights, absl::MakeSpan(gpu_data));
return CreateTexture2DRGBA(DataType::FLOAT32, dst_depth * 4, src_depth,
gpu_data.data(), context, &weights_);
} else {
std::vector<half4> gpu_data(dst_depth * src_depth * 4);
RearrangeWeightsFP16(weights, absl::MakeSpan(gpu_data));
return CreateTexture2DRGBA(DataType::FLOAT32, dst_depth, src_depth * 2,
gpu_data.data(), context, &weights_);
}
}
template <DataType T>
void FullyConnectedTexture::RearrangeWeightsFP16(
const ::tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<half4> dst) {
const int src_depth = AlignByN(IntegralDivideRoundUp(weights.shape.i, 4), 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
int counter = 0;
for (int s = 0; s < src_depth; ++s) {
for (int d = 0; d < dst_depth; ++d) {
half4 filters[2];
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 4; ++j) {
const int dst_ch = d * 4 + i;
const int src_ch = s * 4 + j;
if (dst_ch < weights.shape.o && src_ch < weights.shape.i) {
const int f_index =
weights.shape.LinearIndex({dst_ch, 0, 0, src_ch});
filters[i][j] = weights.data[f_index];
} else {
filters[i][j] = 0.0;
}
}
}
dst[counter++] = filters[0];
dst[counter++] = filters[1];
}
for (int d = 0; d < dst_depth; ++d) {
half4 filters[2];
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 4; ++j) {
const int dst_ch = d * 4 + 2 + i;
const int src_ch = s * 4 + j;
if (dst_ch < weights.shape.o && src_ch < weights.shape.i) {
const int f_index =
weights.shape.LinearIndex({dst_ch, 0, 0, src_ch});
filters[i][j] = weights.data[f_index];
} else {
filters[i][j] = 0.0;
}
}
}
dst[counter++] = filters[0];
dst[counter++] = filters[1];
}
}
}
template <DataType T>
void FullyConnectedTexture::RearrangeWeightsFP32(
const ::tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<float4> dst) {
const int src_depth = AlignByN(IntegralDivideRoundUp(weights.shape.i, 4), 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
int counter = 0;
for (int s = 0; s < src_depth; ++s) {
for (int d = 0; d < dst_depth; ++d) {
float4 filters[4];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
const int dst_ch = d * 4 + i;
const int src_ch = s * 4 + j;
if (dst_ch < weights.shape.o && src_ch < weights.shape.i) {
const int f_index =
weights.shape.LinearIndex({dst_ch, 0, 0, src_ch});
filters[i][j] = weights.data[f_index];
} else {
filters[i][j] = 0.0;
}
}
}
dst[counter++] = filters[0];
dst[counter++] = filters[1];
dst[counter++] = filters[2];
dst[counter++] = filters[3];
}
}
}
Status CreateFullyConnectedTexture(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
FullyConnectedTexture* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_

View File

@ -0,0 +1,67 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, FullyConnectedTexture) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 1, 1, 4);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
FullyConnectedAttributes attr;
attr.weights.shape = OHWI(2, 1, 1, 4);
attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
attr.bias.shape = Linear(2);
attr.bias.data = {0.5f, -0.5f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
FullyConnectedTexture operation;
ASSERT_OK(CreateFullyConnectedTexture(creation_context_, op_def, attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 1, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,192 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/common/access_type.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GetElementWiseCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const ElementwiseOperation& op,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "dst_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string c = GetCommonDefines(precision);
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ);
c += op.GetArgsDeclaration();
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 dst_size\n";
c += ") {\n";
c += " int X = get_global_id(0);\n";
c += " int Y = get_global_id(1);\n";
c += " int Z = get_global_id(2);\n";
c += " if (X >= dst_size.x || Y >= dst_size.y) { \n";
c += " return; \n";
c += " } \n";
c += " " + src_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
c += " FLT4 src = " + src_tensor.Read3D("address") + ";\n";
c += " " + op.GetCoreCode("src", "Z", "address");
c += PostProcess(linked_operations, "src", "Z", "address");
c += " " + dst_tensor.Write3D("src", "address") + "\n";
c += "} \n";
return c;
}
} // namespace
DataType OperationDef::GetDataType() const {
return DeduceDataTypeFromPrecision(precision);
}
DataType OperationDef::GetPrimaryDataType() const {
return src_tensors[0].data_type;
}
TensorStorageType OperationDef::GetPrimaryStorageType() const {
return src_tensors[0].storage_type;
}
GPUOperation::GPUOperation(const OperationDef& definition)
: definition_(definition) {}
void GPUOperation::SetSrc(Tensor* ptr, int index) {
if (index >= src_.size()) {
src_.resize(index + 1, nullptr);
}
src_[index] = ptr;
}
void GPUOperation::SetDst(Tensor* ptr, int index) {
if (index >= dst_.size()) {
dst_.resize(index + 1, nullptr);
}
dst_[index] = ptr;
}
GPUOperation::GPUOperation(GPUOperation&& operation)
: definition_(std::move(operation.definition_)),
src_(std::move(operation.src_)),
dst_(std::move(operation.dst_)),
linked_operations_(std::move(operation.linked_operations_)) {}
GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
if (this != &operation) {
definition_ = std::move(operation.definition_);
src_ = std::move(operation.src_);
dst_ = std::move(operation.dst_);
linked_operations_ = std::move(operation.linked_operations_);
}
return *this;
}
void GPUOperation::AddOperation(ElementwiseOperation* operation) {
linked_operations_.push_back(operation);
operation->SetLinkIndex(linked_operations_.size());
}
ElementwiseOperation::ElementwiseOperation(ElementwiseOperation&& operation)
: GPUOperation(std::move(operation)),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}
ElementwiseOperation& ElementwiseOperation::operator=(
ElementwiseOperation&& operation) {
if (this != &operation) {
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}
Status ElementwiseOperation::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(BindArguments(&kernel_));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
return OkStatus();
}
int3 ElementwiseOperation::GetGridSize() const {
const int grid_x = dst_[0]->Width();
const int grid_y = dst_[0]->Height();
const int grid_z = dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
}
Status ElementwiseOperation::Compile(const CreationContext& creation_context) {
const auto code =
GetElementWiseCode(definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, *this, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status ElementwiseOperation::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
Status ElementwiseOperation::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
std::string GetArgsDeclaration(
const std::vector<ElementwiseOperation*>& linked_ops) {
std::string code;
for (auto linked_op : linked_ops) {
code += linked_op->GetArgsDeclaration();
}
code += ",\n";
return code;
}
std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
const std::string& var_name, const std::string& z_coord,
const std::string& global_address) {
std::string code;
for (auto linked_op : linked_ops) {
code += linked_op->GetCoreCode(var_name, z_coord, global_address);
}
return code;
}
Status BindArgs(CLKernel* kernel,
const std::vector<ElementwiseOperation*>& linked_ops) {
for (auto linked_op : linked_ops) {
RETURN_IF_ERROR(linked_op->BindArguments(kernel));
}
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,180 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_GPU_OPERATION_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_GPU_OPERATION_H_
#include <memory>
#include <string>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
struct CreationContext {
const CLDevice* device;
CLContext* context;
CLCommandQueue* queue;
ProgramCache* cache;
};
struct OperationDef {
CalculationsPrecision precision;
std::vector<TensorDescriptor> src_tensors;
std::vector<TensorDescriptor> dst_tensors;
// returns FLOAT32 for F32 precision and FLOAT16 for F16 precision
DataType GetDataType() const;
// Primary means the first src tensor, because first tensor usually defines
// the structure of kernel, all other resources(biases) types and etc.
DataType GetPrimaryDataType() const;
TensorStorageType GetPrimaryStorageType() const;
};
class ElementwiseOperation;
// GPUOperation represents some implementation of neural network operation on
// GPU. GPUOperation can contain ElementwiseOperation operations, in this case,
// ElementwiseOperation still hold necessary data and should be alive.
// When GPUOperation contains ElementwiseOperations, this GPUoperation replaces
// some sequence of operations Op + el_op0 + el_op1 + ...
// Because of this abilities of GPUOperation, usage scenario is next:
// Create instance of GPUOperation.
// Create all instances of ElementwiseOperations that we will(probably) attach
// to GPUOperation. Attach all ElementwiseOperations to GPUOperation. Call
// GPUOperation.Compile(). Don't call ElementwiseOperation.Compile() if it
// attached, it useless(and may be error)
class GPUOperation {
public:
GPUOperation() = default;
explicit GPUOperation(const OperationDef& definition);
virtual ~GPUOperation() = default;
// Move only
GPUOperation(GPUOperation&& operation);
GPUOperation& operator=(GPUOperation&& operation);
GPUOperation(const GPUOperation&) = delete;
GPUOperation& operator=(const GPUOperation&) = delete;
void AddOperation(ElementwiseOperation* operation);
void SetSrc(Tensor* ptr, int index = 0);
void SetDst(Tensor* ptr, int index = 0);
virtual Status AddToQueue(CLCommandQueue* queue) { return OkStatus(); }
virtual Status Tune(const TuningParameters& params) { return OkStatus(); }
virtual Status Compile(const CreationContext& creation_context) {
return OkStatus();
}
const OperationDef& GetDefinition() const { return definition_; }
protected:
// Defines operation calculation precision and format of src/dst tensors.
OperationDef definition_;
std::vector<Tensor*> src_;
std::vector<Tensor*> dst_;
std::vector<ElementwiseOperation*> linked_operations_;
};
// ElementwiseOperation can be fused(linked) to another operation.
// field linked_ indicate about this
// link_index_ used mostly for generating of correct names for
// linked code variables
// link_index_ is number of operation in sequence of linked operations
// and should be unique in this sequence
// link_index_ = 0 is equivalent that operation not linked.
class ElementwiseOperation : public GPUOperation {
public:
ElementwiseOperation() {}
explicit ElementwiseOperation(const OperationDef& definition)
: GPUOperation(definition) {}
virtual ~ElementwiseOperation() {}
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
ElementwiseOperation(ElementwiseOperation&& operation);
ElementwiseOperation& operator=(ElementwiseOperation&& operation);
ElementwiseOperation(const ElementwiseOperation&) = delete;
ElementwiseOperation& operator=(const ElementwiseOperation&) = delete;
// We need this function for resolving naming conflicts.
// Unfortunately we don't know upfront(at creation time) will be the operation
// linked or not. Operation should be created and SetLinkIndex(0) must be
// called to initialize specific for this op linked info, and this is mean
// that operation is not linked. But if we decided to link it, we need update
// operation linked info and use names for kernel arguments according to this
// index(this is responsibility of particular implementation of
// ElementwiseOperation to generate right names).
virtual void SetLinkIndex(int index) {}
virtual std::string GetCoreCode(const std::string& src,
const std::string& z_coord,
const std::string& address) const = 0;
virtual std::string GetArgsDeclaration() const { return ""; }
virtual Status BindArguments(CLKernel* kernel) { return OkStatus(); }
protected:
Status BindArguments();
int3 GetGridSize() const;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
// Generates arguments declarations string for elementwise
// operations in linked_ops.
// Every ElementwiseOperation can generate arguments declarations.
std::string GetArgsDeclaration(
const std::vector<ElementwiseOperation*>& linked_ops);
// Generates shader code for every elementwise operation in
// linked_ops.
// linked_ops - vector of operations pointers
// var_name - name of variable in shader code that we update/change
// z_coord - name of variable in shader code for currently processed Z -
// coordinate in 3D grid (WHC/XYZ) for tensor, this coordinate is in
// layer/slice(group of 4 channels) space not in channels.
// global_address - name of variable for coordinates in 3D grid (WHC/XYZ) for
// tensor, different tensor layouts encode this address differently.
std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
const std::string& var_name, const std::string& z_coord,
const std::string& global_address);
// Binds arguments to given kernel for elementwise operations in
// linked_ops.
// Every ElementwiseOperation can bind her arguments.
Status BindArgs(CLKernel* kernel,
const std::vector<ElementwiseOperation*>& linked_ops);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_GPU_OPERATION_H_

View File

@ -0,0 +1,63 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_HARD_SWISH_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_HARD_SWISH_H_
#include <memory>
#include <string>
#include "absl/memory/memory.h"
#include "absl/strings/substitute.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
namespace tflite {
namespace gpu {
namespace cl {
class HardSwish : public ElementwiseOperation {
public:
static std::unique_ptr<HardSwish> Create(const OperationDef& op_def) {
auto h_swish = absl::make_unique<HardSwish>(op_def);
h_swish->SetLinkIndex(0);
return h_swish;
}
HardSwish() = delete;
explicit HardSwish(const OperationDef& op_def)
: ElementwiseOperation(op_def) {}
HardSwish(const HardSwish&) = delete;
HardSwish(HardSwish&& h_swish) : ElementwiseOperation(std::move(h_swish)) {}
HardSwish& operator=(const HardSwish&) = delete;
HardSwish& operator=(HardSwish&& h_swish) {
if (this != &h_swish) ElementwiseOperation::operator=(std::move(h_swish));
return *this;
}
std::string GetCoreCode(const std::string& src, const std::string& z_coord,
const std::string& address) const override {
return absl::Substitute(
"$0 *= clamp($0 / 6.0f + (FLT4)(0.5f), (FLT4)(0.0f), (FLT4)(1.0f));\n",
src);
}
};
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_HARD_SWISH_H_

View File

@ -0,0 +1,60 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/hard_swish.h"
#include <memory>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, HardSwish) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 1, 1, 7);
src_tensor.data = {-4.5f, -3.0f, -1.5f, 0.0f, 1.5f, 3.0f, 4.5f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
auto h_swish = HardSwish::Create(op_def);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_,
h_swish.get(), src_tensor.shape,
&dst_tensor));
EXPECT_THAT(
dst_tensor.data,
testing::Pointwise(testing::FloatNear(eps),
{0.0f, 0.0f, -0.375f, 0.0f, 1.125f, 3.f, 4.5f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,164 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GetMaxUnoolingKernelCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& src_ind_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src("src_data", "src_size", src_descriptor);
TensorCodeGenerator src_ind("src_data_indices", "src_size",
src_ind_descriptor);
TensorCodeGenerator dst("dst_data", "dst_size", dst_descriptor);
std::string code = GetCommonDefines(precision);
code += "__kernel void main_function(\n";
code += src.GetDeclaration(AccessType::READ) + ",\n";
code += src_ind.GetDeclaration(AccessType::READ);
code += GetArgsDeclaration(linked_operations);
code += dst.GetDeclaration(AccessType::WRITE) + ",\n";
code += " int4 src_size, \n";
code += " int4 dst_size, \n";
code += " int2 kernel_size, \n";
code += " int2 padding, \n";
code += " int2 stride \n";
code += ") {\n";
code += " int X = get_global_id(0);\n";
code += " int Y = get_global_id(1);\n";
code += " int Z = get_global_id(2);\n";
code += " if (X >= dst_size.x || Y >= dst_size.y) return; \n";
code += " int src_x = (X + padding.x) / stride.x;\n";
code += " int src_y = (Y + padding.y) / stride.y;\n";
code += " " + src.GetAddress("src_adr", "src_x", "src_y", "Z") + "\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
code += " bool outside = src_x < 0 || src_y < 0 ||";
code += " src_x >= src_size.x || src_y >= src_size.y;\n";
code += " FLT4 src = (FLT4)(0.0f);\n";
code += " int4 ind = (int4)(0);\n";
code += " if (!outside) {\n";
code += " src = " + src.Read3D("src_adr") + ";\n";
code += " ind = convert_int4(" + src_ind.Read3D("src_adr") + ");\n";
code += " }\n";
} else { // for textures no boundary checks
code += " FLT4 src = " + src.Read3D("src_adr") + ";\n";
code += " int4 ind = convert_int4(" + src_ind.Read3D("src_adr") + ");\n";
}
code += " int t_x = X - (src_x * stride.x - padding.x);\n";
code += " int t_y = Y - (src_y * stride.y - padding.y);\n";
code += " int t_index = t_y * kernel_size.x + t_x;\n";
code += " FLT4 result;\n";
const std::string channels[] = {".x", ".y", ".z", ".w"};
for (int i = 0; i < 4; ++i) {
const auto& s = channels[i];
code += " result" + s + "= t_index == ind" + s + "? src" + s + ": 0.0f;\n";
}
code += " " + dst.GetAddress("address", "X", "Y", "Z") + "\n";
code += PostProcess(linked_operations, "result", "Z", "address");
code += " " + dst.Write3D("result", "address");
code += "}\n";
return code;
}
} // namespace
MaxUnpooling::MaxUnpooling(const OperationDef& definition,
const MaxUnpooling2DAttributes& attr)
: GPUOperation(definition),
stride_(attr.strides.w, attr.strides.h),
padding_(attr.padding.appended.w, attr.padding.appended.h),
kernel_size_(attr.kernel.w, attr.kernel.h) {}
MaxUnpooling::MaxUnpooling(MaxUnpooling&& kernel)
: GPUOperation(std::move(kernel)),
stride_(kernel.stride_),
padding_(kernel.padding_),
kernel_size_(kernel.kernel_size_),
kernel_(std::move(kernel.kernel_)),
work_group_size_(kernel.work_group_size_) {}
MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
if (this != &kernel) {
std::swap(stride_, kernel.stride_);
std::swap(padding_, kernel.padding_);
std::swap(kernel_size_, kernel.kernel_size_);
kernel_ = std::move(kernel.kernel_);
std::swap(work_group_size_, kernel.work_group_size_);
GPUOperation::operator=(std::move(kernel));
}
return *this;
}
Status MaxUnpooling::Compile(const CreationContext& creation_context) {
const auto code = GetMaxUnoolingKernelCode(
definition_.src_tensors[0], definition_.src_tensors[1],
definition_.dst_tensors[0], definition_.precision, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status MaxUnpooling::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
return OkStatus();
}
int3 MaxUnpooling::GetGridSize() const {
const int grid_x = dst_[0]->Width();
const int grid_y = dst_[0]->Height();
const int grid_z = dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
}
Status MaxUnpooling::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status MaxUnpooling::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
const MaxUnpooling2DAttributes& attr) {
return MaxUnpooling(definition, attr);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,62 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MAX_UNPOOLING_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MAX_UNPOOLING_H_
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class MaxUnpooling : public GPUOperation {
public:
MaxUnpooling(const OperationDef& definition,
const MaxUnpooling2DAttributes& attr);
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
MaxUnpooling(MaxUnpooling&& kernel);
MaxUnpooling& operator=(MaxUnpooling&& kernel);
MaxUnpooling(const MaxUnpooling&) = delete;
MaxUnpooling& operator=(const MaxUnpooling&) = delete;
private:
Status BindArguments();
int3 GetGridSize() const;
int2 stride_;
int2 padding_;
int2 kernel_size_;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
const MaxUnpooling2DAttributes& attr);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MAX_UNPOOLING_H_

View File

@ -0,0 +1,73 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, MaxUnpooling) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 1);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
TensorFloat32 src_ind_tensor;
src_ind_tensor.shape = BHWC(1, 2, 2, 1);
src_ind_tensor.data = {0.1f, 1.1f, 2.1f, 3.1f};
MaxUnpooling2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(0, 0);
attr.strides = HW(2, 2);
attr.kernel = HW(2, 2);
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
MaxUnpooling operation = CreateMaxUnpooling(op_def, attr);
ASSERT_OK(ExecuteGPUOperation({src_tensor, src_ind_tensor},
creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps),
{0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 0.0f, 0.0f, 3.0f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,166 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
#include "absl/types/variant.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
namespace tflite {
namespace gpu {
namespace cl {
MultiplyAdd::MultiplyAdd(MultiplyAdd&& operation)
: ElementwiseOperation(std::move(operation)),
mul_vec_(std::move(operation.mul_vec_)),
add_vec_(std::move(operation.add_vec_)),
use_mul_vec_(operation.use_mul_vec_),
use_add_vec_(operation.use_add_vec_),
scalar_mul_(std::move(operation.scalar_mul_)),
scalar_add_(std::move(operation.scalar_add_)) {}
MultiplyAdd& MultiplyAdd::operator=(MultiplyAdd&& operation) {
if (this != &operation) {
mul_vec_ = std::move(operation.mul_vec_);
add_vec_ = std::move(operation.add_vec_);
use_mul_vec_ = operation.use_mul_vec_;
use_add_vec_ = operation.use_add_vec_;
scalar_mul_ = std::move(operation.scalar_mul_);
scalar_add_ = std::move(operation.scalar_add_);
ElementwiseOperation::operator=(std::move(operation));
}
return *this;
}
void MultiplyAdd::SetLinkIndex(int index) {
scalar_mul_.SetName(absl::StrCat("mad_scalar_mul_", index));
scalar_add_.SetName(absl::StrCat("mad_scalar_add_", index));
mul_vec_.SetName(absl::StrCat("mad_mul_", index));
add_vec_.SetName(absl::StrCat("mad_add_", index));
}
std::string MultiplyAdd::GetCoreCode(const std::string& src,
const std::string& z_coord,
const std::string& address) const {
std::string result = absl::StrCat(src, " = ", src);
if (use_mul_vec_) {
result = absl::StrCat(result, " * ", mul_vec_.ReadLinearFLT4(z_coord));
}
if (scalar_mul_.Active()) {
absl::StrAppend(&result, " * ", scalar_mul_.GetName());
}
if (use_add_vec_) {
result = absl::StrCat(result, " + ", add_vec_.ReadLinearFLT4(z_coord));
}
if (scalar_add_.Active()) {
absl::StrAppend(&result, " + ", scalar_add_.GetName());
}
return absl::StrCat(result, ";\n");
}
std::string MultiplyAdd::GetArgsDeclaration() const {
std::string args;
if (use_mul_vec_) {
args = absl::StrCat(args, ",\n ", mul_vec_.GetDeclaration());
}
if (use_add_vec_) {
args = absl::StrCat(args, ",\n ", add_vec_.GetDeclaration());
}
if (scalar_mul_.Active()) {
absl::StrAppend(&args, ",\n ", scalar_mul_.GetDeclaration());
}
if (scalar_add_.Active()) {
absl::StrAppend(&args, ",\n ", scalar_add_.GetDeclaration());
}
return args;
}
Status MultiplyAdd::BindArguments(CLKernel* kernel) {
if (use_mul_vec_) {
RETURN_IF_ERROR(kernel->SetMemoryAuto(mul_vec_.GetMemoryPtr()));
}
if (use_add_vec_) {
RETURN_IF_ERROR(kernel->SetMemoryAuto(add_vec_.GetMemoryPtr()));
}
if (scalar_mul_.Active()) {
RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_mul_));
}
if (scalar_add_.Active()) {
RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_add_));
}
return OkStatus();
}
Status MultiplyAdd::UploadMul(const MultiplyScalarAttributes& attr,
CLContext* context) {
auto mul = absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
&attr.param);
auto mul_scalar = absl::get_if<float>(&attr.param);
if (mul) {
RETURN_IF_ERROR(UploadMul(*mul, context));
} else {
scalar_mul_ = FLT(definition_.precision, *mul_scalar);
}
return OkStatus();
}
Status MultiplyAdd::UploadAdd(const AddAttributes& attr, CLContext* context) {
auto add = absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
&attr.param);
auto add_scalar = absl::get_if<float>(&attr.param);
if (add) {
RETURN_IF_ERROR(UploadAdd(*add, context));
} else {
scalar_add_ = FLT(definition_.precision, *add_scalar);
}
return OkStatus();
}
Status CreateMultiplyAdd(const CreationContext& creation_context,
const OperationDef& definition,
const MultiplyScalarAttributes& attr,
MultiplyAdd* result) {
*result = MultiplyAdd(definition);
RETURN_IF_ERROR(result->UploadMul(attr, creation_context.context));
result->SetLinkIndex(0);
return OkStatus();
}
Status CreateMultiplyAdd(const CreationContext& creation_context,
const OperationDef& definition,
const AddAttributes& attr, MultiplyAdd* result) {
*result = MultiplyAdd(definition);
RETURN_IF_ERROR(result->UploadAdd(attr, creation_context.context));
result->SetLinkIndex(0);
return OkStatus();
}
Status CreateMultiplyAdd(const CreationContext& creation_context,
const OperationDef& definition,
const MultiplyScalarAttributes& mul_attr,
const AddAttributes& add_attr, MultiplyAdd* result) {
*result = MultiplyAdd(definition);
RETURN_IF_ERROR(result->UploadMul(mul_attr, creation_context.context));
RETURN_IF_ERROR(result->UploadAdd(add_attr, creation_context.context));
result->SetLinkIndex(0);
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,132 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class MultiplyAdd : public ElementwiseOperation {
public:
// Move only
MultiplyAdd() = default;
MultiplyAdd(MultiplyAdd&& operation);
MultiplyAdd& operator=(MultiplyAdd&& operation);
MultiplyAdd(const MultiplyAdd&) = delete;
MultiplyAdd& operator=(const MultiplyAdd&) = delete;
Status UploadMul(const MultiplyScalarAttributes& attr, CLContext* context);
Status UploadAdd(const AddAttributes& attr, CLContext* context);
template <DataType T>
Status UploadMul(const ::tflite::gpu::Tensor<Linear, T>& mul,
CLContext* context);
template <DataType T>
Status UploadAdd(const ::tflite::gpu::Tensor<Linear, T>& add,
CLContext* context);
void SetLinkIndex(int index) override;
std::string GetCoreCode(const std::string& src, const std::string& z_coord,
const std::string& address) const override;
std::string GetArgsDeclaration() const override;
Status BindArguments(CLKernel* kernel) override;
friend Status CreateMultiplyAdd(const CreationContext& creation_context,
const OperationDef& definition,
const MultiplyScalarAttributes& attr,
MultiplyAdd* result);
friend Status CreateMultiplyAdd(const CreationContext& creation_context,
const OperationDef& definition,
const AddAttributes& attr,
MultiplyAdd* result);
friend Status CreateMultiplyAdd(const CreationContext& creation_context,
const OperationDef& definition,
const MultiplyScalarAttributes& mul_attr,
const AddAttributes& add_attr,
MultiplyAdd* result);
private:
explicit MultiplyAdd(const OperationDef& definition)
: ElementwiseOperation(definition),
use_mul_vec_(false),
use_add_vec_(false) {}
LinearStorage mul_vec_;
LinearStorage add_vec_;
bool use_mul_vec_;
bool use_add_vec_;
FLT scalar_mul_;
FLT scalar_add_;
};
Status CreateMultiplyAdd(const CreationContext& creation_context,
const OperationDef& definition,
const MultiplyScalarAttributes& attr,
MultiplyAdd* result);
Status CreateMultiplyAdd(const CreationContext& creation_context,
const OperationDef& definition,
const AddAttributes& attr, MultiplyAdd* result);
Status CreateMultiplyAdd(const CreationContext& creation_context,
const OperationDef& definition,
const MultiplyScalarAttributes& mul_attr,
const AddAttributes& add_attr, MultiplyAdd* result);
template <DataType T>
Status MultiplyAdd::UploadMul(const ::tflite::gpu::Tensor<Linear, T>& mul,
CLContext* context) {
LinearStorageCreateInfo create_info;
create_info.storage_type =
DeduceLinearStorageType(definition_.GetPrimaryStorageType());
create_info.data_type = definition_.GetDataType();
RETURN_IF_ERROR(CreateLinearStorage(create_info, mul, context, &mul_vec_));
use_mul_vec_ = true;
return OkStatus();
}
template <DataType T>
Status MultiplyAdd::UploadAdd(const ::tflite::gpu::Tensor<Linear, T>& add,
CLContext* context) {
LinearStorageCreateInfo create_info;
create_info.storage_type =
DeduceLinearStorageType(definition_.GetPrimaryStorageType());
create_info.data_type = definition_.GetDataType();
RETURN_IF_ERROR(CreateLinearStorage(create_info, add, context, &add_vec_));
use_add_vec_ = true;
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_

View File

@ -0,0 +1,187 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, MultiplyAddVectorMul) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
MultiplyScalarAttributes attr;
::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
parameters.shape = Linear(2);
parameters.data = {0.5f, 2.0f};
attr.param = parameters;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
MultiplyAdd operation;
ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {0.0f, 2.0f, 1.0f, 6.0f}));
}
}
}
TEST_F(OpenCLOperationTest, MultiplyAddVectorAdd) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
AddAttributes attr;
::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
parameters.shape = Linear(2);
parameters.data = {0.5f, 2.0f};
attr.param = parameters;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
MultiplyAdd operation;
ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {0.5f, 3.0f, 2.5f, 5.0f}));
}
}
}
TEST_F(OpenCLOperationTest, MultiplyAddScalarMul) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
MultiplyScalarAttributes attr;
attr.param = 0.5f;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
MultiplyAdd operation;
ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {0.0f, 0.5f, 1.0f, 1.5f}));
}
}
}
TEST_F(OpenCLOperationTest, MultiplyAddScalarAdd) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
AddAttributes attr;
attr.param = -0.5f;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
MultiplyAdd operation;
ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {-0.5f, 0.5f, 1.5f, 2.5f}));
}
}
}
TEST_F(OpenCLOperationTest, MultiplyAddVectorMad) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
MultiplyScalarAttributes mul_attr;
::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
parameters.shape = Linear(2);
parameters.data = {0.5f, 2.0f};
mul_attr.param = parameters;
AddAttributes add_attr;
parameters.data = {-0.5f, 0.5f};
add_attr.param = parameters;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
MultiplyAdd operation;
ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, mul_attr, add_attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {-0.5f, 2.5f, 0.5f, 6.5f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,152 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/padding.h"
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GetPaddingCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string code = GetCommonDefines(precision);
const std::string channels[] = {".x", ".y", ".z", ".w"};
code += "__kernel void main_function(\n";
code += src_tensor.GetDeclaration(AccessType::READ);
code += GetArgsDeclaration(linked_operations);
code += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
code += " int4 src_size, \n";
code += " int4 dst_size, \n";
code += " int4 prepended \n";
code += ") {\n";
code += " int X = get_global_id(0);\n";
code += " int Y = get_global_id(1);\n";
code += " int Z = get_global_id(2);\n";
code += " if (X >= dst_size.x || Y >= dst_size.y) return; \n";
code += " FLT4 result = (FLT4)(0.0);\n";
code += " int s_x = X - prepended.x;\n";
code += " int s_y = Y - prepended.y;\n";
code += " bool inside_x = s_x >= 0 && s_x < src_size.x;\n";
code += " bool inside_y = s_y >= 0 && s_y < src_size.y;\n";
code += " if (inside_x && inside_y) {\n";
code += " int start_channel = Z * 4;\n";
for (int i = 0; i < 4; ++i) {
const auto& s = channels[i];
code += " {\n";
code += " int channel = start_channel + " + std::to_string(i) + ";\n";
code += " int s_z = channel - prepended.z;\n";
code += " if (s_z >= 0 && s_z < src_size.z) {\n";
code +=
" FLT4 t = " + src_tensor.Read3D("s_x", "s_y", "s_z / 4") + ";\n";
code += " FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
code += " result" + s + " = t_ar[s_z % 4];\n";
code += " }\n";
code += " }\n";
}
code += " }\n";
code += " " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
code += PostProcess(linked_operations, "result", "Z", "address");
code += " " + dst_tensor.Write3D("result", "address");
code += "}\n";
return code;
}
} // namespace
Padding::Padding(const OperationDef& definition, const PadAttributes& attr)
: GPUOperation(definition) {
SetPrepended(int3(attr.prepended.w, attr.prepended.h, attr.prepended.c));
}
Padding::Padding(Padding&& kernel)
: GPUOperation(std::move(kernel)),
prepended_(kernel.prepended_),
kernel_(std::move(kernel.kernel_)),
work_group_size_(kernel.work_group_size_) {}
Padding& Padding::operator=(Padding&& kernel) {
if (this != &kernel) {
std::swap(prepended_, kernel.prepended_);
kernel_ = std::move(kernel.kernel_);
std::swap(work_group_size_, kernel.work_group_size_);
GPUOperation::operator=(std::move(kernel));
}
return *this;
}
void Padding::SetPrepended(const int3& prepended) {
prepended_.x = prepended.x;
prepended_.y = prepended.y;
prepended_.z = prepended.z;
prepended_.w = 0;
}
Status Padding::Compile(const CreationContext& creation_context) {
const auto code =
GetPaddingCode(definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status Padding::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(prepended_));
return OkStatus();
}
int3 Padding::GetGridSize() const {
const int grid_x = dst_[0]->Width();
const int grid_y = dst_[0]->Height();
const int grid_z = dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
}
Status Padding::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status Padding::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
Padding CreatePadding(const OperationDef& definition,
const PadAttributes& attr) {
return Padding(definition, attr);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,59 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PADDING_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PADDING_H_
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class Padding : public GPUOperation {
public:
Padding(const OperationDef& definition, const PadAttributes& attr);
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
Padding(Padding&& kernel);
Padding& operator=(Padding&& kernel);
Padding(const Padding&) = delete;
Padding& operator=(const Padding&) = delete;
private:
Status BindArguments();
int3 GetGridSize() const;
void SetPrepended(const int3& prepended);
int4 prepended_;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
Padding CreatePadding(const OperationDef& definition,
const PadAttributes& attr);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_PADDING_H_

View File

@ -0,0 +1,236 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/padding.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, PaddingAppendWidth) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
PadAttributes attr;
attr.prepended = HWC(0, 0, 0);
attr.appended = HWC(0, 1, 0);
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Padding operation = CreatePadding(op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps),
{0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f}));
}
}
}
TEST_F(OpenCLOperationTest, PaddingPrependWidth) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
PadAttributes attr;
attr.prepended = HWC(0, 1, 0);
attr.appended = HWC(0, 0, 0);
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Padding operation = CreatePadding(op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps),
{0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f}));
}
}
}
TEST_F(OpenCLOperationTest, PaddingAppendHeight) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
PadAttributes attr;
attr.prepended = HWC(0, 0, 0);
attr.appended = HWC(1, 0, 0);
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Padding operation = CreatePadding(op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 3, 1, 2), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(eps), {0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f}));
}
}
}
TEST_F(OpenCLOperationTest, PaddingPrependHeight) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
PadAttributes attr;
attr.prepended = HWC(1, 0, 0);
attr.appended = HWC(0, 0, 0);
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Padding operation = CreatePadding(op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 3, 1, 2), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(eps), {0.0f, 0.0f, 0.0f, 1.0f, 2.0f, 3.0f}));
}
}
}
TEST_F(OpenCLOperationTest, PaddingAppendChannels) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
PadAttributes attr;
attr.prepended = HWC(0, 0, 0);
attr.appended = HWC(0, 0, 1);
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Padding operation = CreatePadding(op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 3), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(eps), {0.0f, 1.0f, 0.0f, 2.0f, 3.0f, 0.0f}));
}
}
}
TEST_F(OpenCLOperationTest, PaddingPrependChannels) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
PadAttributes attr;
attr.prepended = HWC(0, 0, 1);
attr.appended = HWC(0, 0, 0);
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Padding operation = CreatePadding(op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 1, 3), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(eps), {0.0f, 0.0f, 1.0f, 0.0f, 2.0f, 3.0f}));
}
}
}
TEST_F(OpenCLOperationTest, PaddingComplex) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 1, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
PadAttributes attr;
attr.prepended = HWC(0, 1, 1);
attr.appended = HWC(1, 1, 0);
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Padding operation = CreatePadding(op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 3, 3, 3), &dst_tensor));
EXPECT_THAT(
dst_tensor.data,
Pointwise(FloatNear(eps),
{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,255 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/pooling.h"
#include <string>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GetAveragePoolingKernelCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const std::vector<ElementwiseOperation*>& linked_operations) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
std::string code = GetCommonDefines(precision);
code += "__kernel void main_function(\n";
code += src_tensor.GetDeclaration(AccessType::READ);
code += GetArgsDeclaration(linked_operations);
code += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
code += " int4 src_size, \n";
code += " int4 dst_size, \n";
code += " int2 kernel_size, \n";
code += " int2 padding, \n";
code += " int2 stride \n";
code += ") {\n";
code += " int X = get_global_id(0);\n";
code += " int Y = get_global_id(1);\n";
code += " int Z = get_global_id(2);\n";
code += " if (X >= dst_size.x || Y >= dst_size.y) return; \n";
code += " float4 r = (float4)(0.0f);\n";
code += " float window_size = 0.0;\n";
code += " for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
code += " int y_c = Y * stride.y - padding.y + ky;\n";
code += " bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
code += " for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
code += " int x_c = X * stride.x - padding.x + kx;\n";
code += " bool outside = outside_y || x_c < 0 || x_c >= src_size.x;\n";
if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
code += " r += !outside ? " +
src_tensor.ReadAsFloat3D("x_c", "y_c", "Z") +
" : (float4)(0.0f);\n";
} else {
code += " r += " + src_tensor.ReadAsFloat3D("x_c", "y_c", "Z") + ";\n";
}
code += " window_size += !outside ? 1.0 : 0.0;\n";
code += " }\n";
code += " }\n";
// If window_size==0, window covered nothing. This situation is a sign of
// incorrectly constructed operation. NaNs are expected as output.
code += " FLT4 result = TO_FLT4(r / window_size);\n";
code += " " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
code += PostProcess(linked_operations, "result", "Z", "address");
code += " " + dst_tensor.Write3D("result", "address");
code += "}\n";
return code;
}
std::string GetMaxPoolingKernelCode(
const TensorDescriptor& src_descriptor,
const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
const std::vector<ElementwiseOperation*>& linked_operations,
bool output_indices) {
TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
TensorCodeGenerator indices_tensor("dst_indices", "dst_size", dst_descriptor);
std::string code = GetCommonDefines(precision);
code += "__kernel void main_function(\n";
code += src_tensor.GetDeclaration(AccessType::READ);
code += GetArgsDeclaration(linked_operations);
code += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
if (output_indices) {
code += indices_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
}
code += " int4 src_size, \n";
code += " int4 dst_size, \n";
code += " int2 kernel_size, \n";
code += " int2 padding, \n";
code += " int2 stride \n";
code += ") {\n";
code += " int X = get_global_id(0);\n";
code += " int Y = get_global_id(1);\n";
code += " int Z = get_global_id(2);\n";
code += " if (X >= dst_size.x || Y >= dst_size.y) return; \n";
code += " FLT4 maximum = (FLT4)(-10000.0f);\n";
if (output_indices) {
code += " int4 indexes = (int4)(0);\n";
code += " int index_counter = 0;\n";
}
code += " for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
code += " int y_c = Y * stride.y - padding.y + ky;\n";
code += " bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
code += " for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
code += " int x_c = X * stride.x - padding.x + kx;\n";
code += " bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
code += " if (!outside_x && !outside_y) {\n";
code += " FLT4 src = " + src_tensor.Read3D("x_c", "y_c", "Z") + ";\n";
if (output_indices) {
code += " if (src.x > maximum.x) {\n";
code += " indexes.x = index_counter;\n";
code += " maximum.x = src.x;\n";
code += " }\n";
code += " if (src.y > maximum.y) {\n";
code += " indexes.y = index_counter;\n";
code += " maximum.y = src.y;\n";
code += " }\n";
code += " if (src.z > maximum.z) {\n";
code += " indexes.z = index_counter;\n";
code += " maximum.z = src.z;\n";
code += " }\n";
code += " if (src.w > maximum.w) {\n";
code += " indexes.w = index_counter;\n";
code += " maximum.w = src.w;\n";
code += " }\n";
code += " index_counter++;\n";
}
code += " maximum = max(src, maximum);\n";
code += " };\n";
code += " }\n";
code += " }\n";
code += " " + dst_tensor.GetAddress("address", "X", "Y", "Z") + "\n";
code += PostProcess(linked_operations, "maximum", "Z", "address");
code += " " + dst_tensor.Write3D("maximum", "address");
if (output_indices) {
code += " FLT4 result_value = TO_FLT4(indexes) + (FLT4)(0.1);\n";
code += " " + indices_tensor.Write3D("result_value", "address");
}
code += "}\n";
return code;
}
} // namespace
Pooling::Pooling(const OperationDef& definition,
const Pooling2DAttributes& attr)
: GPUOperation(definition),
stride_(attr.strides.w, attr.strides.h),
padding_(attr.padding.prepended.w, attr.padding.prepended.h),
kernel_size_(attr.kernel.w, attr.kernel.h),
type_(attr.type),
output_indices_(attr.output_indices) {}
Pooling::Pooling(Pooling&& kernel)
: GPUOperation(std::move(kernel)),
stride_(kernel.stride_),
padding_(kernel.padding_),
kernel_size_(kernel.kernel_size_),
type_(kernel.type_),
output_indices_(kernel.output_indices_),
kernel_(std::move(kernel.kernel_)),
work_group_size_(kernel.work_group_size_) {}
Pooling& Pooling::operator=(Pooling&& kernel) {
if (this != &kernel) {
std::swap(stride_, kernel.stride_);
std::swap(padding_, kernel.padding_);
std::swap(kernel_size_, kernel.kernel_size_);
std::swap(type_, kernel.type_);
std::swap(output_indices_, kernel.output_indices_);
kernel_ = std::move(kernel.kernel_);
std::swap(work_group_size_, kernel.work_group_size_);
GPUOperation::operator=(std::move(kernel));
}
return *this;
}
Status Pooling::Compile(const CreationContext& creation_context) {
std::string code;
switch (type_) {
case PoolingType::AVERAGE:
code = GetAveragePoolingKernelCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_);
break;
case PoolingType::MAX:
code = GetMaxPoolingKernelCode(
definition_.src_tensors[0], definition_.dst_tensors[0],
definition_.precision, linked_operations_, output_indices_);
break;
default:
return InvalidArgumentError(
"You should create another kernel with this params");
break;
}
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
Status Pooling::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtr()));
if (output_indices_) {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[1]->GetMemoryPtr()));
}
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetSizeWithDepth()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
return OkStatus();
}
int3 Pooling::GetGridSize() const {
const int grid_x = dst_[0]->Width();
const int grid_y = dst_[0]->Height();
const int grid_z = dst_[0]->Depth();
return int3(grid_x, grid_y, grid_z);
}
Status Pooling::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status Pooling::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
Pooling CreatePooling(const OperationDef& definition,
const Pooling2DAttributes& attr) {
return Pooling(definition, attr);
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,66 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_POOLING_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_POOLING_H_
#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class Pooling : public GPUOperation {
public:
Pooling(const OperationDef& definition, const Pooling2DAttributes& attr);
Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
Pooling(Pooling&& kernel);
Pooling& operator=(Pooling&& kernel);
Pooling(const Pooling&) = delete;
Pooling& operator=(const Pooling&) = delete;
private:
Status BindArguments();
int3 GetGridSize() const;
int2 stride_;
int2 padding_;
int2 kernel_size_;
PoolingType type_;
bool output_indices_;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
Pooling CreatePooling(const OperationDef& definition,
const Pooling2DAttributes& attr);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_POOLING_H_

View File

@ -0,0 +1,162 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/pooling.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, AveragePooling) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Pooling2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(0, 0);
attr.strides = HW(2, 2);
attr.kernel = HW(2, 2);
attr.type = PoolingType::AVERAGE;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Pooling operation = CreatePooling(op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 1, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {3.0f, 4.0f}));
}
}
}
TEST_F(OpenCLOperationTest, AveragePoolingNonEmptyPadding) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 1);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
Pooling2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(1, 1);
attr.strides = HW(1, 1);
attr.kernel = HW(2, 2);
attr.type = PoolingType::AVERAGE;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Pooling operation = CreatePooling(op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps), {1.5f, 2.0f, 2.5f, 3.0f}));
}
}
}
TEST_F(OpenCLOperationTest, MaxPooling) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {8.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Pooling2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(0, 0);
attr.strides = HW(2, 2);
attr.kernel = HW(2, 2);
attr.type = PoolingType::MAX;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
Pooling operation = CreatePooling(op_def, attr);
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 1, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {8.0f, 7.0f}));
}
}
}
TEST_F(OpenCLOperationTest, MaxPoolingIndices) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 2);
src_tensor.data = {8.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
Pooling2DAttributes attr;
attr.padding.prepended = HW(0, 0);
attr.padding.appended = HW(0, 0);
attr.strides = HW(2, 2);
attr.kernel = HW(2, 2);
attr.type = PoolingType::MAX;
attr.output_indices = true;
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
op_def.dst_tensors.push_back({data_type, storage});
TensorFloat32 dst_tensor;
TensorFloat32 dst_tensor_ind;
Pooling operation = CreatePooling(op_def, attr);
ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
{BHWC(1, 1, 1, 2), BHWC(1, 1, 1, 2)},
{&dst_tensor, &dst_tensor_ind}));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {8.0f, 7.0f}));
for (auto& v : dst_tensor_ind.data) {
v = static_cast<int>(v);
}
EXPECT_THAT(dst_tensor_ind.data, Pointwise(FloatNear(eps), {0.0f, 3.0f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite

Some files were not shown because too many files have changed in this diff Show More