TFLite GPU: Replace tflite::gpu::Status with absl::Status.
PiperOrigin-RevId: 302551400 Change-Id: Ib36038b364fda986c12543576471c07eef87db14
This commit is contained in:
parent
b5c7256118
commit
f18fa5b6b0
@ -12,6 +12,12 @@ exports_files([
|
||||
"metal_delegate.h",
|
||||
])
|
||||
|
||||
# Primary purpose of this config is to replace ::util::Status with our custom
|
||||
# light implementation ::tflite::gpu::StatusLite to reduce binary size. Besides
|
||||
# that, certain features that were hard to communicate without full open source
|
||||
# were hidden away too such as compiled models, serialization, and metadata.
|
||||
# While the latter will be fully available with the open source release, the
|
||||
# former will have to stay until absl::Status is released.
|
||||
config_setting(
|
||||
name = "tflite_gpu_binary_release",
|
||||
values = {"copt": "-DTFLITE_GPU_BINARY_RELEASE"},
|
||||
|
@ -220,8 +220,7 @@ class InferenceBuilder {
|
||||
|
||||
// Sets new shape for the input if underlying implementation and graph
|
||||
// structure allows dynamic tensors.
|
||||
virtual absl::Status SetInputShape(int index,
|
||||
const Dimensions& dimensions) = 0;
|
||||
virtual Status SetInputShape(int index, const Dimensions& dimensions) = 0;
|
||||
|
||||
// Updates object definitions for the given index. Implementation may allow
|
||||
// to use different layouts and/or data type conversions between objects
|
||||
@ -230,21 +229,21 @@ class InferenceBuilder {
|
||||
// A user, however, has an input in DataType::FLOAT16, DataLayout::PHWC4.
|
||||
// An implementation may allow this transformation to happen automatically
|
||||
// under the hood.
|
||||
virtual absl::Status SetInputObjectDef(int index, ObjectDef def) = 0;
|
||||
virtual absl::Status SetOutputObjectDef(int index, ObjectDef def) = 0;
|
||||
virtual absl::Status SetAllInputObjectDefsTo(ObjectDef def) {
|
||||
virtual Status SetInputObjectDef(int index, ObjectDef def) = 0;
|
||||
virtual Status SetOutputObjectDef(int index, ObjectDef def) = 0;
|
||||
virtual Status SetAllInputObjectDefsTo(ObjectDef def) {
|
||||
auto input_defs = inputs();
|
||||
for (int i = 0; i < input_defs.size(); ++i) {
|
||||
RETURN_IF_ERROR(SetInputObjectDef(i, def));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
virtual absl::Status SetAllOutputObjectDefsTo(ObjectDef def) {
|
||||
virtual Status SetAllOutputObjectDefsTo(ObjectDef def) {
|
||||
auto output_defs = outputs();
|
||||
for (int i = 0; i < output_defs.size(); ++i) {
|
||||
RETURN_IF_ERROR(SetOutputObjectDef(i, def));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
// Creates new instance of the inference runner. InferenceBuilder stays valid
|
||||
@ -252,7 +251,7 @@ class InferenceBuilder {
|
||||
//
|
||||
// This method may take significant time to prepare new inference runner. For
|
||||
// example, it may require to compile OpenGL shaders.
|
||||
virtual absl::Status Build(std::unique_ptr<InferenceRunner>* runner) = 0;
|
||||
virtual Status Build(std::unique_ptr<InferenceRunner>* runner) = 0;
|
||||
};
|
||||
|
||||
// Runs prepared inference. Every object marked as external needs to be set
|
||||
@ -269,12 +268,12 @@ class InferenceRunner {
|
||||
// Setters allow to set or change external object for the given index. Note,
|
||||
// object need to match object definition set before in InferenceBuilder.
|
||||
|
||||
virtual absl::Status GetInputObject(int index, TensorObject* object) = 0;
|
||||
virtual absl::Status GetOutputObject(int index, TensorObject* object) = 0;
|
||||
virtual absl::Status SetInputObject(int index, TensorObject object) = 0;
|
||||
virtual absl::Status SetOutputObject(int index, TensorObject object) = 0;
|
||||
virtual Status GetInputObject(int index, TensorObject* object) = 0;
|
||||
virtual Status GetOutputObject(int index, TensorObject* object) = 0;
|
||||
virtual Status SetInputObject(int index, TensorObject object) = 0;
|
||||
virtual Status SetOutputObject(int index, TensorObject object) = 0;
|
||||
|
||||
virtual absl::Status Run() = 0;
|
||||
virtual Status Run() = 0;
|
||||
};
|
||||
|
||||
// Encapsulated compilation/runtime tradeoffs.
|
||||
|
@ -54,22 +54,22 @@ class NoopTensorTie : public TensorTie {
|
||||
return def.external_def == def.internal_def;
|
||||
}
|
||||
|
||||
absl::Status SetExternalObject(TensorObject obj) final {
|
||||
Status SetExternalObject(TensorObject obj) final {
|
||||
if (!def().external_def.object_def.user_provided) {
|
||||
return absl::InvalidArgumentError("Tensor object is readonly.");
|
||||
return InvalidArgumentError("Tensor object is readonly.");
|
||||
}
|
||||
if (!IsValid(def().external_def, obj)) {
|
||||
return absl::InvalidArgumentError("Given object is not valid");
|
||||
return InvalidArgumentError("Given object is not valid");
|
||||
}
|
||||
obj_ = obj;
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
TensorObject GetExternalObject() final { return obj_; }
|
||||
|
||||
absl::Status CopyToExternalObject() final { return absl::OkStatus(); }
|
||||
Status CopyToExternalObject() final { return OkStatus(); }
|
||||
|
||||
absl::Status CopyFromExternalObject() final { return absl::OkStatus(); }
|
||||
Status CopyFromExternalObject() final { return OkStatus(); }
|
||||
|
||||
private:
|
||||
TensorObject obj_;
|
||||
@ -93,45 +93,45 @@ class DefaultTensorTie : public TensorTie {
|
||||
converter_builder.IsSupported(def.external_def, def.internal_def);
|
||||
}
|
||||
|
||||
static absl::Status New(const TensorTieDef& def, TensorObject internal_object,
|
||||
TensorObjectConverterBuilder* converter_builder,
|
||||
Environment* env, std::unique_ptr<TensorTie>* tie) {
|
||||
static Status New(const TensorTieDef& def, TensorObject internal_object,
|
||||
TensorObjectConverterBuilder* converter_builder,
|
||||
Environment* env, std::unique_ptr<TensorTie>* tie) {
|
||||
auto tie_impl = absl::make_unique<DefaultTensorTie>(def, internal_object);
|
||||
RETURN_IF_ERROR(tie_impl->Init(converter_builder, env));
|
||||
*tie = std::move(tie_impl);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CopyToExternalObject() final {
|
||||
Status CopyToExternalObject() final {
|
||||
if (!converter_to_) {
|
||||
return absl::UnavailableError("Conversion is not available");
|
||||
return UnavailableError("Conversion is not available");
|
||||
}
|
||||
return converter_to_->Convert(internal_obj_, GetExternalObject());
|
||||
}
|
||||
|
||||
absl::Status CopyFromExternalObject() final {
|
||||
Status CopyFromExternalObject() final {
|
||||
if (!converter_from_) {
|
||||
return absl::UnavailableError("Conversion is not available");
|
||||
return UnavailableError("Conversion is not available");
|
||||
}
|
||||
return converter_from_->Convert(GetExternalObject(), internal_obj_);
|
||||
}
|
||||
|
||||
absl::Status SetExternalObject(TensorObject obj) final {
|
||||
Status SetExternalObject(TensorObject obj) final {
|
||||
if (!def().external_def.object_def.user_provided) {
|
||||
return absl::InvalidArgumentError("External object is read-only");
|
||||
return InvalidArgumentError("External object is read-only");
|
||||
}
|
||||
if (!IsValid(def().external_def, obj)) {
|
||||
return absl::InvalidArgumentError("Given object is not valid");
|
||||
return InvalidArgumentError("Given object is not valid");
|
||||
}
|
||||
external_obj_ = obj;
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
TensorObject GetExternalObject() final { return external_obj_; }
|
||||
|
||||
private:
|
||||
absl::Status Init(TensorObjectConverterBuilder* converter_builder,
|
||||
Environment* env) {
|
||||
Status Init(TensorObjectConverterBuilder* converter_builder,
|
||||
Environment* env) {
|
||||
RETURN_IF_ERROR(converter_builder->MakeConverter(
|
||||
def().internal_def, def().external_def, &converter_to_));
|
||||
RETURN_IF_ERROR(converter_builder->MakeConverter(
|
||||
@ -139,10 +139,10 @@ class DefaultTensorTie : public TensorTie {
|
||||
return MaybeAllocateExternalObject(env);
|
||||
}
|
||||
|
||||
absl::Status MaybeAllocateExternalObject(Environment* env) {
|
||||
Status MaybeAllocateExternalObject(Environment* env) {
|
||||
const TensorObjectDef& d = def().external_def;
|
||||
if (d.object_def.user_provided) {
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
switch (d.object_def.object_type) {
|
||||
case ObjectType::CPU_MEMORY: {
|
||||
@ -170,9 +170,9 @@ class DefaultTensorTie : public TensorTie {
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return absl::InternalError("Unexpected object type");
|
||||
return InternalError("Unexpected object type");
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
const TensorObject internal_obj_;
|
||||
@ -198,26 +198,26 @@ class TwoStepTensorTie : public TensorTie {
|
||||
DefaultTensorTie::IsSupported(defs.second, converter_builder);
|
||||
}
|
||||
|
||||
static absl::Status New(const TensorTieDef& def, TensorObject internal_object,
|
||||
TensorObjectConverterBuilder* converter_builder,
|
||||
Environment* env, std::unique_ptr<TensorTie>* tie) {
|
||||
static Status New(const TensorTieDef& def, TensorObject internal_object,
|
||||
TensorObjectConverterBuilder* converter_builder,
|
||||
Environment* env, std::unique_ptr<TensorTie>* tie) {
|
||||
auto tie_impl = absl::make_unique<TwoStepTensorTie>(def);
|
||||
RETURN_IF_ERROR(tie_impl->Init(internal_object, converter_builder, env));
|
||||
*tie = std::move(tie_impl);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CopyToExternalObject() final {
|
||||
Status CopyToExternalObject() final {
|
||||
RETURN_IF_ERROR(inner_tie_->CopyToExternalObject());
|
||||
return outer_tie_->CopyToExternalObject();
|
||||
}
|
||||
|
||||
absl::Status CopyFromExternalObject() final {
|
||||
Status CopyFromExternalObject() final {
|
||||
RETURN_IF_ERROR(outer_tie_->CopyFromExternalObject());
|
||||
return inner_tie_->CopyFromExternalObject();
|
||||
}
|
||||
|
||||
absl::Status SetExternalObject(TensorObject obj) final {
|
||||
Status SetExternalObject(TensorObject obj) final {
|
||||
return outer_tie_->SetExternalObject(obj);
|
||||
}
|
||||
|
||||
@ -241,9 +241,9 @@ class TwoStepTensorTie : public TensorTie {
|
||||
return std::make_pair(outer_def, inner_def);
|
||||
}
|
||||
|
||||
absl::Status Init(TensorObject internal_object,
|
||||
TensorObjectConverterBuilder* converter_builder,
|
||||
Environment* env) {
|
||||
Status Init(TensorObject internal_object,
|
||||
TensorObjectConverterBuilder* converter_builder,
|
||||
Environment* env) {
|
||||
auto defs = MakeOuterInnerDefs(def());
|
||||
RETURN_IF_ERROR(DefaultTensorTie::New(defs.second, internal_object,
|
||||
converter_builder, env, &inner_tie_));
|
||||
@ -274,27 +274,27 @@ class GlBufferHolder : public TensorTie {
|
||||
return DefaultTensorTie::IsSupported(MakeClDef(def), converter_builder);
|
||||
}
|
||||
|
||||
static absl::Status New(const TensorTieDef& def, TensorObject internal_object,
|
||||
TensorObjectConverterBuilder* converter_builder,
|
||||
GlInteropFabric* gl_interop_fabric, Environment* env,
|
||||
std::unique_ptr<TensorTie>* tie) {
|
||||
static Status New(const TensorTieDef& def, TensorObject internal_object,
|
||||
TensorObjectConverterBuilder* converter_builder,
|
||||
GlInteropFabric* gl_interop_fabric, Environment* env,
|
||||
std::unique_ptr<TensorTie>* tie) {
|
||||
auto tie_impl =
|
||||
absl::make_unique<GlBufferHolder>(def, gl_interop_fabric, env);
|
||||
RETURN_IF_ERROR(DefaultTensorTie::New(MakeClDef(def), internal_object,
|
||||
converter_builder, env,
|
||||
&tie_impl->tie_));
|
||||
*tie = std::move(tie_impl);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SetExternalObject(TensorObject obj) final {
|
||||
Status SetExternalObject(TensorObject obj) final {
|
||||
auto ssbo = absl::get_if<OpenGlBuffer>(&obj);
|
||||
if (!ssbo) {
|
||||
return absl::InvalidArgumentError("Missing OpenGL SSBO");
|
||||
return InvalidArgumentError("Missing OpenGL SSBO");
|
||||
}
|
||||
auto old_ssbo = absl::get_if<OpenGlBuffer>(&external_obj_);
|
||||
if (old_ssbo && ssbo->id == old_ssbo->id) {
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
if (cl_object_.memory()) {
|
||||
gl_interop_fabric_->UnregisterMemory(cl_object_.memory());
|
||||
@ -304,18 +304,16 @@ class GlBufferHolder : public TensorTie {
|
||||
external_obj_ = obj;
|
||||
RETURN_IF_ERROR(tie_->SetExternalObject(OpenClBuffer{cl_object_.memory()}));
|
||||
gl_interop_fabric_->RegisterMemory(cl_object_.memory());
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
TensorObject GetExternalObject() final { return external_obj_; }
|
||||
|
||||
absl::Status CopyFromExternalObject() final {
|
||||
Status CopyFromExternalObject() final {
|
||||
return tie_->CopyFromExternalObject();
|
||||
}
|
||||
|
||||
absl::Status CopyToExternalObject() final {
|
||||
return tie_->CopyToExternalObject();
|
||||
}
|
||||
Status CopyToExternalObject() final { return tie_->CopyToExternalObject(); }
|
||||
|
||||
private:
|
||||
static TensorTieDef MakeClDef(const TensorTieDef& def) {
|
||||
@ -360,20 +358,20 @@ class TensorTieFactory {
|
||||
TwoStepTensorTie::IsSupported(def, *converter_builder_));
|
||||
}
|
||||
|
||||
absl::Status NewTensorTie(const TensorTieDef& def,
|
||||
std::unique_ptr<TensorTie>* tie) {
|
||||
Status NewTensorTie(const TensorTieDef& def,
|
||||
std::unique_ptr<TensorTie>* tie) {
|
||||
TensorObject internal_object = TensorToObj(*context_.GetTensor(def.id));
|
||||
auto converter = converter_builder_.get();
|
||||
if (NoopTensorTie::IsSupported(def)) {
|
||||
*tie = absl::make_unique<NoopTensorTie>(def, internal_object);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
if (DefaultTensorTie::IsSupported(def, *converter)) {
|
||||
return DefaultTensorTie::New(def, internal_object, converter, &env_, tie);
|
||||
}
|
||||
if (GlBufferHolder::IsSupported(def, *converter)) {
|
||||
if (!gl_interop_fabric_) {
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"GL object is used but InferenceEnvironmentOptions does not have "
|
||||
"EGL display and context set.");
|
||||
}
|
||||
@ -383,7 +381,7 @@ class TensorTieFactory {
|
||||
if (TwoStepTensorTie::IsSupported(def, *converter)) {
|
||||
return TwoStepTensorTie::New(def, internal_object, converter, &env_, tie);
|
||||
}
|
||||
return absl::UnimplementedError("Unsupported tensor tie definition.");
|
||||
return UnimplementedError("Unsupported tensor tie definition.");
|
||||
}
|
||||
|
||||
private:
|
||||
@ -402,9 +400,9 @@ class InferenceRunnerImpl : public InferenceRunner {
|
||||
context_(std::move(context)),
|
||||
gl_interop_fabric_(std::move(gl_interop_fabric)) {}
|
||||
|
||||
absl::Status Initialize(const std::vector<TensorTieDef>& inputs,
|
||||
const std::vector<TensorTieDef>& outputs,
|
||||
TensorTieFactory* factory) {
|
||||
Status Initialize(const std::vector<TensorTieDef>& inputs,
|
||||
const std::vector<TensorTieDef>& outputs,
|
||||
TensorTieFactory* factory) {
|
||||
RETURN_IF_ERROR(LinkTensors(inputs, factory, &inputs_));
|
||||
return LinkTensors(outputs, factory, &outputs_);
|
||||
}
|
||||
@ -417,37 +415,37 @@ class InferenceRunnerImpl : public InferenceRunner {
|
||||
return GetExternalDefinitions(outputs_);
|
||||
}
|
||||
|
||||
absl::Status GetInputObject(int index, TensorObject* object) override {
|
||||
Status GetInputObject(int index, TensorObject* object) override {
|
||||
if (index < 0 || index >= inputs_.size()) {
|
||||
return absl::OutOfRangeError("Index is out of range");
|
||||
return OutOfRangeError("Index is out of range");
|
||||
}
|
||||
*object = inputs_[index]->GetExternalObject();
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status GetOutputObject(int index, TensorObject* object) override {
|
||||
Status GetOutputObject(int index, TensorObject* object) override {
|
||||
if (index < 0 || index >= outputs_.size()) {
|
||||
return absl::OutOfRangeError("Index is out of range");
|
||||
return OutOfRangeError("Index is out of range");
|
||||
}
|
||||
*object = outputs_[index]->GetExternalObject();
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SetInputObject(int index, TensorObject object) override {
|
||||
Status SetInputObject(int index, TensorObject object) override {
|
||||
if (index < 0 || index >= inputs_.size()) {
|
||||
return absl::OutOfRangeError("Index is out of range");
|
||||
return OutOfRangeError("Index is out of range");
|
||||
}
|
||||
return inputs_[index]->SetExternalObject(object);
|
||||
}
|
||||
|
||||
absl::Status SetOutputObject(int index, TensorObject object) override {
|
||||
Status SetOutputObject(int index, TensorObject object) override {
|
||||
if (index < 0 || index >= outputs_.size()) {
|
||||
return absl::OutOfRangeError("Index is out of range");
|
||||
return OutOfRangeError("Index is out of range");
|
||||
}
|
||||
return outputs_[index]->SetExternalObject(object);
|
||||
}
|
||||
|
||||
absl::Status Run() override {
|
||||
Status Run() override {
|
||||
if (gl_interop_fabric_) {
|
||||
RETURN_IF_ERROR(gl_interop_fabric_->Start());
|
||||
}
|
||||
@ -462,20 +460,20 @@ class InferenceRunnerImpl : public InferenceRunner {
|
||||
if (gl_interop_fabric_) {
|
||||
RETURN_IF_ERROR(gl_interop_fabric_->Finish());
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
private:
|
||||
static absl::Status LinkTensors(
|
||||
const std::vector<TensorTieDef>& defs, TensorTieFactory* factory,
|
||||
std::vector<std::unique_ptr<TensorTie>>* objects) {
|
||||
static Status LinkTensors(const std::vector<TensorTieDef>& defs,
|
||||
TensorTieFactory* factory,
|
||||
std::vector<std::unique_ptr<TensorTie>>* objects) {
|
||||
objects->reserve(defs.size());
|
||||
for (auto& def : defs) {
|
||||
std::unique_ptr<TensorTie> object;
|
||||
RETURN_IF_ERROR(factory->NewTensorTie(def, &object));
|
||||
objects->push_back(std::move(object));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
static std::vector<TensorObjectDef> GetExternalDefinitions(
|
||||
@ -513,9 +511,9 @@ class InferenceBuilderImpl : public InferenceBuilder {
|
||||
explicit InferenceBuilderImpl(Environment* environment)
|
||||
: environment_(environment) {}
|
||||
|
||||
absl::Status Initialize(const InferenceOptions& options,
|
||||
const InferenceEnvironmentOptions& env_options,
|
||||
const GraphFloat32& graph) {
|
||||
Status Initialize(const InferenceOptions& options,
|
||||
const InferenceEnvironmentOptions& env_options,
|
||||
const GraphFloat32& graph) {
|
||||
context_ = absl::make_unique<InferenceContext>();
|
||||
InferenceContext::CreateInferenceInfo create_info;
|
||||
create_info.precision = GetPrecision(options);
|
||||
@ -535,7 +533,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
|
||||
|
||||
inputs_ = LinkTensors(graph, graph.inputs());
|
||||
outputs_ = LinkTensors(graph, graph.outputs());
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
std::vector<TensorObjectDef> inputs() const override {
|
||||
@ -546,42 +544,40 @@ class InferenceBuilderImpl : public InferenceBuilder {
|
||||
return GetExternalDefinitions(outputs_);
|
||||
}
|
||||
|
||||
absl::Status SetInputShape(int index, const Dimensions& dimensions) override {
|
||||
Status SetInputShape(int index, const Dimensions& dimensions) override {
|
||||
if (index < 0 || index >= inputs_.size()) {
|
||||
return absl::OutOfRangeError("Index is out of range");
|
||||
return OutOfRangeError("Index is out of range");
|
||||
}
|
||||
return absl::UnimplementedError("Changing input shapes is not supported");
|
||||
return UnimplementedError("Changing input shapes is not supported");
|
||||
}
|
||||
|
||||
absl::Status SetInputObjectDef(int index, ObjectDef new_def) override {
|
||||
Status SetInputObjectDef(int index, ObjectDef new_def) override {
|
||||
if (index < 0 || index >= inputs_.size()) {
|
||||
return absl::OutOfRangeError("Index is out of range");
|
||||
return OutOfRangeError("Index is out of range");
|
||||
}
|
||||
auto def = inputs_[index];
|
||||
def.external_def.object_def = new_def;
|
||||
if (!tie_factory_->IsSupported(def)) {
|
||||
return absl::InvalidArgumentError(
|
||||
"New object definition is not supported.");
|
||||
return InvalidArgumentError("New object definition is not supported.");
|
||||
}
|
||||
inputs_[index] = def;
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status SetOutputObjectDef(int index, ObjectDef new_def) override {
|
||||
Status SetOutputObjectDef(int index, ObjectDef new_def) override {
|
||||
if (index < 0 || index >= outputs_.size()) {
|
||||
return absl::OutOfRangeError("Index is out of range");
|
||||
return OutOfRangeError("Index is out of range");
|
||||
}
|
||||
auto def = outputs_[index];
|
||||
def.external_def.object_def = new_def;
|
||||
if (!tie_factory_->IsSupported(def)) {
|
||||
return absl::InvalidArgumentError(
|
||||
"New object definition is not supported.");
|
||||
return InvalidArgumentError("New object definition is not supported.");
|
||||
}
|
||||
outputs_[index] = def;
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status Build(std::unique_ptr<InferenceRunner>* runner) override {
|
||||
Status Build(std::unique_ptr<InferenceRunner>* runner) override {
|
||||
if (gl_interop_fabric_ && !HasGlObjects()) {
|
||||
// destroy interop layer when there are no GL objects to avoid
|
||||
// extra synchronization cost.
|
||||
@ -592,7 +588,7 @@ class InferenceBuilderImpl : public InferenceBuilder {
|
||||
RETURN_IF_ERROR(
|
||||
runner_impl->Initialize(inputs_, outputs_, tie_factory_.get()));
|
||||
*runner = std::move(runner_impl);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
private:
|
||||
@ -700,7 +696,7 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
|
||||
explicit InferenceEnvironmentImpl(const InferenceEnvironmentOptions& options)
|
||||
: options_(options) {}
|
||||
|
||||
absl::Status Init() {
|
||||
Status Init() {
|
||||
RETURN_IF_ERROR(LoadOpenCL());
|
||||
properties_.is_opencl_available = true;
|
||||
|
||||
@ -720,13 +716,13 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
|
||||
properties_.is_cl_to_gl_fast_sync_supported =
|
||||
IsEglSyncFromClEventSupported();
|
||||
if (options_.IsGlAware() && !properties_.is_gl_sharing_supported) {
|
||||
return absl::UnavailableError("GL sharing is not supported");
|
||||
return UnavailableError("GL sharing is not supported");
|
||||
}
|
||||
|
||||
CLContext context;
|
||||
if (options_.context) {
|
||||
if (options_.IsGlAware()) {
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"OpenCL context and EGL parameters are set in the same time.");
|
||||
}
|
||||
context = CLContext(options_.context, /* has_ownership = */ false);
|
||||
@ -758,11 +754,11 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
|
||||
return environment_.Init();
|
||||
}
|
||||
|
||||
absl::Status NewInferenceBuilder(
|
||||
const InferenceOptions& options, GraphFloat32 model,
|
||||
std::unique_ptr<InferenceBuilder>* builder) final {
|
||||
Status NewInferenceBuilder(const InferenceOptions& options,
|
||||
GraphFloat32 model,
|
||||
std::unique_ptr<InferenceBuilder>* builder) final {
|
||||
if (!IsValid(options)) {
|
||||
return absl::InvalidArgumentError("InferenceOptions are invalid.");
|
||||
return InvalidArgumentError("InferenceOptions are invalid.");
|
||||
}
|
||||
InferenceOptions resolved_options = options;
|
||||
ResolveAutoPriority(&resolved_options);
|
||||
@ -780,7 +776,7 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
|
||||
RETURN_IF_ERROR(
|
||||
builder_impl->Initialize(resolved_options, options_, model));
|
||||
*builder = std::move(builder_impl);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
std::vector<uint8_t> GetSerializedBinaryCache() const final {
|
||||
@ -804,18 +800,18 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
|
||||
|
||||
} // namespace
|
||||
|
||||
absl::Status NewInferenceEnvironment(
|
||||
Status NewInferenceEnvironment(
|
||||
const InferenceEnvironmentOptions& options,
|
||||
std::unique_ptr<InferenceEnvironment>* environment,
|
||||
InferenceEnvironmentProperties* properties) {
|
||||
auto env_impl = absl::make_unique<InferenceEnvironmentImpl>(options);
|
||||
absl::Status status = env_impl->Init();
|
||||
Status status = env_impl->Init();
|
||||
if (properties) {
|
||||
*properties = env_impl->properties();
|
||||
}
|
||||
RETURN_IF_ERROR(status);
|
||||
*environment = std::move(env_impl);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -70,7 +70,7 @@ class InferenceEnvironment {
|
||||
public:
|
||||
virtual ~InferenceEnvironment() {}
|
||||
|
||||
virtual absl::Status NewInferenceBuilder(
|
||||
virtual Status NewInferenceBuilder(
|
||||
const InferenceOptions& options, GraphFloat32 model,
|
||||
std::unique_ptr<InferenceBuilder>* builder) = 0;
|
||||
|
||||
@ -112,7 +112,7 @@ struct InferenceEnvironmentOptions {
|
||||
|
||||
// Creates new OpenCL environment that needs to stay around until all inference
|
||||
// runners are destroyed.
|
||||
absl::Status NewInferenceEnvironment(
|
||||
Status NewInferenceEnvironment(
|
||||
const InferenceEnvironmentOptions& options,
|
||||
std::unique_ptr<InferenceEnvironment>* environment,
|
||||
InferenceEnvironmentProperties* properties /* optional */);
|
||||
|
@ -21,10 +21,8 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
|
||||
absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
|
||||
const void* data, CLContext* context,
|
||||
Buffer* result) {
|
||||
Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only, const void* data,
|
||||
CLContext* context, Buffer* result) {
|
||||
cl_mem_flags flags = gpu_read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
|
||||
if (data != nullptr) {
|
||||
flags |= CL_MEM_COPY_HOST_PTR;
|
||||
@ -33,14 +31,14 @@ absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
|
||||
cl_mem buffer = clCreateBuffer(context->context(), flags, size_in_bytes,
|
||||
const_cast<void*>(data), &error_code);
|
||||
if (!buffer) {
|
||||
return absl::UnknownError(
|
||||
return UnknownError(
|
||||
absl::StrCat("Failed to allocate device memory with clCreateBuffer",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
|
||||
*result = Buffer(buffer, size_in_bytes);
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
@ -71,18 +69,18 @@ void Buffer::Release() {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
|
||||
Buffer* result) {
|
||||
Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
|
||||
Buffer* result) {
|
||||
return CreateBuffer(size_in_bytes, true, nullptr, context, result);
|
||||
}
|
||||
|
||||
absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
|
||||
CLContext* context, Buffer* result) {
|
||||
Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
|
||||
CLContext* context, Buffer* result) {
|
||||
return CreateBuffer(size_in_bytes, true, data, context, result);
|
||||
}
|
||||
|
||||
absl::Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext* context,
|
||||
Buffer* result) {
|
||||
Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext* context,
|
||||
Buffer* result) {
|
||||
return CreateBuffer(size_in_bytes, false, nullptr, context, result);
|
||||
}
|
||||
|
||||
|
@ -51,11 +51,11 @@ class Buffer {
|
||||
// Writes data to a buffer. Data should point to a region that
|
||||
// has exact size in bytes as size_in_bytes(constructor parameter).
|
||||
template <typename T>
|
||||
absl::Status WriteData(CLCommandQueue* queue, const absl::Span<T> data);
|
||||
Status WriteData(CLCommandQueue* queue, const absl::Span<T> data);
|
||||
|
||||
// Reads data from Buffer into CPU memory.
|
||||
template <typename T>
|
||||
absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
|
||||
Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
|
||||
|
||||
private:
|
||||
void Release();
|
||||
@ -64,31 +64,29 @@ class Buffer {
|
||||
size_t size_;
|
||||
};
|
||||
|
||||
absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
|
||||
Buffer* result);
|
||||
Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
|
||||
Buffer* result);
|
||||
|
||||
absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
|
||||
CLContext* context, Buffer* result);
|
||||
Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
|
||||
CLContext* context, Buffer* result);
|
||||
|
||||
absl::Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext* context,
|
||||
Buffer* result);
|
||||
Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext* context,
|
||||
Buffer* result);
|
||||
|
||||
template <typename T>
|
||||
absl::Status Buffer::WriteData(CLCommandQueue* queue,
|
||||
const absl::Span<T> data) {
|
||||
Status Buffer::WriteData(CLCommandQueue* queue, const absl::Span<T> data) {
|
||||
if (size_ != sizeof(T) * data.size()) {
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"absl::Span<T> data size is different from buffer allocated size.");
|
||||
}
|
||||
RETURN_IF_ERROR(queue->EnqueueWriteBuffer(buffer_, size_, data.data()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
absl::Status Buffer::ReadData(CLCommandQueue* queue,
|
||||
std::vector<T>* result) const {
|
||||
Status Buffer::ReadData(CLCommandQueue* queue, std::vector<T>* result) const {
|
||||
if (size_ % sizeof(T) != 0) {
|
||||
return absl::UnknownError("Wrong element size(typename T is not correct?");
|
||||
return UnknownError("Wrong element size(typename T is not correct?");
|
||||
}
|
||||
|
||||
const int elements_count = size_ / sizeof(T);
|
||||
|
@ -56,9 +56,8 @@ void CLCommandQueue::Release() {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size,
|
||||
CLEvent* event) {
|
||||
Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size, CLEvent* event) {
|
||||
std::vector<size_t> local(3);
|
||||
std::vector<size_t> global(3);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
@ -73,31 +72,30 @@ absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
*event = CLEvent(resulting_event);
|
||||
}
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(
|
||||
absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size) {
|
||||
Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size) {
|
||||
return DispatchImplicit(kernel, grid, work_group_size, nullptr);
|
||||
}
|
||||
|
||||
absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
|
||||
Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
|
||||
cl_event resulting_event;
|
||||
const int error_code = clEnqueueMarker(queue_, &resulting_event);
|
||||
*event = CLEvent(resulting_event);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(absl::StrCat("Failed to clEnqueueMarker - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to clEnqueueMarker - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
|
||||
const void* data) {
|
||||
Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
|
||||
const void* data) {
|
||||
const size_t origin[] = {0, 0, 0};
|
||||
const size_t r[] = {static_cast<size_t>(region.x),
|
||||
static_cast<size_t>(region.y),
|
||||
@ -105,16 +103,16 @@ absl::Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
|
||||
auto error_code = clEnqueueWriteImage(queue_, memory, CL_TRUE, origin, r, 0,
|
||||
0, data, 0, nullptr, nullptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(
|
||||
return UnknownError(
|
||||
absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
|
||||
void* data) {
|
||||
Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
|
||||
void* data) {
|
||||
const size_t origin[] = {0, 0, 0};
|
||||
const size_t r[] = {static_cast<size_t>(region.x),
|
||||
static_cast<size_t>(region.y),
|
||||
@ -122,47 +120,45 @@ absl::Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
|
||||
auto error_code = clEnqueueReadImage(queue_, memory, CL_TRUE, origin, r, 0, 0,
|
||||
data, 0, nullptr, nullptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(
|
||||
return UnknownError(
|
||||
absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory,
|
||||
size_t size_in_bytes,
|
||||
const void* data) {
|
||||
Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
|
||||
const void* data) {
|
||||
auto error_code = clEnqueueWriteBuffer(
|
||||
queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(
|
||||
return UnknownError(
|
||||
absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory,
|
||||
size_t size_in_bytes,
|
||||
void* data) {
|
||||
Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes,
|
||||
void* data) {
|
||||
auto error_code = clEnqueueReadBuffer(
|
||||
queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(
|
||||
return UnknownError(
|
||||
absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CLCommandQueue::WaitForCompletion() {
|
||||
Status CLCommandQueue::WaitForCompletion() {
|
||||
auto error_code = clFinish(queue_);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(
|
||||
return UnknownError(
|
||||
absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue)
|
||||
@ -191,14 +187,14 @@ void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
|
||||
|
||||
void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
|
||||
|
||||
absl::Status ProfilingCommandQueue::DispatchImplicit(const CLKernel& kernel,
|
||||
int3 grid,
|
||||
int3 work_group_size) {
|
||||
Status ProfilingCommandQueue::DispatchImplicit(const CLKernel& kernel,
|
||||
int3 grid,
|
||||
int3 work_group_size) {
|
||||
events_.push_back(CLEvent());
|
||||
RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
|
||||
kernel, grid, work_group_size, &events_[events_.size() - 1]));
|
||||
events_.back().SetName(current_label_);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
|
||||
@ -212,7 +208,7 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
|
||||
Status ProfilingCommandQueue::GetBestWorkGroupIndex(
|
||||
const CLKernel& kernel, const DeviceInfo& device_info, const int3& grid,
|
||||
const std::vector<int3>& work_group_sizes, int* index) {
|
||||
// Some Adreno 3xx can have wrong numbers for some events
|
||||
@ -272,22 +268,20 @@ absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
|
||||
|
||||
*index = minimum_index;
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateCLCommandQueue(const CLDevice& device,
|
||||
const CLContext& context,
|
||||
CLCommandQueue* result) {
|
||||
Status CreateCLCommandQueue(const CLDevice& device, const CLContext& context,
|
||||
CLCommandQueue* result) {
|
||||
int error_code;
|
||||
cl_command_queue queue =
|
||||
clCreateCommandQueue(context.context(), device.id(), 0, &error_code);
|
||||
if (!queue) {
|
||||
return absl::UnknownError(
|
||||
absl::StrCat("Failed to create a command queue - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to create a command queue - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
*result = CLCommandQueue(queue, true);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
double ProfilingCommandQueue::GetQueueExecutionTimeMs() const {
|
||||
@ -306,20 +300,19 @@ double ProfilingCommandQueue::GetSumOfEventsTimeMs() const {
|
||||
return sum;
|
||||
}
|
||||
|
||||
absl::Status CreateProfilingCommandQueue(const CLDevice& device,
|
||||
const CLContext& context,
|
||||
ProfilingCommandQueue* result) {
|
||||
Status CreateProfilingCommandQueue(const CLDevice& device,
|
||||
const CLContext& context,
|
||||
ProfilingCommandQueue* result) {
|
||||
int error_code;
|
||||
cl_command_queue queue = clCreateCommandQueue(
|
||||
context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code);
|
||||
if (!queue) {
|
||||
return absl::UnknownError(
|
||||
absl::StrCat("Failed to create a command queue - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to create a command queue - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
|
||||
*result = ProfilingCommandQueue(queue);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Duration ProfilingInfo::GetTotalTime() const {
|
||||
|
@ -74,23 +74,22 @@ class CLCommandQueue {
|
||||
|
||||
cl_command_queue queue() const { return queue_; }
|
||||
|
||||
virtual absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size);
|
||||
virtual Status DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size);
|
||||
|
||||
absl::Status EnqueueEvent(CLEvent* event);
|
||||
Status EnqueueEvent(CLEvent* event);
|
||||
|
||||
absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size, CLEvent* event);
|
||||
Status DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size, CLEvent* event);
|
||||
|
||||
absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data);
|
||||
absl::Status EnqueueReadImage(cl_mem memory, int3 region, void* data);
|
||||
Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data);
|
||||
Status EnqueueReadImage(cl_mem memory, int3 region, void* data);
|
||||
|
||||
absl::Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
|
||||
const void* data);
|
||||
absl::Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes,
|
||||
void* data);
|
||||
Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
|
||||
const void* data);
|
||||
Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes, void* data);
|
||||
|
||||
absl::Status WaitForCompletion();
|
||||
Status WaitForCompletion();
|
||||
|
||||
protected:
|
||||
void Release();
|
||||
@ -110,15 +109,14 @@ class ProfilingCommandQueue : public CLCommandQueue {
|
||||
ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
|
||||
ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
|
||||
|
||||
absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size) override;
|
||||
Status DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size) override;
|
||||
|
||||
// will write index for fastest work_group among work_group_sizes
|
||||
absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
|
||||
const DeviceInfo& device_info,
|
||||
const int3& grid,
|
||||
const std::vector<int3>& work_group_sizes,
|
||||
int* index);
|
||||
Status GetBestWorkGroupIndex(const CLKernel& kernel,
|
||||
const DeviceInfo& device_info, const int3& grid,
|
||||
const std::vector<int3>& work_group_sizes,
|
||||
int* index);
|
||||
|
||||
// call ResetMeasurements() to start new seriese of measurements
|
||||
void ResetMeasurements();
|
||||
@ -141,13 +139,12 @@ class ProfilingCommandQueue : public CLCommandQueue {
|
||||
std::string current_label_;
|
||||
};
|
||||
|
||||
absl::Status CreateCLCommandQueue(const CLDevice& device,
|
||||
const CLContext& context,
|
||||
CLCommandQueue* result);
|
||||
Status CreateCLCommandQueue(const CLDevice& device, const CLContext& context,
|
||||
CLCommandQueue* result);
|
||||
|
||||
absl::Status CreateProfilingCommandQueue(const CLDevice& device,
|
||||
const CLContext& context,
|
||||
ProfilingCommandQueue* result);
|
||||
Status CreateProfilingCommandQueue(const CLDevice& device,
|
||||
const CLContext& context,
|
||||
ProfilingCommandQueue* result);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -43,21 +43,19 @@ std::vector<cl_image_format> GetSupportedImage2DFormats(cl_context context,
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::Status CreateCLContext(const CLDevice& device,
|
||||
cl_context_properties* properties,
|
||||
CLContext* result) {
|
||||
Status CreateCLContext(const CLDevice& device,
|
||||
cl_context_properties* properties, CLContext* result) {
|
||||
int error_code;
|
||||
cl_device_id device_id = device.id();
|
||||
cl_context context =
|
||||
clCreateContext(properties, 1, &device_id, nullptr, nullptr, &error_code);
|
||||
if (!context) {
|
||||
return absl::UnknownError(
|
||||
absl::StrCat("Failed to create a compute context - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to create a compute context - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
|
||||
*result = CLContext(context, true);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@ -101,16 +99,15 @@ bool CLContext::IsFloatTexture2DSupported(int num_channels, DataType data_type,
|
||||
return false;
|
||||
}
|
||||
|
||||
absl::Status CreateCLContext(const CLDevice& device, CLContext* result) {
|
||||
Status CreateCLContext(const CLDevice& device, CLContext* result) {
|
||||
return CreateCLContext(device, nullptr, result);
|
||||
}
|
||||
|
||||
absl::Status CreateCLGLContext(const CLDevice& device,
|
||||
cl_context_properties egl_context,
|
||||
cl_context_properties egl_display,
|
||||
CLContext* result) {
|
||||
Status CreateCLGLContext(const CLDevice& device,
|
||||
cl_context_properties egl_context,
|
||||
cl_context_properties egl_display, CLContext* result) {
|
||||
if (!device.SupportsExtension("cl_khr_gl_sharing")) {
|
||||
return absl::UnavailableError("Device doesn't support CL-GL sharing.");
|
||||
return UnavailableError("Device doesn't support CL-GL sharing.");
|
||||
}
|
||||
cl_context_properties platform =
|
||||
reinterpret_cast<cl_context_properties>(device.platform());
|
||||
|
@ -51,11 +51,10 @@ class CLContext {
|
||||
bool has_ownership_ = false;
|
||||
};
|
||||
|
||||
absl::Status CreateCLContext(const CLDevice& device, CLContext* result);
|
||||
absl::Status CreateCLGLContext(const CLDevice& device,
|
||||
cl_context_properties egl_context,
|
||||
cl_context_properties egl_display,
|
||||
CLContext* result);
|
||||
Status CreateCLContext(const CLDevice& device, CLContext* result);
|
||||
Status CreateCLGLContext(const CLDevice& device,
|
||||
cl_context_properties egl_context,
|
||||
cl_context_properties egl_display, CLContext* result);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -516,11 +516,11 @@ void CLDevice::DisableOneLayerTextureArray() {
|
||||
info_.adreno_info.support_one_layer_texture_array = false;
|
||||
}
|
||||
|
||||
absl::Status CreateDefaultGPUDevice(CLDevice* result) {
|
||||
Status CreateDefaultGPUDevice(CLDevice* result) {
|
||||
cl_uint num_platforms;
|
||||
clGetPlatformIDs(0, nullptr, &num_platforms);
|
||||
if (num_platforms == 0) {
|
||||
return absl::UnknownError("No supported OpenCL platform.");
|
||||
return UnknownError("No supported OpenCL platform.");
|
||||
}
|
||||
std::vector<cl_platform_id> platforms(num_platforms);
|
||||
clGetPlatformIDs(num_platforms, platforms.data(), nullptr);
|
||||
@ -529,7 +529,7 @@ absl::Status CreateDefaultGPUDevice(CLDevice* result) {
|
||||
cl_uint num_devices;
|
||||
clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices);
|
||||
if (num_devices == 0) {
|
||||
return absl::UnknownError("No GPU on current platform.");
|
||||
return UnknownError("No GPU on current platform.");
|
||||
}
|
||||
|
||||
std::vector<cl_device_id> devices(num_devices);
|
||||
@ -537,7 +537,7 @@ absl::Status CreateDefaultGPUDevice(CLDevice* result) {
|
||||
nullptr);
|
||||
|
||||
*result = CLDevice(devices[0], platform_id);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -191,7 +191,7 @@ class CLDevice {
|
||||
DeviceInfo info_;
|
||||
};
|
||||
|
||||
absl::Status CreateDefaultGPUDevice(CLDevice* result);
|
||||
Status CreateDefaultGPUDevice(CLDevice* result);
|
||||
|
||||
template <typename T>
|
||||
T GetDeviceInfo(cl_device_id id, cl_device_info info) {
|
||||
@ -204,12 +204,12 @@ T GetDeviceInfo(cl_device_id id, cl_device_info info) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
absl::Status GetDeviceInfo(cl_device_id id, cl_device_info info, T* result) {
|
||||
Status GetDeviceInfo(cl_device_id id, cl_device_info info, T* result) {
|
||||
cl_int error = clGetDeviceInfo(id, info, sizeof(T), result, nullptr);
|
||||
if (error != CL_SUCCESS) {
|
||||
return absl::InvalidArgumentError(CLErrorCodeToString(error));
|
||||
return InvalidArgumentError(CLErrorCodeToString(error));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -27,12 +27,11 @@ namespace cl {
|
||||
|
||||
// @return if error_code is success, then return OK status. Otherwise translates
|
||||
// error code into a message.
|
||||
inline absl::Status GetOpenCLError(cl_int error_code) {
|
||||
inline Status GetOpenCLError(cl_int error_code) {
|
||||
if (error_code == CL_SUCCESS) {
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
return absl::InternalError("OpenCL error: " +
|
||||
CLErrorCodeToString(error_code));
|
||||
return InternalError("OpenCL error: " + CLErrorCodeToString(error_code));
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -25,34 +25,34 @@ namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
|
||||
absl::Status GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id,
|
||||
int* result) {
|
||||
Status GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id,
|
||||
int* result) {
|
||||
size_t max_work_group_size;
|
||||
cl_int error_code =
|
||||
clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,
|
||||
sizeof(size_t), &max_work_group_size, nullptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(
|
||||
return UnknownError(
|
||||
absl::StrCat("Failed to get info CL_KERNEL_WORK_GROUP_SIZE ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
*result = static_cast<int>(max_work_group_size);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status GetKernelPrivateMemorySize(cl_kernel kernel,
|
||||
cl_device_id device_id, int* result) {
|
||||
Status GetKernelPrivateMemorySize(cl_kernel kernel, cl_device_id device_id,
|
||||
int* result) {
|
||||
cl_ulong private_mem_size;
|
||||
cl_int error_code =
|
||||
clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PRIVATE_MEM_SIZE,
|
||||
sizeof(cl_ulong), &private_mem_size, nullptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(
|
||||
return UnknownError(
|
||||
absl::StrCat("Failed to get info CL_KERNEL_PRIVATE_MEM_SIZE ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
*result = static_cast<int>(private_mem_size);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@ -82,17 +82,17 @@ CLKernel& CLKernel::operator=(CLKernel&& kernel) {
|
||||
|
||||
CLKernel::~CLKernel() { Release(); }
|
||||
|
||||
absl::Status CLKernel::ReInit() const {
|
||||
Status CLKernel::ReInit() const {
|
||||
clReleaseKernel(kernel_);
|
||||
cl_kernel* kern_ptr = const_cast<cl_kernel*>(&kernel_);
|
||||
int error_code;
|
||||
*kern_ptr = clCreateKernel(program_, function_name_.c_str(), &error_code);
|
||||
if (!kernel_ || error_code != CL_SUCCESS) {
|
||||
*kern_ptr = nullptr;
|
||||
return absl::UnknownError(absl::StrCat("Failed to create ", function_name_,
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to create ", function_name_,
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
void CLKernel::Release() {
|
||||
@ -103,16 +103,16 @@ void CLKernel::Release() {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CLKernel::CreateFromProgram(const CLProgram& program,
|
||||
const std::string& function_name) {
|
||||
Status CLKernel::CreateFromProgram(const CLProgram& program,
|
||||
const std::string& function_name) {
|
||||
int error_code;
|
||||
function_name_ = function_name;
|
||||
kernel_ =
|
||||
clCreateKernel(program.program(), function_name.c_str(), &error_code);
|
||||
if (!kernel_ || error_code != CL_SUCCESS) {
|
||||
kernel_ = nullptr;
|
||||
return absl::UnknownError(absl::StrCat("Failed to create ", function_name,
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to create ", function_name,
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
|
||||
program_ = program.program();
|
||||
@ -122,64 +122,64 @@ absl::Status CLKernel::CreateFromProgram(const CLProgram& program,
|
||||
&private_memory_size_));
|
||||
RETURN_IF_ERROR(GetKernelMaxWorkGroupSize(kernel_, program.GetDeviceId(),
|
||||
&max_work_group_size_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CLKernel::SetMemory(int index, cl_mem memory) {
|
||||
Status CLKernel::SetMemory(int index, cl_mem memory) {
|
||||
return SetBytes(index, &memory, sizeof(cl_mem));
|
||||
}
|
||||
|
||||
absl::Status CLKernel::SetMemoryAuto(cl_mem memory) {
|
||||
Status CLKernel::SetMemoryAuto(cl_mem memory) {
|
||||
return SetBytesAuto(&memory, sizeof(cl_mem));
|
||||
}
|
||||
|
||||
absl::Status CLKernel::SetBytes(int index, const void* ptr, int length) const {
|
||||
Status CLKernel::SetBytes(int index, const void* ptr, int length) const {
|
||||
const int error_code = clSetKernelArg(kernel_, index, length, ptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to set kernel arguments - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CLKernel::SetBytesAuto(const void* ptr, int length) {
|
||||
Status CLKernel::SetBytesAuto(const void* ptr, int length) {
|
||||
const int error_code = clSetKernelArg(kernel_, binding_counter_, length, ptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(absl::StrCat(
|
||||
"Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
|
||||
"(at index - ", binding_counter_, ")"));
|
||||
return UnknownError(absl::StrCat("Failed to set kernel arguments - ",
|
||||
CLErrorCodeToString(error_code),
|
||||
"(at index - ", binding_counter_, ")"));
|
||||
}
|
||||
binding_counter_++;
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const {
|
||||
Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const {
|
||||
return SetBytes(index, value.GetData(), value.GetSize());
|
||||
}
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytes<FLT2>(int index, const FLT2& value) const {
|
||||
Status CLKernel::SetBytes<FLT2>(int index, const FLT2& value) const {
|
||||
return SetBytes(index, value.GetData(), value.GetSize());
|
||||
}
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const {
|
||||
Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const {
|
||||
return SetBytes(index, value.GetData(), value.GetSize());
|
||||
}
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytesAuto<FLT>(const FLT& value) {
|
||||
Status CLKernel::SetBytesAuto<FLT>(const FLT& value) {
|
||||
return SetBytesAuto(value.GetData(), value.GetSize());
|
||||
}
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytesAuto<FLT2>(const FLT2& value) {
|
||||
Status CLKernel::SetBytesAuto<FLT2>(const FLT2& value) {
|
||||
return SetBytesAuto(value.GetData(), value.GetSize());
|
||||
}
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value) {
|
||||
Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value) {
|
||||
return SetBytesAuto(value.GetData(), value.GetSize());
|
||||
}
|
||||
|
||||
|
@ -48,17 +48,17 @@ class CLKernel {
|
||||
|
||||
cl_kernel kernel() const { return kernel_; }
|
||||
|
||||
absl::Status CreateFromProgram(const CLProgram& program,
|
||||
const std::string& function_name);
|
||||
Status CreateFromProgram(const CLProgram& program,
|
||||
const std::string& function_name);
|
||||
|
||||
absl::Status SetMemory(int index, cl_mem memory);
|
||||
absl::Status SetMemoryAuto(cl_mem memory);
|
||||
Status SetMemory(int index, cl_mem memory);
|
||||
Status SetMemoryAuto(cl_mem memory);
|
||||
template <typename T>
|
||||
absl::Status SetBytes(int index, const T& value) const {
|
||||
Status SetBytes(int index, const T& value) const {
|
||||
return SetBytes(index, static_cast<const void*>(&value), sizeof(T));
|
||||
}
|
||||
template <typename T>
|
||||
absl::Status SetBytesAuto(const T& value) {
|
||||
Status SetBytesAuto(const T& value) {
|
||||
return SetBytesAuto(static_cast<const void*>(&value), sizeof(T));
|
||||
}
|
||||
|
||||
@ -69,12 +69,12 @@ class CLKernel {
|
||||
|
||||
// Do not use this function
|
||||
// workaround for Mali memory leak
|
||||
absl::Status ReInit() const;
|
||||
Status ReInit() const;
|
||||
|
||||
private:
|
||||
void Release();
|
||||
absl::Status SetBytes(int index, const void* ptr, int length) const;
|
||||
absl::Status SetBytesAuto(const void* ptr, int length);
|
||||
Status SetBytes(int index, const void* ptr, int length) const;
|
||||
Status SetBytesAuto(const void* ptr, int length);
|
||||
|
||||
int private_memory_size_;
|
||||
int max_work_group_size_;
|
||||
@ -87,22 +87,22 @@ class CLKernel {
|
||||
};
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const;
|
||||
Status CLKernel::SetBytes<FLT>(int index, const FLT& value) const;
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytes<FLT2>(int index, const FLT2& value) const;
|
||||
Status CLKernel::SetBytes<FLT2>(int index, const FLT2& value) const;
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const;
|
||||
Status CLKernel::SetBytes<FLT4>(int index, const FLT4& value) const;
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytesAuto<FLT>(const FLT& value);
|
||||
Status CLKernel::SetBytesAuto<FLT>(const FLT& value);
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytesAuto<FLT2>(const FLT2& value);
|
||||
Status CLKernel::SetBytesAuto<FLT2>(const FLT2& value);
|
||||
|
||||
template <>
|
||||
absl::Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value);
|
||||
Status CLKernel::SetBytesAuto<FLT4>(const FLT4& value);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -49,29 +49,28 @@ std::string GetProgramBuildInfo(cl_program program, cl_device_id id,
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::Status GetBinarySize(cl_program program, size_t* binary_size) {
|
||||
Status GetBinarySize(cl_program program, size_t* binary_size) {
|
||||
cl_int error_code = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
|
||||
sizeof(size_t), binary_size, nullptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(
|
||||
absl::StrCat("Failed to get program binary size - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to get program binary size - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status BuildProgram(cl_program program, const CLDevice& device,
|
||||
const std::string& compiler_options) {
|
||||
Status BuildProgram(cl_program program, const CLDevice& device,
|
||||
const std::string& compiler_options) {
|
||||
const int error_code = clBuildProgram(
|
||||
program, 0, nullptr, compiler_options.c_str(), nullptr, nullptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(absl::StrCat(
|
||||
return UnknownError(absl::StrCat(
|
||||
"Failed to build program executable - ",
|
||||
CLErrorCodeToString(error_code),
|
||||
GetProgramBuildInfo(program, device.id(), CL_PROGRAM_BUILD_LOG)));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
std::string CompilerOptionToString(const CLDevice& device,
|
||||
@ -134,7 +133,7 @@ void CLProgram::Release() {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CLProgram::GetBinary(std::vector<uint8_t>* result) const {
|
||||
Status CLProgram::GetBinary(std::vector<uint8_t>* result) const {
|
||||
size_t binary_size;
|
||||
RETURN_IF_ERROR(GetBinarySize(program_, &binary_size));
|
||||
result->resize(result->size() + binary_size);
|
||||
@ -142,36 +141,35 @@ absl::Status CLProgram::GetBinary(std::vector<uint8_t>* result) const {
|
||||
cl_int error_code = clGetProgramInfo(program_, CL_PROGRAM_BINARIES,
|
||||
binary_size, &binary_ptr, nullptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(absl::StrCat("Failed to get program binary - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to get program binary - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateCLProgram(const std::string& code,
|
||||
const std::string& compiler_options,
|
||||
const CLContext& context, const CLDevice& device,
|
||||
CLProgram* result) {
|
||||
Status CreateCLProgram(const std::string& code,
|
||||
const std::string& compiler_options,
|
||||
const CLContext& context, const CLDevice& device,
|
||||
CLProgram* result) {
|
||||
int error_code;
|
||||
const char* source = code.c_str();
|
||||
|
||||
cl_program program = clCreateProgramWithSource(context.context(), 1, &source,
|
||||
nullptr, &error_code);
|
||||
if (!program || error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(
|
||||
absl::StrCat("Failed to create compute program - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to create compute program - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
|
||||
*result = CLProgram(program, device.id());
|
||||
RETURN_IF_ERROR(BuildProgram(program, device, compiler_options));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateCLProgramFromBinary(const CLContext& context,
|
||||
const CLDevice& device,
|
||||
absl::Span<const uint8_t> binary,
|
||||
CLProgram* result) {
|
||||
Status CreateCLProgramFromBinary(const CLContext& context,
|
||||
const CLDevice& device,
|
||||
absl::Span<const uint8_t> binary,
|
||||
CLProgram* result) {
|
||||
cl_int binary_status;
|
||||
cl_int error_code;
|
||||
cl_device_id devices_list[] = {device.id()};
|
||||
@ -181,13 +179,13 @@ absl::Status CreateCLProgramFromBinary(const CLContext& context,
|
||||
context.context(), 1, devices_list, &binary_size, &binary_pointer,
|
||||
&binary_status, &error_code);
|
||||
if (binary_status != CL_SUCCESS) {
|
||||
return absl::UnknownError(absl::StrCat(
|
||||
return UnknownError(absl::StrCat(
|
||||
"Something wrong with binary after clCreateProgramWithBinary - ",
|
||||
binary_status));
|
||||
}
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(absl::StrCat("Failed to create program - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return UnknownError(absl::StrCat("Failed to create program - ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
*result = CLProgram(program, device.id());
|
||||
return BuildProgram(program, device, "");
|
||||
|
@ -68,7 +68,7 @@ class CLProgram {
|
||||
// was created using clCreateProgramWithBinary.
|
||||
cl_device_id GetDeviceId() const { return device_id_; }
|
||||
|
||||
absl::Status GetBinary(std::vector<uint8_t>* result) const;
|
||||
Status GetBinary(std::vector<uint8_t>* result) const;
|
||||
|
||||
private:
|
||||
void Release();
|
||||
@ -79,15 +79,15 @@ class CLProgram {
|
||||
cl_device_id device_id_ = nullptr;
|
||||
};
|
||||
|
||||
absl::Status CreateCLProgram(const std::string& code,
|
||||
const std::string& compiler_options,
|
||||
const CLContext& context, const CLDevice& device,
|
||||
CLProgram* result);
|
||||
Status CreateCLProgram(const std::string& code,
|
||||
const std::string& compiler_options,
|
||||
const CLContext& context, const CLDevice& device,
|
||||
CLProgram* result);
|
||||
|
||||
absl::Status CreateCLProgramFromBinary(const CLContext& context,
|
||||
const CLDevice& device,
|
||||
absl::Span<const uint8_t> binary,
|
||||
CLProgram* result);
|
||||
Status CreateCLProgramFromBinary(const CLContext& context,
|
||||
const CLDevice& device,
|
||||
absl::Span<const uint8_t> binary,
|
||||
CLProgram* result);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -21,15 +21,15 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
absl::Status EglSync::NewFence(EGLDisplay display, EglSync* sync) {
|
||||
Status EglSync::NewFence(EGLDisplay display, EglSync* sync) {
|
||||
EGLSyncKHR egl_sync;
|
||||
RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglCreateSyncKHR, &egl_sync, display,
|
||||
EGL_SYNC_FENCE_KHR, nullptr));
|
||||
if (egl_sync == EGL_NO_SYNC_KHR) {
|
||||
return absl::InternalError("Returned empty KHR EGL sync");
|
||||
return InternalError("Returned empty KHR EGL sync");
|
||||
}
|
||||
*sync = EglSync(display, egl_sync);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
EglSync& EglSync::operator=(EglSync&& sync) {
|
||||
@ -48,23 +48,22 @@ void EglSync::Invalidate() {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status EglSync::ServerWait() {
|
||||
Status EglSync::ServerWait() {
|
||||
EGLint result;
|
||||
RETURN_IF_ERROR(
|
||||
TFLITE_GPU_CALL_EGL(eglWaitSyncKHR, &result, display_, sync_, 0));
|
||||
return result == EGL_TRUE ? absl::OkStatus()
|
||||
: absl::InternalError("eglWaitSync failed");
|
||||
return result == EGL_TRUE ? OkStatus() : InternalError("eglWaitSync failed");
|
||||
}
|
||||
|
||||
absl::Status EglSync::ClientWait() {
|
||||
Status EglSync::ClientWait() {
|
||||
EGLint result;
|
||||
// TODO(akulik): make it active wait for better performance
|
||||
RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglClientWaitSyncKHR, &result, display_,
|
||||
sync_, EGL_SYNC_FLUSH_COMMANDS_BIT_KHR,
|
||||
EGL_FOREVER_KHR));
|
||||
return result == EGL_CONDITION_SATISFIED_KHR
|
||||
? absl::OkStatus()
|
||||
: absl::InternalError("eglClientWaitSync failed");
|
||||
? OkStatus()
|
||||
: InternalError("eglClientWaitSync failed");
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -32,7 +32,7 @@ class EglSync {
|
||||
// flushed.
|
||||
//
|
||||
// Depends on EGL_KHR_fence_sync extension.
|
||||
static absl::Status NewFence(EGLDisplay display, EglSync* sync);
|
||||
static Status NewFence(EGLDisplay display, EglSync* sync);
|
||||
|
||||
// Creates invalid object.
|
||||
EglSync() : EglSync(EGL_NO_DISPLAY, EGL_NO_SYNC_KHR) {}
|
||||
@ -50,10 +50,10 @@ class EglSync {
|
||||
|
||||
// Causes GPU to block and wait until this sync has been signaled.
|
||||
// This call does not block and returns immediately.
|
||||
absl::Status ServerWait();
|
||||
Status ServerWait();
|
||||
|
||||
// Causes CPU to block and wait until this sync has been signaled.
|
||||
absl::Status ClientWait();
|
||||
Status ClientWait();
|
||||
|
||||
// Returns the EGLDisplay on which this instance was created.
|
||||
EGLDisplay display() const { return display_; }
|
||||
|
@ -26,7 +26,6 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GetKernelOneLayerTextureArray() {
|
||||
return R"(
|
||||
|
||||
@ -44,12 +43,12 @@ __kernel void main_function(__write_only image2d_array_t dst) {
|
||||
// texture, we will get zeroes instead of actual values.
|
||||
// The same kernel will work, if we use texture array with more than one layer.
|
||||
// With help of this code we can detect this bug.
|
||||
absl::Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
|
||||
bool* result) {
|
||||
Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
|
||||
bool* result) {
|
||||
// No bug on Adreno 6xx
|
||||
if (env->device().GetInfo().adreno_info.gpu_version >= 600) {
|
||||
*result = true;
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
CLKernel kernel;
|
||||
RETURN_IF_ERROR(env->program_cache()->GetOrCreateCLKernel(
|
||||
@ -76,12 +75,12 @@ absl::Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
|
||||
break;
|
||||
}
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateEnvironment(Environment* result, bool shared,
|
||||
cl_context_properties egl_context,
|
||||
cl_context_properties egl_display) {
|
||||
Status CreateEnvironment(Environment* result, bool shared,
|
||||
cl_context_properties egl_context,
|
||||
cl_context_properties egl_display) {
|
||||
CLDevice gpu;
|
||||
RETURN_IF_ERROR(CreateDefaultGPUDevice(&gpu));
|
||||
|
||||
@ -108,9 +107,8 @@ absl::Status CreateEnvironment(Environment* result, bool shared,
|
||||
}
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Environment::Environment(CLDevice&& device, CLContext&& context,
|
||||
@ -139,7 +137,7 @@ Environment& Environment::operator=(Environment&& environment) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Environment::Init() {
|
||||
Status Environment::Init() {
|
||||
if (device().IsAdreno() && device().SupportsTextureArray()) {
|
||||
bool supports_one_layer;
|
||||
RETURN_IF_ERROR(
|
||||
@ -148,7 +146,7 @@ absl::Status Environment::Init() {
|
||||
GetDevicePtr()->DisableOneLayerTextureArray();
|
||||
}
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
void Environment::SetHighPerformance() const {
|
||||
@ -268,7 +266,7 @@ TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
|
||||
return TensorStorageType::BUFFER;
|
||||
}
|
||||
|
||||
absl::Status CreateEnvironment(Environment* result) {
|
||||
Status CreateEnvironment(Environment* result) {
|
||||
CLDevice gpu;
|
||||
RETURN_IF_ERROR(CreateDefaultGPUDevice(&gpu));
|
||||
|
||||
|
@ -57,7 +57,7 @@ class Environment {
|
||||
std::vector<TensorStorageType> GetSupportedStorages() const;
|
||||
bool IsSupported(TensorStorageType storage_type) const;
|
||||
|
||||
absl::Status Init();
|
||||
Status Init();
|
||||
|
||||
void SetHighPerformance() const;
|
||||
void SetDefaultPerformance() const;
|
||||
@ -75,7 +75,7 @@ TensorStorageType GetFastestStorageType(const CLDevice& gpu);
|
||||
TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
|
||||
const CLDevice& gpu);
|
||||
|
||||
absl::Status CreateEnvironment(Environment* result);
|
||||
Status CreateEnvironment(Environment* result);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -41,11 +41,10 @@ PFNEGLCREATESYNCPROC g_eglCreateSync = nullptr;
|
||||
|
||||
} // namespace
|
||||
|
||||
absl::Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
|
||||
EglSync* sync) {
|
||||
Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
|
||||
EglSync* sync) {
|
||||
if (!IsEglSyncFromClEventSupported()) {
|
||||
return absl::UnimplementedError(
|
||||
"CreateEglSyncFromClEvent is not supported");
|
||||
return UnimplementedError("CreateEglSyncFromClEvent is not supported");
|
||||
}
|
||||
EGLSync egl_sync;
|
||||
const EGLAttrib attributes[] = {EGL_CL_EVENT_HANDLE,
|
||||
@ -53,10 +52,10 @@ absl::Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
|
||||
RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(g_eglCreateSync, &egl_sync, display,
|
||||
EGL_SYNC_CL_EVENT, attributes));
|
||||
if (egl_sync == EGL_NO_SYNC) {
|
||||
return absl::InternalError("Returned empty EGL sync");
|
||||
return InternalError("Returned empty EGL sync");
|
||||
}
|
||||
*sync = EglSync(display, egl_sync);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
bool IsEglSyncFromClEventSupported() {
|
||||
@ -74,54 +73,52 @@ bool IsEglSyncFromClEventSupported() {
|
||||
return supported;
|
||||
}
|
||||
|
||||
absl::Status CreateClEventFromEglSync(cl_context context,
|
||||
const EglSync& egl_sync, CLEvent* event) {
|
||||
Status CreateClEventFromEglSync(cl_context context, const EglSync& egl_sync,
|
||||
CLEvent* event) {
|
||||
cl_int error_code;
|
||||
cl_event new_event = clCreateEventFromEGLSyncKHR(
|
||||
context, egl_sync.sync(), egl_sync.display(), &error_code);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::InternalError(
|
||||
return InternalError(
|
||||
absl::StrCat("Unable to create CL sync from EGL sync. ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
*event = CLEvent(new_event);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
bool IsClEventFromEglSyncSupported(const CLDevice& device) {
|
||||
return device.SupportsExtension("cl_khr_egl_event");
|
||||
}
|
||||
|
||||
absl::Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id,
|
||||
AccessType access_type,
|
||||
CLContext* context, CLMemory* memory) {
|
||||
Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id, AccessType access_type,
|
||||
CLContext* context, CLMemory* memory) {
|
||||
cl_int error_code;
|
||||
auto mem = clCreateFromGLBuffer(context->context(), ToClMemFlags(access_type),
|
||||
gl_ssbo_id, &error_code);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::InternalError(
|
||||
return InternalError(
|
||||
absl::StrCat("Unable to acquire CL buffer from GL buffer. ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
*memory = CLMemory(mem, true);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateClMemoryFromGlTexture(GLenum texture_target,
|
||||
GLuint texture_id,
|
||||
AccessType access_type,
|
||||
CLContext* context, CLMemory* memory) {
|
||||
Status CreateClMemoryFromGlTexture(GLenum texture_target, GLuint texture_id,
|
||||
AccessType access_type, CLContext* context,
|
||||
CLMemory* memory) {
|
||||
cl_int error_code;
|
||||
auto mem =
|
||||
clCreateFromGLTexture(context->context(), ToClMemFlags(access_type),
|
||||
texture_target, 0, texture_id, &error_code);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::InternalError(
|
||||
return InternalError(
|
||||
absl::StrCat("Unable to create CL buffer from GL texture. ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
*memory = CLMemory(mem, true);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
bool IsGlSharingSupported(const CLDevice& device) {
|
||||
@ -131,18 +128,19 @@ bool IsGlSharingSupported(const CLDevice& device) {
|
||||
|
||||
AcquiredGlObjects::~AcquiredGlObjects() { Release({}, nullptr).IgnoreError(); }
|
||||
|
||||
absl::Status AcquiredGlObjects::Acquire(
|
||||
const std::vector<cl_mem>& memory, cl_command_queue queue,
|
||||
const std::vector<cl_event>& wait_events, CLEvent* acquire_event,
|
||||
AcquiredGlObjects* objects) {
|
||||
Status AcquiredGlObjects::Acquire(const std::vector<cl_mem>& memory,
|
||||
cl_command_queue queue,
|
||||
const std::vector<cl_event>& wait_events,
|
||||
CLEvent* acquire_event,
|
||||
AcquiredGlObjects* objects) {
|
||||
if (!memory.empty()) {
|
||||
cl_event new_event;
|
||||
cl_int error_code = clEnqueueAcquireGLObjects(
|
||||
queue, memory.size(), memory.data(), wait_events.size(),
|
||||
wait_events.data(), acquire_event ? &new_event : nullptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::InternalError(absl::StrCat("Unable to acquire GL object. ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return InternalError(absl::StrCat("Unable to acquire GL object. ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
if (acquire_event) {
|
||||
*acquire_event = CLEvent(new_event);
|
||||
@ -150,19 +148,19 @@ absl::Status AcquiredGlObjects::Acquire(
|
||||
clFlush(queue);
|
||||
}
|
||||
*objects = AcquiredGlObjects(memory, queue);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status AcquiredGlObjects::Release(
|
||||
const std::vector<cl_event>& wait_events, CLEvent* release_event) {
|
||||
Status AcquiredGlObjects::Release(const std::vector<cl_event>& wait_events,
|
||||
CLEvent* release_event) {
|
||||
if (queue_ && !memory_.empty()) {
|
||||
cl_event new_event;
|
||||
cl_int error_code = clEnqueueReleaseGLObjects(
|
||||
queue_, memory_.size(), memory_.data(), wait_events.size(),
|
||||
wait_events.data(), release_event ? &new_event : nullptr);
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::InternalError(absl::StrCat("Unable to release GL object. ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
return InternalError(absl::StrCat("Unable to release GL object. ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
if (release_event) {
|
||||
*release_event = CLEvent(new_event);
|
||||
@ -170,7 +168,7 @@ absl::Status AcquiredGlObjects::Release(
|
||||
clFlush(queue_);
|
||||
queue_ = nullptr;
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
GlInteropFabric::GlInteropFabric(EGLDisplay egl_display,
|
||||
@ -194,9 +192,9 @@ void GlInteropFabric::UnregisterMemory(cl_mem memory) {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status GlInteropFabric::Start() {
|
||||
Status GlInteropFabric::Start() {
|
||||
if (!is_enabled()) {
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
// In GL-CL interoperability, we need to make sure GL finished processing of
|
||||
@ -237,9 +235,9 @@ absl::Status GlInteropFabric::Start() {
|
||||
nullptr, &gl_objects_);
|
||||
}
|
||||
|
||||
absl::Status GlInteropFabric::Finish() {
|
||||
Status GlInteropFabric::Finish() {
|
||||
if (!is_enabled()) {
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
RETURN_IF_ERROR(gl_objects_.Release({}, &outbound_event_));
|
||||
|
||||
@ -260,7 +258,7 @@ absl::Status GlInteropFabric::Finish() {
|
||||
// This slow sync is the only working solution right now. We have to debug why
|
||||
// above version is not working fast and reliable.
|
||||
outbound_event_.Wait();
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -39,8 +39,8 @@ namespace cl {
|
||||
// returned sync and could be safely destroyed.
|
||||
//
|
||||
// Depends on EGL 1.5.
|
||||
absl::Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
|
||||
EglSync* sync);
|
||||
Status CreateEglSyncFromClEvent(cl_event event, EGLDisplay display,
|
||||
EglSync* sync);
|
||||
|
||||
// Returns true if 'CreateEglSyncFromClEvent' is supported.
|
||||
bool IsEglSyncFromClEventSupported();
|
||||
@ -48,22 +48,20 @@ bool IsEglSyncFromClEventSupported();
|
||||
// Creates CL event from EGL sync.
|
||||
// Created event could only be consumed by AcquiredGlObject::Acquire call as
|
||||
// a 'wait_event'.
|
||||
absl::Status CreateClEventFromEglSync(cl_context context,
|
||||
const EglSync& egl_sync, CLEvent* event);
|
||||
Status CreateClEventFromEglSync(cl_context context, const EglSync& egl_sync,
|
||||
CLEvent* event);
|
||||
|
||||
// Returns true if 'CreateClEventFromEglSync' is supported.
|
||||
bool IsClEventFromEglSyncSupported(const CLDevice& device);
|
||||
|
||||
// Creates new CL memory object from OpenGL buffer.
|
||||
absl::Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id,
|
||||
AccessType access_type,
|
||||
CLContext* context, CLMemory* memory);
|
||||
Status CreateClMemoryFromGlBuffer(GLuint gl_ssbo_id, AccessType access_type,
|
||||
CLContext* context, CLMemory* memory);
|
||||
|
||||
// Creates new CL memory object from OpenGL texture.
|
||||
absl::Status CreateClMemoryFromGlTexture(GLenum texture_target,
|
||||
GLuint texture_id,
|
||||
AccessType access_type,
|
||||
CLContext* context, CLMemory* memory);
|
||||
Status CreateClMemoryFromGlTexture(GLenum texture_target, GLuint texture_id,
|
||||
AccessType access_type, CLContext* context,
|
||||
CLMemory* memory);
|
||||
|
||||
// Returns true if GL objects could be shared with OpenCL context.
|
||||
bool IsGlSharingSupported(const CLDevice& device);
|
||||
@ -83,16 +81,16 @@ class AcquiredGlObjects {
|
||||
// CreateClMemoryFromGlBuffer or CreateClMemoryFromGlTexture calls.
|
||||
// If 'acquire_event' is not nullptr, it will be signared once acquisition is
|
||||
// complete.
|
||||
static absl::Status Acquire(const std::vector<cl_mem>& memory,
|
||||
cl_command_queue queue,
|
||||
const std::vector<cl_event>& wait_events,
|
||||
CLEvent* acquire_event /* optional */,
|
||||
AcquiredGlObjects* objects);
|
||||
static Status Acquire(const std::vector<cl_mem>& memory,
|
||||
cl_command_queue queue,
|
||||
const std::vector<cl_event>& wait_events,
|
||||
CLEvent* acquire_event /* optional */,
|
||||
AcquiredGlObjects* objects);
|
||||
|
||||
// Releases OpenCL memory back to OpenGL context. If 'release_event' is not
|
||||
// nullptr, it will be signalled once release is complete.
|
||||
absl::Status Release(const std::vector<cl_event>& wait_events,
|
||||
CLEvent* release_event /* optional */);
|
||||
Status Release(const std::vector<cl_event>& wait_events,
|
||||
CLEvent* release_event /* optional */);
|
||||
|
||||
private:
|
||||
AcquiredGlObjects(const std::vector<cl_mem>& memory, cl_command_queue queue)
|
||||
@ -110,10 +108,10 @@ class GlInteropFabric {
|
||||
|
||||
// Ensures proper GL->CL synchronization is in place before
|
||||
// GL objects that are mapped to CL objects are used.
|
||||
absl::Status Start();
|
||||
Status Start();
|
||||
|
||||
// Puts appropriate CL->GL synchronization after all work is complete.
|
||||
absl::Status Finish();
|
||||
Status Finish();
|
||||
|
||||
// Registers memory to be used from GL context. Such CL memory object must
|
||||
// be created with CreateClMemoryFromGlBuffer or CreateClMemoryFromGlTexture
|
||||
|
@ -87,8 +87,8 @@ class Delegate {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status Prepare(TfLiteContext* context,
|
||||
const TfLiteDelegateParams* delegate_params) {
|
||||
Status Prepare(TfLiteContext* context,
|
||||
const TfLiteDelegateParams* delegate_params) {
|
||||
// Extract TFLite delegate execution plan from the context and convert it
|
||||
// into FlowGraph32.
|
||||
GraphFloat32 graph;
|
||||
@ -98,7 +98,7 @@ class Delegate {
|
||||
NullTransformationReporter reporter;
|
||||
ModelTransformer transformer(&graph, &reporter);
|
||||
if (!ApplyGeneralTransformations(&transformer)) {
|
||||
return absl::InternalError("Graph general transformations failed");
|
||||
return InternalError("Graph general transformations failed");
|
||||
}
|
||||
|
||||
InferenceEnvironmentOptions env_options;
|
||||
@ -108,7 +108,7 @@ class Delegate {
|
||||
options_.serialized_binary_cache_data,
|
||||
options_.serialized_binary_cache_size};
|
||||
InferenceEnvironmentProperties properties;
|
||||
absl::Status status =
|
||||
Status status =
|
||||
NewInferenceEnvironment(env_options, &environment_, &properties);
|
||||
if (!properties.is_opencl_available) {
|
||||
context->ReportError(context,
|
||||
@ -200,7 +200,7 @@ class Delegate {
|
||||
return builder->Build(&runner_);
|
||||
}
|
||||
|
||||
absl::Status SetInputsAndOutputs(TfLiteContext* context) {
|
||||
Status SetInputsAndOutputs(TfLiteContext* context) {
|
||||
int i = 0;
|
||||
for (auto index : input_indices_) {
|
||||
RETURN_IF_ERROR(
|
||||
@ -211,10 +211,10 @@ class Delegate {
|
||||
RETURN_IF_ERROR(
|
||||
runner_->SetOutputObject(i++, GetTensorObject(index, context)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status Invoke(TfLiteContext* context) {
|
||||
Status Invoke(TfLiteContext* context) {
|
||||
RETURN_IF_ERROR(SetInputsAndOutputs(context));
|
||||
return runner_->Run();
|
||||
}
|
||||
@ -310,7 +310,7 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
|
||||
const auto status = gpu_delegate->Prepare(context, params);
|
||||
if (!status.ok()) {
|
||||
context->ReportError(context, "TfLiteGpuDelegate Init: %s",
|
||||
std::string(status.message()).c_str());
|
||||
status.error_message().c_str());
|
||||
return nullptr;
|
||||
}
|
||||
return gpu_delegate;
|
||||
@ -335,7 +335,7 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
|
||||
const auto status = GetDelegate(node)->Invoke(context);
|
||||
if (!status.ok()) {
|
||||
context->ReportError(context, "TfLiteGpuDelegate Invoke: %s",
|
||||
std::string(status.message()).c_str());
|
||||
status.error_message().c_str());
|
||||
return kTfLiteError;
|
||||
}
|
||||
return kTfLiteOk;
|
||||
|
@ -169,9 +169,9 @@ CLNode& CLNode::operator=(CLNode&& node) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::InitFromGraph(
|
||||
const CreateInferenceInfo& create_info, const GraphFloat32& graph,
|
||||
Environment* env) {
|
||||
Status InferenceContext::InitFromGraph(const CreateInferenceInfo& create_info,
|
||||
const GraphFloat32& graph,
|
||||
Environment* env) {
|
||||
CreationContext creation_context;
|
||||
creation_context.device = env->GetDevicePtr();
|
||||
creation_context.context = &env->context();
|
||||
@ -206,15 +206,15 @@ absl::Status InferenceContext::InitFromGraph(
|
||||
tuning_parameters.tuning_type = TuningType::FAST;
|
||||
}
|
||||
RETURN_IF_ERROR(Tune(tuning_parameters));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::InitFromGraphWithTransforms(
|
||||
Status InferenceContext::InitFromGraphWithTransforms(
|
||||
const CreateInferenceInfo& create_info, GraphFloat32* graph,
|
||||
Environment* env) {
|
||||
RETURN_IF_ERROR(RunGraphTransforms(graph));
|
||||
RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) {
|
||||
@ -258,7 +258,7 @@ void InferenceContext::ReserveGraphTensors(
|
||||
tensor_reserver_.SetNext(max_id + 1);
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::ConvertOperations(
|
||||
Status InferenceContext::ConvertOperations(
|
||||
const CreationContext& creation_context, const GraphFloat32& graph,
|
||||
ModelHints hints) {
|
||||
std::vector<Node*> graph_nodes = graph.nodes();
|
||||
@ -343,7 +343,7 @@ absl::Status InferenceContext::ConvertOperations(
|
||||
}
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
void InferenceContext::Merge() {
|
||||
@ -424,15 +424,15 @@ void InferenceContext::GetUsages(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::AllocateMemory(const CLDevice& device,
|
||||
CLContext* context) {
|
||||
Status InferenceContext::AllocateMemory(const CLDevice& device,
|
||||
CLContext* context) {
|
||||
RETURN_IF_ERROR(AllocateMemoryForBuffers(device, context));
|
||||
RETURN_IF_ERROR(AllocateMemoryForStrongShapes(device, context));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::AllocateMemoryForBuffers(const CLDevice& device,
|
||||
CLContext* context) {
|
||||
Status InferenceContext::AllocateMemoryForBuffers(const CLDevice& device,
|
||||
CLContext* context) {
|
||||
std::map<ValueId, int2> buffer_usages;
|
||||
GetUsages(
|
||||
[](const TensorDescriptor& t) { return IsBufferBased(t.storage_type); },
|
||||
@ -480,11 +480,11 @@ absl::Status InferenceContext::AllocateMemoryForBuffers(const CLDevice& device,
|
||||
created_tensors[tensor_index] = true;
|
||||
}
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::AllocateMemoryForStrongShapes(
|
||||
const CLDevice& device, CLContext* context) {
|
||||
Status InferenceContext::AllocateMemoryForStrongShapes(const CLDevice& device,
|
||||
CLContext* context) {
|
||||
std::map<ValueId, int2> usages;
|
||||
GetUsages(
|
||||
[](const TensorDescriptor& t) { return !IsBufferBased(t.storage_type); },
|
||||
@ -517,7 +517,7 @@ absl::Status InferenceContext::AllocateMemoryForStrongShapes(
|
||||
}
|
||||
}
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
void InferenceContext::BindMemoryToOperations() {
|
||||
@ -539,22 +539,21 @@ void InferenceContext::BindMemoryToOperations() {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
Status InferenceContext::Compile(const CreationContext& creation_context) {
|
||||
for (auto& node : nodes_) {
|
||||
RETURN_IF_ERROR(node.operations[0]->Compile(creation_context));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::Tune(const TuningParameters& tuning_parameters) {
|
||||
Status InferenceContext::Tune(const TuningParameters& tuning_parameters) {
|
||||
for (auto& node : nodes_) {
|
||||
RETURN_IF_ERROR(node.operations[0]->Tune(tuning_parameters));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
|
||||
Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
|
||||
if (need_manual_release_) {
|
||||
if (prev_enqueue_start_point_.is_valid()) {
|
||||
prev_enqueue_start_point_.Wait();
|
||||
@ -572,11 +571,11 @@ absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
|
||||
if (need_flush_) {
|
||||
clFlush(queue->queue());
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue,
|
||||
ProfilingInfo* result) {
|
||||
Status InferenceContext::Profile(ProfilingCommandQueue* queue,
|
||||
ProfilingInfo* result) {
|
||||
queue->ResetMeasurements();
|
||||
for (auto& node : nodes_) {
|
||||
queue->SetEventsLabel(node.name);
|
||||
@ -584,7 +583,7 @@ absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue,
|
||||
}
|
||||
RETURN_IF_ERROR(queue->WaitForCompletion());
|
||||
*result = queue->GetProfilingInfo();
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
uint64_t InferenceContext::GetSizeOfMemoryAllocatedForIntermediateTensors()
|
||||
@ -609,15 +608,13 @@ Tensor* InferenceContext::GetTensor(ValueId id) {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::SetInputTensor(ValueId id,
|
||||
const TensorFloat32& tensor,
|
||||
CLCommandQueue* queue) {
|
||||
Status InferenceContext::SetInputTensor(ValueId id, const TensorFloat32& tensor,
|
||||
CLCommandQueue* queue) {
|
||||
return GetTensor(id)->WriteData(queue, tensor);
|
||||
}
|
||||
|
||||
absl::Status InferenceContext::GetOutputTensor(ValueId id,
|
||||
CLCommandQueue* queue,
|
||||
TensorFloat32* result) {
|
||||
Status InferenceContext::GetOutputTensor(ValueId id, CLCommandQueue* queue,
|
||||
TensorFloat32* result) {
|
||||
const auto& gpu_tensor = *GetTensor(id);
|
||||
const auto dst_shape = BHWC(gpu_tensor.Batch(), gpu_tensor.Height(),
|
||||
gpu_tensor.Width(), gpu_tensor.Channels());
|
||||
@ -627,17 +624,17 @@ absl::Status InferenceContext::GetOutputTensor(ValueId id,
|
||||
return gpu_tensor.ReadData(queue, result);
|
||||
}
|
||||
|
||||
absl::Status RunGraphTransforms(GraphFloat32* graph) {
|
||||
Status RunGraphTransforms(GraphFloat32* graph) {
|
||||
auto merge_padding_transform = NewMergePaddingWithAdd();
|
||||
auto add_bias_transform = NewAddBias();
|
||||
ModelTransformer transformer(graph, /*reporter=*/nullptr);
|
||||
if (!transformer.Apply("add_bias", add_bias_transform.get())) {
|
||||
return absl::InternalError("Invalid add_bias transform");
|
||||
return InternalError("Invalid add_bias transform");
|
||||
}
|
||||
if (!transformer.Apply("merge_padding", merge_padding_transform.get())) {
|
||||
return absl::InternalError("Invalid merge_padding transform");
|
||||
return InternalError("Invalid merge_padding transform");
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -65,55 +65,53 @@ class InferenceContext {
|
||||
TensorStorageType storage_type;
|
||||
ModelHints hints;
|
||||
};
|
||||
absl::Status InitFromGraph(const CreateInferenceInfo& create_info,
|
||||
const GraphFloat32& graph, Environment* env);
|
||||
Status InitFromGraph(const CreateInferenceInfo& create_info,
|
||||
const GraphFloat32& graph, Environment* env);
|
||||
|
||||
// Applies OpenCL-specific transformations to the graph before the
|
||||
// initialization. These transformations are either impossible or useless in
|
||||
// other backends.
|
||||
absl::Status InitFromGraphWithTransforms(
|
||||
const CreateInferenceInfo& create_info, GraphFloat32* graph,
|
||||
Environment* env);
|
||||
Status InitFromGraphWithTransforms(const CreateInferenceInfo& create_info,
|
||||
GraphFloat32* graph, Environment* env);
|
||||
|
||||
absl::Status AddToQueue(CLCommandQueue* queue);
|
||||
absl::Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
|
||||
Status AddToQueue(CLCommandQueue* queue);
|
||||
Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
|
||||
// for profiling and memory statistics
|
||||
uint64_t GetSizeOfMemoryAllocatedForIntermediateTensors() const;
|
||||
|
||||
absl::Status SetInputTensor(ValueId id, const TensorFloat32& tensor,
|
||||
CLCommandQueue* queue);
|
||||
Status SetInputTensor(ValueId id, const TensorFloat32& tensor,
|
||||
CLCommandQueue* queue);
|
||||
|
||||
// It will work only with input/output tensor ids. For all other ids we don't
|
||||
// have any guarantees.
|
||||
Tensor* GetTensor(ValueId id);
|
||||
|
||||
absl::Status GetOutputTensor(ValueId id, CLCommandQueue* queue,
|
||||
TensorFloat32* result);
|
||||
Status GetOutputTensor(ValueId id, CLCommandQueue* queue,
|
||||
TensorFloat32* result);
|
||||
|
||||
private:
|
||||
void CopyInAndOutIds(const GraphFloat32& graph);
|
||||
absl::Status ConvertOperations(const CreationContext& creation_context,
|
||||
const GraphFloat32& graph, ModelHints hints);
|
||||
Status ConvertOperations(const CreationContext& creation_context,
|
||||
const GraphFloat32& graph, ModelHints hints);
|
||||
void CreateLinks();
|
||||
void ReserveGraphTensors(const CreateInferenceInfo& create_info,
|
||||
const CreationContext& creation_context,
|
||||
const GraphFloat32& graph);
|
||||
void Merge();
|
||||
absl::Status AllocateMemory(const CLDevice& device, CLContext* context);
|
||||
Status AllocateMemory(const CLDevice& device, CLContext* context);
|
||||
|
||||
absl::Status AllocateMemoryForBuffers(const CLDevice& device,
|
||||
CLContext* context);
|
||||
Status AllocateMemoryForBuffers(const CLDevice& device, CLContext* context);
|
||||
|
||||
absl::Status AllocateMemoryForStrongShapes(const CLDevice& device,
|
||||
CLContext* context);
|
||||
Status AllocateMemoryForStrongShapes(const CLDevice& device,
|
||||
CLContext* context);
|
||||
|
||||
// utility function
|
||||
void GetUsages(const std::function<bool(const TensorDescriptor&)>& functor,
|
||||
std::map<ValueId, int2>* usages);
|
||||
|
||||
void BindMemoryToOperations();
|
||||
absl::Status Compile(const CreationContext& creation_context);
|
||||
absl::Status Tune(const TuningParameters& tuning_parameters);
|
||||
Status Compile(const CreationContext& creation_context);
|
||||
Status Tune(const TuningParameters& tuning_parameters);
|
||||
|
||||
// performance hacks
|
||||
bool need_flush_ = false;
|
||||
@ -177,7 +175,7 @@ class InferenceContext {
|
||||
};
|
||||
|
||||
// Runs OpenCL specific transforms for the graph.
|
||||
absl::Status RunGraphTransforms(GraphFloat32* graph);
|
||||
Status RunGraphTransforms(GraphFloat32* graph);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -143,17 +143,17 @@ std::string Add::GetArgsDeclaration() const {
|
||||
return args;
|
||||
}
|
||||
|
||||
absl::Status Add::BindArguments(CLKernel* kernel) {
|
||||
Status Add::BindArguments(CLKernel* kernel) {
|
||||
for (int i = 1; i < src_depthes_.size(); ++i) {
|
||||
RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[i]->GetMemoryPtr()));
|
||||
}
|
||||
for (int i = 1; i < src_depthes_.size(); ++i) {
|
||||
RETURN_IF_ERROR(kernel->SetBytesAuto(src_[i]->GetWBatchedHSB()));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status Add::Compile(const CreationContext& creation_context) {
|
||||
Status Add::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetElementWiseCode(definition_, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
|
@ -36,7 +36,7 @@ class Add : public ElementwiseOperation {
|
||||
Add(const OperationDef& definition, const std::vector<int>& channels,
|
||||
int dst_channels);
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Add(Add&& operation);
|
||||
@ -47,7 +47,7 @@ class Add : public ElementwiseOperation {
|
||||
void SetLinkIndex(int index) override;
|
||||
std::string GetCoreCode(const LinkingContext& context) const override;
|
||||
std::string GetArgsDeclaration() const override;
|
||||
absl::Status BindArguments(CLKernel* kernel) override;
|
||||
Status BindArguments(CLKernel* kernel) override;
|
||||
|
||||
private:
|
||||
std::string GetElementWiseCode(
|
||||
|
@ -21,17 +21,17 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation,
|
||||
const std::vector<BHWC>& dst_sizes,
|
||||
const std::vector<TensorFloat32*>& dst_cpu) {
|
||||
Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation,
|
||||
const std::vector<BHWC>& dst_sizes,
|
||||
const std::vector<TensorFloat32*>& dst_cpu) {
|
||||
const OperationDef& op_def = operation->GetDefinition();
|
||||
std::vector<Tensor> src(src_cpu.size());
|
||||
for (int i = 0; i < src_cpu.size(); ++i) {
|
||||
auto src_shape = src_cpu[i].shape;
|
||||
if (src_shape.b != 1 && !op_def.IsBatchSupported()) {
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"Layout doesn't have Batch dimension, but shape.b != 1");
|
||||
}
|
||||
RETURN_IF_ERROR(CreateTensor(*creation_context.context,
|
||||
@ -45,7 +45,7 @@ absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
|
||||
for (int i = 0; i < dst_cpu.size(); ++i) {
|
||||
auto dst_shape = dst_sizes[i];
|
||||
if (dst_shape.b != 1 && !op_def.IsBatchSupported()) {
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"Layout doesn't have Batch dimension, but shape.b != 1");
|
||||
}
|
||||
RETURN_IF_ERROR(CreateTensor(*creation_context.context,
|
||||
@ -64,22 +64,22 @@ absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
|
||||
dst_cpu[i]->data = std::vector<float>(dst_sizes[i].DimensionsProduct(), 0);
|
||||
RETURN_IF_ERROR(dst[i].ReadData(creation_context.queue, dst_cpu[i]));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation, const BHWC& dst_size,
|
||||
TensorFloat32* result) {
|
||||
Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation, const BHWC& dst_size,
|
||||
TensorFloat32* result) {
|
||||
return ExecuteGPUOperation(
|
||||
std::vector<TensorFloat32>{src_cpu}, creation_context, operation,
|
||||
std::vector<BHWC>{dst_size}, std::vector<TensorFloat32*>{result});
|
||||
}
|
||||
|
||||
absl::Status ExecuteGPUOperation(const TensorFloat32& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation, const BHWC& dst_size,
|
||||
TensorFloat32* result) {
|
||||
Status ExecuteGPUOperation(const TensorFloat32& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation, const BHWC& dst_size,
|
||||
TensorFloat32* result) {
|
||||
return ExecuteGPUOperation(std::vector<TensorFloat32>{src_cpu},
|
||||
creation_context, operation, dst_size, result);
|
||||
}
|
||||
|
@ -51,21 +51,21 @@ class OpenCLOperationTest : public ::testing::Test {
|
||||
CreationContext creation_context_;
|
||||
};
|
||||
|
||||
absl::Status ExecuteGPUOperation(const TensorFloat32& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation, const BHWC& dst_size,
|
||||
TensorFloat32* result);
|
||||
Status ExecuteGPUOperation(const TensorFloat32& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation, const BHWC& dst_size,
|
||||
TensorFloat32* result);
|
||||
|
||||
absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation, const BHWC& dst_size,
|
||||
TensorFloat32* result);
|
||||
Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation, const BHWC& dst_size,
|
||||
TensorFloat32* result);
|
||||
|
||||
absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation,
|
||||
const std::vector<BHWC>& dst_sizes,
|
||||
const std::vector<TensorFloat32*>& dst_cpu);
|
||||
Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
|
||||
const CreationContext& creation_context,
|
||||
GPUOperation* operation,
|
||||
const std::vector<BHWC>& dst_sizes,
|
||||
const std::vector<TensorFloat32*>& dst_cpu);
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
} // namespace tflite
|
||||
|
@ -96,7 +96,7 @@ ConcatXY& ConcatXY::operator=(ConcatXY&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConcatXY::Compile(const CreationContext& creation_context) {
|
||||
Status ConcatXY::Compile(const CreationContext& creation_context) {
|
||||
const auto code =
|
||||
GetConcatKernelCode(definition_, tensors_count_, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
@ -104,7 +104,7 @@ absl::Status ConcatXY::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status ConcatXY::BindArguments() {
|
||||
Status ConcatXY::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
for (int i = 0; i < tensors_count_; ++i) {
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
|
||||
@ -122,7 +122,7 @@ absl::Status ConcatXY::BindArguments() {
|
||||
y_offset += attr_.axis == Axis::HEIGHT ? height : 0;
|
||||
}
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConcatXY::GetGridSize() const {
|
||||
@ -140,12 +140,12 @@ int3 ConcatXY::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status ConcatXY::Tune(const TuningParameters& params) {
|
||||
Status ConcatXY::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status ConcatXY::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConcatXY::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
@ -31,10 +31,10 @@ class ConcatXY : public GPUOperation {
|
||||
ConcatXY(const OperationDef& definition, const ConcatAttributes& attr,
|
||||
int tensors_count)
|
||||
: GPUOperation(definition), attr_(attr), tensors_count_(tensors_count) {}
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ConcatXY(ConcatXY&& operation);
|
||||
@ -43,7 +43,7 @@ class ConcatXY : public GPUOperation {
|
||||
ConcatXY& operator=(const ConcatXY&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
ConcatAttributes attr_;
|
||||
|
@ -25,8 +25,8 @@ limitations under the License.
|
||||
namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
|
||||
namespace {
|
||||
bool IsAllChannelsX4(const std::vector<int>& channels) {
|
||||
for (int channel : channels) {
|
||||
if (channel % 4 != 0) {
|
||||
@ -146,7 +146,6 @@ std::string GetConcatKernelCode(
|
||||
c += "}\n";
|
||||
return c;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
ConcatZ::ConcatZ(ConcatZ&& kernel)
|
||||
@ -165,7 +164,7 @@ ConcatZ& ConcatZ::operator=(ConcatZ&& kernel) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConcatZ::Compile(const CreationContext& creation_context) {
|
||||
Status ConcatZ::Compile(const CreationContext& creation_context) {
|
||||
const auto code =
|
||||
GetConcatKernelCode(definition_, channels_, linked_operations_);
|
||||
std::vector<CompilerOptions> options;
|
||||
@ -187,7 +186,7 @@ absl::Status ConcatZ::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status ConcatZ::BindArguments() {
|
||||
Status ConcatZ::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
for (int i = 0; i < channels_.size(); ++i) {
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
|
||||
@ -198,7 +197,7 @@ absl::Status ConcatZ::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[i]->Slices()));
|
||||
}
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConcatZ::GetGridSize() const {
|
||||
@ -208,12 +207,12 @@ int3 ConcatZ::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status ConcatZ::Tune(const TuningParameters& params) {
|
||||
Status ConcatZ::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status ConcatZ::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConcatZ::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
@ -32,10 +32,10 @@ class ConcatZ : public GPUOperation {
|
||||
public:
|
||||
ConcatZ(const OperationDef& definition, const std::vector<int>& channels)
|
||||
: GPUOperation(definition), channels_(channels) {}
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ConcatZ(ConcatZ&& kernel);
|
||||
@ -44,7 +44,7 @@ class ConcatZ : public GPUOperation {
|
||||
ConcatZ& operator=(const ConcatZ&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
std::vector<int> channels_;
|
||||
|
@ -76,7 +76,7 @@ Conv3D& Conv3D::operator=(Conv3D&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Conv3D::Compile(const CreationContext& creation_context) {
|
||||
Status Conv3D::Compile(const CreationContext& creation_context) {
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_.x != 1;
|
||||
const std::string code =
|
||||
@ -92,7 +92,7 @@ absl::Status Conv3D::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Conv3D::BindArguments() {
|
||||
Status Conv3D::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
if (conv_params_.AreWeightsBuffer()) {
|
||||
@ -131,7 +131,7 @@ absl::Status Conv3D::BindArguments() {
|
||||
IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 Conv3D::GetGridSize() const {
|
||||
@ -154,12 +154,12 @@ int3 Conv3D::GetGridSize() const {
|
||||
conv_params_.work_group_size.z);
|
||||
}
|
||||
|
||||
absl::Status Conv3D::Tune(const TuningParameters& params) {
|
||||
Status Conv3D::Tune(const TuningParameters& params) {
|
||||
if (conv_params_.weights_upload_type ==
|
||||
WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
|
||||
conv_params_.weights_upload_type ==
|
||||
WeightsUploadType::LOCAL_MEM_BY_THREADS) {
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
if (conv_params_.work_group_launch_order[0] == 0 &&
|
||||
conv_params_.work_group_launch_order[1] == 1 &&
|
||||
@ -168,10 +168,10 @@ absl::Status Conv3D::Tune(const TuningParameters& params) {
|
||||
return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
|
||||
&conv_params_.work_group_size);
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status Conv3D::AddToQueue(CLCommandQueue* queue) {
|
||||
Status Conv3D::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(),
|
||||
conv_params_.work_group_size);
|
||||
@ -903,9 +903,9 @@ Conv3D::ConvParams Conv3D::GuessBestParams(
|
||||
x_kernel_is_1, y_kernel_is_1, z_kernel_is_1);
|
||||
}
|
||||
|
||||
absl::Status CreateConv3D(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr, Conv3D* result) {
|
||||
Status CreateConv3D(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr, Conv3D* result) {
|
||||
*result = Conv3D(definition, attr, *creation_context.device);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
}
|
||||
|
@ -39,9 +39,9 @@ namespace cl {
|
||||
class Conv3D : public GPUOperation {
|
||||
public:
|
||||
Conv3D() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Conv3D(Conv3D&& operation);
|
||||
@ -75,21 +75,21 @@ class Conv3D : public GPUOperation {
|
||||
const CLDevice& device);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadData(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
Status UploadData(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWDI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
friend absl::Status CreateConv3D(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr,
|
||||
Conv3D* result);
|
||||
friend Status CreateConv3D(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr,
|
||||
Conv3D* result);
|
||||
|
||||
friend std::string GenerateConv3D(
|
||||
const OperationDef& op_def, const LinearStorage& biases,
|
||||
@ -105,7 +105,7 @@ class Conv3D : public GPUOperation {
|
||||
int dst_slices, bool x_kernel_is_1,
|
||||
bool y_kernel_is_1, bool z_kernel_is_1) const;
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Texture2D weights_0_;
|
||||
@ -125,9 +125,9 @@ class Conv3D : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status Conv3D::UploadData(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context) {
|
||||
Status Conv3D::UploadData(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context) {
|
||||
RETURN_IF_ERROR(UploadWeights(weights, context));
|
||||
LinearStorageCreateInfo create_info;
|
||||
create_info.storage_type = conv_params_.AreWeightsBuffer()
|
||||
@ -139,12 +139,12 @@ absl::Status Conv3D::UploadData(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
create_info.name = "biases";
|
||||
create_info.aligned_size = weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status Conv3D::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
|
||||
Status Conv3D::UploadWeights(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
CLContext* context) {
|
||||
const int block_size = conv_params_.block_size.w;
|
||||
const int dst_slices =
|
||||
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size);
|
||||
@ -211,7 +211,7 @@ absl::Status Conv3D::UploadWeights(
|
||||
}
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType S, typename T>
|
||||
@ -271,9 +271,9 @@ void Conv3D::RearrangeWeightsData(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateConv3D(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr, Conv3D* result);
|
||||
Status CreateConv3D(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution3DAttributes& attr, Conv3D* result);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -291,16 +291,16 @@ ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
|
||||
Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
|
||||
std::string code =
|
||||
GenerateConvBuffer1x1(definition_, conv_params_, linked_operations_);
|
||||
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status ConvBuffer1x1::BindArguments() {
|
||||
Status ConvBuffer1x1::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
|
||||
@ -313,7 +313,7 @@ absl::Status ConvBuffer1x1::BindArguments() {
|
||||
src_width_elements * src_[0]->Height());
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_size));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConvBuffer1x1::GetGridSize() const {
|
||||
@ -328,13 +328,13 @@ int3 ConvBuffer1x1::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status ConvBuffer1x1::Tune(const TuningParameters& params) {
|
||||
Status ConvBuffer1x1::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
|
||||
&conv_params_.work_group_size);
|
||||
}
|
||||
|
||||
absl::Status ConvBuffer1x1::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConvBuffer1x1::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(),
|
||||
conv_params_.work_group_size);
|
||||
@ -351,12 +351,12 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
|
||||
attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
|
||||
}
|
||||
|
||||
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvBuffer1x1* result, const BHWC* shape) {
|
||||
Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvBuffer1x1* result, const BHWC* shape) {
|
||||
if (!IsConvBuffer1x1Supported(definition, attr)) {
|
||||
return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported");
|
||||
return InvalidArgumentError("ConvBuffer1x1 doesn't supported");
|
||||
}
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
@ -372,10 +372,10 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
}
|
||||
|
||||
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvBuffer1x1* result, const BHWC* shape) {
|
||||
Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvBuffer1x1* result, const BHWC* shape) {
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
@ -392,10 +392,11 @@ absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
}
|
||||
|
||||
absl::Status CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
|
||||
const BHWC* shape) {
|
||||
Status CreateConvBuffer1x1Wino4x4To6x6(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvBuffer1x1* result,
|
||||
const BHWC* shape) {
|
||||
const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
|
||||
ConvBuffer1x1::ConvParams conv_params;
|
||||
|
@ -45,10 +45,10 @@ class ConvBuffer1x1 : public GPUOperation {
|
||||
ConvBuffer1x1(const ConvBuffer1x1&) = delete;
|
||||
ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
|
||||
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
struct ConvParams {
|
||||
int3 block_size = int3(1, 1, 1);
|
||||
@ -64,33 +64,33 @@ class ConvBuffer1x1 : public GPUOperation {
|
||||
|
||||
private:
|
||||
ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params);
|
||||
friend absl::Status CreateConvBuffer1x1(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
|
||||
const BHWC* shape);
|
||||
friend absl::Status CreateConvBuffer1x1(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr, ConvBuffer1x1* result,
|
||||
const BHWC* shape);
|
||||
friend absl::Status CreateConvBuffer1x1Wino4x4To6x6(
|
||||
friend Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvBuffer1x1* result, const BHWC* shape);
|
||||
friend Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvBuffer1x1* result, const BHWC* shape);
|
||||
friend Status CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
|
||||
const BHWC* shape);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
Status UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
template <DataType T>
|
||||
absl::Status UploadDataForWinograd4x4To6x6(
|
||||
Status UploadDataForWinograd4x4To6x6(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Buffer weights_;
|
||||
@ -101,20 +101,20 @@ class ConvBuffer1x1 : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvBuffer1x1::UploadData(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
|
||||
Status ConvBuffer1x1::UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context) {
|
||||
RETURN_IF_ERROR(UploadWeights(weights, context));
|
||||
LinearStorageCreateInfo create_info;
|
||||
create_info.storage_type = LinearStorageType::BUFFER;
|
||||
create_info.data_type = definition_.GetDataType();
|
||||
create_info.aligned_size = weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
|
||||
Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context) {
|
||||
::tflite::gpu::Tensor<OHWI, T> wino_weights;
|
||||
@ -132,7 +132,7 @@ absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvBuffer1x1::UploadWeights(
|
||||
Status ConvBuffer1x1::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
@ -162,22 +162,21 @@ absl::Status ConvBuffer1x1::UploadWeights(
|
||||
bool IsConvBuffer1x1Supported(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvBuffer1x1* result,
|
||||
const BHWC* shape = nullptr);
|
||||
Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvBuffer1x1* result, const BHWC* shape = nullptr);
|
||||
|
||||
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvBuffer1x1* result,
|
||||
const BHWC* shape = nullptr);
|
||||
Status CreateConvBuffer1x1(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvBuffer1x1* result, const BHWC* shape = nullptr);
|
||||
|
||||
absl::Status CreateConvBuffer1x1Wino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
|
||||
const BHWC* shape = nullptr);
|
||||
Status CreateConvBuffer1x1Wino4x4To6x6(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvBuffer1x1* result,
|
||||
const BHWC* shape = nullptr);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -219,7 +219,7 @@ ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConvConstants::Compile(const CreationContext& creation_context) {
|
||||
Status ConvConstants::Compile(const CreationContext& creation_context) {
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_.x != 1;
|
||||
const auto code = GenerateConvolutionConstantCode(
|
||||
@ -240,7 +240,7 @@ absl::Status ConvConstants::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status ConvConstants::BindArguments() {
|
||||
Status ConvConstants::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
|
||||
@ -254,7 +254,7 @@ absl::Status ConvConstants::BindArguments() {
|
||||
kernel_.SetBytesAuto(int2(dilation_.x * src_[0]->Batch(), dilation_.y)));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConvConstants::GetGridSize() const {
|
||||
@ -263,12 +263,12 @@ int3 ConvConstants::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, 1);
|
||||
}
|
||||
|
||||
absl::Status ConvConstants::Tune(const TuningParameters& params) {
|
||||
Status ConvConstants::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status ConvConstants::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConvConstants::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
@ -294,12 +294,12 @@ bool IsConvConstantsSupported(const CLDevice& device,
|
||||
return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
|
||||
}
|
||||
|
||||
absl::Status CreateConvConstants(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvConstants* result) {
|
||||
Status CreateConvConstants(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvConstants* result) {
|
||||
if (!IsConvConstantsSupported(*creation_context.device, definition, attr)) {
|
||||
return absl::InvalidArgumentError("ConvConstants doesn't supported");
|
||||
return InvalidArgumentError("ConvConstants doesn't supported");
|
||||
}
|
||||
*result = ConvConstants(definition, attr);
|
||||
RETURN_IF_ERROR(
|
||||
@ -310,7 +310,8 @@ absl::Status CreateConvConstants(const CreationContext& creation_context,
|
||||
create_info.aligned_size = attr.weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(
|
||||
create_info, attr.bias, creation_context.context, &result->biases_));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -35,10 +35,10 @@ namespace cl {
|
||||
class ConvConstants : public GPUOperation {
|
||||
public:
|
||||
ConvConstants() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ConvConstants(ConvConstants&& kernel);
|
||||
@ -47,9 +47,10 @@ class ConvConstants : public GPUOperation {
|
||||
ConvConstants& operator=(const ConvConstants&) = delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateConvConstants(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvConstants* result);
|
||||
friend Status CreateConvConstants(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvConstants* result);
|
||||
explicit ConvConstants(const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr)
|
||||
: GPUOperation(definition),
|
||||
@ -61,14 +62,14 @@ class ConvConstants : public GPUOperation {
|
||||
dst_channels_(attr.weights.shape.o) {}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Buffer weights_;
|
||||
@ -86,7 +87,7 @@ class ConvConstants : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvConstants::UploadWeights(
|
||||
Status ConvConstants::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int kernel_x = weights.shape.w;
|
||||
@ -156,10 +157,10 @@ bool IsConvConstantsSupported(const CLDevice& device,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr);
|
||||
|
||||
absl::Status CreateConvConstants(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvConstants* result);
|
||||
Status CreateConvConstants(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvConstants* result);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -173,7 +173,7 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConvPowerVR::Compile(const CreationContext& creation_context) {
|
||||
Status ConvPowerVR::Compile(const CreationContext& creation_context) {
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_padding_.x != 1;
|
||||
const std::string code =
|
||||
@ -189,7 +189,7 @@ absl::Status ConvPowerVR::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status ConvPowerVR::BindArguments() {
|
||||
Status ConvPowerVR::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
|
||||
@ -211,7 +211,7 @@ absl::Status ConvPowerVR::BindArguments() {
|
||||
}
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConvPowerVR::GetGridSize() const {
|
||||
@ -245,13 +245,13 @@ int3 ConvPowerVR::GetGridSize() const {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status ConvPowerVR::Tune(const TuningParameters& params) {
|
||||
Status ConvPowerVR::Tune(const TuningParameters& params) {
|
||||
if (conv_params_.weights_upload_type ==
|
||||
WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
|
||||
conv_params_.weights_upload_type ==
|
||||
WeightsUploadType::LOCAL_MEM_BY_THREADS ||
|
||||
conv_params_.fixed_work_group_size) {
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
if (conv_params_.work_group_launch_order[0] == 0 &&
|
||||
conv_params_.work_group_launch_order[1] == 1 &&
|
||||
@ -260,10 +260,10 @@ absl::Status ConvPowerVR::Tune(const TuningParameters& params) {
|
||||
return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
|
||||
&conv_params_.work_group_size);
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(),
|
||||
conv_params_.work_group_size);
|
||||
@ -848,26 +848,27 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
|
||||
return params;
|
||||
}
|
||||
|
||||
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result, const BHWC* dst_shape) {
|
||||
Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result, const BHWC* dst_shape) {
|
||||
*result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
}
|
||||
|
||||
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvPowerVR* result, const BHWC* dst_shape) {
|
||||
Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvPowerVR* result, const BHWC* dst_shape) {
|
||||
*result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
}
|
||||
|
||||
absl::Status CreateConvPowerVRWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvPowerVR* result,
|
||||
const BHWC* dst_shape) {
|
||||
Status CreateConvPowerVRWino4x4To6x6(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result,
|
||||
const BHWC* dst_shape) {
|
||||
*result = ConvPowerVR(definition);
|
||||
result->conv_params_ = result->GuessBestParamsWinograd(
|
||||
*creation_context.device, definition, attr, dst_shape);
|
||||
|
@ -39,9 +39,9 @@ namespace cl {
|
||||
class ConvPowerVR : public GPUOperation {
|
||||
public:
|
||||
ConvPowerVR() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ConvPowerVR(ConvPowerVR&& operation);
|
||||
@ -87,31 +87,29 @@ class ConvPowerVR : public GPUOperation {
|
||||
explicit ConvPowerVR(const OperationDef& definition);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
Status UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
template <DataType T>
|
||||
absl::Status UploadDataForWinograd4x4To6x6(
|
||||
Status UploadDataForWinograd4x4To6x6(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
friend absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result,
|
||||
const BHWC* dst_shape);
|
||||
friend Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result, const BHWC* dst_shape);
|
||||
|
||||
friend absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvPowerVR* result,
|
||||
const BHWC* dst_shape);
|
||||
friend Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvPowerVR* result, const BHWC* dst_shape);
|
||||
|
||||
friend absl::Status CreateConvPowerVRWino4x4To6x6(
|
||||
friend Status CreateConvPowerVRWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvPowerVR* result,
|
||||
const BHWC* dst_shape);
|
||||
@ -140,7 +138,7 @@ class ConvPowerVR : public GPUOperation {
|
||||
bool different_weights_for_height,
|
||||
const BHWC* dst_shape = nullptr) const;
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Buffer weights_;
|
||||
@ -154,20 +152,20 @@ class ConvPowerVR : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvPowerVR::UploadData(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
|
||||
Status ConvPowerVR::UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context) {
|
||||
RETURN_IF_ERROR(UploadWeights(weights, context));
|
||||
LinearStorageCreateInfo create_info;
|
||||
create_info.storage_type = LinearStorageType::BUFFER;
|
||||
create_info.data_type = conv_params_.weights_data_type;
|
||||
create_info.aligned_size = weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
|
||||
Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context) {
|
||||
::tflite::gpu::Tensor<OHWI, T> wino_weights;
|
||||
@ -181,12 +179,12 @@ absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
|
||||
bias.shape = Linear(weights.shape.o);
|
||||
bias.data.resize(weights.shape.o, 0.0f);
|
||||
RETURN_IF_ERROR(CreateLinearStorage(create_info, bias, context, &biases_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvPowerVR::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
Status ConvPowerVR::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context) {
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
|
||||
@ -212,22 +210,21 @@ absl::Status ConvPowerVR::UploadWeights(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result, const BHWC* dst_shape = nullptr);
|
||||
|
||||
absl::Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvPowerVR* result,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
Status CreateConvPowerVR(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvPowerVR* result, const BHWC* dst_shape = nullptr);
|
||||
|
||||
absl::Status CreateConvPowerVRWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvPowerVR* result,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
Status CreateConvPowerVRWino4x4To6x6(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvPowerVR* result,
|
||||
const BHWC* dst_shape = nullptr);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -30,7 +30,6 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
|
||||
std::string GenerateConvCode(
|
||||
const OperationDef& op_def, const int3& block_size, bool is1x1,
|
||||
bool adreno4xx_optimization, bool stride_correction,
|
||||
@ -385,7 +384,7 @@ ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
|
||||
Status ConvTexture::Compile(const CreationContext& creation_context) {
|
||||
auto storage_type = definition_.GetPrimaryStorageType();
|
||||
bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
|
||||
bool adreno4xx_optimization =
|
||||
@ -408,7 +407,7 @@ absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status ConvTexture::BindArguments() {
|
||||
Status ConvTexture::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_0_.GetMemoryPtr()));
|
||||
@ -428,7 +427,7 @@ absl::Status ConvTexture::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
|
||||
RETURN_IF_ERROR(
|
||||
kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConvTexture::GetGridSize() const {
|
||||
@ -439,36 +438,37 @@ int3 ConvTexture::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status ConvTexture::Tune(const TuningParameters& params) {
|
||||
Status ConvTexture::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
|
||||
&work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status ConvTexture::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConvTexture::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvTexture* result) {
|
||||
Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvTexture* result) {
|
||||
*result = ConvTexture(definition, attr);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
}
|
||||
|
||||
absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvTexture* result) {
|
||||
Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvTexture* result) {
|
||||
*result = ConvTexture(definition);
|
||||
return result->UploadData(attr.weights, attr.bias, creation_context.context);
|
||||
}
|
||||
|
||||
absl::Status CreateConvTextureWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvTexture* result) {
|
||||
Status CreateConvTextureWino4x4To6x6(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvTexture* result) {
|
||||
*result = ConvTexture(definition);
|
||||
result->different_weights_for_height_ = true;
|
||||
result->block_size_ = {4, 1, 2};
|
||||
|
@ -41,10 +41,10 @@ namespace cl {
|
||||
class ConvTexture : public GPUOperation {
|
||||
public:
|
||||
ConvTexture() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ConvTexture(ConvTexture&& operation);
|
||||
@ -53,16 +53,16 @@ class ConvTexture : public GPUOperation {
|
||||
ConvTexture& operator=(const ConvTexture&) = delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvTexture* result);
|
||||
friend absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvTexture* result);
|
||||
friend Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvTexture* result);
|
||||
friend Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvTexture* result);
|
||||
|
||||
friend absl::Status CreateConvTextureWino4x4To6x6(
|
||||
friend Status CreateConvTextureWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvTexture* result);
|
||||
|
||||
@ -70,25 +70,25 @@ class ConvTexture : public GPUOperation {
|
||||
const Convolution2DAttributes& attr);
|
||||
explicit ConvTexture(const OperationDef& definition);
|
||||
template <DataType T>
|
||||
absl::Status UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
Status UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadDataForWinograd4x4To6x6(
|
||||
Status UploadDataForWinograd4x4To6x6(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
absl::Span<T> dst_0, absl::Span<T> dst_1,
|
||||
absl::Span<T> dst_2, absl::Span<T> dst_3);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Texture2D weights_0_;
|
||||
@ -114,20 +114,20 @@ class ConvTexture : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvTexture::UploadData(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
|
||||
Status ConvTexture::UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context) {
|
||||
RETURN_IF_ERROR(UploadWeights(weights, context));
|
||||
LinearStorageCreateInfo create_info;
|
||||
create_info.storage_type = LinearStorageType::TEXTURE_2D;
|
||||
create_info.data_type = definition_.GetDataType();
|
||||
create_info.aligned_size = weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
|
||||
Status ConvTexture::UploadDataForWinograd4x4To6x6(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
|
||||
CLContext* context) {
|
||||
::tflite::gpu::Tensor<OHWI, T> wino_weights;
|
||||
@ -145,8 +145,8 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvTexture::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
Status ConvTexture::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context) {
|
||||
int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
dst_depth = AlignByN(dst_depth, block_size_.z);
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
@ -246,19 +246,20 @@ void ConvTexture::RearrangeWeightsData(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvTexture* result);
|
||||
Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvTexture* result);
|
||||
|
||||
absl::Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvTexture* result);
|
||||
Status CreateConvTexture(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
ConvTexture* result);
|
||||
|
||||
absl::Status CreateConvTextureWino4x4To6x6(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr, ConvTexture* result);
|
||||
Status CreateConvTextureWino4x4To6x6(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const Convolution2DAttributes& attr,
|
||||
ConvTexture* result);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -35,12 +35,12 @@ namespace {
|
||||
|
||||
class OpenClConverterImpl : public TensorObjectConverter {
|
||||
public:
|
||||
virtual absl::Status Init(const TensorObjectDef& input_def,
|
||||
const TensorObjectDef& output_def,
|
||||
Environment* environment) = 0;
|
||||
virtual Status Init(const TensorObjectDef& input_def,
|
||||
const TensorObjectDef& output_def,
|
||||
Environment* environment) = 0;
|
||||
|
||||
protected:
|
||||
absl::Status DispatchKernel(cl_mem input, cl_mem output) {
|
||||
Status DispatchKernel(cl_mem input, cl_mem output) {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(input));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(output));
|
||||
@ -119,9 +119,9 @@ class FromTensorConverter : public OpenClConverterImpl {
|
||||
})");
|
||||
}
|
||||
|
||||
absl::Status Init(const TensorObjectDef& input_def,
|
||||
const TensorObjectDef& output_def,
|
||||
Environment* environment) final {
|
||||
Status Init(const TensorObjectDef& input_def,
|
||||
const TensorObjectDef& output_def,
|
||||
Environment* environment) final {
|
||||
auto params_kernel = output_def.object_def.data_layout == DataLayout::BHWC
|
||||
? GetToBhwcKernel(input_def, output_def)
|
||||
: GetToDhwc4Kernel(input_def, output_def);
|
||||
@ -157,12 +157,11 @@ __kernel void from_tensor()" +
|
||||
environment->device(), &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Convert(const TensorObject& input_obj,
|
||||
const TensorObject& output_obj) override {
|
||||
Status Convert(const TensorObject& input_obj,
|
||||
const TensorObject& output_obj) override {
|
||||
auto output = absl::get_if<OpenClBuffer>(&output_obj);
|
||||
if (!output || !output->memobj) {
|
||||
return absl::InvalidArgumentError(
|
||||
"Missing output in from_tensor converter");
|
||||
return InvalidArgumentError("Missing output in from_tensor converter");
|
||||
}
|
||||
auto input_texture = absl::get_if<OpenClTexture>(&input_obj);
|
||||
if (input_texture && input_texture->memobj) {
|
||||
@ -172,7 +171,7 @@ __kernel void from_tensor()" +
|
||||
if (input_buffer && input_buffer->memobj) {
|
||||
return DispatchKernel(input_buffer->memobj, output->memobj);
|
||||
}
|
||||
return absl::InvalidArgumentError("Missing input in from_tensor converter");
|
||||
return InvalidArgumentError("Missing input in from_tensor converter");
|
||||
}
|
||||
};
|
||||
|
||||
@ -226,9 +225,9 @@ class ToTensorConverter : public OpenClConverterImpl {
|
||||
)");
|
||||
}
|
||||
|
||||
absl::Status Init(const TensorObjectDef& input_def,
|
||||
const TensorObjectDef& output_def,
|
||||
Environment* environment) final {
|
||||
Status Init(const TensorObjectDef& input_def,
|
||||
const TensorObjectDef& output_def,
|
||||
Environment* environment) final {
|
||||
auto params_kernel = input_def.object_def.data_layout == DataLayout::BHWC
|
||||
? GetFromBhwcKernel(input_def, output_def)
|
||||
: GetFromDhwc4Kernel(input_def, output_def);
|
||||
@ -262,11 +261,11 @@ __kernel void to_tensor()" +
|
||||
&kernel_);
|
||||
}
|
||||
|
||||
absl::Status Convert(const TensorObject& input_obj,
|
||||
const TensorObject& output_obj) override {
|
||||
Status Convert(const TensorObject& input_obj,
|
||||
const TensorObject& output_obj) override {
|
||||
auto input = absl::get_if<OpenClBuffer>(&input_obj);
|
||||
if (!input || !input->memobj) {
|
||||
return absl::InvalidArgumentError("Missing input in to_tensor converter");
|
||||
return InvalidArgumentError("Missing input in to_tensor converter");
|
||||
}
|
||||
auto output_texture = absl::get_if<OpenClTexture>(&output_obj);
|
||||
if (output_texture && output_texture->memobj) {
|
||||
@ -276,7 +275,7 @@ __kernel void to_tensor()" +
|
||||
if (output_buffer && output_buffer->memobj) {
|
||||
return DispatchKernel(input->memobj, output_buffer->memobj);
|
||||
}
|
||||
return absl::InvalidArgumentError("Missing input in to_tensor converter");
|
||||
return InvalidArgumentError("Missing input in to_tensor converter");
|
||||
}
|
||||
};
|
||||
|
||||
@ -319,18 +318,18 @@ class TrivialCopier : public OpenClConverterImpl {
|
||||
input.data_layout == output.data_layout;
|
||||
}
|
||||
|
||||
absl::Status Init(const TensorObjectDef& input_def,
|
||||
const TensorObjectDef& output_def,
|
||||
Environment* environment) final {
|
||||
Status Init(const TensorObjectDef& input_def,
|
||||
const TensorObjectDef& output_def,
|
||||
Environment* environment) final {
|
||||
dims_ = input_def.dimensions;
|
||||
data_type_ = input_def.object_def.data_type;
|
||||
queue_ = environment->queue();
|
||||
region_ = CalculateTextureRegion(output_def);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status Convert(const TensorObject& input_obj,
|
||||
const TensorObject& output_obj) override {
|
||||
Status Convert(const TensorObject& input_obj,
|
||||
const TensorObject& output_obj) override {
|
||||
auto texture_input = absl::get_if<OpenClTexture>(&input_obj);
|
||||
auto texture_output = absl::get_if<OpenClTexture>(&output_obj);
|
||||
if (texture_input && texture_output) {
|
||||
@ -341,12 +340,12 @@ class TrivialCopier : public OpenClConverterImpl {
|
||||
if (buffer_input && buffer_output) {
|
||||
return Copy(*buffer_input, *buffer_output);
|
||||
}
|
||||
return absl::InternalError("Unexpected object");
|
||||
return InternalError("Unexpected object");
|
||||
}
|
||||
|
||||
absl::Status Copy(const OpenClBuffer& input, const OpenClBuffer& output) {
|
||||
Status Copy(const OpenClBuffer& input, const OpenClBuffer& output) {
|
||||
if (input.memobj == output.memobj) {
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
return GetOpenCLError(clEnqueueCopyBuffer(
|
||||
queue_->queue(), input.memobj, output.memobj, 0, 0,
|
||||
@ -354,9 +353,9 @@ class TrivialCopier : public OpenClConverterImpl {
|
||||
nullptr));
|
||||
}
|
||||
|
||||
absl::Status Copy(const OpenClTexture& input, const OpenClTexture& output) {
|
||||
Status Copy(const OpenClTexture& input, const OpenClTexture& output) {
|
||||
if (input.memobj == output.memobj) {
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
size_t origin[3] = {0, 0, 0};
|
||||
return GetOpenCLError(
|
||||
@ -381,18 +380,18 @@ class CpuCopier : public OpenClConverterImpl {
|
||||
IsOpenClTextureOrBuffer(input.object_type)));
|
||||
}
|
||||
|
||||
absl::Status Init(const TensorObjectDef& input_def,
|
||||
const TensorObjectDef& output_def,
|
||||
Environment* environment) final {
|
||||
Status Init(const TensorObjectDef& input_def,
|
||||
const TensorObjectDef& output_def,
|
||||
Environment* environment) final {
|
||||
region_ = CalculateTextureRegion(
|
||||
input_def.object_def.object_type == ObjectType::CPU_MEMORY ? output_def
|
||||
: input_def);
|
||||
queue_ = environment->queue();
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status Convert(const TensorObject& input_obj,
|
||||
const TensorObject& output_obj) override {
|
||||
Status Convert(const TensorObject& input_obj,
|
||||
const TensorObject& output_obj) override {
|
||||
auto cpu_input = absl::get_if<CpuMemory>(&input_obj);
|
||||
auto cpu_output = absl::get_if<CpuMemory>(&output_obj);
|
||||
if (cpu_input) {
|
||||
@ -420,7 +419,7 @@ class CpuCopier : public OpenClConverterImpl {
|
||||
buffer_input->memobj, cpu_output->size_bytes, cpu_output->data);
|
||||
}
|
||||
}
|
||||
return absl::InternalError("Unexpected object");
|
||||
return InternalError("Unexpected object");
|
||||
}
|
||||
|
||||
private:
|
||||
@ -443,7 +442,7 @@ class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder {
|
||||
ToTensorConverter::IsSupported(input_def, output_def));
|
||||
}
|
||||
|
||||
absl::Status MakeConverter(
|
||||
Status MakeConverter(
|
||||
const TensorObjectDef& input, const TensorObjectDef& output,
|
||||
std::unique_ptr<TensorObjectConverter>* converter) final {
|
||||
std::unique_ptr<OpenClConverterImpl> impl;
|
||||
@ -458,11 +457,11 @@ class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder {
|
||||
} else if (ToTensorConverter::IsSupported(input_def, output_def)) {
|
||||
impl = absl::make_unique<ToTensorConverter>();
|
||||
} else {
|
||||
return absl::UnimplementedError("Unsupported conversion");
|
||||
return UnimplementedError("Unsupported conversion");
|
||||
}
|
||||
RETURN_IF_ERROR(impl->Init(input, output, environment_));
|
||||
*converter = std::move(impl);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
Environment* environment_;
|
||||
|
@ -368,8 +368,7 @@ ConvolutionTransposed& ConvolutionTransposed::operator=(
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
Status ConvolutionTransposed::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionTransposedCode(
|
||||
definition_, biases_, *creation_context.device, weights_are_buffer_,
|
||||
block_size_, linked_operations_);
|
||||
@ -381,7 +380,7 @@ absl::Status ConvolutionTransposed::Compile(
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed::BindArguments() {
|
||||
Status ConvolutionTransposed::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
if (weights_are_buffer_) {
|
||||
@ -400,7 +399,7 @@ absl::Status ConvolutionTransposed::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConvolutionTransposed::GetGridSize() const {
|
||||
@ -413,21 +412,21 @@ int3 ConvolutionTransposed::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed::Tune(const TuningParameters& params) {
|
||||
Status ConvolutionTransposed::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
|
||||
&work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConvolutionTransposed::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status CreateConvolutionTransposed(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed* result) {
|
||||
Status CreateConvolutionTransposed(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed* result) {
|
||||
*result = ConvolutionTransposed(definition, attr, *creation_context.device);
|
||||
RETURN_IF_ERROR(
|
||||
result->UploadWeights(attr.weights, creation_context.context));
|
||||
@ -439,7 +438,8 @@ absl::Status CreateConvolutionTransposed(
|
||||
create_info.aligned_size = attr.weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(
|
||||
create_info, attr.bias, creation_context.context, &result->biases_));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -38,10 +38,10 @@ namespace cl {
|
||||
class ConvolutionTransposed : public GPUOperation {
|
||||
public:
|
||||
ConvolutionTransposed() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ConvolutionTransposed(ConvolutionTransposed&& operation);
|
||||
@ -50,7 +50,7 @@ class ConvolutionTransposed : public GPUOperation {
|
||||
ConvolutionTransposed& operator=(const ConvolutionTransposed&) = delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateConvolutionTransposed(
|
||||
friend Status CreateConvolutionTransposed(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed* result);
|
||||
@ -58,14 +58,14 @@ class ConvolutionTransposed : public GPUOperation {
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
const CLDevice& device);
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
LinearStorage biases_;
|
||||
@ -88,7 +88,7 @@ class ConvolutionTransposed : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvolutionTransposed::UploadWeights(
|
||||
Status ConvolutionTransposed::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int dst_depth =
|
||||
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
|
||||
@ -153,7 +153,7 @@ absl::Status ConvolutionTransposed::UploadWeights(
|
||||
}
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType S, typename T>
|
||||
@ -208,9 +208,10 @@ void ConvolutionTransposed::RearrangeWeightsData(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateConvolutionTransposed(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr, ConvolutionTransposed* result);
|
||||
Status CreateConvolutionTransposed(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed* result);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -396,7 +396,7 @@ ConvolutionTransposed3D& ConvolutionTransposed3D::operator=(
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed3D::Compile(
|
||||
Status ConvolutionTransposed3D::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionTransposed3DCode(
|
||||
definition_, biases_, *creation_context.device, weights_are_buffer_,
|
||||
@ -417,7 +417,7 @@ absl::Status ConvolutionTransposed3D::Compile(
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed3D::BindArguments() {
|
||||
Status ConvolutionTransposed3D::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
if (weights_are_buffer_) {
|
||||
@ -444,7 +444,7 @@ absl::Status ConvolutionTransposed3D::BindArguments() {
|
||||
IntegralDivideRoundUp(dst_[0]->Slices(), block_size_.w)));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHDS()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHDS()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConvolutionTransposed3D::GetGridSize() const {
|
||||
@ -459,18 +459,18 @@ int3 ConvolutionTransposed3D::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed3D::Tune(const TuningParameters& params) {
|
||||
Status ConvolutionTransposed3D::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
|
||||
&work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed3D::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConvolutionTransposed3D::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status CreateConvolutionTransposed3D(
|
||||
Status CreateConvolutionTransposed3D(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposed3DAttributes& attr,
|
||||
ConvolutionTransposed3D* result) {
|
||||
@ -485,7 +485,8 @@ absl::Status CreateConvolutionTransposed3D(
|
||||
create_info.aligned_size = attr.weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(
|
||||
create_info, attr.bias, creation_context.context, &result->biases_));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -38,10 +38,10 @@ namespace cl {
|
||||
class ConvolutionTransposed3D : public GPUOperation {
|
||||
public:
|
||||
ConvolutionTransposed3D() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ConvolutionTransposed3D(ConvolutionTransposed3D&& operation);
|
||||
@ -50,7 +50,7 @@ class ConvolutionTransposed3D : public GPUOperation {
|
||||
ConvolutionTransposed3D& operator=(const ConvolutionTransposed3D&) = delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateConvolutionTransposed3D(
|
||||
friend Status CreateConvolutionTransposed3D(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposed3DAttributes& attr,
|
||||
ConvolutionTransposed3D* result);
|
||||
@ -58,14 +58,14 @@ class ConvolutionTransposed3D : public GPUOperation {
|
||||
const ConvolutionTransposed3DAttributes& attr,
|
||||
const CLDevice& device);
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWDI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
LinearStorage biases_;
|
||||
@ -88,7 +88,7 @@ class ConvolutionTransposed3D : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvolutionTransposed3D::UploadWeights(
|
||||
Status ConvolutionTransposed3D::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
|
||||
const int dst_depth =
|
||||
AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
|
||||
@ -155,7 +155,7 @@ absl::Status ConvolutionTransposed3D::UploadWeights(
|
||||
}
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType S, typename T>
|
||||
@ -214,7 +214,7 @@ void ConvolutionTransposed3D::RearrangeWeightsData(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateConvolutionTransposed3D(
|
||||
Status CreateConvolutionTransposed3D(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposed3DAttributes& attr,
|
||||
ConvolutionTransposed3D* result);
|
||||
|
@ -304,11 +304,12 @@ ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed3x3::Compile(
|
||||
Status ConvolutionTransposed3x3::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionTransposedCode(
|
||||
definition_, biases_, linked_operations_, weights_upload_type_, padding_,
|
||||
work_group_launch_order_);
|
||||
|
||||
std::vector<CompilerOptions> options;
|
||||
if (definition_.precision == CalculationsPrecision::F16 &&
|
||||
creation_context.device->IsPowerVR()) {
|
||||
@ -317,10 +318,11 @@ absl::Status ConvolutionTransposed3x3::Compile(
|
||||
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", options, *creation_context.context,
|
||||
*creation_context.device, &kernel_));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed3x3::BindArguments() {
|
||||
Status ConvolutionTransposed3x3::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
|
||||
@ -335,7 +337,10 @@ absl::Status ConvolutionTransposed3x3::BindArguments() {
|
||||
padding_.x >= 1 ? (padding_.x - 1) / 2 : (padding_.x - 2) / 2;
|
||||
const int padding_y =
|
||||
padding_.y >= 1 ? (padding_.y - 1) / 2 : (padding_.y - 2) / 2;
|
||||
return kernel_.SetBytesAuto(int2(padding_x * src_[0]->Batch(), padding_y));
|
||||
RETURN_IF_ERROR(
|
||||
kernel_.SetBytesAuto(int2(padding_x * src_[0]->Batch(), padding_y)));
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConvolutionTransposed3x3::GetGridSize() const {
|
||||
@ -353,7 +358,7 @@ int3 ConvolutionTransposed3x3::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed3x3::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConvolutionTransposed3x3::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
@ -365,13 +370,13 @@ bool IsConvolutionTransposed3x3Supported(
|
||||
attr.stride.w == 2 && attr.stride.h == 2;
|
||||
}
|
||||
|
||||
absl::Status CreateConvolutionTransposed3x3(
|
||||
Status CreateConvolutionTransposed3x3(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed3x3* result) {
|
||||
if (!IsConvolutionTransposed3x3Supported(*creation_context.device, definition,
|
||||
attr)) {
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"ConvolutionTransposed3x3 doesn't support this attributes");
|
||||
}
|
||||
const int2 padding = int2(attr.padding.prepended.w, attr.padding.prepended.h);
|
||||
@ -386,7 +391,7 @@ absl::Status CreateConvolutionTransposed3x3(
|
||||
create_info.aligned_size = attr.weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(
|
||||
create_info, attr.bias, creation_context.context, &result->biases_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -37,8 +37,8 @@ namespace cl {
|
||||
class ConvolutionTransposed3x3 : public GPUOperation {
|
||||
public:
|
||||
ConvolutionTransposed3x3() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ConvolutionTransposed3x3(ConvolutionTransposed3x3&& operation);
|
||||
@ -56,19 +56,19 @@ class ConvolutionTransposed3x3 : public GPUOperation {
|
||||
private:
|
||||
ConvolutionTransposed3x3(const OperationDef& definition,
|
||||
const CLDevice& device, int2 padding);
|
||||
friend absl::Status CreateConvolutionTransposed3x3(
|
||||
friend Status CreateConvolutionTransposed3x3(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed3x3* result);
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
int2 padding_;
|
||||
@ -82,7 +82,7 @@ class ConvolutionTransposed3x3 : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvolutionTransposed3x3::UploadWeights(
|
||||
Status ConvolutionTransposed3x3::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
@ -165,7 +165,7 @@ bool IsConvolutionTransposed3x3Supported(
|
||||
const CLDevice& device, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
absl::Status CreateConvolutionTransposed3x3(
|
||||
Status CreateConvolutionTransposed3x3(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed3x3* result);
|
||||
|
@ -221,18 +221,19 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed3x3Thin::Compile(
|
||||
Status ConvolutionTransposed3x3Thin::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionTransposedCode(
|
||||
definition_, biases_, IntegralDivideRoundUp(src_channels_, 4),
|
||||
IntegralDivideRoundUp(dst_channels_, 4), *creation_context.device,
|
||||
linked_operations_);
|
||||
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed3x3Thin::BindArguments() {
|
||||
Status ConvolutionTransposed3x3Thin::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
|
||||
@ -241,7 +242,7 @@ absl::Status ConvolutionTransposed3x3Thin::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
|
||||
@ -251,13 +252,12 @@ int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed3x3Thin::Tune(
|
||||
const TuningParameters& params) {
|
||||
Status ConvolutionTransposed3x3Thin::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed3x3Thin::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConvolutionTransposed3x3Thin::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
@ -271,13 +271,13 @@ bool IsConvolutionTransposed3x3ThinSupported(
|
||||
attr.padding.appended.h == 1;
|
||||
}
|
||||
|
||||
absl::Status CreateConvolutionTransposed3x3Thin(
|
||||
Status CreateConvolutionTransposed3x3Thin(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed3x3Thin* result) {
|
||||
if (!IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
|
||||
attr)) {
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"ConvolutionTransposed3x3Thin doesn't support this attributes");
|
||||
}
|
||||
*result = ConvolutionTransposed3x3Thin(definition, attr);
|
||||
@ -291,7 +291,8 @@ absl::Status CreateConvolutionTransposed3x3Thin(
|
||||
create_info.aligned_size = attr.weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(
|
||||
create_info, attr.bias, creation_context.context, &result->biases_));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -37,10 +37,10 @@ namespace cl {
|
||||
class ConvolutionTransposed3x3Thin : public GPUOperation {
|
||||
public:
|
||||
ConvolutionTransposed3x3Thin() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ConvolutionTransposed3x3Thin(ConvolutionTransposed3x3Thin&& operation);
|
||||
@ -51,7 +51,7 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
|
||||
delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateConvolutionTransposed3x3Thin(
|
||||
friend Status CreateConvolutionTransposed3x3Thin(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed3x3Thin* result);
|
||||
@ -59,14 +59,14 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
|
||||
const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Buffer weights_;
|
||||
@ -80,7 +80,7 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
|
||||
Status ConvolutionTransposed3x3Thin::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(dst_channels_, 4);
|
||||
@ -150,7 +150,7 @@ void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
|
||||
bool IsConvolutionTransposed3x3ThinSupported(
|
||||
const CLDevice& device, const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
absl::Status CreateConvolutionTransposed3x3Thin(
|
||||
Status CreateConvolutionTransposed3x3Thin(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed3x3Thin* result);
|
||||
|
@ -301,7 +301,7 @@ ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed4x4::Compile(
|
||||
Status ConvolutionTransposed4x4::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionTransposedCode(
|
||||
definition_, biases_, linked_operations_, weights_upload_type_);
|
||||
@ -314,10 +314,11 @@ absl::Status ConvolutionTransposed4x4::Compile(
|
||||
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", options, *creation_context.context,
|
||||
*creation_context.device, &kernel_));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed4x4::BindArguments() {
|
||||
Status ConvolutionTransposed4x4::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
|
||||
@ -328,7 +329,8 @@ absl::Status ConvolutionTransposed4x4::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
|
||||
const int32_t filters_offset = 4 * 16 * src_[0]->Slices();
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(filters_offset));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConvolutionTransposed4x4::GetGridSize() const {
|
||||
@ -339,7 +341,7 @@ int3 ConvolutionTransposed4x4::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposed4x4::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConvolutionTransposed4x4::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
@ -352,13 +354,13 @@ bool IsConvolutionTransposed4x4Supported(
|
||||
attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1;
|
||||
}
|
||||
|
||||
absl::Status CreateConvolutionTransposed4x4(
|
||||
Status CreateConvolutionTransposed4x4(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed4x4* result) {
|
||||
if (!IsConvolutionTransposed4x4Supported(*creation_context.device, definition,
|
||||
attr)) {
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"ConvolutionTransposed4x4 doesn't support this attributes");
|
||||
}
|
||||
*result = ConvolutionTransposed4x4(definition, *creation_context.device);
|
||||
@ -371,7 +373,7 @@ absl::Status CreateConvolutionTransposed4x4(
|
||||
create_info.aligned_size = attr.weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(
|
||||
create_info, attr.bias, creation_context.context, &result->biases_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -37,8 +37,8 @@ namespace cl {
|
||||
class ConvolutionTransposed4x4 : public GPUOperation {
|
||||
public:
|
||||
ConvolutionTransposed4x4() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ConvolutionTransposed4x4(ConvolutionTransposed4x4&& operation);
|
||||
@ -56,19 +56,19 @@ class ConvolutionTransposed4x4 : public GPUOperation {
|
||||
private:
|
||||
ConvolutionTransposed4x4(const OperationDef& definition,
|
||||
const CLDevice& device);
|
||||
friend absl::Status CreateConvolutionTransposed4x4(
|
||||
friend Status CreateConvolutionTransposed4x4(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed4x4* result);
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Buffer weights_;
|
||||
@ -80,7 +80,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvolutionTransposed4x4::UploadWeights(
|
||||
Status ConvolutionTransposed4x4::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
@ -150,7 +150,7 @@ bool IsConvolutionTransposed4x4Supported(
|
||||
const CLDevice& device, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
absl::Status CreateConvolutionTransposed4x4(
|
||||
Status CreateConvolutionTransposed4x4(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposed4x4* result);
|
||||
|
@ -184,7 +184,7 @@ ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposedThin::Compile(
|
||||
Status ConvolutionTransposedThin::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
const auto code = GenerateConvolutionTransposedCode(
|
||||
definition_, IntegralDivideRoundUp(src_channels_, 4), dst_channels_,
|
||||
@ -201,7 +201,7 @@ absl::Status ConvolutionTransposedThin::Compile(
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposedThin::BindArguments() {
|
||||
Status ConvolutionTransposedThin::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
|
||||
@ -210,7 +210,7 @@ absl::Status ConvolutionTransposedThin::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(bias_value_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ConvolutionTransposedThin::GetGridSize() const {
|
||||
@ -220,12 +220,12 @@ int3 ConvolutionTransposedThin::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposedThin::Tune(const TuningParameters& params) {
|
||||
Status ConvolutionTransposedThin::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status ConvolutionTransposedThin::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ConvolutionTransposedThin::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
@ -238,18 +238,18 @@ bool IsConvolutionTransposedThinSupported(
|
||||
attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
|
||||
}
|
||||
|
||||
absl::Status CreateConvolutionTransposedThin(
|
||||
Status CreateConvolutionTransposedThin(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposedThin* result) {
|
||||
if (!IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"ConvolutionTransposedThin doesn't support this attributes");
|
||||
}
|
||||
*result = ConvolutionTransposedThin(definition, attr);
|
||||
RETURN_IF_ERROR(
|
||||
result->UploadWeights(attr.weights, creation_context.context));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -38,10 +38,10 @@ namespace cl {
|
||||
class ConvolutionTransposedThin : public GPUOperation {
|
||||
public:
|
||||
ConvolutionTransposedThin() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ConvolutionTransposedThin(ConvolutionTransposedThin&& operation);
|
||||
@ -51,21 +51,21 @@ class ConvolutionTransposedThin : public GPUOperation {
|
||||
delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateConvolutionTransposedThin(
|
||||
friend Status CreateConvolutionTransposedThin(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposedThin* result);
|
||||
ConvolutionTransposedThin(const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr);
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Buffer weights_buf_;
|
||||
@ -80,7 +80,7 @@ class ConvolutionTransposedThin : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status ConvolutionTransposedThin::UploadWeights(
|
||||
Status ConvolutionTransposedThin::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(src_channels_, 4);
|
||||
const int elements_count =
|
||||
@ -136,7 +136,7 @@ void ConvolutionTransposedThin::RearrangeWeightsData(
|
||||
bool IsConvolutionTransposedThinSupported(
|
||||
const CLDevice& device, const ConvolutionTransposedAttributes& attr);
|
||||
|
||||
absl::Status CreateConvolutionTransposedThin(
|
||||
Status CreateConvolutionTransposedThin(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const ConvolutionTransposedAttributes& attr,
|
||||
ConvolutionTransposedThin* result);
|
||||
|
@ -226,8 +226,7 @@ DepthWiseConvolution& DepthWiseConvolution::operator=(
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConvolution::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
Status DepthWiseConvolution::Compile(const CreationContext& creation_context) {
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_.x != 1;
|
||||
const auto code = GenerateDepthWiseConvolutionCode(
|
||||
@ -238,7 +237,7 @@ absl::Status DepthWiseConvolution::Compile(
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConvolution::BindArguments() {
|
||||
Status DepthWiseConvolution::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
|
||||
@ -256,7 +255,7 @@ absl::Status DepthWiseConvolution::BindArguments() {
|
||||
}
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 DepthWiseConvolution::GetGridSize() const {
|
||||
@ -266,20 +265,20 @@ int3 DepthWiseConvolution::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConvolution::Tune(const TuningParameters& params) {
|
||||
Status DepthWiseConvolution::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConvolution::AddToQueue(CLCommandQueue* queue) {
|
||||
Status DepthWiseConvolution::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status CreateDepthWiseConvolution(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr,
|
||||
DepthWiseConvolution* result) {
|
||||
Status CreateDepthWiseConvolution(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr,
|
||||
DepthWiseConvolution* result) {
|
||||
bool weights_are_buffer = creation_context.device->IsMali();
|
||||
*result = DepthWiseConvolution(definition, attr, weights_are_buffer);
|
||||
RETURN_IF_ERROR(
|
||||
@ -292,7 +291,7 @@ absl::Status CreateDepthWiseConvolution(
|
||||
create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(
|
||||
create_info, attr.bias, creation_context.context, &result->biases_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -38,10 +38,10 @@ namespace cl {
|
||||
class DepthWiseConvolution : public GPUOperation {
|
||||
public:
|
||||
DepthWiseConvolution() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
DepthWiseConvolution(DepthWiseConvolution&& operation);
|
||||
@ -50,7 +50,7 @@ class DepthWiseConvolution : public GPUOperation {
|
||||
DepthWiseConvolution& operator=(const DepthWiseConvolution&) = delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateDepthWiseConvolution(
|
||||
friend Status CreateDepthWiseConvolution(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr,
|
||||
DepthWiseConvolution* result);
|
||||
@ -58,14 +58,14 @@ class DepthWiseConvolution : public GPUOperation {
|
||||
const DepthwiseConvolution2DAttributes& attr,
|
||||
bool weights_are_buffer);
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
bool weights_are_buffer_;
|
||||
@ -86,7 +86,7 @@ class DepthWiseConvolution : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status DepthWiseConvolution::UploadWeights(
|
||||
Status DepthWiseConvolution::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int dst_channels = weights.shape.i * weights.shape.o;
|
||||
const int dst_depth = IntegralDivideRoundUp(dst_channels, 4);
|
||||
@ -130,7 +130,7 @@ absl::Status DepthWiseConvolution::UploadWeights(
|
||||
weights_ = weights_tex2d_.GetMemoryPtr();
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType S, typename T>
|
||||
@ -162,9 +162,10 @@ void DepthWiseConvolution::RearrangeWeightsData(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateDepthWiseConvolution(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr, DepthWiseConvolution* result);
|
||||
Status CreateDepthWiseConvolution(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr,
|
||||
DepthWiseConvolution* result);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -256,7 +256,7 @@ DepthWiseConvolution3D& DepthWiseConvolution3D::operator=(
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConvolution3D::Compile(
|
||||
Status DepthWiseConvolution3D::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_.x != 1;
|
||||
@ -268,7 +268,7 @@ absl::Status DepthWiseConvolution3D::Compile(
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConvolution3D::BindArguments() {
|
||||
Status DepthWiseConvolution3D::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
if (weights_are_buffer_) {
|
||||
@ -295,7 +295,7 @@ absl::Status DepthWiseConvolution3D::BindArguments() {
|
||||
}
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 DepthWiseConvolution3D::GetGridSize() const {
|
||||
@ -305,17 +305,17 @@ int3 DepthWiseConvolution3D::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConvolution3D::Tune(const TuningParameters& params) {
|
||||
Status DepthWiseConvolution3D::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConvolution3D::AddToQueue(CLCommandQueue* queue) {
|
||||
Status DepthWiseConvolution3D::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status CreateDepthWiseConvolution3D(
|
||||
Status CreateDepthWiseConvolution3D(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const DepthwiseConvolution3DAttributes& attr,
|
||||
DepthWiseConvolution3D* result) {
|
||||
@ -330,7 +330,7 @@ absl::Status CreateDepthWiseConvolution3D(
|
||||
create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(
|
||||
create_info, attr.bias, creation_context.context, &result->biases_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -38,10 +38,10 @@ namespace cl {
|
||||
class DepthWiseConvolution3D : public GPUOperation {
|
||||
public:
|
||||
DepthWiseConvolution3D() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
DepthWiseConvolution3D(DepthWiseConvolution3D&& operation);
|
||||
@ -50,7 +50,7 @@ class DepthWiseConvolution3D : public GPUOperation {
|
||||
DepthWiseConvolution3D& operator=(const DepthWiseConvolution3D&) = delete;
|
||||
|
||||
private:
|
||||
friend absl::Status CreateDepthWiseConvolution3D(
|
||||
friend Status CreateDepthWiseConvolution3D(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const DepthwiseConvolution3DAttributes& attr,
|
||||
DepthWiseConvolution3D* result);
|
||||
@ -58,14 +58,14 @@ class DepthWiseConvolution3D : public GPUOperation {
|
||||
const DepthwiseConvolution3DAttributes& attr,
|
||||
const CLDevice& device);
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType S, typename T>
|
||||
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWDI, S>& weights,
|
||||
absl::Span<T> dst);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Texture2D weights_tex2d_;
|
||||
@ -85,7 +85,7 @@ class DepthWiseConvolution3D : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status DepthWiseConvolution3D::UploadWeights(
|
||||
Status DepthWiseConvolution3D::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
|
||||
const int dst_channels = weights.shape.i * weights.shape.o;
|
||||
const int dst_slices = IntegralDivideRoundUp(dst_channels, 4);
|
||||
@ -123,7 +123,7 @@ absl::Status DepthWiseConvolution3D::UploadWeights(
|
||||
gpu_data.data(), context, &weights_tex2d_));
|
||||
}
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType S, typename T>
|
||||
@ -158,7 +158,7 @@ void DepthWiseConvolution3D::RearrangeWeightsData(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateDepthWiseConvolution3D(
|
||||
Status CreateDepthWiseConvolution3D(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const DepthwiseConvolution3DAttributes& attr,
|
||||
DepthWiseConvolution3D* result);
|
||||
|
@ -297,8 +297,7 @@ DepthWiseConv3x3& DepthWiseConv3x3::operator=(DepthWiseConv3x3&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConv3x3::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
Status DepthWiseConv3x3::Compile(const CreationContext& creation_context) {
|
||||
std::string code = GenerateDepthWiseConvCode(
|
||||
definition_, linked_operations_, *creation_context.device,
|
||||
weights_are_buffer_, local_mem_uploads_);
|
||||
@ -312,14 +311,15 @@ absl::Status DepthWiseConv3x3::Compile(
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConv3x3::BindArguments() {
|
||||
Status DepthWiseConv3x3::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 DepthWiseConv3x3::GetGridSize() const {
|
||||
@ -329,15 +329,15 @@ int3 DepthWiseConv3x3::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConv3x3::Tune(const TuningParameters& params) {
|
||||
Status DepthWiseConv3x3::Tune(const TuningParameters& params) {
|
||||
if (local_mem_uploads_) {
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status DepthWiseConv3x3::AddToQueue(CLCommandQueue* queue) {
|
||||
Status DepthWiseConv3x3::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
@ -351,11 +351,12 @@ bool IsDepthWiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr) {
|
||||
attr.padding.appended.h == 1;
|
||||
}
|
||||
|
||||
absl::Status CreateDepthWiseConv3x3(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr, DepthWiseConv3x3* result) {
|
||||
Status CreateDepthWiseConv3x3(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr,
|
||||
DepthWiseConv3x3* result) {
|
||||
if (!IsDepthWiseConv3x3Supported(attr)) {
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"DepthWiseConv3x3 doesn't support this attributes");
|
||||
}
|
||||
bool weights_are_buffer =
|
||||
@ -363,8 +364,9 @@ absl::Status CreateDepthWiseConv3x3(
|
||||
bool local_mem_uploads =
|
||||
weights_are_buffer && creation_context.device->IsPowerVR();
|
||||
*result = DepthWiseConv3x3(definition, weights_are_buffer, local_mem_uploads);
|
||||
return result->UploadWeightsAndBiases(attr.weights, attr.bias,
|
||||
creation_context.context);
|
||||
RETURN_IF_ERROR(result->UploadWeightsAndBiases(attr.weights, attr.bias,
|
||||
creation_context.context));
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -38,10 +38,10 @@ namespace cl {
|
||||
class DepthWiseConv3x3 : public GPUOperation {
|
||||
public:
|
||||
DepthWiseConv3x3() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
DepthWiseConv3x3(DepthWiseConv3x3&& operation);
|
||||
@ -53,11 +53,11 @@ class DepthWiseConv3x3 : public GPUOperation {
|
||||
explicit DepthWiseConv3x3(const OperationDef& definition,
|
||||
bool weights_are_buffer, bool local_mem_uploads);
|
||||
template <DataType T>
|
||||
absl::Status UploadWeightsAndBiases(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases, CLContext* context);
|
||||
Status UploadWeightsAndBiases(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases,
|
||||
CLContext* context);
|
||||
|
||||
friend absl::Status CreateDepthWiseConv3x3(
|
||||
friend Status CreateDepthWiseConv3x3(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr, DepthWiseConv3x3* result);
|
||||
|
||||
@ -66,7 +66,7 @@ class DepthWiseConv3x3 : public GPUOperation {
|
||||
const ::tflite::gpu::Tensor<OHWI, S>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
bool weights_are_buffer_;
|
||||
@ -80,7 +80,7 @@ class DepthWiseConv3x3 : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status DepthWiseConv3x3::UploadWeightsAndBiases(
|
||||
Status DepthWiseConv3x3::UploadWeightsAndBiases(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
const ::tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
@ -122,7 +122,7 @@ absl::Status DepthWiseConv3x3::UploadWeightsAndBiases(
|
||||
weights_ = weights_tex2d_.GetMemoryPtr();
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType S, typename T>
|
||||
@ -160,9 +160,10 @@ void DepthWiseConv3x3::RearrangeWeightsAndBiasesData(
|
||||
|
||||
bool IsDepthWiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr);
|
||||
|
||||
absl::Status CreateDepthWiseConv3x3(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr, DepthWiseConv3x3* result);
|
||||
Status CreateDepthWiseConv3x3(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr,
|
||||
DepthWiseConv3x3* result);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -203,14 +203,14 @@ std::string ElementwiseTwoInput::GetArgsDeclaration() const {
|
||||
return args;
|
||||
}
|
||||
|
||||
absl::Status ElementwiseTwoInput::BindArguments(CLKernel* kernel) {
|
||||
Status ElementwiseTwoInput::BindArguments(CLKernel* kernel) {
|
||||
if (use_scalar_para_) {
|
||||
RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_para_));
|
||||
} else {
|
||||
RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[1]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel->SetBytesAuto(src_[1]->GetWBatchedHSB()));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
ElementwiseTwoInput CreateElementwiseTwoInput(
|
||||
|
@ -75,7 +75,7 @@ class ElementwiseTwoInput : public ElementwiseOperation {
|
||||
void SetLinkIndex(int index) override;
|
||||
std::string GetCoreCode(const LinkingContext& context) const override;
|
||||
std::string GetArgsDeclaration() const override;
|
||||
absl::Status BindArguments(CLKernel* kernel) override;
|
||||
Status BindArguments(CLKernel* kernel) override;
|
||||
inline void SetScalarPara(FLT scalar) {
|
||||
scalar_para_ = scalar;
|
||||
use_scalar_para_ = true;
|
||||
|
@ -113,7 +113,7 @@ FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status FullyConnected::Compile(const CreationContext& creation_context) {
|
||||
Status FullyConnected::Compile(const CreationContext& creation_context) {
|
||||
int wg_width = 32;
|
||||
int wg_height = 4;
|
||||
int work_items;
|
||||
@ -134,10 +134,10 @@ absl::Status FullyConnected::Compile(const CreationContext& creation_context) {
|
||||
}
|
||||
work_items = work_group_size_.x * work_group_size_.y * work_group_size_.z;
|
||||
} while (work_items > kernel_.GetMaxWorkGroupSize());
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status FullyConnected::AddToQueue(CLCommandQueue* queue) {
|
||||
Status FullyConnected::AddToQueue(CLCommandQueue* queue) {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
|
||||
@ -146,14 +146,15 @@ absl::Status FullyConnected::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
|
||||
RETURN_IF_ERROR(
|
||||
kernel_.SetBytesAuto(int2(src_[0]->Slices(), dst_[0]->Slices())));
|
||||
|
||||
return queue->DispatchImplicit(kernel_, {dst_[0]->Slices(), 1, 1},
|
||||
work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status CreateFullyConnected(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
FullyConnected* result) {
|
||||
Status CreateFullyConnected(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
FullyConnected* result) {
|
||||
*result = FullyConnected(definition);
|
||||
RETURN_IF_ERROR(
|
||||
result->UploadWeights(attr.weights, creation_context.context));
|
||||
@ -164,7 +165,7 @@ absl::Status CreateFullyConnected(const CreationContext& creation_context,
|
||||
create_info.aligned_size = attr.weights.shape.o;
|
||||
RETURN_IF_ERROR(CreateLinearStorage(
|
||||
create_info, attr.bias, creation_context.context, &result->biases_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -37,9 +37,9 @@ namespace cl {
|
||||
class FullyConnected : public GPUOperation {
|
||||
public:
|
||||
FullyConnected() = default;
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
FullyConnected(FullyConnected&& kernel);
|
||||
@ -49,13 +49,14 @@ class FullyConnected : public GPUOperation {
|
||||
|
||||
private:
|
||||
explicit FullyConnected(const OperationDef& definition);
|
||||
friend absl::Status CreateFullyConnected(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr, FullyConnected* result);
|
||||
friend Status CreateFullyConnected(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
FullyConnected* result);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType T, typename S>
|
||||
void RearrangeWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
@ -68,7 +69,7 @@ class FullyConnected : public GPUOperation {
|
||||
};
|
||||
|
||||
template <DataType T>
|
||||
absl::Status FullyConnected::UploadWeights(
|
||||
Status FullyConnected::UploadWeights(
|
||||
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
|
||||
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
|
||||
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
|
||||
@ -122,10 +123,10 @@ void FullyConnected::RearrangeWeights(
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CreateFullyConnected(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
FullyConnected* result);
|
||||
Status CreateFullyConnected(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const FullyConnectedAttributes& attr,
|
||||
FullyConnected* result);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -154,7 +154,7 @@ ElementwiseOperation& ElementwiseOperation::operator=(
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status ElementwiseOperation::BindArguments() {
|
||||
Status ElementwiseOperation::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArguments(&kernel_));
|
||||
@ -162,7 +162,7 @@ absl::Status ElementwiseOperation::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 ElementwiseOperation::GetGridSize() const {
|
||||
@ -172,20 +172,19 @@ int3 ElementwiseOperation::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status ElementwiseOperation::Compile(
|
||||
const CreationContext& creation_context) {
|
||||
Status ElementwiseOperation::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetElementWiseCode(definition_, *this, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status ElementwiseOperation::AddToQueue(CLCommandQueue* queue) {
|
||||
Status ElementwiseOperation::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status ElementwiseOperation::Tune(const TuningParameters& params) {
|
||||
Status ElementwiseOperation::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
@ -210,12 +209,12 @@ std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
|
||||
return code;
|
||||
}
|
||||
|
||||
absl::Status BindArgs(CLKernel* kernel,
|
||||
const std::vector<ElementwiseOperation*>& linked_ops) {
|
||||
Status BindArgs(CLKernel* kernel,
|
||||
const std::vector<ElementwiseOperation*>& linked_ops) {
|
||||
for (auto linked_op : linked_ops) {
|
||||
RETURN_IF_ERROR(linked_op->BindArguments(kernel));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -96,15 +96,11 @@ class GPUOperation {
|
||||
void SetSrc(Tensor* ptr, int index = 0);
|
||||
void SetDst(Tensor* ptr, int index = 0);
|
||||
|
||||
virtual absl::Status AddToQueue(CLCommandQueue* queue) {
|
||||
return absl::OkStatus();
|
||||
}
|
||||
virtual absl::Status Tune(const TuningParameters& params) {
|
||||
return absl::OkStatus();
|
||||
}
|
||||
virtual Status AddToQueue(CLCommandQueue* queue) { return OkStatus(); }
|
||||
virtual Status Tune(const TuningParameters& params) { return OkStatus(); }
|
||||
|
||||
virtual absl::Status Compile(const CreationContext& creation_context) {
|
||||
return absl::OkStatus();
|
||||
virtual Status Compile(const CreationContext& creation_context) {
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
const OperationDef& GetDefinition() const { return definition_; }
|
||||
@ -131,10 +127,10 @@ class ElementwiseOperation : public GPUOperation {
|
||||
: GPUOperation(definition) {}
|
||||
|
||||
virtual ~ElementwiseOperation() {}
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
ElementwiseOperation(ElementwiseOperation&& operation);
|
||||
@ -154,12 +150,10 @@ class ElementwiseOperation : public GPUOperation {
|
||||
|
||||
virtual std::string GetCoreCode(const LinkingContext& context) const = 0;
|
||||
virtual std::string GetArgsDeclaration() const { return ""; }
|
||||
virtual absl::Status BindArguments(CLKernel* kernel) {
|
||||
return absl::OkStatus();
|
||||
}
|
||||
virtual Status BindArguments(CLKernel* kernel) { return OkStatus(); }
|
||||
|
||||
protected:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
CLKernel kernel_;
|
||||
int3 work_group_size_ = int3(8, 4, 1);
|
||||
@ -177,8 +171,8 @@ std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
|
||||
// Binds arguments to given kernel for elementwise operations in
|
||||
// linked_ops.
|
||||
// Every ElementwiseOperation can bind her arguments.
|
||||
absl::Status BindArgs(CLKernel* kernel,
|
||||
const std::vector<ElementwiseOperation*>& linked_ops);
|
||||
Status BindArgs(CLKernel* kernel,
|
||||
const std::vector<ElementwiseOperation*>& linked_ops);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -121,14 +121,14 @@ LSTM& LSTM::operator=(LSTM&& kernel) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status LSTM::Compile(const CreationContext& creation_context) {
|
||||
Status LSTM::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetLSTMCode(definition_, *creation_context.device);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status LSTM::BindArguments() {
|
||||
Status LSTM::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
|
||||
@ -137,7 +137,8 @@ absl::Status LSTM::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->Batch()));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 LSTM::GetGridSize() const {
|
||||
@ -147,12 +148,12 @@ int3 LSTM::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status LSTM::Tune(const TuningParameters& params) {
|
||||
Status LSTM::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status LSTM::AddToQueue(CLCommandQueue* queue) {
|
||||
Status LSTM::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
@ -28,9 +28,9 @@ namespace cl {
|
||||
class LSTM : public GPUOperation {
|
||||
public:
|
||||
explicit LSTM(const OperationDef& definition);
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
LSTM(LSTM&& kernel);
|
||||
@ -39,7 +39,7 @@ class LSTM : public GPUOperation {
|
||||
LSTM& operator=(const LSTM&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
CLKernel kernel_;
|
||||
|
@ -218,7 +218,7 @@ MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status MaxUnpooling::Compile(const CreationContext& creation_context) {
|
||||
Status MaxUnpooling::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetMaxUnpoolingKernelCode(
|
||||
definition_, *creation_context.device, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
@ -226,7 +226,7 @@ absl::Status MaxUnpooling::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status MaxUnpooling::BindArguments() {
|
||||
Status MaxUnpooling::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
|
||||
@ -237,7 +237,8 @@ absl::Status MaxUnpooling::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 MaxUnpooling::GetGridSize() const {
|
||||
@ -247,12 +248,12 @@ int3 MaxUnpooling::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status MaxUnpooling::Tune(const TuningParameters& params) {
|
||||
Status MaxUnpooling::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status MaxUnpooling::AddToQueue(CLCommandQueue* queue) {
|
||||
Status MaxUnpooling::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
@ -290,7 +291,7 @@ MaxUnpooling3D& MaxUnpooling3D::operator=(MaxUnpooling3D&& kernel) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status MaxUnpooling3D::Compile(const CreationContext& creation_context) {
|
||||
Status MaxUnpooling3D::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetMaxUnpooling3DKernelCode(
|
||||
definition_, *creation_context.device, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
@ -298,7 +299,7 @@ absl::Status MaxUnpooling3D::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status MaxUnpooling3D::BindArguments() {
|
||||
Status MaxUnpooling3D::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
|
||||
@ -315,7 +316,8 @@ absl::Status MaxUnpooling3D::BindArguments() {
|
||||
kernel_.SetBytesAuto(int4(padding_.x, padding_.y, padding_.z, 1)));
|
||||
RETURN_IF_ERROR(
|
||||
kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 MaxUnpooling3D::GetGridSize() const {
|
||||
@ -325,12 +327,12 @@ int3 MaxUnpooling3D::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status MaxUnpooling3D::Tune(const TuningParameters& params) {
|
||||
Status MaxUnpooling3D::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status MaxUnpooling3D::AddToQueue(CLCommandQueue* queue) {
|
||||
Status MaxUnpooling3D::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
@ -29,10 +29,10 @@ class MaxUnpooling : public GPUOperation {
|
||||
public:
|
||||
MaxUnpooling(const OperationDef& definition,
|
||||
const MaxUnpooling2DAttributes& attr);
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
MaxUnpooling(MaxUnpooling&& kernel);
|
||||
@ -41,7 +41,7 @@ class MaxUnpooling : public GPUOperation {
|
||||
MaxUnpooling& operator=(const MaxUnpooling&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
int2 stride_;
|
||||
@ -59,10 +59,10 @@ class MaxUnpooling3D : public GPUOperation {
|
||||
public:
|
||||
MaxUnpooling3D(const OperationDef& definition,
|
||||
const MaxUnpooling3DAttributes& attr);
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
MaxUnpooling3D(MaxUnpooling3D&& kernel);
|
||||
@ -71,7 +71,7 @@ class MaxUnpooling3D : public GPUOperation {
|
||||
MaxUnpooling3D& operator=(const MaxUnpooling3D&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
int3 stride_;
|
||||
|
@ -103,7 +103,7 @@ Mean& Mean::operator=(Mean&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Mean::Compile(const CreationContext& creation_context) {
|
||||
Status Mean::Compile(const CreationContext& creation_context) {
|
||||
if (creation_context.device->IsAdreno3xx()) {
|
||||
work_group_size_ = int3(16, 8, 1);
|
||||
}
|
||||
@ -114,7 +114,7 @@ absl::Status Mean::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Mean::BindArguments() {
|
||||
Status Mean::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
@ -124,7 +124,7 @@ absl::Status Mean::BindArguments() {
|
||||
const double size_0 = work_group_size_.x * work_group_size_.y;
|
||||
const double size_1 = total_size / size_0;
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(float2(1.0 / size_1, 1.0 / size_0)));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 Mean::GetGridSize() const {
|
||||
@ -134,7 +134,7 @@ int3 Mean::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status Mean::AddToQueue(CLCommandQueue* queue) {
|
||||
Status Mean::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
@ -30,9 +30,9 @@ class Mean : public GPUOperation {
|
||||
public:
|
||||
Mean() = default;
|
||||
explicit Mean(const OperationDef& definition) : GPUOperation(definition) {}
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Mean(Mean&& operation);
|
||||
@ -41,7 +41,7 @@ class Mean : public GPUOperation {
|
||||
Mean& operator=(const Mean&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
CLKernel kernel_;
|
||||
|
||||
|
@ -89,7 +89,7 @@ std::string MultiplyAdd::GetArgsDeclaration() const {
|
||||
return args;
|
||||
}
|
||||
|
||||
absl::Status MultiplyAdd::BindArguments(CLKernel* kernel) {
|
||||
Status MultiplyAdd::BindArguments(CLKernel* kernel) {
|
||||
if (use_mul_vec_) {
|
||||
RETURN_IF_ERROR(kernel->SetMemoryAuto(mul_vec_.GetMemoryPtr()));
|
||||
}
|
||||
@ -102,12 +102,12 @@ absl::Status MultiplyAdd::BindArguments(CLKernel* kernel) {
|
||||
if (scalar_add_.Active()) {
|
||||
RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_add_));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status MultiplyAdd::UploadMul(const MultiplyAttributes& attr,
|
||||
CalculationsPrecision scalar_precision,
|
||||
CLContext* context) {
|
||||
Status MultiplyAdd::UploadMul(const MultiplyAttributes& attr,
|
||||
CalculationsPrecision scalar_precision,
|
||||
CLContext* context) {
|
||||
auto mul = absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
|
||||
&attr.param);
|
||||
auto mul_scalar = absl::get_if<float>(&attr.param);
|
||||
@ -116,12 +116,12 @@ absl::Status MultiplyAdd::UploadMul(const MultiplyAttributes& attr,
|
||||
} else {
|
||||
scalar_mul_ = FLT(scalar_precision, *mul_scalar);
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status MultiplyAdd::UploadAdd(const AddAttributes& attr,
|
||||
CalculationsPrecision scalar_precision,
|
||||
CLContext* context) {
|
||||
Status MultiplyAdd::UploadAdd(const AddAttributes& attr,
|
||||
CalculationsPrecision scalar_precision,
|
||||
CLContext* context) {
|
||||
auto add = absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
|
||||
&attr.param);
|
||||
auto add_scalar = absl::get_if<float>(&attr.param);
|
||||
@ -130,13 +130,12 @@ absl::Status MultiplyAdd::UploadAdd(const AddAttributes& attr,
|
||||
} else {
|
||||
scalar_add_ = FLT(scalar_precision, *add_scalar);
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& attr,
|
||||
MultiplyAdd* result) {
|
||||
Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& attr, MultiplyAdd* result) {
|
||||
const auto scalar_precision = creation_context.device->IsPowerVR()
|
||||
? CalculationsPrecision::F32
|
||||
: definition.precision;
|
||||
@ -144,12 +143,12 @@ absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
RETURN_IF_ERROR(
|
||||
result->UploadMul(attr, scalar_precision, creation_context.context));
|
||||
result->SetLinkIndex(0);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const AddAttributes& attr, MultiplyAdd* result) {
|
||||
Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const AddAttributes& attr, MultiplyAdd* result) {
|
||||
const auto scalar_precision = creation_context.device->IsPowerVR()
|
||||
? CalculationsPrecision::F32
|
||||
: definition.precision;
|
||||
@ -157,14 +156,13 @@ absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
RETURN_IF_ERROR(
|
||||
result->UploadAdd(attr, scalar_precision, creation_context.context));
|
||||
result->SetLinkIndex(0);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& mul_attr,
|
||||
const AddAttributes& add_attr,
|
||||
MultiplyAdd* result) {
|
||||
Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& mul_attr,
|
||||
const AddAttributes& add_attr, MultiplyAdd* result) {
|
||||
const auto scalar_precision = creation_context.device->IsPowerVR()
|
||||
? CalculationsPrecision::F32
|
||||
: definition.precision;
|
||||
@ -174,7 +172,7 @@ absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
RETURN_IF_ERROR(
|
||||
result->UploadAdd(add_attr, scalar_precision, creation_context.context));
|
||||
result->SetLinkIndex(0);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -40,42 +40,40 @@ class MultiplyAdd : public ElementwiseOperation {
|
||||
MultiplyAdd(const MultiplyAdd&) = delete;
|
||||
MultiplyAdd& operator=(const MultiplyAdd&) = delete;
|
||||
|
||||
absl::Status UploadMul(const MultiplyAttributes& attr,
|
||||
CalculationsPrecision scalar_precision,
|
||||
CLContext* context);
|
||||
absl::Status UploadAdd(const AddAttributes& attr,
|
||||
CalculationsPrecision scalar_precision,
|
||||
CLContext* context);
|
||||
Status UploadMul(const MultiplyAttributes& attr,
|
||||
CalculationsPrecision scalar_precision, CLContext* context);
|
||||
Status UploadAdd(const AddAttributes& attr,
|
||||
CalculationsPrecision scalar_precision, CLContext* context);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadMul(const ::tflite::gpu::Tensor<Linear, T>& mul,
|
||||
CLContext* context);
|
||||
Status UploadMul(const ::tflite::gpu::Tensor<Linear, T>& mul,
|
||||
CLContext* context);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadAdd(const ::tflite::gpu::Tensor<Linear, T>& add,
|
||||
CLContext* context);
|
||||
Status UploadAdd(const ::tflite::gpu::Tensor<Linear, T>& add,
|
||||
CLContext* context);
|
||||
|
||||
void SetLinkIndex(int index) override;
|
||||
std::string GetCoreCode(const LinkingContext& context) const override;
|
||||
|
||||
std::string GetArgsDeclaration() const override;
|
||||
absl::Status BindArguments(CLKernel* kernel) override;
|
||||
Status BindArguments(CLKernel* kernel) override;
|
||||
|
||||
friend absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& attr,
|
||||
MultiplyAdd* result);
|
||||
friend Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& attr,
|
||||
MultiplyAdd* result);
|
||||
|
||||
friend absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const AddAttributes& attr,
|
||||
MultiplyAdd* result);
|
||||
friend Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const AddAttributes& attr,
|
||||
MultiplyAdd* result);
|
||||
|
||||
friend absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& mul_attr,
|
||||
const AddAttributes& add_attr,
|
||||
MultiplyAdd* result);
|
||||
friend Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& mul_attr,
|
||||
const AddAttributes& add_attr,
|
||||
MultiplyAdd* result);
|
||||
|
||||
private:
|
||||
explicit MultiplyAdd(const OperationDef& definition)
|
||||
@ -91,43 +89,41 @@ class MultiplyAdd : public ElementwiseOperation {
|
||||
FLT scalar_add_;
|
||||
};
|
||||
|
||||
absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& attr,
|
||||
MultiplyAdd* result);
|
||||
Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& attr, MultiplyAdd* result);
|
||||
|
||||
absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const AddAttributes& attr, MultiplyAdd* result);
|
||||
Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const AddAttributes& attr, MultiplyAdd* result);
|
||||
|
||||
absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& mul_attr,
|
||||
const AddAttributes& add_attr,
|
||||
MultiplyAdd* result);
|
||||
Status CreateMultiplyAdd(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const MultiplyAttributes& mul_attr,
|
||||
const AddAttributes& add_attr, MultiplyAdd* result);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status MultiplyAdd::UploadMul(const ::tflite::gpu::Tensor<Linear, T>& mul,
|
||||
CLContext* context) {
|
||||
Status MultiplyAdd::UploadMul(const ::tflite::gpu::Tensor<Linear, T>& mul,
|
||||
CLContext* context) {
|
||||
LinearStorageCreateInfo create_info;
|
||||
create_info.storage_type =
|
||||
DeduceLinearStorageType(definition_.GetPrimaryStorageType());
|
||||
create_info.data_type = definition_.GetDataType();
|
||||
RETURN_IF_ERROR(CreateLinearStorage(create_info, mul, context, &mul_vec_));
|
||||
use_mul_vec_ = true;
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
template <DataType T>
|
||||
absl::Status MultiplyAdd::UploadAdd(const ::tflite::gpu::Tensor<Linear, T>& add,
|
||||
CLContext* context) {
|
||||
Status MultiplyAdd::UploadAdd(const ::tflite::gpu::Tensor<Linear, T>& add,
|
||||
CLContext* context) {
|
||||
LinearStorageCreateInfo create_info;
|
||||
create_info.storage_type =
|
||||
DeduceLinearStorageType(definition_.GetPrimaryStorageType());
|
||||
create_info.data_type = definition_.GetDataType();
|
||||
RETURN_IF_ERROR(CreateLinearStorage(create_info, add, context, &add_vec_));
|
||||
use_add_vec_ = true;
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -169,7 +169,7 @@ Padding& Padding::operator=(Padding&& kernel) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Padding::Compile(const CreationContext& creation_context) {
|
||||
Status Padding::Compile(const CreationContext& creation_context) {
|
||||
const auto code =
|
||||
GetPaddingCode(definition_, linked_operations_, attributes_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
@ -177,7 +177,7 @@ absl::Status Padding::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Padding::BindArguments() {
|
||||
Status Padding::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
@ -187,7 +187,7 @@ absl::Status Padding::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
|
||||
const auto& prep = attributes_.prepended;
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(int4(prep.w, prep.h, prep.c, prep.b)));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 Padding::GetGridSize() const {
|
||||
@ -197,12 +197,12 @@ int3 Padding::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status Padding::Tune(const TuningParameters& params) {
|
||||
Status Padding::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status Padding::AddToQueue(CLCommandQueue* queue) {
|
||||
Status Padding::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
@ -28,10 +28,10 @@ namespace cl {
|
||||
class Padding : public GPUOperation {
|
||||
public:
|
||||
Padding(const OperationDef& definition, const PadAttributes& attr);
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Padding(Padding&& kernel);
|
||||
@ -40,7 +40,7 @@ class Padding : public GPUOperation {
|
||||
Padding& operator=(const Padding&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
PadAttributes attributes_;
|
||||
|
@ -408,7 +408,7 @@ Pooling& Pooling::operator=(Pooling&& kernel) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Pooling::Compile(const CreationContext& creation_context) {
|
||||
Status Pooling::Compile(const CreationContext& creation_context) {
|
||||
std::string code;
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_.x != 1;
|
||||
@ -423,7 +423,7 @@ absl::Status Pooling::Compile(const CreationContext& creation_context) {
|
||||
linked_operations_, output_indices_);
|
||||
break;
|
||||
default:
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"You should create another kernel with this params");
|
||||
break;
|
||||
}
|
||||
@ -432,7 +432,7 @@ absl::Status Pooling::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Pooling::BindArguments() {
|
||||
Status Pooling::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
@ -447,7 +447,7 @@ absl::Status Pooling::BindArguments() {
|
||||
kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 Pooling::GetGridSize() const {
|
||||
@ -457,12 +457,12 @@ int3 Pooling::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status Pooling::Tune(const TuningParameters& params) {
|
||||
Status Pooling::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status Pooling::AddToQueue(CLCommandQueue* queue) {
|
||||
Status Pooling::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
@ -506,7 +506,7 @@ Pooling3D& Pooling3D::operator=(Pooling3D&& kernel) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Pooling3D::Compile(const CreationContext& creation_context) {
|
||||
Status Pooling3D::Compile(const CreationContext& creation_context) {
|
||||
std::string code;
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_.x != 1;
|
||||
@ -521,7 +521,7 @@ absl::Status Pooling3D::Compile(const CreationContext& creation_context) {
|
||||
linked_operations_, output_indices_);
|
||||
break;
|
||||
default:
|
||||
return absl::InvalidArgumentError(
|
||||
return InvalidArgumentError(
|
||||
"You should create another kernel with this params");
|
||||
break;
|
||||
}
|
||||
@ -530,7 +530,7 @@ absl::Status Pooling3D::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Pooling3D::BindArguments() {
|
||||
Status Pooling3D::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
@ -550,7 +550,7 @@ absl::Status Pooling3D::BindArguments() {
|
||||
RETURN_IF_ERROR(
|
||||
kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
|
||||
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 Pooling3D::GetGridSize() const {
|
||||
@ -560,12 +560,12 @@ int3 Pooling3D::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status Pooling3D::Tune(const TuningParameters& params) {
|
||||
Status Pooling3D::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status Pooling3D::AddToQueue(CLCommandQueue* queue) {
|
||||
Status Pooling3D::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
@ -30,10 +30,10 @@ namespace cl {
|
||||
class Pooling : public GPUOperation {
|
||||
public:
|
||||
Pooling(const OperationDef& definition, const Pooling2DAttributes& attr);
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Pooling(Pooling&& kernel);
|
||||
@ -42,7 +42,7 @@ class Pooling : public GPUOperation {
|
||||
Pooling& operator=(const Pooling&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
int2 stride_;
|
||||
@ -62,10 +62,10 @@ Pooling CreatePooling(const OperationDef& definition,
|
||||
class Pooling3D : public GPUOperation {
|
||||
public:
|
||||
Pooling3D(const OperationDef& definition, const Pooling3DAttributes& attr);
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Pooling3D(Pooling3D&& kernel);
|
||||
@ -74,7 +74,7 @@ class Pooling3D : public GPUOperation {
|
||||
Pooling3D& operator=(const Pooling3D&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
int3 stride_;
|
||||
|
@ -73,21 +73,21 @@ std::string PReLU::GetArgsDeclaration() const {
|
||||
return args;
|
||||
}
|
||||
|
||||
absl::Status PReLU::BindArguments(CLKernel* kernel) {
|
||||
Status PReLU::BindArguments(CLKernel* kernel) {
|
||||
RETURN_IF_ERROR(kernel->SetMemoryAuto(alpha_.GetMemoryPtr()));
|
||||
if (clip_.Active()) {
|
||||
RETURN_IF_ERROR(kernel->SetBytesAuto(clip_));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreatePReLU(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const PReLUAttributes& attr, PReLU* result) {
|
||||
Status CreatePReLU(const CreationContext& creation_context,
|
||||
const OperationDef& definition, const PReLUAttributes& attr,
|
||||
PReLU* result) {
|
||||
auto alpha = absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
|
||||
&attr.alpha);
|
||||
if (!alpha) {
|
||||
return absl::InvalidArgumentError("Alpha is missing");
|
||||
return InvalidArgumentError("Alpha is missing");
|
||||
}
|
||||
const auto scalar_precision = creation_context.device->IsPowerVR()
|
||||
? CalculationsPrecision::F32
|
||||
@ -95,7 +95,7 @@ absl::Status CreatePReLU(const CreationContext& creation_context,
|
||||
*result = PReLU(definition, attr, scalar_precision);
|
||||
RETURN_IF_ERROR(result->UploadParameters(*alpha, creation_context.context));
|
||||
result->SetLinkIndex(0);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -44,30 +44,30 @@ class PReLU : public ElementwiseOperation {
|
||||
void SetLinkIndex(int index) override;
|
||||
std::string GetCoreCode(const LinkingContext& context) const override;
|
||||
std::string GetArgsDeclaration() const override;
|
||||
absl::Status BindArguments(CLKernel* kernel) override;
|
||||
Status BindArguments(CLKernel* kernel) override;
|
||||
|
||||
friend absl::Status CreatePReLU(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const PReLUAttributes& attr, PReLU* result);
|
||||
friend Status CreatePReLU(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const PReLUAttributes& attr, PReLU* result);
|
||||
|
||||
private:
|
||||
PReLU(const OperationDef& definition, const PReLUAttributes& attr,
|
||||
CalculationsPrecision scalar_precision);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadParameters(
|
||||
const ::tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context);
|
||||
Status UploadParameters(const ::tflite::gpu::Tensor<Linear, T>& parameters,
|
||||
CLContext* context);
|
||||
|
||||
FLT clip_;
|
||||
LinearStorage alpha_;
|
||||
};
|
||||
|
||||
absl::Status CreatePReLU(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const PReLUAttributes& attr, PReLU* result);
|
||||
Status CreatePReLU(const CreationContext& creation_context,
|
||||
const OperationDef& definition, const PReLUAttributes& attr,
|
||||
PReLU* result);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status PReLU::UploadParameters(
|
||||
Status PReLU::UploadParameters(
|
||||
const ::tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
|
||||
LinearStorageCreateInfo create_info;
|
||||
create_info.storage_type =
|
||||
@ -75,7 +75,7 @@ absl::Status PReLU::UploadParameters(
|
||||
create_info.data_type = definition_.GetPrimaryDataType();
|
||||
RETURN_IF_ERROR(
|
||||
CreateLinearStorage(create_info, parameters, context, &alpha_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -92,17 +92,17 @@ std::string QuantizeAndDequantize::GetArgsDeclaration() const {
|
||||
scale_.GetDeclaration());
|
||||
}
|
||||
|
||||
absl::Status QuantizeAndDequantize::BindArguments(CLKernel* kernel) {
|
||||
Status QuantizeAndDequantize::BindArguments(CLKernel* kernel) {
|
||||
RETURN_IF_ERROR(kernel->SetBytesAuto(min_));
|
||||
RETURN_IF_ERROR(kernel->SetBytesAuto(max_));
|
||||
RETURN_IF_ERROR(kernel->SetBytesAuto(scale_));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateQuantizeAndDequantize(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const QuantizeAndDequantizeAttributes& attr,
|
||||
QuantizeAndDequantize* result) {
|
||||
Status CreateQuantizeAndDequantize(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const QuantizeAndDequantizeAttributes& attr,
|
||||
QuantizeAndDequantize* result) {
|
||||
const auto scalar_precision = creation_context.device->IsPowerVR()
|
||||
? CalculationsPrecision::F32
|
||||
: definition.precision;
|
||||
@ -120,7 +120,7 @@ absl::Status CreateQuantizeAndDequantize(
|
||||
*result = QuantizeAndDequantize(definition, attr, scalar_precision);
|
||||
}
|
||||
result->SetLinkIndex(0);
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -57,9 +57,9 @@ class QuantizeAndDequantize : public ElementwiseOperation {
|
||||
void SetLinkIndex(int index) override;
|
||||
std::string GetCoreCode(const LinkingContext& context) const override;
|
||||
std::string GetArgsDeclaration() const override;
|
||||
absl::Status BindArguments(CLKernel* kernel) override;
|
||||
Status BindArguments(CLKernel* kernel) override;
|
||||
|
||||
friend absl::Status CreateQuantizeAndDequantize(
|
||||
friend Status CreateQuantizeAndDequantize(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const QuantizeAndDequantizeAttributes& attr,
|
||||
QuantizeAndDequantize* result);
|
||||
@ -70,26 +70,27 @@ class QuantizeAndDequantize : public ElementwiseOperation {
|
||||
CalculationsPrecision scalar_precision);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status UploadParameters(
|
||||
const ::tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context);
|
||||
Status UploadParameters(const ::tflite::gpu::Tensor<Linear, T>& parameters,
|
||||
CLContext* context);
|
||||
|
||||
FLT min_;
|
||||
FLT max_;
|
||||
FLT scale_;
|
||||
};
|
||||
|
||||
absl::Status CreateQuantizeAndDequantize(
|
||||
const CreationContext& creation_context, const OperationDef& definition,
|
||||
const QuantizeAndDequantizeAttributes& attr, QuantizeAndDequantize* result);
|
||||
Status CreateQuantizeAndDequantize(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
const QuantizeAndDequantizeAttributes& attr,
|
||||
QuantizeAndDequantize* result);
|
||||
|
||||
template <DataType T>
|
||||
absl::Status QuantizeAndDequantize::UploadParameters(
|
||||
Status QuantizeAndDequantize::UploadParameters(
|
||||
const ::tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
|
||||
LinearStorageCreateInfo create_info;
|
||||
create_info.storage_type =
|
||||
DeduceLinearStorageType(definition_.GetPrimaryStorageType());
|
||||
create_info.data_type = definition_.GetPrimaryDataType();
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -80,14 +80,14 @@ std::string ReLU::GetArgsDeclaration() const {
|
||||
return args;
|
||||
}
|
||||
|
||||
absl::Status ReLU::BindArguments(CLKernel* kernel) {
|
||||
Status ReLU::BindArguments(CLKernel* kernel) {
|
||||
if (alpha_.Active()) {
|
||||
RETURN_IF_ERROR(kernel->SetBytesAuto(alpha_));
|
||||
}
|
||||
if (clip_.Active()) {
|
||||
RETURN_IF_ERROR(kernel->SetBytesAuto(clip_));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
ReLU CreateReLU(const CreationContext& creation_context,
|
||||
|
@ -37,7 +37,7 @@ class ReLU : public ElementwiseOperation {
|
||||
void SetLinkIndex(int index) override;
|
||||
std::string GetCoreCode(const LinkingContext& context) const override;
|
||||
std::string GetArgsDeclaration() const override;
|
||||
absl::Status BindArguments(CLKernel* kernel) override;
|
||||
Status BindArguments(CLKernel* kernel) override;
|
||||
|
||||
friend ReLU CreateReLU(const CreationContext& creation_context,
|
||||
const OperationDef& definition,
|
||||
|
@ -156,7 +156,7 @@ Reshape& Reshape::operator=(Reshape&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Reshape::Compile(const CreationContext& creation_context) {
|
||||
Status Reshape::Compile(const CreationContext& creation_context) {
|
||||
const auto code = definition_.IsBatchSupported()
|
||||
? GetReshapeBatchedCode(definition_, linked_operations_)
|
||||
: GetReshapeCode(definition_, linked_operations_);
|
||||
@ -165,7 +165,7 @@ absl::Status Reshape::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Reshape::BindArguments() {
|
||||
Status Reshape::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
@ -174,7 +174,8 @@ absl::Status Reshape::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Channels()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->Channels()));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 Reshape::GetGridSize() const {
|
||||
@ -184,12 +185,12 @@ int3 Reshape::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status Reshape::Tune(const TuningParameters& params) {
|
||||
Status Reshape::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status Reshape::AddToQueue(CLCommandQueue* queue) {
|
||||
Status Reshape::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
@ -29,10 +29,10 @@ class Reshape : public GPUOperation {
|
||||
public:
|
||||
explicit Reshape(const OperationDef& definition)
|
||||
: GPUOperation(definition), work_group_size_(8, 4, 1) {}
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Reshape(Reshape&& operation);
|
||||
@ -41,7 +41,7 @@ class Reshape : public GPUOperation {
|
||||
Reshape& operator=(const Reshape&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
CLKernel kernel_;
|
||||
|
@ -120,7 +120,7 @@ Reshapex4& Reshapex4::operator=(Reshapex4&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Reshapex4::Compile(const CreationContext& creation_context) {
|
||||
Status Reshapex4::Compile(const CreationContext& creation_context) {
|
||||
const auto code = definition_.IsBatchSupported()
|
||||
? GetReshapeBatchedCode(definition_, linked_operations_)
|
||||
: GetReshapeCode(definition_, linked_operations_);
|
||||
@ -129,14 +129,15 @@ absl::Status Reshapex4::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Reshapex4::BindArguments() {
|
||||
Status Reshapex4::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
|
||||
return absl::OkStatus();
|
||||
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 Reshapex4::GetGridSize() const {
|
||||
@ -146,12 +147,12 @@ int3 Reshapex4::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status Reshapex4::Tune(const TuningParameters& params) {
|
||||
Status Reshapex4::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status Reshapex4::AddToQueue(CLCommandQueue* queue) {
|
||||
Status Reshapex4::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
@ -30,10 +30,10 @@ class Reshapex4 : public GPUOperation {
|
||||
public:
|
||||
explicit Reshapex4(const OperationDef& definition)
|
||||
: GPUOperation(definition), work_group_size_(8, 4, 1) {}
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Reshapex4(Reshapex4&& operation);
|
||||
@ -42,7 +42,7 @@ class Reshapex4 : public GPUOperation {
|
||||
Reshapex4& operator=(const Reshapex4&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
CLKernel kernel_;
|
||||
|
@ -209,7 +209,7 @@ Resize& Resize::operator=(Resize&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Resize::Compile(const CreationContext& creation_context) {
|
||||
Status Resize::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetResizeCode(definition_, attr_.type,
|
||||
attr_.half_pixel_centers, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
@ -217,7 +217,7 @@ absl::Status Resize::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Resize::BindArguments() {
|
||||
Status Resize::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
@ -230,7 +230,7 @@ absl::Status Resize::BindArguments() {
|
||||
float2(CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_),
|
||||
CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(scale_factor));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 Resize::GetGridSize() const {
|
||||
@ -240,12 +240,12 @@ int3 Resize::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status Resize::AddToQueue(CLCommandQueue* queue) {
|
||||
Status Resize::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status Resize::Tune(const TuningParameters& params) {
|
||||
Status Resize::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
@ -271,7 +271,7 @@ Resize3D& Resize3D::operator=(Resize3D&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Resize3D::Compile(const CreationContext& creation_context) {
|
||||
Status Resize3D::Compile(const CreationContext& creation_context) {
|
||||
const auto code =
|
||||
GetResize3DCode(definition_, attr_.type, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
@ -279,7 +279,7 @@ absl::Status Resize3D::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Resize3D::BindArguments() {
|
||||
Status Resize3D::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
@ -296,7 +296,7 @@ absl::Status Resize3D::BindArguments() {
|
||||
CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_),
|
||||
CalculateResizeScale(src_[0]->Depth(), dst_[0]->Depth(), attr_), 1.0f);
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(scale_factor));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 Resize3D::GetGridSize() const {
|
||||
@ -306,12 +306,12 @@ int3 Resize3D::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status Resize3D::AddToQueue(CLCommandQueue* queue) {
|
||||
Status Resize3D::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status Resize3D::Tune(const TuningParameters& params) {
|
||||
Status Resize3D::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
@ -27,10 +27,10 @@ namespace cl {
|
||||
|
||||
class Resize : public GPUOperation {
|
||||
public:
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Resize(Resize&& operation);
|
||||
@ -45,7 +45,7 @@ class Resize : public GPUOperation {
|
||||
Resize(const OperationDef& definition, const Resize2DAttributes& attr)
|
||||
: GPUOperation(definition), attr_(attr) {}
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Resize2DAttributes attr_;
|
||||
@ -58,10 +58,10 @@ Resize CreateResize(const OperationDef& definition,
|
||||
|
||||
class Resize3D : public GPUOperation {
|
||||
public:
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Resize3D(Resize3D&& operation);
|
||||
@ -76,7 +76,7 @@ class Resize3D : public GPUOperation {
|
||||
Resize3D(const OperationDef& definition, const Resize3DAttributes& attr)
|
||||
: GPUOperation(definition), attr_(attr) {}
|
||||
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
Resize3DAttributes attr_;
|
||||
|
@ -79,14 +79,14 @@ Softmax& Softmax::operator=(Softmax&& kernel) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Softmax::Compile(const CreationContext& creation_context) {
|
||||
Status Softmax::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetSoftmaxKernelCode(definition_, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Softmax::BindArguments() {
|
||||
Status Softmax::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
@ -94,7 +94,7 @@ absl::Status Softmax::BindArguments() {
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
|
||||
RETURN_IF_ERROR(
|
||||
kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 Softmax::GetGridSize() const {
|
||||
@ -104,12 +104,12 @@ int3 Softmax::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status Softmax::Tune(const TuningParameters& params) {
|
||||
Status Softmax::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status Softmax::AddToQueue(CLCommandQueue* queue) {
|
||||
Status Softmax::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
@ -30,10 +30,10 @@ class Softmax : public GPUOperation {
|
||||
public:
|
||||
Softmax() = default;
|
||||
explicit Softmax(const OperationDef& definition) : GPUOperation(definition) {}
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Softmax(Softmax&& kernel);
|
||||
@ -44,7 +44,7 @@ class Softmax : public GPUOperation {
|
||||
friend Softmax CreateSoftmax();
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
CLKernel kernel_;
|
||||
int3 work_group_size_ = int3(8, 4, 1);
|
||||
|
@ -115,14 +115,14 @@ Softmax1x1& Softmax1x1::operator=(Softmax1x1&& kernel) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status Softmax1x1::Compile(const CreationContext& creation_context) {
|
||||
Status Softmax1x1::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetSoftmaxKernelCode(definition_, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
|
||||
Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
|
@ -30,9 +30,9 @@ class Softmax1x1 : public GPUOperation {
|
||||
Softmax1x1() = default;
|
||||
explicit Softmax1x1(const OperationDef& definition)
|
||||
: GPUOperation(definition) {}
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
// Move only
|
||||
Softmax1x1(Softmax1x1&& kernel);
|
||||
|
@ -96,14 +96,14 @@ SpaceToDepth& SpaceToDepth::operator=(SpaceToDepth&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status SpaceToDepth::Compile(const CreationContext& creation_context) {
|
||||
Status SpaceToDepth::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetSpaceToDepthCode(definition_, linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status SpaceToDepth::BindArguments() {
|
||||
Status SpaceToDepth::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
@ -121,12 +121,12 @@ int3 SpaceToDepth::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status SpaceToDepth::Tune(const TuningParameters& params) {
|
||||
Status SpaceToDepth::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status SpaceToDepth::AddToQueue(CLCommandQueue* queue) {
|
||||
Status SpaceToDepth::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
@ -30,9 +30,9 @@ class SpaceToDepth : public GPUOperation {
|
||||
public:
|
||||
SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr)
|
||||
: GPUOperation(op_def), attr_(attr), work_group_size_(8, 4, 1) {}
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) override;
|
||||
absl::Status Tune(const TuningParameters& params) override;
|
||||
absl::Status Compile(const CreationContext& creation_context) override;
|
||||
Status AddToQueue(CLCommandQueue* queue) override;
|
||||
Status Tune(const TuningParameters& params) override;
|
||||
Status Compile(const CreationContext& creation_context) override;
|
||||
|
||||
SpaceToDepth(SpaceToDepth&& operation);
|
||||
SpaceToDepth& operator=(SpaceToDepth&& operation);
|
||||
@ -40,7 +40,7 @@ class SpaceToDepth : public GPUOperation {
|
||||
SpaceToDepth& operator=(const SpaceToDepth&) = delete;
|
||||
|
||||
private:
|
||||
absl::Status BindArguments();
|
||||
Status BindArguments();
|
||||
int3 GetGridSize() const;
|
||||
|
||||
SpaceToDepthAttributes attr_;
|
||||
|
@ -166,7 +166,7 @@ StridedSlice& StridedSlice::operator=(StridedSlice&& operation) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
absl::Status StridedSlice::Compile(const CreationContext& creation_context) {
|
||||
Status StridedSlice::Compile(const CreationContext& creation_context) {
|
||||
const auto code = GetStridedSliceCode(definition_, Is4Aligned(attributes_),
|
||||
linked_operations_);
|
||||
return creation_context.cache->GetOrCreateCLKernel(
|
||||
@ -174,7 +174,7 @@ absl::Status StridedSlice::Compile(const CreationContext& creation_context) {
|
||||
*creation_context.device, &kernel_);
|
||||
}
|
||||
|
||||
absl::Status StridedSlice::BindArguments() {
|
||||
Status StridedSlice::BindArguments() {
|
||||
kernel_.ResetBindingCounter();
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
||||
@ -187,7 +187,7 @@ absl::Status StridedSlice::BindArguments() {
|
||||
attributes_.strides.c, attributes_.strides.b)));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
|
||||
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
|
||||
return absl::OkStatus();
|
||||
return OkStatus();
|
||||
}
|
||||
|
||||
int3 StridedSlice::GetGridSize() const {
|
||||
@ -197,12 +197,12 @@ int3 StridedSlice::GetGridSize() const {
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
absl::Status StridedSlice::Tune(const TuningParameters& params) {
|
||||
Status StridedSlice::Tune(const TuningParameters& params) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
|
||||
}
|
||||
|
||||
absl::Status StridedSlice::AddToQueue(CLCommandQueue* queue) {
|
||||
Status StridedSlice::AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(BindArguments());
|
||||
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user