FullyConnected converted to new style.
Buffer implements GPUObject. PiperOrigin-RevId: 317746356 Change-Id: I715d37a924298e54e9fe68e5c7b719ab592237a1
This commit is contained in:
parent
cf15e5cf15
commit
763a82c83f
@ -81,6 +81,7 @@ cc_library(
|
|||||||
deps = [
|
deps = [
|
||||||
":cl_command_queue",
|
":cl_command_queue",
|
||||||
":cl_context",
|
":cl_context",
|
||||||
|
":gpu_object",
|
||||||
":opencl_wrapper",
|
":opencl_wrapper",
|
||||||
":util",
|
":util",
|
||||||
"//tensorflow/lite/delegates/gpu/common:status",
|
"//tensorflow/lite/delegates/gpu/common:status",
|
||||||
|
@ -746,6 +746,9 @@ absl::Status Arguments::ResolveSelectorsPass(
|
|||||||
size_t close_bracket_pos;
|
size_t close_bracket_pos;
|
||||||
RETURN_IF_ERROR(ParseArgsInsideBrackets(*code, next_position,
|
RETURN_IF_ERROR(ParseArgsInsideBrackets(*code, next_position,
|
||||||
&close_bracket_pos, &args));
|
&close_bracket_pos, &args));
|
||||||
|
for (auto& arg : args) {
|
||||||
|
RETURN_IF_ERROR(ResolveSelectorsPass({}, &arg));
|
||||||
|
}
|
||||||
std::string patch;
|
std::string patch;
|
||||||
RETURN_IF_ERROR(ResolveSelector(linkables, object_name, selector_name,
|
RETURN_IF_ERROR(ResolveSelector(linkables, object_name, selector_name,
|
||||||
args, template_args, &patch));
|
args, template_args, &patch));
|
||||||
|
@ -44,6 +44,38 @@ absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
|
|||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
GPUResources BufferDescriptor::GetGPUResources(AccessType access_type) const {
|
||||||
|
GPUResources resources;
|
||||||
|
GPUBufferDescriptor desc;
|
||||||
|
desc.data_type = element_type;
|
||||||
|
desc.access_type = access_type;
|
||||||
|
desc.element_size = element_size;
|
||||||
|
resources.buffers.push_back({"buffer", desc});
|
||||||
|
return resources;
|
||||||
|
}
|
||||||
|
|
||||||
|
absl::Status BufferDescriptor::PerformSelector(
|
||||||
|
const std::string& selector, const std::vector<std::string>& args,
|
||||||
|
const std::vector<std::string>& template_args, std::string* result) const {
|
||||||
|
if (selector == "Read") {
|
||||||
|
return PerformReadSelector(args, result);
|
||||||
|
} else {
|
||||||
|
return absl::NotFoundError(absl::StrCat(
|
||||||
|
"BufferDescriptor don't have selector with name - ", selector));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
absl::Status BufferDescriptor::PerformReadSelector(
|
||||||
|
const std::vector<std::string>& args, std::string* result) const {
|
||||||
|
if (args.size() != 1) {
|
||||||
|
return absl::NotFoundError(
|
||||||
|
absl::StrCat("BufferDescriptor Read require one argument, but ",
|
||||||
|
args.size(), " was passed"));
|
||||||
|
}
|
||||||
|
*result = absl::StrCat("buffer[", args[0], "]");
|
||||||
|
return absl::OkStatus();
|
||||||
|
}
|
||||||
|
|
||||||
Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
|
Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
|
||||||
: buffer_(buffer), size_(size_in_bytes) {}
|
: buffer_(buffer), size_(size_in_bytes) {}
|
||||||
|
|
||||||
@ -71,6 +103,12 @@ void Buffer::Release() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GPUResourcesWithValue Buffer::GetGPUResources(AccessType access_type) const {
|
||||||
|
GPUResourcesWithValue resources;
|
||||||
|
resources.buffers.push_back({"buffer", buffer_});
|
||||||
|
return resources;
|
||||||
|
}
|
||||||
|
|
||||||
absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
|
absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
|
||||||
Buffer* result) {
|
Buffer* result) {
|
||||||
return CreateBuffer(size_in_bytes, true, nullptr, context, result);
|
return CreateBuffer(size_in_bytes, true, nullptr, context, result);
|
||||||
|
@ -20,6 +20,7 @@ limitations under the License.
|
|||||||
#include "absl/types/span.h"
|
#include "absl/types/span.h"
|
||||||
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
|
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
|
||||||
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
|
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
|
||||||
|
#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
|
||||||
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
|
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
|
||||||
#include "tensorflow/lite/delegates/gpu/cl/util.h"
|
#include "tensorflow/lite/delegates/gpu/cl/util.h"
|
||||||
#include "tensorflow/lite/delegates/gpu/common/status.h"
|
#include "tensorflow/lite/delegates/gpu/common/status.h"
|
||||||
@ -28,9 +29,23 @@ namespace tflite {
|
|||||||
namespace gpu {
|
namespace gpu {
|
||||||
namespace cl {
|
namespace cl {
|
||||||
|
|
||||||
|
struct BufferDescriptor : public GPUObjectDescriptor {
|
||||||
|
DataType element_type; // FLOAT32 or FLOAT16
|
||||||
|
int element_size;
|
||||||
|
|
||||||
|
absl::Status PerformSelector(const std::string& selector,
|
||||||
|
const std::vector<std::string>& args,
|
||||||
|
const std::vector<std::string>& template_args,
|
||||||
|
std::string* result) const override;
|
||||||
|
|
||||||
|
GPUResources GetGPUResources(AccessType access_type) const override;
|
||||||
|
absl::Status PerformReadSelector(const std::vector<std::string>& args,
|
||||||
|
std::string* result) const;
|
||||||
|
};
|
||||||
|
|
||||||
// Buffer represent linear GPU data storage with arbitrary data format.
|
// Buffer represent linear GPU data storage with arbitrary data format.
|
||||||
// Buffer is moveable but not copyable.
|
// Buffer is moveable but not copyable.
|
||||||
class Buffer {
|
class Buffer : public GPUObject {
|
||||||
public:
|
public:
|
||||||
Buffer() {} // just for using Buffer as a class members
|
Buffer() {} // just for using Buffer as a class members
|
||||||
Buffer(cl_mem buffer, size_t size_in_bytes);
|
Buffer(cl_mem buffer, size_t size_in_bytes);
|
||||||
@ -57,6 +72,8 @@ class Buffer {
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
|
absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
|
||||||
|
|
||||||
|
GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void Release();
|
void Release();
|
||||||
|
|
||||||
|
@ -31,14 +31,15 @@ namespace {
|
|||||||
// Good results for ~1024 x 1024 sizes, for other can be written more
|
// Good results for ~1024 x 1024 sizes, for other can be written more
|
||||||
// optimized shaders
|
// optimized shaders
|
||||||
|
|
||||||
std::string GetFullyConnectedKernelCode(
|
std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
|
||||||
const OperationDef& op_def, const LinearStorage& biases,
|
const int3& work_group_size,
|
||||||
const std::vector<ElementwiseOperation*>& linked_operations,
|
Arguments* args) {
|
||||||
const int3& work_group_size) {
|
args->AddObjectRef(
|
||||||
TensorCodeGenerator src_tensor("src_data", WHSPoint{"1", "1", "depthes.x"},
|
"src_tensor", AccessType::READ,
|
||||||
op_def.src_tensors[0]);
|
absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
|
||||||
TensorCodeGenerator dst_tensor("dst_data", WHSPoint{"1", "1", "depthes.y"},
|
args->AddObjectRef(
|
||||||
op_def.dst_tensors[0]);
|
"dst_tensor", AccessType::WRITE,
|
||||||
|
absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
|
||||||
|
|
||||||
std::string c = GetCommonDefines(op_def.precision);
|
std::string c = GetCommonDefines(op_def.precision);
|
||||||
switch (op_def.precision) {
|
switch (op_def.precision) {
|
||||||
@ -54,21 +55,16 @@ std::string GetFullyConnectedKernelCode(
|
|||||||
const std::string wg_x = std::to_string(work_group_size.x);
|
const std::string wg_x = std::to_string(work_group_size.x);
|
||||||
const std::string wg_y = std::to_string(work_group_size.y);
|
const std::string wg_y = std::to_string(work_group_size.y);
|
||||||
c += "__kernel void main_function(\n";
|
c += "__kernel void main_function(\n";
|
||||||
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
|
c += "$0) {\n";
|
||||||
c += " __global FLT16* filters, \n";
|
|
||||||
c += biases.GetDeclaration();
|
|
||||||
c += GetArgsDeclaration(linked_operations);
|
|
||||||
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
|
|
||||||
c += " int2 depthes \n";
|
|
||||||
c += ") {\n";
|
|
||||||
c += " int gid = get_global_id(0);\n";
|
c += " int gid = get_global_id(0);\n";
|
||||||
c += " bool inside = gid < depthes.y;\n";
|
c += " bool inside = gid < args.dst_tensor.Slices();\n";
|
||||||
c += " gid = min(gid, depthes.y - 1);\n";
|
c += " gid = min(gid, args.dst_tensor.Slices() - 1);\n";
|
||||||
c += " int2 tid = (int2)(get_local_id(0), get_local_id(1));\n";
|
c += " int2 tid = (int2)(get_local_id(0), get_local_id(1));\n";
|
||||||
c += " ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);\n";
|
c += " ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);\n";
|
||||||
c += " for (uint c = tid.y; c < depthes.x; c += " + wg_y + ") {\n";
|
c += " for (uint c = tid.y; c < args.src_tensor.Slices(); c += " + wg_y +
|
||||||
c += " FLT4 v = " + src_tensor.ReadWHS("0", "0", "c") + ";\n";
|
") {\n";
|
||||||
c += " FLT16 w = filters[c * depthes.y + gid];\n";
|
c += " FLT4 v = args.src_tensor.Read(0, 0, c);\n";
|
||||||
|
c += " FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);\n";
|
||||||
c += " s.x += dot(v, w.s0123);\n";
|
c += " s.x += dot(v, w.s0123);\n";
|
||||||
c += " s.y += dot(v, w.s4567);\n";
|
c += " s.y += dot(v, w.s4567);\n";
|
||||||
c += " s.z += dot(v, w.s89ab);\n";
|
c += " s.z += dot(v, w.s89ab);\n";
|
||||||
@ -81,10 +77,8 @@ std::string GetFullyConnectedKernelCode(
|
|||||||
for (int i = 1; i < work_group_size.y; ++i) {
|
for (int i = 1; i < work_group_size.y; ++i) {
|
||||||
c += " s += temp[tid.x][" + std::to_string(i) + "];\n";
|
c += " s += temp[tid.x][" + std::to_string(i) + "];\n";
|
||||||
}
|
}
|
||||||
c += " FLT4 r0 = TO_FLT4(s) + " + biases.ReadLinearFLT4("gid") + ";\n";
|
c += " FLT4 r0 = TO_FLT4(s) + args.biases.Read(gid);\n";
|
||||||
const LinkingContext context{"r0", "0", "0", "gid"};
|
c += " args.dst_tensor.Write(r0, 0, 0, gid);\n";
|
||||||
c += PostProcess(linked_operations, context);
|
|
||||||
c += " " + dst_tensor.WriteWHS("r0", "0", "0", "gid") + "\n";
|
|
||||||
c += " }\n";
|
c += " }\n";
|
||||||
c += "}\n";
|
c += "}\n";
|
||||||
|
|
||||||
@ -97,15 +91,11 @@ FullyConnected::FullyConnected(const OperationDef& definition)
|
|||||||
|
|
||||||
FullyConnected::FullyConnected(FullyConnected&& kernel)
|
FullyConnected::FullyConnected(FullyConnected&& kernel)
|
||||||
: GPUOperation(std::move(kernel)),
|
: GPUOperation(std::move(kernel)),
|
||||||
weights_(std::move(kernel.weights_)),
|
|
||||||
biases_(std::move(kernel.biases_)),
|
|
||||||
kernel_(std::move(kernel.kernel_)),
|
kernel_(std::move(kernel.kernel_)),
|
||||||
work_group_size_(kernel.work_group_size_) {}
|
work_group_size_(kernel.work_group_size_) {}
|
||||||
|
|
||||||
FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
|
FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
|
||||||
if (this != &kernel) {
|
if (this != &kernel) {
|
||||||
weights_ = std::move(kernel.weights_);
|
|
||||||
biases_ = std::move(kernel.biases_);
|
|
||||||
kernel_ = std::move(kernel.kernel_);
|
kernel_ = std::move(kernel.kernel_);
|
||||||
std::swap(work_group_size_, kernel.work_group_size_);
|
std::swap(work_group_size_, kernel.work_group_size_);
|
||||||
GPUOperation::operator=(std::move(kernel));
|
GPUOperation::operator=(std::move(kernel));
|
||||||
@ -120,8 +110,14 @@ absl::Status FullyConnected::Compile(const CreationContext& creation_context) {
|
|||||||
do {
|
do {
|
||||||
work_group_size_ = {wg_width, wg_height, 1};
|
work_group_size_ = {wg_width, wg_height, 1};
|
||||||
wg_width /= 2;
|
wg_width /= 2;
|
||||||
const auto code = GetFullyConnectedKernelCode(
|
std::string code =
|
||||||
definition_, biases_, linked_operations_, work_group_size_);
|
GetFullyConnectedKernelCode(definition_, work_group_size_, &args_);
|
||||||
|
std::string element_wise_code;
|
||||||
|
RETURN_IF_ERROR(
|
||||||
|
MergeOperations(linked_operations_, &args_, &element_wise_code));
|
||||||
|
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
|
||||||
|
{{"dst_tensor", element_wise_code}},
|
||||||
|
&code));
|
||||||
auto status = creation_context.cache->GetOrCreateCLKernel(
|
auto status = creation_context.cache->GetOrCreateCLKernel(
|
||||||
code, "main_function", *creation_context.context,
|
code, "main_function", *creation_context.context,
|
||||||
*creation_context.device, &kernel_);
|
*creation_context.device, &kernel_);
|
||||||
@ -138,14 +134,10 @@ absl::Status FullyConnected::Compile(const CreationContext& creation_context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
absl::Status FullyConnected::AddToQueue(CLCommandQueue* queue) {
|
absl::Status FullyConnected::AddToQueue(CLCommandQueue* queue) {
|
||||||
kernel_.ResetBindingCounter();
|
RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
|
||||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
|
RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
|
||||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
|
RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
|
||||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
|
RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
|
||||||
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
|
|
||||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
|
|
||||||
RETURN_IF_ERROR(
|
|
||||||
kernel_.SetBytesAuto(int2(src_[0]->Slices(), dst_[0]->Slices())));
|
|
||||||
return queue->DispatchImplicit(kernel_, {dst_[0]->Slices(), 1, 1},
|
return queue->DispatchImplicit(kernel_, {dst_[0]->Slices(), 1, 1},
|
||||||
work_group_size_);
|
work_group_size_);
|
||||||
}
|
}
|
||||||
@ -157,13 +149,18 @@ absl::Status CreateFullyConnected(const CreationContext& creation_context,
|
|||||||
*result = FullyConnected(definition);
|
*result = FullyConnected(definition);
|
||||||
RETURN_IF_ERROR(
|
RETURN_IF_ERROR(
|
||||||
result->UploadWeights(attr.weights, creation_context.context));
|
result->UploadWeights(attr.weights, creation_context.context));
|
||||||
LinearStorageCreateInfo create_info;
|
|
||||||
create_info.storage_type = LinearStorageType::TEXTURE_2D;
|
TensorLinearDescriptor desc;
|
||||||
create_info.data_type = definition.GetDataType();
|
desc.storage_type = LinearStorageType::TEXTURE_2D;
|
||||||
create_info.name = "biases";
|
desc.element_type = definition.GetDataType();
|
||||||
create_info.aligned_size = attr.weights.shape.o;
|
|
||||||
RETURN_IF_ERROR(CreateLinearStorage(
|
LinearStorage lt;
|
||||||
create_info, attr.bias, creation_context.context, &result->biases_));
|
RETURN_IF_ERROR(
|
||||||
|
CreateLinearStorage(desc, attr.bias, creation_context.context, <));
|
||||||
|
result->args_.AddObject("biases", AccessType::READ,
|
||||||
|
absl::make_unique<LinearStorage>(std::move(lt)),
|
||||||
|
absl::make_unique<TensorLinearDescriptor>(desc));
|
||||||
|
|
||||||
return absl::OkStatus();
|
return absl::OkStatus();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -61,8 +61,6 @@ class FullyConnected : public GPUOperation {
|
|||||||
void RearrangeWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
|
void RearrangeWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||||
absl::Span<S> dst);
|
absl::Span<S> dst);
|
||||||
|
|
||||||
Buffer weights_;
|
|
||||||
LinearStorage biases_;
|
|
||||||
CLKernel kernel_;
|
CLKernel kernel_;
|
||||||
int3 work_group_size_ = int3(0, 0, 0);
|
int3 work_group_size_ = int3(0, 0, 0);
|
||||||
};
|
};
|
||||||
@ -78,17 +76,30 @@ absl::Status FullyConnected::UploadWeights(
|
|||||||
|
|
||||||
const int float4_size = f32_weights ? 16 : 8;
|
const int float4_size = f32_weights ? 16 : 8;
|
||||||
|
|
||||||
if (definition_.GetDataType() == DataType::FLOAT32) {
|
BufferDescriptor desc;
|
||||||
|
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||||
|
desc.element_size = 16;
|
||||||
|
|
||||||
|
Buffer weights_buffer;
|
||||||
|
if (f32_weights) {
|
||||||
std::vector<float4> gpu_data(dst_depth * src_depth * 4);
|
std::vector<float4> gpu_data(dst_depth * src_depth * 4);
|
||||||
RearrangeWeights(weights, absl::MakeSpan(gpu_data));
|
RearrangeWeights(weights, absl::MakeSpan(gpu_data));
|
||||||
return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
|
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||||
context, &weights_);
|
gpu_data.data(), context,
|
||||||
|
&weights_buffer));
|
||||||
} else {
|
} else {
|
||||||
std::vector<half4> gpu_data(dst_depth * src_depth * 4);
|
std::vector<half4> gpu_data(dst_depth * src_depth * 4);
|
||||||
RearrangeWeights(weights, absl::MakeSpan(gpu_data));
|
RearrangeWeights(weights, absl::MakeSpan(gpu_data));
|
||||||
return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
|
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||||
context, &weights_);
|
gpu_data.data(), context,
|
||||||
|
&weights_buffer));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
args_.AddObject("weights", AccessType::READ,
|
||||||
|
absl::make_unique<Buffer>(std::move(weights_buffer)),
|
||||||
|
absl::make_unique<BufferDescriptor>(desc));
|
||||||
|
|
||||||
|
return absl::OkStatus();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <DataType T, typename S>
|
template <DataType T, typename S>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user