Added CPU representation for Buffer and Texture2D.
Removed many OpenCL APIs calls from operations. PiperOrigin-RevId: 327071360 Change-Id: I85b9ade32ff49325ddaed43cb64c2c97c2054ec5
This commit is contained in:
parent
387414fa32
commit
2d98952a90
tensorflow/lite/delegates/gpu/cl
BUILDarguments.ccarguments.hbuffer.ccbuffer.hgpu_object.h
kernels
conv_3d.hconv_buffer_1x1.hconv_constants.hconv_powervr.hconv_texture.hconvolution_transposed.ccconvolution_transposed.hconvolution_transposed_3d.hconvolution_transposed_3x3.hconvolution_transposed_3x3_thin.hconvolution_transposed_4x4.hconvolution_transposed_thin.hdepthwise_conv.hdepthwise_conv_3x3.hfully_connected.hgpu_operation.cc
texture2d.cctexture2d.hspecial
@ -353,6 +353,7 @@ cc_library(
|
||||
srcs = ["gpu_object.cc"],
|
||||
hdrs = ["gpu_object.h"],
|
||||
deps = [
|
||||
":cl_context",
|
||||
":opencl_wrapper",
|
||||
"//tensorflow/lite/delegates/gpu/common:access_type",
|
||||
"//tensorflow/lite/delegates/gpu/common:data_type",
|
||||
|
@ -263,6 +263,12 @@ void Arguments::AddObject(const std::string& name, AccessType access_type,
|
||||
objects_[name] = {std::move(object), std::move(descriptor_ptr)};
|
||||
}
|
||||
|
||||
void Arguments::AddObject(const std::string& name,
|
||||
GPUObjectDescriptorPtr&& descriptor_ptr) {
|
||||
descriptor_ptr->SetAccess(AccessType::READ);
|
||||
objects_[name] = {nullptr, std::move(descriptor_ptr)};
|
||||
}
|
||||
|
||||
void Arguments::AddGPUResources(const std::string& name,
|
||||
const GPUResources& resources) {
|
||||
for (const auto& r : resources.ints) {
|
||||
@ -840,6 +846,15 @@ absl::Status Arguments::ResolveSelectorsPass(
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status Arguments::AllocateObjects(CLContext* context) {
|
||||
for (auto& t : objects_) {
|
||||
RETURN_IF_ERROR(
|
||||
t.second.descriptor->CreateGPUObject(context, &t.second.obj_ptr));
|
||||
t.second.descriptor->Release();
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status Arguments::AddObjectArgs() {
|
||||
for (auto& t : objects_) {
|
||||
AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
|
||||
|
@ -54,6 +54,8 @@ class Arguments {
|
||||
void AddObject(const std::string& name, AccessType access_type,
|
||||
GPUObjectPtr&& object,
|
||||
GPUObjectDescriptorPtr&& descriptor_ptr);
|
||||
void AddObject(const std::string& name,
|
||||
GPUObjectDescriptorPtr&& descriptor_ptr);
|
||||
|
||||
absl::Status SetInt(const std::string& name, int value);
|
||||
absl::Status SetFloat(const std::string& name, float value);
|
||||
@ -73,6 +75,7 @@ class Arguments {
|
||||
void RenameArgs(const std::string& postfix, std::string* code) const;
|
||||
absl::Status Merge(Arguments&& args, const std::string& postfix);
|
||||
|
||||
absl::Status AllocateObjects(CLContext* context);
|
||||
absl::Status TransformToCLCode(
|
||||
const DeviceInfo& device_info,
|
||||
const std::map<std::string, std::string>& linkables, std::string* code);
|
||||
|
@ -47,6 +47,30 @@ absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
|
||||
}
|
||||
} // namespace
|
||||
|
||||
BufferDescriptor::BufferDescriptor(BufferDescriptor&& desc)
|
||||
: GPUObjectDescriptor(std::move(desc)),
|
||||
element_type(desc.element_type),
|
||||
element_size(desc.element_size),
|
||||
memory_type(desc.memory_type),
|
||||
attributes(std::move(desc.attributes)),
|
||||
size(desc.size),
|
||||
data(std::move(desc.data)) {}
|
||||
|
||||
BufferDescriptor& BufferDescriptor::operator=(BufferDescriptor&& desc) {
|
||||
if (this != &desc) {
|
||||
std::swap(element_type, desc.element_type);
|
||||
std::swap(element_size, desc.element_size);
|
||||
std::swap(memory_type, desc.memory_type);
|
||||
attributes = std::move(desc.attributes);
|
||||
std::swap(size, desc.size);
|
||||
data = std::move(desc.data);
|
||||
GPUObjectDescriptor::operator=(std::move(desc));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
void BufferDescriptor::Release() { data.clear(); }
|
||||
|
||||
GPUResources BufferDescriptor::GetGPUResources() const {
|
||||
GPUResources resources;
|
||||
GPUBufferDescriptor desc;
|
||||
@ -115,6 +139,14 @@ absl::Status BufferDescriptor::PerformGetPtrSelector(
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status BufferDescriptor::CreateGPUObject(CLContext* context,
|
||||
GPUObjectPtr* result) const {
|
||||
Buffer gpu_buffer;
|
||||
RETURN_IF_ERROR(gpu_buffer.CreateFromBufferDescriptor(*this, context));
|
||||
*result = absl::make_unique<Buffer>(std::move(gpu_buffer));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
|
||||
: buffer_(buffer), size_(size_in_bytes) {}
|
||||
|
||||
@ -151,6 +183,32 @@ absl::Status Buffer::GetGPUResources(const GPUObjectDescriptor* obj_ptr,
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status Buffer::CreateFromBufferDescriptor(const BufferDescriptor& desc,
|
||||
CLContext* context) {
|
||||
cl_mem_flags flags = desc.memory_type == MemoryType::CONSTANT
|
||||
? CL_MEM_READ_ONLY
|
||||
: CL_MEM_READ_WRITE;
|
||||
if (!desc.data.empty()) {
|
||||
flags |= CL_MEM_COPY_HOST_PTR;
|
||||
}
|
||||
cl_int error_code;
|
||||
size_ = desc.size;
|
||||
if (desc.data.empty()) {
|
||||
buffer_ = clCreateBuffer(context->context(), flags, desc.size, nullptr,
|
||||
&error_code);
|
||||
} else {
|
||||
buffer_ = clCreateBuffer(context->context(), flags, desc.size,
|
||||
const_cast<unsigned char*>(desc.data.data()),
|
||||
&error_code);
|
||||
}
|
||||
if (!buffer_) {
|
||||
return absl::UnknownError(
|
||||
absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
|
||||
Buffer* result) {
|
||||
return CreateBuffer(size_in_bytes, true, nullptr, context, result);
|
||||
|
@ -35,6 +35,16 @@ struct BufferDescriptor : public GPUObjectDescriptor {
|
||||
MemoryType memory_type = MemoryType::GLOBAL;
|
||||
std::vector<std::string> attributes;
|
||||
|
||||
// optional
|
||||
int size = 0;
|
||||
std::vector<uint8_t> data;
|
||||
|
||||
BufferDescriptor() = default;
|
||||
BufferDescriptor(const BufferDescriptor&) = default;
|
||||
BufferDescriptor& operator=(const BufferDescriptor&) = default;
|
||||
BufferDescriptor(BufferDescriptor&& desc);
|
||||
BufferDescriptor& operator=(BufferDescriptor&& desc);
|
||||
|
||||
absl::Status PerformSelector(const std::string& selector,
|
||||
const std::vector<std::string>& args,
|
||||
const std::vector<std::string>& template_args,
|
||||
@ -46,6 +56,10 @@ struct BufferDescriptor : public GPUObjectDescriptor {
|
||||
absl::Status PerformGetPtrSelector(
|
||||
const std::vector<std::string>& args,
|
||||
const std::vector<std::string>& template_args, std::string* result) const;
|
||||
|
||||
absl::Status CreateGPUObject(CLContext* context,
|
||||
GPUObjectPtr* result) const override;
|
||||
void Release() override;
|
||||
};
|
||||
|
||||
// Buffer represent linear GPU data storage with arbitrary data format.
|
||||
@ -80,6 +94,9 @@ class Buffer : public GPUObject {
|
||||
absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
|
||||
GPUResourcesWithValue* resources) const override;
|
||||
|
||||
absl::Status CreateFromBufferDescriptor(const BufferDescriptor& desc,
|
||||
CLContext* context);
|
||||
|
||||
private:
|
||||
void Release();
|
||||
|
||||
|
@ -21,6 +21,7 @@ limitations under the License.
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/access_type.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
|
||||
@ -119,9 +120,21 @@ struct GPUResourcesWithValue {
|
||||
std::vector<std::pair<std::string, cl_mem>> custom_memories;
|
||||
};
|
||||
|
||||
class GPUObject;
|
||||
|
||||
class GPUObjectDescriptor {
|
||||
public:
|
||||
GPUObjectDescriptor() = default;
|
||||
GPUObjectDescriptor(const GPUObjectDescriptor&) = default;
|
||||
GPUObjectDescriptor& operator=(const GPUObjectDescriptor&) = default;
|
||||
GPUObjectDescriptor(GPUObjectDescriptor&& obj_desc)
|
||||
: state_vars_(std::move(obj_desc.state_vars_)) {}
|
||||
GPUObjectDescriptor& operator=(GPUObjectDescriptor&& obj_desc) {
|
||||
if (this != &obj_desc) {
|
||||
state_vars_ = std::move(obj_desc.state_vars_);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
virtual ~GPUObjectDescriptor() = default;
|
||||
|
||||
void SetStateVar(const std::string& key, const std::string& value) const {
|
||||
@ -141,6 +154,12 @@ class GPUObjectDescriptor {
|
||||
}
|
||||
virtual GPUResources GetGPUResources() const { return GPUResources(); }
|
||||
|
||||
virtual absl::Status CreateGPUObject(
|
||||
CLContext* context, std::unique_ptr<GPUObject>* result) const {
|
||||
return absl::OkStatus();
|
||||
}
|
||||
virtual void Release() {}
|
||||
|
||||
void SetAccess(AccessType access_type) { access_type_ = access_type; }
|
||||
AccessType GetAccess() const { return access_type_; }
|
||||
|
||||
|
@ -155,83 +155,57 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
|
||||
|
||||
const int float4_size = f32_weights ? 16 : 8;
|
||||
|
||||
Texture2D weights_0;
|
||||
Texture2D weights_1;
|
||||
Texture2D weights_2;
|
||||
Texture2D weights_3;
|
||||
Buffer weights_buf;
|
||||
std::vector<uint8_t> data(float4_size * elements_count);
|
||||
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
if (conv_params_.AreWeightsBuffer()) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data(), context, &weights_0));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height, context,
|
||||
&weights_1));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height * 2, context,
|
||||
&weights_2));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height * 3, context,
|
||||
&weights_3));
|
||||
}
|
||||
float4* ptr = reinterpret_cast<float4*>(data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
if (conv_params_.AreWeightsBuffer()) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data(), context, &weights_0));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height, context,
|
||||
&weights_1));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height * 2, context,
|
||||
&weights_2));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height * 3, context,
|
||||
&weights_3));
|
||||
}
|
||||
half4* ptr = reinterpret_cast<half4*>(data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
|
||||
}
|
||||
|
||||
if (conv_params_.AreWeightsBuffer()) {
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 4;
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buf)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
desc.size = float4_size * elements_count;
|
||||
desc.data = std::move(data);
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
} else {
|
||||
Texture2DDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
args_.AddObject("weights0", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_0)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights1", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_1)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights2", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_2)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights3", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_3)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
int sub_size = float4_size * elements_count / 4;
|
||||
Texture2DDescriptor desc0;
|
||||
desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc0.size = int2(texture_width, texture_height);
|
||||
desc0.data.resize(sub_size);
|
||||
memcpy(desc0.data.data(), data.data(), sub_size);
|
||||
args_.AddObject("weights0",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
|
||||
|
||||
Texture2DDescriptor desc1;
|
||||
desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc1.size = int2(texture_width, texture_height);
|
||||
desc1.data.resize(sub_size);
|
||||
memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
|
||||
args_.AddObject("weights1",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
|
||||
|
||||
Texture2DDescriptor desc2;
|
||||
desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc2.size = int2(texture_width, texture_height);
|
||||
desc2.data.resize(sub_size);
|
||||
memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
|
||||
args_.AddObject("weights2",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
|
||||
|
||||
Texture2DDescriptor desc3;
|
||||
desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc3.size = int2(texture_width, texture_height);
|
||||
desc3.data.resize(sub_size);
|
||||
memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
|
||||
args_.AddObject("weights3",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
|
@ -150,31 +150,25 @@ absl::Status ConvBuffer1x1::UploadWeights(
|
||||
const int elements_count =
|
||||
weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
|
||||
|
||||
Buffer weights_buffer;
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data(elements_count);
|
||||
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
|
||||
absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buffer));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(elements_count);
|
||||
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
|
||||
absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buffer));
|
||||
}
|
||||
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 16;
|
||||
desc.memory_type = MemoryType::GLOBAL;
|
||||
desc.size = float4_size * elements_count;
|
||||
desc.data.resize(desc.size);
|
||||
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buffer)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
if (f32_weights) {
|
||||
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
|
||||
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
|
||||
absl::MakeSpan(ptr, elements_count));
|
||||
} else {
|
||||
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
|
||||
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
|
||||
absl::MakeSpan(ptr, elements_count));
|
||||
}
|
||||
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
|
@ -82,31 +82,26 @@ absl::Status ConvConstants::UploadWeights(
|
||||
const int kernel_y = weights.shape.h;
|
||||
|
||||
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
const int float_size = f32_weights ? 4 : 2;
|
||||
const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
|
||||
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 4;
|
||||
desc.memory_type = MemoryType::CONSTANT;
|
||||
desc.size = float_size * float_count;
|
||||
desc.data.resize(desc.size);
|
||||
|
||||
const int float_size = f32_weights ? 4 : 2;
|
||||
const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
|
||||
|
||||
Buffer weights_buffer;
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data(float_count / 4);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(
|
||||
float_size * float_count, gpu_data.data(), context, &weights_buffer));
|
||||
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(float_count / 4);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(
|
||||
float_size * float_count, gpu_data.data(), context, &weights_buffer));
|
||||
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
|
||||
}
|
||||
|
||||
args_.AddObject("weigths", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buffer)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
args_.AddObject("weigths",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
@ -245,30 +245,25 @@ absl::Status ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
|
||||
ConvPowerVR::WeightsUploadType::CONSTANT_MEM
|
||||
? MemoryType::CONSTANT
|
||||
: MemoryType::GLOBAL;
|
||||
|
||||
Buffer bias_buffer;
|
||||
const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
|
||||
? sizeof(float)
|
||||
: sizeof(half);
|
||||
int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.z);
|
||||
desc.size = float_size * aligned_channels;
|
||||
desc.data.resize(desc.size);
|
||||
if (conv_params_.weights_data_type == DataType::FLOAT32) {
|
||||
std::vector<float> gpu_data(aligned_channels);
|
||||
for (int i = 0; i < gpu_data.size(); ++i) {
|
||||
float* gpu_data = reinterpret_cast<float*>(desc.data.data());
|
||||
for (int i = 0; i < aligned_channels; ++i) {
|
||||
gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
|
||||
}
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float) * gpu_data.size(),
|
||||
gpu_data.data(), context,
|
||||
&bias_buffer));
|
||||
} else {
|
||||
std::vector<half> gpu_data(aligned_channels);
|
||||
for (int i = 0; i < gpu_data.size(); ++i) {
|
||||
half* gpu_data = reinterpret_cast<half*>(desc.data.data());
|
||||
for (int i = 0; i < aligned_channels; ++i) {
|
||||
gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
|
||||
}
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half) * gpu_data.size(),
|
||||
gpu_data.data(), context,
|
||||
&bias_buffer));
|
||||
}
|
||||
|
||||
args_.AddObject("biases", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(bias_buffer)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
args_.AddObject("biases",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
@ -285,23 +280,6 @@ absl::Status ConvPowerVR::UploadWeights(
|
||||
const int elements_count =
|
||||
weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
|
||||
|
||||
Buffer weights_buffer;
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data(elements_count);
|
||||
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
|
||||
absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buffer));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(elements_count);
|
||||
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
|
||||
absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buffer));
|
||||
}
|
||||
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = conv_params_.weights_data_type;
|
||||
desc.element_size = 4;
|
||||
@ -309,10 +287,20 @@ absl::Status ConvPowerVR::UploadWeights(
|
||||
ConvPowerVR::WeightsUploadType::CONSTANT_MEM
|
||||
? MemoryType::CONSTANT
|
||||
: MemoryType::GLOBAL;
|
||||
desc.size = float4_size * elements_count;
|
||||
desc.data.resize(desc.size);
|
||||
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buffer)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
if (f32_weights) {
|
||||
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
|
||||
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
|
||||
absl::MakeSpan(ptr, elements_count));
|
||||
} else {
|
||||
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
|
||||
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
|
||||
absl::MakeSpan(ptr, elements_count));
|
||||
}
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
|
@ -169,69 +169,56 @@ absl::Status ConvTexture::UploadWeights(
|
||||
DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
|
||||
const int elements_count = texture_width * texture_height;
|
||||
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
|
||||
|
||||
Texture2DDescriptor desc;
|
||||
desc.element_type = data_type;
|
||||
Texture2DDescriptor desc0;
|
||||
desc0.element_type = data_type;
|
||||
desc0.size = int2(texture_width, texture_height);
|
||||
desc0.data.resize(elements_count * float4_size);
|
||||
|
||||
Texture2D weights_0;
|
||||
Texture2D weights_1;
|
||||
Texture2D weights_2;
|
||||
Texture2D weights_3;
|
||||
Texture2DDescriptor desc1;
|
||||
desc1.element_type = data_type;
|
||||
desc1.size = int2(texture_width, texture_height);
|
||||
desc1.data.resize(elements_count * float4_size);
|
||||
|
||||
Texture2DDescriptor desc2;
|
||||
desc2.element_type = data_type;
|
||||
desc2.size = int2(texture_width, texture_height);
|
||||
desc2.data.resize(elements_count * float4_size);
|
||||
|
||||
Texture2DDescriptor desc3;
|
||||
desc3.element_type = data_type;
|
||||
desc3.size = int2(texture_width, texture_height);
|
||||
desc3.data.resize(elements_count * float4_size);
|
||||
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data_0(elements_count);
|
||||
std::vector<float4> gpu_data_1(elements_count);
|
||||
std::vector<float4> gpu_data_2(elements_count);
|
||||
std::vector<float4> gpu_data_3(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
|
||||
absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
|
||||
absl::MakeSpan(gpu_data_3));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_0.data(),
|
||||
context, &weights_0));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_1.data(),
|
||||
context, &weights_1));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_2.data(),
|
||||
context, &weights_2));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_3.data(),
|
||||
context, &weights_3));
|
||||
float4* ptr0 = reinterpret_cast<float4*>(desc0.data.data());
|
||||
float4* ptr1 = reinterpret_cast<float4*>(desc1.data.data());
|
||||
float4* ptr2 = reinterpret_cast<float4*>(desc2.data.data());
|
||||
float4* ptr3 = reinterpret_cast<float4*>(desc3.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr0, elements_count),
|
||||
absl::MakeSpan(ptr1, elements_count),
|
||||
absl::MakeSpan(ptr2, elements_count),
|
||||
absl::MakeSpan(ptr3, elements_count));
|
||||
} else {
|
||||
std::vector<half4> gpu_data_0(elements_count);
|
||||
std::vector<half4> gpu_data_1(elements_count);
|
||||
std::vector<half4> gpu_data_2(elements_count);
|
||||
std::vector<half4> gpu_data_3(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
|
||||
absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
|
||||
absl::MakeSpan(gpu_data_3));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_0.data(),
|
||||
context, &weights_0));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_1.data(),
|
||||
context, &weights_1));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_2.data(),
|
||||
context, &weights_2));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
|
||||
texture_height, gpu_data_3.data(),
|
||||
context, &weights_3));
|
||||
half4* ptr0 = reinterpret_cast<half4*>(desc0.data.data());
|
||||
half4* ptr1 = reinterpret_cast<half4*>(desc1.data.data());
|
||||
half4* ptr2 = reinterpret_cast<half4*>(desc2.data.data());
|
||||
half4* ptr3 = reinterpret_cast<half4*>(desc3.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr0, elements_count),
|
||||
absl::MakeSpan(ptr1, elements_count),
|
||||
absl::MakeSpan(ptr2, elements_count),
|
||||
absl::MakeSpan(ptr3, elements_count));
|
||||
}
|
||||
|
||||
args_.AddObject("weights0", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_0)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights1", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_1)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights2", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_2)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights3", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_3)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights0",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
|
||||
args_.AddObject("weights1",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
|
||||
args_.AddObject("weights2",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
|
||||
args_.AddObject("weights3",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
|
@ -168,7 +168,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
|
||||
"args.dst_tensor.Height() || dst_z >= "
|
||||
"args.dst_tensor.Slices()) return;\n";
|
||||
if (weights_are_buffer) {
|
||||
c += " int f_base = dst_z * args.src_tensor.Slice() * args.kernel_size_x "
|
||||
c += " int f_base = dst_z * args.src_tensor.Slices() * args.kernel_size_x "
|
||||
"* args.kernel_size_y;\n";
|
||||
}
|
||||
for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
|
||||
|
@ -89,91 +89,62 @@ absl::Status ConvolutionTransposed::UploadWeights(
|
||||
const int src_depth = DivideRoundUp(weights.shape.i, 4);
|
||||
const int kernel_x = kernel_size_.x;
|
||||
const int kernel_y = kernel_size_.y;
|
||||
int texture_width = dst_depth;
|
||||
int texture_height = src_depth * kernel_x * kernel_y;
|
||||
|
||||
const int elements_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
|
||||
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
|
||||
const int float4_size = f32_weights ? 16 : 8;
|
||||
std::vector<uint8_t> data(float4_size * elements_count);
|
||||
|
||||
Texture2D weights_0;
|
||||
Texture2D weights_1;
|
||||
Texture2D weights_2;
|
||||
Texture2D weights_3;
|
||||
Buffer weights_buf;
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
if (weights_are_buffer_) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
|
||||
gpu_data.data(), context, &weights_0));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
|
||||
gpu_data.data() + texture_width * texture_height, context,
|
||||
&weights_1));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
|
||||
gpu_data.data() + texture_width * texture_height * 2, context,
|
||||
&weights_2));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
|
||||
gpu_data.data() + texture_width * texture_height * 3, context,
|
||||
&weights_3));
|
||||
}
|
||||
float4* ptr = reinterpret_cast<float4*>(data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
if (weights_are_buffer_) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
|
||||
gpu_data.data(), context, &weights_0));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
|
||||
gpu_data.data() + texture_width * texture_height, context,
|
||||
&weights_1));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
|
||||
gpu_data.data() + texture_width * texture_height * 2, context,
|
||||
&weights_2));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
|
||||
gpu_data.data() + texture_width * texture_height * 3, context,
|
||||
&weights_3));
|
||||
}
|
||||
half4* ptr = reinterpret_cast<half4*>(data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
|
||||
}
|
||||
|
||||
if (weights_are_buffer_) {
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 16;
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buf)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
desc.size = float4_size * elements_count;
|
||||
desc.data = std::move(data);
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
} else {
|
||||
Texture2DDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
args_.AddObject("weights0", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_0)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights1", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_1)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights2", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_2)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights3", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_3)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
int sub_size = float4_size * elements_count / 4;
|
||||
Texture2DDescriptor desc0;
|
||||
desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc0.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
|
||||
desc0.data.resize(sub_size);
|
||||
memcpy(desc0.data.data(), data.data(), sub_size);
|
||||
args_.AddObject("weights0",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
|
||||
|
||||
Texture2DDescriptor desc1;
|
||||
desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc1.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
|
||||
desc1.data.resize(sub_size);
|
||||
memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
|
||||
args_.AddObject("weights1",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
|
||||
|
||||
Texture2DDescriptor desc2;
|
||||
desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc2.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
|
||||
desc2.data.resize(sub_size);
|
||||
memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
|
||||
args_.AddObject("weights2",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
|
||||
|
||||
Texture2DDescriptor desc3;
|
||||
desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc3.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
|
||||
desc3.data.resize(sub_size);
|
||||
memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
|
||||
args_.AddObject("weights3",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
|
@ -97,84 +97,57 @@ absl::Status ConvolutionTransposed3D::UploadWeights(
|
||||
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
|
||||
const int float4_size = f32_weights ? 16 : 8;
|
||||
std::vector<uint8_t> data(float4_size * elements_count);
|
||||
|
||||
Texture2D weights_0;
|
||||
Texture2D weights_1;
|
||||
Texture2D weights_2;
|
||||
Texture2D weights_3;
|
||||
Buffer weights_buf;
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
if (weights_are_buffer_) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data(), context, &weights_0));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height, context,
|
||||
&weights_1));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height * 2, context,
|
||||
&weights_2));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height * 3, context,
|
||||
&weights_3));
|
||||
}
|
||||
float4* ptr = reinterpret_cast<float4*>(data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
if (weights_are_buffer_) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data(), context, &weights_0));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height, context,
|
||||
&weights_1));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height * 2, context,
|
||||
&weights_2));
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data() + texture_width * texture_height * 3, context,
|
||||
&weights_3));
|
||||
}
|
||||
half4* ptr = reinterpret_cast<half4*>(data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
|
||||
}
|
||||
|
||||
if (weights_are_buffer_) {
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 16;
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buf)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
desc.size = float4_size * elements_count;
|
||||
desc.data = std::move(data);
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
} else {
|
||||
Texture2DDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
args_.AddObject("weights0", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_0)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights1", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_1)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights2", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_2)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
args_.AddObject("weights3", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_3)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
int sub_size = float4_size * elements_count / 4;
|
||||
Texture2DDescriptor desc0;
|
||||
desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc0.size = int2(texture_width, texture_height);
|
||||
desc0.data.resize(sub_size);
|
||||
memcpy(desc0.data.data(), data.data(), sub_size);
|
||||
args_.AddObject("weights0",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
|
||||
|
||||
Texture2DDescriptor desc1;
|
||||
desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc1.size = int2(texture_width, texture_height);
|
||||
desc1.data.resize(sub_size);
|
||||
memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
|
||||
args_.AddObject("weights1",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
|
||||
|
||||
Texture2DDescriptor desc2;
|
||||
desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc2.size = int2(texture_width, texture_height);
|
||||
desc2.data.resize(sub_size);
|
||||
memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
|
||||
args_.AddObject("weights2",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
|
||||
|
||||
Texture2DDescriptor desc3;
|
||||
desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc3.size = int2(texture_width, texture_height);
|
||||
desc3.data.resize(sub_size);
|
||||
memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
|
||||
args_.AddObject("weights3",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
|
@ -96,19 +96,6 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
|
||||
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
|
||||
|
||||
Buffer weights_buffer;
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data(flt4_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(
|
||||
flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(flt4_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(
|
||||
flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
|
||||
}
|
||||
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 4;
|
||||
@ -117,10 +104,19 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
|
||||
ConvolutionTransposed3x3::WeightsUploadType::CONSTANT_MEM
|
||||
? MemoryType::CONSTANT
|
||||
: MemoryType::GLOBAL;
|
||||
desc.size = flt4_size * flt4_count;
|
||||
desc.data.resize(desc.size);
|
||||
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buffer)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
if (f32_weights) {
|
||||
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
|
||||
} else {
|
||||
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
|
||||
}
|
||||
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
@ -79,46 +79,41 @@ absl::Status ConvolutionTransposed3x3Thin::UploadData(
|
||||
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
|
||||
|
||||
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
|
||||
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 4;
|
||||
desc.memory_type = MemoryType::CONSTANT;
|
||||
desc.size = flt4_size * (flt4_count + dst_depth);
|
||||
desc.data.resize(desc.size);
|
||||
|
||||
Buffer weights_buffer;
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data(flt4_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
float4* gpu_data = reinterpret_cast<float4*>(desc.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
|
||||
for (int i = 0; i < dst_depth; ++i) {
|
||||
float4 bias_value(0.0f);
|
||||
for (int c = 0; c < 4; ++c) {
|
||||
int ch = i * 4 + c;
|
||||
bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
|
||||
}
|
||||
gpu_data.push_back(bias_value);
|
||||
gpu_data[flt4_count + i] = bias_value;
|
||||
}
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float4) * gpu_data.size(),
|
||||
gpu_data.data(), context,
|
||||
&weights_buffer));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(flt4_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
half4* gpu_data = reinterpret_cast<half4*>(desc.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
|
||||
for (int i = 0; i < dst_depth; ++i) {
|
||||
half4 bias_value(0.0f);
|
||||
for (int c = 0; c < 4; ++c) {
|
||||
int ch = i * 4 + c;
|
||||
bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
|
||||
}
|
||||
gpu_data.push_back(bias_value);
|
||||
gpu_data[flt4_count + i] = bias_value;
|
||||
}
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half4) * gpu_data.size(),
|
||||
gpu_data.data(), context,
|
||||
&weights_buffer));
|
||||
}
|
||||
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buffer)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
@ -92,19 +92,6 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
|
||||
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
|
||||
|
||||
Buffer weights_buffer;
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data(flt4_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(
|
||||
flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(flt4_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(
|
||||
flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
|
||||
}
|
||||
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 4;
|
||||
@ -113,10 +100,19 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
|
||||
ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
|
||||
? MemoryType::CONSTANT
|
||||
: MemoryType::GLOBAL;
|
||||
desc.size = flt4_size * flt4_count;
|
||||
desc.data.resize(desc.size);
|
||||
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buffer)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
if (f32_weights) {
|
||||
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
|
||||
} else {
|
||||
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
|
||||
}
|
||||
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
@ -76,40 +76,35 @@ absl::Status ConvolutionTransposedThin::UploadData(
|
||||
weights.shape.w * weights.shape.h * src_depth * weights.shape.o;
|
||||
|
||||
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
|
||||
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 4;
|
||||
desc.memory_type = MemoryType::CONSTANT;
|
||||
desc.size = flt4_size * (flt4_count + 1);
|
||||
desc.data.resize(desc.size);
|
||||
|
||||
Buffer weights_buffer;
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data(flt4_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
float4* gpu_data = reinterpret_cast<float4*>(desc.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
|
||||
float4 bias_value(0.0f);
|
||||
for (int i = 0; i < weights.shape.o; ++i) {
|
||||
bias_value[i] = biases.data[i];
|
||||
}
|
||||
gpu_data.push_back(bias_value);
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float4) * gpu_data.size(),
|
||||
gpu_data.data(), context,
|
||||
&weights_buffer));
|
||||
gpu_data[flt4_count] = bias_value;
|
||||
} else {
|
||||
std::vector<half4> gpu_data(flt4_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
half4* gpu_data = reinterpret_cast<half4*>(desc.data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
|
||||
half4 bias_value(0.0f);
|
||||
for (int i = 0; i < weights.shape.o; ++i) {
|
||||
bias_value[i] = biases.data[i];
|
||||
}
|
||||
gpu_data.push_back(bias_value);
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half4) * gpu_data.size(),
|
||||
gpu_data.data(), context,
|
||||
&weights_buffer));
|
||||
gpu_data[flt4_count] = bias_value;
|
||||
}
|
||||
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buffer)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
@ -106,47 +106,29 @@ absl::Status DepthwiseConvolution::UploadWeights(
|
||||
const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
const int float4_size = fp32_weights ? 16 : 8;
|
||||
|
||||
Texture2D weights_tex2d;
|
||||
Buffer weights_buf;
|
||||
std::vector<uint8_t> data(float4_size * elements_count);
|
||||
|
||||
if (fp32_weights) {
|
||||
std::vector<float4> gpu_data(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
if (weights_are_buffer_) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
|
||||
gpu_data.data(), context, &weights_tex2d));
|
||||
}
|
||||
float4* ptr = reinterpret_cast<float4*>(data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
if (weights_are_buffer_) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
|
||||
gpu_data.data(), context, &weights_tex2d));
|
||||
}
|
||||
half4* ptr = reinterpret_cast<half4*>(data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
|
||||
}
|
||||
|
||||
if (weights_are_buffer_) {
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 4;
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buf)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
desc.size = float4_size * elements_count;
|
||||
desc.data = std::move(data);
|
||||
args_.AddObject("weights", absl::make_unique<BufferDescriptor>(desc));
|
||||
} else {
|
||||
Texture2DDescriptor desc;
|
||||
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_tex2d)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
desc.size = int2(kernel_x * kernel_y, dst_slices);
|
||||
desc.data = std::move(data);
|
||||
args_.AddObject("weights", absl::make_unique<Texture2DDescriptor>(desc));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
@ -195,47 +177,31 @@ absl::Status DepthwiseConvolution::UploadWeights(
|
||||
const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
const int float4_size = fp32_weights ? 16 : 8;
|
||||
|
||||
Texture2D weights_tex2d;
|
||||
Buffer weights_buf;
|
||||
std::vector<uint8_t> data(float4_size * elements_count);
|
||||
|
||||
if (fp32_weights) {
|
||||
std::vector<float4> gpu_data(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
if (weights_are_buffer_) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
|
||||
gpu_data.data(), context, &weights_tex2d));
|
||||
}
|
||||
float4* ptr = reinterpret_cast<float4*>(data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(elements_count);
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
|
||||
if (weights_are_buffer_) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
|
||||
gpu_data.data(), context, &weights_tex2d));
|
||||
}
|
||||
half4* ptr = reinterpret_cast<half4*>(data.data());
|
||||
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
|
||||
}
|
||||
|
||||
if (weights_are_buffer_) {
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 4;
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buf)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
desc.size = float4_size * elements_count;
|
||||
desc.data = std::move(data);
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
} else {
|
||||
Texture2DDescriptor desc;
|
||||
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_tex2d)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
desc.size = int2(kernel_x * kernel_y * kernel_z, dst_slices);
|
||||
desc.data = std::move(data);
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc)));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
|
@ -88,47 +88,32 @@ absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
|
||||
const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
|
||||
const int float4_size = fp32_weights ? 16 : 8;
|
||||
|
||||
Texture2D weights_tex2d;
|
||||
Buffer weights_buf;
|
||||
std::vector<uint8_t> data(float4_size * elements_count);
|
||||
if (fp32_weights) {
|
||||
std::vector<float4> gpu_data(elements_count);
|
||||
RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
|
||||
if (weights_are_buffer) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data(), context, &weights_tex2d));
|
||||
}
|
||||
float4* ptr = reinterpret_cast<float4*>(data.data());
|
||||
RearrangeWeightsAndBiasesData(weights, biases,
|
||||
absl::MakeSpan(ptr, elements_count));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(elements_count);
|
||||
RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
|
||||
if (weights_are_buffer) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buf));
|
||||
} else {
|
||||
RETURN_IF_ERROR(CreateTexture2DRGBA(
|
||||
definition_.GetDataType(), texture_width, texture_height,
|
||||
gpu_data.data(), context, &weights_tex2d));
|
||||
}
|
||||
half4* ptr = reinterpret_cast<half4*>(data.data());
|
||||
RearrangeWeightsAndBiasesData(weights, biases,
|
||||
absl::MakeSpan(ptr, elements_count));
|
||||
}
|
||||
|
||||
if (weights_are_buffer) {
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 4;
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buf)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
desc.size = float4_size * elements_count;
|
||||
desc.data = std::move(data);
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
} else {
|
||||
Texture2DDescriptor desc;
|
||||
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Texture2D>(std::move(weights_tex2d)),
|
||||
absl::make_unique<Texture2DDescriptor>(desc));
|
||||
desc.size = int2(texture_width, texture_height);
|
||||
desc.data = std::move(data);
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<Texture2DDescriptor>(std::move(desc)));
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
|
@ -131,26 +131,19 @@ absl::Status FullyConnected::UploadWeights(
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 16;
|
||||
desc.size = float4_size * elements_count;
|
||||
desc.data.resize(desc.size);
|
||||
|
||||
Buffer weights_buffer;
|
||||
if (f32_weights) {
|
||||
std::vector<float4> gpu_data(dst_depth * src_depth * 4);
|
||||
RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buffer));
|
||||
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
|
||||
RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(ptr, elements_count));
|
||||
} else {
|
||||
std::vector<half4> gpu_data(dst_depth * src_depth * 4);
|
||||
RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(gpu_data));
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
|
||||
gpu_data.data(), context,
|
||||
&weights_buffer));
|
||||
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
|
||||
RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(ptr, elements_count));
|
||||
}
|
||||
|
||||
args_.AddObject("weights", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(weights_buffer)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
|
||||
args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
|
@ -209,6 +209,7 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
|
||||
std::string code =
|
||||
GetElementWiseCode(definition_, check_src_channels_size_);
|
||||
elementwise_code_ = "{\n" + code_ + "\n}\n" + elementwise_code_;
|
||||
RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
|
||||
RETURN_IF_ERROR(args_.TransformToCLCode(
|
||||
creation_context.device->info_,
|
||||
{{dst_tensors_names_[0], elementwise_code_}}, &code));
|
||||
@ -217,6 +218,7 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
|
||||
code, "main_function", *creation_context.context,
|
||||
*creation_context.device, &kernel_));
|
||||
} else {
|
||||
RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
|
||||
RETURN_IF_ERROR(args_.TransformToCLCode(
|
||||
creation_context.device->info_,
|
||||
{{dst_tensors_names_[0], elementwise_code_}}, &code_));
|
||||
|
@ -93,30 +93,25 @@ absl::Status UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
|
||||
}
|
||||
}
|
||||
|
||||
Buffer constants_buf;
|
||||
const bool fp32_weights = precision == CalculationsPrecision::F32;
|
||||
const int float_size = fp32_weights ? 4 : 2;
|
||||
if (fp32_weights) {
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float_size * gpu_data.size(),
|
||||
gpu_data.data(), context,
|
||||
&constants_buf));
|
||||
} else {
|
||||
std::vector<half> gpu_data_half(gpu_data.size());
|
||||
for (int i = 0; i < gpu_data.size(); ++i) {
|
||||
gpu_data_half[i] = gpu_data[i];
|
||||
}
|
||||
RETURN_IF_ERROR(CreateReadOnlyBuffer(float_size * gpu_data_half.size(),
|
||||
gpu_data_half.data(), context,
|
||||
&constants_buf));
|
||||
}
|
||||
|
||||
BufferDescriptor desc;
|
||||
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
|
||||
desc.element_size = 4;
|
||||
desc.memory_type = MemoryType::CONSTANT;
|
||||
op->args_.AddObject("constants", AccessType::READ,
|
||||
absl::make_unique<Buffer>(std::move(constants_buf)),
|
||||
absl::make_unique<BufferDescriptor>(desc));
|
||||
desc.size = float_size * gpu_data.size();
|
||||
desc.data.resize(desc.size);
|
||||
|
||||
if (fp32_weights) {
|
||||
memcpy(desc.data.data(), gpu_data.data(), desc.size);
|
||||
} else {
|
||||
half* gpu_data_half = reinterpret_cast<half*>(desc.data.data());
|
||||
for (int i = 0; i < gpu_data.size(); ++i) {
|
||||
gpu_data_half[i] = gpu_data[i];
|
||||
}
|
||||
}
|
||||
op->args_.AddObject("constants",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
|
@ -59,6 +59,25 @@ absl::Status CreateTexture2D(int width, int height, cl_channel_type type,
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Texture2DDescriptor::Texture2DDescriptor(Texture2DDescriptor&& desc)
|
||||
: GPUObjectDescriptor(std::move(desc)),
|
||||
element_type(desc.element_type),
|
||||
size(desc.size),
|
||||
data(std::move(desc.data)) {}
|
||||
|
||||
Texture2DDescriptor& Texture2DDescriptor::operator=(
|
||||
Texture2DDescriptor&& desc) {
|
||||
if (this != &desc) {
|
||||
std::swap(element_type, desc.element_type);
|
||||
std::swap(size, desc.size);
|
||||
data = std::move(desc.data);
|
||||
GPUObjectDescriptor::operator=(std::move(desc));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
void Texture2DDescriptor::Release() { data.clear(); }
|
||||
|
||||
GPUResources Texture2DDescriptor::GetGPUResources() const {
|
||||
GPUResources resources;
|
||||
GPUImage2DDescriptor desc;
|
||||
@ -93,6 +112,14 @@ absl::Status Texture2DDescriptor::PerformReadSelector(
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status Texture2DDescriptor::CreateGPUObject(CLContext* context,
|
||||
GPUObjectPtr* result) const {
|
||||
Texture2D gpu_texture;
|
||||
RETURN_IF_ERROR(gpu_texture.CreateFromTexture2DDescriptor(*this, context));
|
||||
*result = absl::make_unique<Texture2D>(std::move(gpu_texture));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
Texture2D::Texture2D(cl_mem texture, int width, int height,
|
||||
cl_channel_type type)
|
||||
: texture_(texture), width_(width), height_(height), channel_type_(type) {}
|
||||
@ -139,6 +166,49 @@ absl::Status Texture2D::GetGPUResources(
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status Texture2D::CreateFromTexture2DDescriptor(
|
||||
const Texture2DDescriptor& tex_desc, CLContext* context) {
|
||||
cl_image_desc desc;
|
||||
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
|
||||
desc.image_width = tex_desc.size.x;
|
||||
desc.image_height = tex_desc.size.y;
|
||||
desc.image_depth = 0;
|
||||
desc.image_row_pitch = 0;
|
||||
desc.image_slice_pitch = 0;
|
||||
desc.num_mip_levels = 0;
|
||||
desc.num_samples = 0;
|
||||
desc.buffer = nullptr;
|
||||
|
||||
cl_image_format format;
|
||||
format.image_channel_order = CL_RGBA;
|
||||
format.image_channel_data_type =
|
||||
tex_desc.element_type == DataType::FLOAT32 ? CL_FLOAT : CL_HALF_FLOAT;
|
||||
|
||||
cl_mem_flags flags = CL_MEM_READ_WRITE;
|
||||
if (!tex_desc.data.empty()) {
|
||||
flags |= CL_MEM_COPY_HOST_PTR;
|
||||
}
|
||||
|
||||
cl_int error_code;
|
||||
width_ = tex_desc.size.x;
|
||||
height_ = tex_desc.size.y;
|
||||
channel_type_ = format.image_channel_data_type;
|
||||
if (tex_desc.data.empty()) {
|
||||
texture_ = CreateImage2DLegacy(context->context(), flags, &format, &desc,
|
||||
nullptr, &error_code);
|
||||
} else {
|
||||
texture_ = CreateImage2DLegacy(
|
||||
context->context(), flags, &format, &desc,
|
||||
const_cast<unsigned char*>(tex_desc.data.data()), &error_code);
|
||||
}
|
||||
if (error_code != CL_SUCCESS) {
|
||||
return absl::UnknownError(
|
||||
absl::StrCat("Failed to create 2D texture (clCreateImage): ",
|
||||
CLErrorCodeToString(error_code)));
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
// Creates new 4-channel 2D texture with f32 elements
|
||||
absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext* context,
|
||||
Texture2D* result) {
|
||||
|
@ -34,6 +34,16 @@ namespace cl {
|
||||
struct Texture2DDescriptor : public GPUObjectDescriptor {
|
||||
DataType element_type; // FLOAT32 or FLOAT16
|
||||
|
||||
// optional
|
||||
int2 size = int2(0, 0);
|
||||
std::vector<uint8_t> data;
|
||||
|
||||
Texture2DDescriptor() = default;
|
||||
Texture2DDescriptor(const Texture2DDescriptor&) = default;
|
||||
Texture2DDescriptor& operator=(const Texture2DDescriptor&) = default;
|
||||
Texture2DDescriptor(Texture2DDescriptor&& desc);
|
||||
Texture2DDescriptor& operator=(Texture2DDescriptor&& desc);
|
||||
|
||||
absl::Status PerformSelector(const std::string& selector,
|
||||
const std::vector<std::string>& args,
|
||||
const std::vector<std::string>& template_args,
|
||||
@ -42,6 +52,10 @@ struct Texture2DDescriptor : public GPUObjectDescriptor {
|
||||
GPUResources GetGPUResources() const override;
|
||||
absl::Status PerformReadSelector(const std::vector<std::string>& args,
|
||||
std::string* result) const;
|
||||
|
||||
absl::Status CreateGPUObject(CLContext* context,
|
||||
GPUObjectPtr* result) const override;
|
||||
void Release() override;
|
||||
};
|
||||
|
||||
// Texture2D represent formatted GPU data storage.
|
||||
@ -73,6 +87,9 @@ class Texture2D : public GPUObject {
|
||||
absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
|
||||
GPUResourcesWithValue* resources) const override;
|
||||
|
||||
absl::Status CreateFromTexture2DDescriptor(const Texture2DDescriptor& desc,
|
||||
CLContext* context);
|
||||
|
||||
private:
|
||||
void Release();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user