Added CPU representation for Buffer and Texture2D.

Removed many OpenCL APIs calls from operations.

PiperOrigin-RevId: 327071360
Change-Id: I85b9ade32ff49325ddaed43cb64c2c97c2054ec5
This commit is contained in:
Raman Sarokin 2020-08-17 12:14:37 -07:00 committed by TensorFlower Gardener
parent 387414fa32
commit 2d98952a90
25 changed files with 526 additions and 521 deletions

View File

@ -353,6 +353,7 @@ cc_library(
srcs = ["gpu_object.cc"],
hdrs = ["gpu_object.h"],
deps = [
":cl_context",
":opencl_wrapper",
"//tensorflow/lite/delegates/gpu/common:access_type",
"//tensorflow/lite/delegates/gpu/common:data_type",

View File

@ -263,6 +263,12 @@ void Arguments::AddObject(const std::string& name, AccessType access_type,
objects_[name] = {std::move(object), std::move(descriptor_ptr)};
}
void Arguments::AddObject(const std::string& name,
GPUObjectDescriptorPtr&& descriptor_ptr) {
descriptor_ptr->SetAccess(AccessType::READ);
objects_[name] = {nullptr, std::move(descriptor_ptr)};
}
void Arguments::AddGPUResources(const std::string& name,
const GPUResources& resources) {
for (const auto& r : resources.ints) {
@ -840,6 +846,15 @@ absl::Status Arguments::ResolveSelectorsPass(
return absl::OkStatus();
}
absl::Status Arguments::AllocateObjects(CLContext* context) {
for (auto& t : objects_) {
RETURN_IF_ERROR(
t.second.descriptor->CreateGPUObject(context, &t.second.obj_ptr));
t.second.descriptor->Release();
}
return absl::OkStatus();
}
absl::Status Arguments::AddObjectArgs() {
for (auto& t : objects_) {
AddGPUResources(t.first, t.second.descriptor->GetGPUResources());

View File

@ -54,6 +54,8 @@ class Arguments {
void AddObject(const std::string& name, AccessType access_type,
GPUObjectPtr&& object,
GPUObjectDescriptorPtr&& descriptor_ptr);
void AddObject(const std::string& name,
GPUObjectDescriptorPtr&& descriptor_ptr);
absl::Status SetInt(const std::string& name, int value);
absl::Status SetFloat(const std::string& name, float value);
@ -73,6 +75,7 @@ class Arguments {
void RenameArgs(const std::string& postfix, std::string* code) const;
absl::Status Merge(Arguments&& args, const std::string& postfix);
absl::Status AllocateObjects(CLContext* context);
absl::Status TransformToCLCode(
const DeviceInfo& device_info,
const std::map<std::string, std::string>& linkables, std::string* code);

View File

@ -47,6 +47,30 @@ absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
}
} // namespace
BufferDescriptor::BufferDescriptor(BufferDescriptor&& desc)
: GPUObjectDescriptor(std::move(desc)),
element_type(desc.element_type),
element_size(desc.element_size),
memory_type(desc.memory_type),
attributes(std::move(desc.attributes)),
size(desc.size),
data(std::move(desc.data)) {}
BufferDescriptor& BufferDescriptor::operator=(BufferDescriptor&& desc) {
if (this != &desc) {
std::swap(element_type, desc.element_type);
std::swap(element_size, desc.element_size);
std::swap(memory_type, desc.memory_type);
attributes = std::move(desc.attributes);
std::swap(size, desc.size);
data = std::move(desc.data);
GPUObjectDescriptor::operator=(std::move(desc));
}
return *this;
}
void BufferDescriptor::Release() { data.clear(); }
GPUResources BufferDescriptor::GetGPUResources() const {
GPUResources resources;
GPUBufferDescriptor desc;
@ -115,6 +139,14 @@ absl::Status BufferDescriptor::PerformGetPtrSelector(
return absl::OkStatus();
}
absl::Status BufferDescriptor::CreateGPUObject(CLContext* context,
GPUObjectPtr* result) const {
Buffer gpu_buffer;
RETURN_IF_ERROR(gpu_buffer.CreateFromBufferDescriptor(*this, context));
*result = absl::make_unique<Buffer>(std::move(gpu_buffer));
return absl::OkStatus();
}
Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
: buffer_(buffer), size_(size_in_bytes) {}
@ -151,6 +183,32 @@ absl::Status Buffer::GetGPUResources(const GPUObjectDescriptor* obj_ptr,
return absl::OkStatus();
}
absl::Status Buffer::CreateFromBufferDescriptor(const BufferDescriptor& desc,
CLContext* context) {
cl_mem_flags flags = desc.memory_type == MemoryType::CONSTANT
? CL_MEM_READ_ONLY
: CL_MEM_READ_WRITE;
if (!desc.data.empty()) {
flags |= CL_MEM_COPY_HOST_PTR;
}
cl_int error_code;
size_ = desc.size;
if (desc.data.empty()) {
buffer_ = clCreateBuffer(context->context(), flags, desc.size, nullptr,
&error_code);
} else {
buffer_ = clCreateBuffer(context->context(), flags, desc.size,
const_cast<unsigned char*>(desc.data.data()),
&error_code);
}
if (!buffer_) {
return absl::UnknownError(
absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
CLErrorCodeToString(error_code)));
}
return absl::OkStatus();
}
absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
Buffer* result) {
return CreateBuffer(size_in_bytes, true, nullptr, context, result);

View File

@ -35,6 +35,16 @@ struct BufferDescriptor : public GPUObjectDescriptor {
MemoryType memory_type = MemoryType::GLOBAL;
std::vector<std::string> attributes;
// optional
int size = 0;
std::vector<uint8_t> data;
BufferDescriptor() = default;
BufferDescriptor(const BufferDescriptor&) = default;
BufferDescriptor& operator=(const BufferDescriptor&) = default;
BufferDescriptor(BufferDescriptor&& desc);
BufferDescriptor& operator=(BufferDescriptor&& desc);
absl::Status PerformSelector(const std::string& selector,
const std::vector<std::string>& args,
const std::vector<std::string>& template_args,
@ -46,6 +56,10 @@ struct BufferDescriptor : public GPUObjectDescriptor {
absl::Status PerformGetPtrSelector(
const std::vector<std::string>& args,
const std::vector<std::string>& template_args, std::string* result) const;
absl::Status CreateGPUObject(CLContext* context,
GPUObjectPtr* result) const override;
void Release() override;
};
// Buffer represent linear GPU data storage with arbitrary data format.
@ -80,6 +94,9 @@ class Buffer : public GPUObject {
absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
GPUResourcesWithValue* resources) const override;
absl::Status CreateFromBufferDescriptor(const BufferDescriptor& desc,
CLContext* context);
private:
void Release();

View File

@ -21,6 +21,7 @@ limitations under the License.
#include <string>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/common/access_type.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
@ -119,9 +120,21 @@ struct GPUResourcesWithValue {
std::vector<std::pair<std::string, cl_mem>> custom_memories;
};
class GPUObject;
class GPUObjectDescriptor {
public:
GPUObjectDescriptor() = default;
GPUObjectDescriptor(const GPUObjectDescriptor&) = default;
GPUObjectDescriptor& operator=(const GPUObjectDescriptor&) = default;
GPUObjectDescriptor(GPUObjectDescriptor&& obj_desc)
: state_vars_(std::move(obj_desc.state_vars_)) {}
GPUObjectDescriptor& operator=(GPUObjectDescriptor&& obj_desc) {
if (this != &obj_desc) {
state_vars_ = std::move(obj_desc.state_vars_);
}
return *this;
}
virtual ~GPUObjectDescriptor() = default;
void SetStateVar(const std::string& key, const std::string& value) const {
@ -141,6 +154,12 @@ class GPUObjectDescriptor {
}
virtual GPUResources GetGPUResources() const { return GPUResources(); }
virtual absl::Status CreateGPUObject(
CLContext* context, std::unique_ptr<GPUObject>* result) const {
return absl::OkStatus();
}
virtual void Release() {}
void SetAccess(AccessType access_type) { access_type_ = access_type; }
AccessType GetAccess() const { return access_type_; }

View File

@ -155,83 +155,57 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
const int float4_size = f32_weights ? 16 : 8;
Texture2D weights_0;
Texture2D weights_1;
Texture2D weights_2;
Texture2D weights_3;
Buffer weights_buf;
std::vector<uint8_t> data(float4_size * elements_count);
if (f32_weights) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (conv_params_.AreWeightsBuffer()) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
float4* ptr = reinterpret_cast<float4*>(data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data(), context, &weights_0));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height, context,
&weights_1));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height * 2, context,
&weights_2));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height * 3, context,
&weights_3));
}
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (conv_params_.AreWeightsBuffer()) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data(), context, &weights_0));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height, context,
&weights_1));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height * 2, context,
&weights_2));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height * 3, context,
&weights_3));
}
half4* ptr = reinterpret_cast<half4*>(data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
}
if (conv_params_.AreWeightsBuffer()) {
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buf)),
absl::make_unique<BufferDescriptor>(desc));
desc.size = float4_size * elements_count;
desc.data = std::move(data);
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
} else {
Texture2DDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
args_.AddObject("weights0", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_0)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights1", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_1)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights2", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_2)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights3", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_3)),
absl::make_unique<Texture2DDescriptor>(desc));
int sub_size = float4_size * elements_count / 4;
Texture2DDescriptor desc0;
desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc0.size = int2(texture_width, texture_height);
desc0.data.resize(sub_size);
memcpy(desc0.data.data(), data.data(), sub_size);
args_.AddObject("weights0",
absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
Texture2DDescriptor desc1;
desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc1.size = int2(texture_width, texture_height);
desc1.data.resize(sub_size);
memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
args_.AddObject("weights1",
absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
Texture2DDescriptor desc2;
desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc2.size = int2(texture_width, texture_height);
desc2.data.resize(sub_size);
memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
args_.AddObject("weights2",
absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
Texture2DDescriptor desc3;
desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc3.size = int2(texture_width, texture_height);
desc3.data.resize(sub_size);
memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
args_.AddObject("weights3",
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
}
return absl::OkStatus();

View File

@ -150,31 +150,25 @@ absl::Status ConvBuffer1x1::UploadWeights(
const int elements_count =
weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
Buffer weights_buffer;
if (f32_weights) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buffer));
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buffer));
}
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 16;
desc.memory_type = MemoryType::GLOBAL;
desc.size = float4_size * elements_count;
desc.data.resize(desc.size);
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buffer)),
absl::make_unique<BufferDescriptor>(desc));
if (f32_weights) {
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
absl::MakeSpan(ptr, elements_count));
} else {
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
absl::MakeSpan(ptr, elements_count));
}
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}

View File

@ -82,31 +82,26 @@ absl::Status ConvConstants::UploadWeights(
const int kernel_y = weights.shape.h;
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int float_size = f32_weights ? 4 : 2;
const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
desc.memory_type = MemoryType::CONSTANT;
desc.size = float_size * float_count;
desc.data.resize(desc.size);
const int float_size = f32_weights ? 4 : 2;
const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
Buffer weights_buffer;
if (f32_weights) {
std::vector<float4> gpu_data(float_count / 4);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(
float_size * float_count, gpu_data.data(), context, &weights_buffer));
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
} else {
std::vector<half4> gpu_data(float_count / 4);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(
float_size * float_count, gpu_data.data(), context, &weights_buffer));
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
}
args_.AddObject("weigths", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buffer)),
absl::make_unique<BufferDescriptor>(desc));
args_.AddObject("weigths",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}

View File

@ -245,30 +245,25 @@ absl::Status ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
ConvPowerVR::WeightsUploadType::CONSTANT_MEM
? MemoryType::CONSTANT
: MemoryType::GLOBAL;
Buffer bias_buffer;
const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
? sizeof(float)
: sizeof(half);
int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.z);
desc.size = float_size * aligned_channels;
desc.data.resize(desc.size);
if (conv_params_.weights_data_type == DataType::FLOAT32) {
std::vector<float> gpu_data(aligned_channels);
for (int i = 0; i < gpu_data.size(); ++i) {
float* gpu_data = reinterpret_cast<float*>(desc.data.data());
for (int i = 0; i < aligned_channels; ++i) {
gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
}
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float) * gpu_data.size(),
gpu_data.data(), context,
&bias_buffer));
} else {
std::vector<half> gpu_data(aligned_channels);
for (int i = 0; i < gpu_data.size(); ++i) {
half* gpu_data = reinterpret_cast<half*>(desc.data.data());
for (int i = 0; i < aligned_channels; ++i) {
gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
}
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half) * gpu_data.size(),
gpu_data.data(), context,
&bias_buffer));
}
args_.AddObject("biases", AccessType::READ,
absl::make_unique<Buffer>(std::move(bias_buffer)),
absl::make_unique<BufferDescriptor>(desc));
args_.AddObject("biases",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}
@ -285,23 +280,6 @@ absl::Status ConvPowerVR::UploadWeights(
const int elements_count =
weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
Buffer weights_buffer;
if (f32_weights) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buffer));
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buffer));
}
BufferDescriptor desc;
desc.element_type = conv_params_.weights_data_type;
desc.element_size = 4;
@ -309,10 +287,20 @@ absl::Status ConvPowerVR::UploadWeights(
ConvPowerVR::WeightsUploadType::CONSTANT_MEM
? MemoryType::CONSTANT
: MemoryType::GLOBAL;
desc.size = float4_size * elements_count;
desc.data.resize(desc.size);
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buffer)),
absl::make_unique<BufferDescriptor>(desc));
if (f32_weights) {
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
absl::MakeSpan(ptr, elements_count));
} else {
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
absl::MakeSpan(ptr, elements_count));
}
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}

View File

@ -169,69 +169,56 @@ absl::Status ConvTexture::UploadWeights(
DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
const int elements_count = texture_width * texture_height;
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
Texture2DDescriptor desc;
desc.element_type = data_type;
Texture2DDescriptor desc0;
desc0.element_type = data_type;
desc0.size = int2(texture_width, texture_height);
desc0.data.resize(elements_count * float4_size);
Texture2D weights_0;
Texture2D weights_1;
Texture2D weights_2;
Texture2D weights_3;
Texture2DDescriptor desc1;
desc1.element_type = data_type;
desc1.size = int2(texture_width, texture_height);
desc1.data.resize(elements_count * float4_size);
Texture2DDescriptor desc2;
desc2.element_type = data_type;
desc2.size = int2(texture_width, texture_height);
desc2.data.resize(elements_count * float4_size);
Texture2DDescriptor desc3;
desc3.element_type = data_type;
desc3.size = int2(texture_width, texture_height);
desc3.data.resize(elements_count * float4_size);
if (f32_weights) {
std::vector<float4> gpu_data_0(elements_count);
std::vector<float4> gpu_data_1(elements_count);
std::vector<float4> gpu_data_2(elements_count);
std::vector<float4> gpu_data_3(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
absl::MakeSpan(gpu_data_3));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_0.data(),
context, &weights_0));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_1.data(),
context, &weights_1));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_2.data(),
context, &weights_2));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_3.data(),
context, &weights_3));
float4* ptr0 = reinterpret_cast<float4*>(desc0.data.data());
float4* ptr1 = reinterpret_cast<float4*>(desc1.data.data());
float4* ptr2 = reinterpret_cast<float4*>(desc2.data.data());
float4* ptr3 = reinterpret_cast<float4*>(desc3.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr0, elements_count),
absl::MakeSpan(ptr1, elements_count),
absl::MakeSpan(ptr2, elements_count),
absl::MakeSpan(ptr3, elements_count));
} else {
std::vector<half4> gpu_data_0(elements_count);
std::vector<half4> gpu_data_1(elements_count);
std::vector<half4> gpu_data_2(elements_count);
std::vector<half4> gpu_data_3(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
absl::MakeSpan(gpu_data_3));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_0.data(),
context, &weights_0));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_1.data(),
context, &weights_1));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_2.data(),
context, &weights_2));
RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
texture_height, gpu_data_3.data(),
context, &weights_3));
half4* ptr0 = reinterpret_cast<half4*>(desc0.data.data());
half4* ptr1 = reinterpret_cast<half4*>(desc1.data.data());
half4* ptr2 = reinterpret_cast<half4*>(desc2.data.data());
half4* ptr3 = reinterpret_cast<half4*>(desc3.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr0, elements_count),
absl::MakeSpan(ptr1, elements_count),
absl::MakeSpan(ptr2, elements_count),
absl::MakeSpan(ptr3, elements_count));
}
args_.AddObject("weights0", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_0)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights1", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_1)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights2", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_2)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights3", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_3)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights0",
absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
args_.AddObject("weights1",
absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
args_.AddObject("weights2",
absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
args_.AddObject("weights3",
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
return absl::OkStatus();
}

View File

@ -168,7 +168,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
"args.dst_tensor.Height() || dst_z >= "
"args.dst_tensor.Slices()) return;\n";
if (weights_are_buffer) {
c += " int f_base = dst_z * args.src_tensor.Slice() * args.kernel_size_x "
c += " int f_base = dst_z * args.src_tensor.Slices() * args.kernel_size_x "
"* args.kernel_size_y;\n";
}
for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {

View File

@ -89,91 +89,62 @@ absl::Status ConvolutionTransposed::UploadWeights(
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;
int texture_width = dst_depth;
int texture_height = src_depth * kernel_x * kernel_y;
const int elements_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int float4_size = f32_weights ? 16 : 8;
std::vector<uint8_t> data(float4_size * elements_count);
Texture2D weights_0;
Texture2D weights_1;
Texture2D weights_2;
Texture2D weights_3;
Buffer weights_buf;
if (f32_weights) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (weights_are_buffer_) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
float4* ptr = reinterpret_cast<float4*>(data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
gpu_data.data(), context, &weights_0));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
gpu_data.data() + texture_width * texture_height, context,
&weights_1));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
gpu_data.data() + texture_width * texture_height * 2, context,
&weights_2));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
gpu_data.data() + texture_width * texture_height * 3, context,
&weights_3));
}
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (weights_are_buffer_) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
gpu_data.data(), context, &weights_0));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
gpu_data.data() + texture_width * texture_height, context,
&weights_1));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
gpu_data.data() + texture_width * texture_height * 2, context,
&weights_2));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
gpu_data.data() + texture_width * texture_height * 3, context,
&weights_3));
}
half4* ptr = reinterpret_cast<half4*>(data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
}
if (weights_are_buffer_) {
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 16;
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buf)),
absl::make_unique<BufferDescriptor>(desc));
desc.size = float4_size * elements_count;
desc.data = std::move(data);
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
} else {
Texture2DDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
args_.AddObject("weights0", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_0)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights1", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_1)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights2", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_2)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights3", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_3)),
absl::make_unique<Texture2DDescriptor>(desc));
int sub_size = float4_size * elements_count / 4;
Texture2DDescriptor desc0;
desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc0.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
desc0.data.resize(sub_size);
memcpy(desc0.data.data(), data.data(), sub_size);
args_.AddObject("weights0",
absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
Texture2DDescriptor desc1;
desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc1.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
desc1.data.resize(sub_size);
memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
args_.AddObject("weights1",
absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
Texture2DDescriptor desc2;
desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc2.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
desc2.data.resize(sub_size);
memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
args_.AddObject("weights2",
absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
Texture2DDescriptor desc3;
desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc3.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
desc3.data.resize(sub_size);
memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
args_.AddObject("weights3",
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
}
return absl::OkStatus();

View File

@ -97,84 +97,57 @@ absl::Status ConvolutionTransposed3D::UploadWeights(
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int float4_size = f32_weights ? 16 : 8;
std::vector<uint8_t> data(float4_size * elements_count);
Texture2D weights_0;
Texture2D weights_1;
Texture2D weights_2;
Texture2D weights_3;
Buffer weights_buf;
if (f32_weights) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (weights_are_buffer_) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
float4* ptr = reinterpret_cast<float4*>(data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data(), context, &weights_0));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height, context,
&weights_1));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height * 2, context,
&weights_2));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height * 3, context,
&weights_3));
}
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (weights_are_buffer_) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data(), context, &weights_0));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height, context,
&weights_1));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height * 2, context,
&weights_2));
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data() + texture_width * texture_height * 3, context,
&weights_3));
}
half4* ptr = reinterpret_cast<half4*>(data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
}
if (weights_are_buffer_) {
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 16;
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buf)),
absl::make_unique<BufferDescriptor>(desc));
desc.size = float4_size * elements_count;
desc.data = std::move(data);
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
} else {
Texture2DDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
args_.AddObject("weights0", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_0)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights1", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_1)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights2", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_2)),
absl::make_unique<Texture2DDescriptor>(desc));
args_.AddObject("weights3", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_3)),
absl::make_unique<Texture2DDescriptor>(desc));
int sub_size = float4_size * elements_count / 4;
Texture2DDescriptor desc0;
desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc0.size = int2(texture_width, texture_height);
desc0.data.resize(sub_size);
memcpy(desc0.data.data(), data.data(), sub_size);
args_.AddObject("weights0",
absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
Texture2DDescriptor desc1;
desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc1.size = int2(texture_width, texture_height);
desc1.data.resize(sub_size);
memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
args_.AddObject("weights1",
absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
Texture2DDescriptor desc2;
desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc2.size = int2(texture_width, texture_height);
desc2.data.resize(sub_size);
memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
args_.AddObject("weights2",
absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
Texture2DDescriptor desc3;
desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc3.size = int2(texture_width, texture_height);
desc3.data.resize(sub_size);
memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
args_.AddObject("weights3",
absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
}
return absl::OkStatus();

View File

@ -96,19 +96,6 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
Buffer weights_buffer;
if (f32_weights) {
std::vector<float4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(
flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
} else {
std::vector<half4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(
flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
}
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
@ -117,10 +104,19 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
ConvolutionTransposed3x3::WeightsUploadType::CONSTANT_MEM
? MemoryType::CONSTANT
: MemoryType::GLOBAL;
desc.size = flt4_size * flt4_count;
desc.data.resize(desc.size);
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buffer)),
absl::make_unique<BufferDescriptor>(desc));
if (f32_weights) {
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
} else {
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
}
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}

View File

@ -79,46 +79,41 @@ absl::Status ConvolutionTransposed3x3Thin::UploadData(
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
desc.memory_type = MemoryType::CONSTANT;
desc.size = flt4_size * (flt4_count + dst_depth);
desc.data.resize(desc.size);
Buffer weights_buffer;
if (f32_weights) {
std::vector<float4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
float4* gpu_data = reinterpret_cast<float4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
for (int i = 0; i < dst_depth; ++i) {
float4 bias_value(0.0f);
for (int c = 0; c < 4; ++c) {
int ch = i * 4 + c;
bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
}
gpu_data.push_back(bias_value);
gpu_data[flt4_count + i] = bias_value;
}
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float4) * gpu_data.size(),
gpu_data.data(), context,
&weights_buffer));
} else {
std::vector<half4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
half4* gpu_data = reinterpret_cast<half4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
for (int i = 0; i < dst_depth; ++i) {
half4 bias_value(0.0f);
for (int c = 0; c < 4; ++c) {
int ch = i * 4 + c;
bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
}
gpu_data.push_back(bias_value);
gpu_data[flt4_count + i] = bias_value;
}
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half4) * gpu_data.size(),
gpu_data.data(), context,
&weights_buffer));
}
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buffer)),
absl::make_unique<BufferDescriptor>(desc));
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}

View File

@ -92,19 +92,6 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
Buffer weights_buffer;
if (f32_weights) {
std::vector<float4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(
flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
} else {
std::vector<half4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(
flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
}
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
@ -113,10 +100,19 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
? MemoryType::CONSTANT
: MemoryType::GLOBAL;
desc.size = flt4_size * flt4_count;
desc.data.resize(desc.size);
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buffer)),
absl::make_unique<BufferDescriptor>(desc));
if (f32_weights) {
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
} else {
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
}
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}

View File

@ -76,40 +76,35 @@ absl::Status ConvolutionTransposedThin::UploadData(
weights.shape.w * weights.shape.h * src_depth * weights.shape.o;
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
desc.memory_type = MemoryType::CONSTANT;
desc.size = flt4_size * (flt4_count + 1);
desc.data.resize(desc.size);
Buffer weights_buffer;
if (f32_weights) {
std::vector<float4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
float4* gpu_data = reinterpret_cast<float4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
float4 bias_value(0.0f);
for (int i = 0; i < weights.shape.o; ++i) {
bias_value[i] = biases.data[i];
}
gpu_data.push_back(bias_value);
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float4) * gpu_data.size(),
gpu_data.data(), context,
&weights_buffer));
gpu_data[flt4_count] = bias_value;
} else {
std::vector<half4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
half4* gpu_data = reinterpret_cast<half4*>(desc.data.data());
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
half4 bias_value(0.0f);
for (int i = 0; i < weights.shape.o; ++i) {
bias_value[i] = biases.data[i];
}
gpu_data.push_back(bias_value);
RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half4) * gpu_data.size(),
gpu_data.data(), context,
&weights_buffer));
gpu_data[flt4_count] = bias_value;
}
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buffer)),
absl::make_unique<BufferDescriptor>(desc));
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}

View File

@ -106,47 +106,29 @@ absl::Status DepthwiseConvolution::UploadWeights(
const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
const int float4_size = fp32_weights ? 16 : 8;
Texture2D weights_tex2d;
Buffer weights_buf;
std::vector<uint8_t> data(float4_size * elements_count);
if (fp32_weights) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (weights_are_buffer_) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
float4* ptr = reinterpret_cast<float4*>(data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
gpu_data.data(), context, &weights_tex2d));
}
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (weights_are_buffer_) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
gpu_data.data(), context, &weights_tex2d));
}
half4* ptr = reinterpret_cast<half4*>(data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
}
if (weights_are_buffer_) {
BufferDescriptor desc;
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buf)),
absl::make_unique<BufferDescriptor>(desc));
desc.size = float4_size * elements_count;
desc.data = std::move(data);
args_.AddObject("weights", absl::make_unique<BufferDescriptor>(desc));
} else {
Texture2DDescriptor desc;
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_tex2d)),
absl::make_unique<Texture2DDescriptor>(desc));
desc.size = int2(kernel_x * kernel_y, dst_slices);
desc.data = std::move(data);
args_.AddObject("weights", absl::make_unique<Texture2DDescriptor>(desc));
}
return absl::OkStatus();
@ -195,47 +177,31 @@ absl::Status DepthwiseConvolution::UploadWeights(
const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
const int float4_size = fp32_weights ? 16 : 8;
Texture2D weights_tex2d;
Buffer weights_buf;
std::vector<uint8_t> data(float4_size * elements_count);
if (fp32_weights) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (weights_are_buffer_) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
float4* ptr = reinterpret_cast<float4*>(data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
gpu_data.data(), context, &weights_tex2d));
}
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
if (weights_are_buffer_) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
gpu_data.data(), context, &weights_tex2d));
}
half4* ptr = reinterpret_cast<half4*>(data.data());
RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
}
if (weights_are_buffer_) {
BufferDescriptor desc;
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buf)),
absl::make_unique<BufferDescriptor>(desc));
desc.size = float4_size * elements_count;
desc.data = std::move(data);
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
} else {
Texture2DDescriptor desc;
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_tex2d)),
absl::make_unique<Texture2DDescriptor>(desc));
desc.size = int2(kernel_x * kernel_y * kernel_z, dst_slices);
desc.data = std::move(data);
args_.AddObject("weights",
absl::make_unique<Texture2DDescriptor>(std::move(desc)));
}
return absl::OkStatus();

View File

@ -88,47 +88,32 @@ absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
const int float4_size = fp32_weights ? 16 : 8;
Texture2D weights_tex2d;
Buffer weights_buf;
std::vector<uint8_t> data(float4_size * elements_count);
if (fp32_weights) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
if (weights_are_buffer) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
float4* ptr = reinterpret_cast<float4*>(data.data());
RearrangeWeightsAndBiasesData(weights, biases,
absl::MakeSpan(ptr, elements_count));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data(), context, &weights_tex2d));
}
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
if (weights_are_buffer) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buf));
} else {
RETURN_IF_ERROR(CreateTexture2DRGBA(
definition_.GetDataType(), texture_width, texture_height,
gpu_data.data(), context, &weights_tex2d));
}
half4* ptr = reinterpret_cast<half4*>(data.data());
RearrangeWeightsAndBiasesData(weights, biases,
absl::MakeSpan(ptr, elements_count));
}
if (weights_are_buffer) {
BufferDescriptor desc;
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buf)),
absl::make_unique<BufferDescriptor>(desc));
desc.size = float4_size * elements_count;
desc.data = std::move(data);
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
} else {
Texture2DDescriptor desc;
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Texture2D>(std::move(weights_tex2d)),
absl::make_unique<Texture2DDescriptor>(desc));
desc.size = int2(texture_width, texture_height);
desc.data = std::move(data);
args_.AddObject("weights",
absl::make_unique<Texture2DDescriptor>(std::move(desc)));
}
return absl::OkStatus();

View File

@ -131,26 +131,19 @@ absl::Status FullyConnected::UploadWeights(
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 16;
desc.size = float4_size * elements_count;
desc.data.resize(desc.size);
Buffer weights_buffer;
if (f32_weights) {
std::vector<float4> gpu_data(dst_depth * src_depth * 4);
RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buffer));
float4* ptr = reinterpret_cast<float4*>(desc.data.data());
RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(ptr, elements_count));
} else {
std::vector<half4> gpu_data(dst_depth * src_depth * 4);
RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buffer));
half4* ptr = reinterpret_cast<half4*>(desc.data.data());
RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(ptr, elements_count));
}
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buffer)),
absl::make_unique<BufferDescriptor>(desc));
args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}

View File

@ -209,6 +209,7 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
std::string code =
GetElementWiseCode(definition_, check_src_channels_size_);
elementwise_code_ = "{\n" + code_ + "\n}\n" + elementwise_code_;
RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
RETURN_IF_ERROR(args_.TransformToCLCode(
creation_context.device->info_,
{{dst_tensors_names_[0], elementwise_code_}}, &code));
@ -217,6 +218,7 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_));
} else {
RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
RETURN_IF_ERROR(args_.TransformToCLCode(
creation_context.device->info_,
{{dst_tensors_names_[0], elementwise_code_}}, &code_));

View File

@ -93,30 +93,25 @@ absl::Status UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
}
}
Buffer constants_buf;
const bool fp32_weights = precision == CalculationsPrecision::F32;
const int float_size = fp32_weights ? 4 : 2;
if (fp32_weights) {
RETURN_IF_ERROR(CreateReadOnlyBuffer(float_size * gpu_data.size(),
gpu_data.data(), context,
&constants_buf));
} else {
std::vector<half> gpu_data_half(gpu_data.size());
for (int i = 0; i < gpu_data.size(); ++i) {
gpu_data_half[i] = gpu_data[i];
}
RETURN_IF_ERROR(CreateReadOnlyBuffer(float_size * gpu_data_half.size(),
gpu_data_half.data(), context,
&constants_buf));
}
BufferDescriptor desc;
desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 4;
desc.memory_type = MemoryType::CONSTANT;
op->args_.AddObject("constants", AccessType::READ,
absl::make_unique<Buffer>(std::move(constants_buf)),
absl::make_unique<BufferDescriptor>(desc));
desc.size = float_size * gpu_data.size();
desc.data.resize(desc.size);
if (fp32_weights) {
memcpy(desc.data.data(), gpu_data.data(), desc.size);
} else {
half* gpu_data_half = reinterpret_cast<half*>(desc.data.data());
for (int i = 0; i < gpu_data.size(); ++i) {
gpu_data_half[i] = gpu_data[i];
}
}
op->args_.AddObject("constants",
absl::make_unique<BufferDescriptor>(std::move(desc)));
return absl::OkStatus();
}

View File

@ -59,6 +59,25 @@ absl::Status CreateTexture2D(int width, int height, cl_channel_type type,
}
} // namespace
Texture2DDescriptor::Texture2DDescriptor(Texture2DDescriptor&& desc)
: GPUObjectDescriptor(std::move(desc)),
element_type(desc.element_type),
size(desc.size),
data(std::move(desc.data)) {}
Texture2DDescriptor& Texture2DDescriptor::operator=(
Texture2DDescriptor&& desc) {
if (this != &desc) {
std::swap(element_type, desc.element_type);
std::swap(size, desc.size);
data = std::move(desc.data);
GPUObjectDescriptor::operator=(std::move(desc));
}
return *this;
}
void Texture2DDescriptor::Release() { data.clear(); }
GPUResources Texture2DDescriptor::GetGPUResources() const {
GPUResources resources;
GPUImage2DDescriptor desc;
@ -93,6 +112,14 @@ absl::Status Texture2DDescriptor::PerformReadSelector(
return absl::OkStatus();
}
absl::Status Texture2DDescriptor::CreateGPUObject(CLContext* context,
GPUObjectPtr* result) const {
Texture2D gpu_texture;
RETURN_IF_ERROR(gpu_texture.CreateFromTexture2DDescriptor(*this, context));
*result = absl::make_unique<Texture2D>(std::move(gpu_texture));
return absl::OkStatus();
}
Texture2D::Texture2D(cl_mem texture, int width, int height,
cl_channel_type type)
: texture_(texture), width_(width), height_(height), channel_type_(type) {}
@ -139,6 +166,49 @@ absl::Status Texture2D::GetGPUResources(
return absl::OkStatus();
}
absl::Status Texture2D::CreateFromTexture2DDescriptor(
const Texture2DDescriptor& tex_desc, CLContext* context) {
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_width = tex_desc.size.x;
desc.image_height = tex_desc.size.y;
desc.image_depth = 0;
desc.image_row_pitch = 0;
desc.image_slice_pitch = 0;
desc.num_mip_levels = 0;
desc.num_samples = 0;
desc.buffer = nullptr;
cl_image_format format;
format.image_channel_order = CL_RGBA;
format.image_channel_data_type =
tex_desc.element_type == DataType::FLOAT32 ? CL_FLOAT : CL_HALF_FLOAT;
cl_mem_flags flags = CL_MEM_READ_WRITE;
if (!tex_desc.data.empty()) {
flags |= CL_MEM_COPY_HOST_PTR;
}
cl_int error_code;
width_ = tex_desc.size.x;
height_ = tex_desc.size.y;
channel_type_ = format.image_channel_data_type;
if (tex_desc.data.empty()) {
texture_ = CreateImage2DLegacy(context->context(), flags, &format, &desc,
nullptr, &error_code);
} else {
texture_ = CreateImage2DLegacy(
context->context(), flags, &format, &desc,
const_cast<unsigned char*>(tex_desc.data.data()), &error_code);
}
if (error_code != CL_SUCCESS) {
return absl::UnknownError(
absl::StrCat("Failed to create 2D texture (clCreateImage): ",
CLErrorCodeToString(error_code)));
}
return absl::OkStatus();
}
// Creates new 4-channel 2D texture with f32 elements
absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext* context,
Texture2D* result) {

View File

@ -34,6 +34,16 @@ namespace cl {
struct Texture2DDescriptor : public GPUObjectDescriptor {
DataType element_type; // FLOAT32 or FLOAT16
// optional
int2 size = int2(0, 0);
std::vector<uint8_t> data;
Texture2DDescriptor() = default;
Texture2DDescriptor(const Texture2DDescriptor&) = default;
Texture2DDescriptor& operator=(const Texture2DDescriptor&) = default;
Texture2DDescriptor(Texture2DDescriptor&& desc);
Texture2DDescriptor& operator=(Texture2DDescriptor&& desc);
absl::Status PerformSelector(const std::string& selector,
const std::vector<std::string>& args,
const std::vector<std::string>& template_args,
@ -42,6 +52,10 @@ struct Texture2DDescriptor : public GPUObjectDescriptor {
GPUResources GetGPUResources() const override;
absl::Status PerformReadSelector(const std::vector<std::string>& args,
std::string* result) const;
absl::Status CreateGPUObject(CLContext* context,
GPUObjectPtr* result) const override;
void Release() override;
};
// Texture2D represent formatted GPU data storage.
@ -73,6 +87,9 @@ class Texture2D : public GPUObject {
absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
GPUResourcesWithValue* resources) const override;
absl::Status CreateFromTexture2DDescriptor(const Texture2DDescriptor& desc,
CLContext* context);
private:
void Release();