TFLite iOS GPU: New conversion functions are used across all operations.
PiperOrigin-RevId: 273598884
This commit is contained in:
parent
3d0978f59c
commit
0ae85d7a0e
@ -29,16 +29,6 @@ namespace tflite {
|
|||||||
namespace gpu {
|
namespace gpu {
|
||||||
namespace metal {
|
namespace metal {
|
||||||
|
|
||||||
/// Helper function to convert buffer's content into stream of bytes
|
|
||||||
std::vector<uint8_t> VectorFloatToHalf(const std::vector<float>& input_vector) {
|
|
||||||
std::vector<HalfBits> result;
|
|
||||||
result.reserve(input_vector.size());
|
|
||||||
for (const float v : input_vector) {
|
|
||||||
result.push_back(fp16_ieee_from_fp32_value(v));
|
|
||||||
}
|
|
||||||
return GetByteBuffer(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts float to destination type (if needed) and stores as bytes array.
|
/// Converts float to destination type (if needed) and stores as bytes array.
|
||||||
std::vector<uint8_t> GetByteBufferConverted(
|
std::vector<uint8_t> GetByteBufferConverted(
|
||||||
const std::vector<float>& input_vector,
|
const std::vector<float>& input_vector,
|
||||||
|
@ -135,9 +135,6 @@ std::vector<uint8_t> GetByteBufferConvertedResized(
|
|||||||
const std::vector<float>& input_vector,
|
const std::vector<float>& input_vector,
|
||||||
RuntimeOptions::Precision destination_type, size_t elements_count);
|
RuntimeOptions::Precision destination_type, size_t elements_count);
|
||||||
|
|
||||||
/// Helper function to convert FP32 to FP16 and into stream of bytes.
|
|
||||||
std::vector<uint8_t> VectorFloatToHalf(const std::vector<float>& input_vector);
|
|
||||||
|
|
||||||
} // namespace metal
|
} // namespace metal
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
} // namespace tflite
|
} // namespace tflite
|
||||||
|
@ -95,11 +95,9 @@ std::vector<ComputeTaskDescriptorPtr> Add(int id,
|
|||||||
device FLT4* const broadcast) { return value + broadcast[gid.z]; })";
|
device FLT4* const broadcast) { return value + broadcast[gid.z]; })";
|
||||||
desc->input_buffers = {{input_ids[0]}};
|
desc->input_buffers = {{input_ids[0]}};
|
||||||
desc->output_buffer = {output_id};
|
desc->output_buffer = {output_id};
|
||||||
auto values = options.storage_precision == RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(broadcast->data)
|
|
||||||
: VectorFloatToHalf(broadcast->data);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const", values},
|
{"device FLT4* const",
|
||||||
|
GetByteBufferConverted(broadcast->data, options.storage_precision)},
|
||||||
};
|
};
|
||||||
return {desc};
|
return {desc};
|
||||||
}
|
}
|
||||||
|
@ -931,11 +931,9 @@ std::vector<ComputeTaskDescriptorPtr> Convolution(
|
|||||||
}};
|
}};
|
||||||
|
|
||||||
auto weights_reordered = ReorderWeightsForConvShared(params);
|
auto weights_reordered = ReorderWeightsForConvShared(params);
|
||||||
auto weights = options.storage_precision == RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(weights_reordered)
|
|
||||||
: VectorFloatToHalf(weights_reordered);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const weights", weights},
|
{"device FLT4* const weights",
|
||||||
|
GetByteBufferConverted(weights_reordered, options.storage_precision)},
|
||||||
{"device FLT4* const biases",
|
{"device FLT4* const biases",
|
||||||
GetByteBufferConvertedResized(params.bias.data,
|
GetByteBufferConvertedResized(params.bias.data,
|
||||||
options.storage_precision,
|
options.storage_precision,
|
||||||
@ -990,12 +988,9 @@ std::vector<ComputeTaskDescriptorPtr> Convolution1x1(
|
|||||||
}};
|
}};
|
||||||
|
|
||||||
auto weights_reordered = ReorderWeightsForConv(params, z_out);
|
auto weights_reordered = ReorderWeightsForConv(params, z_out);
|
||||||
auto weights =
|
|
||||||
options.storage_precision == metal::RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(weights_reordered)
|
|
||||||
: VectorFloatToHalf(weights_reordered);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const filters", weights},
|
{"device FLT4* const filters",
|
||||||
|
GetByteBufferConverted(weights_reordered, options.storage_precision)},
|
||||||
{"device FLT4* const biases",
|
{"device FLT4* const biases",
|
||||||
GetByteBufferConvertedResized(params.bias.data,
|
GetByteBufferConvertedResized(params.bias.data,
|
||||||
options.storage_precision,
|
options.storage_precision,
|
||||||
@ -1051,12 +1046,9 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric(
|
|||||||
}};
|
}};
|
||||||
|
|
||||||
auto weights_reordered = ReorderWeightsForConv(params, z_out);
|
auto weights_reordered = ReorderWeightsForConv(params, z_out);
|
||||||
auto weights =
|
|
||||||
options.storage_precision == metal::RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(weights_reordered)
|
|
||||||
: VectorFloatToHalf(weights_reordered);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const filters", weights},
|
{"device FLT4* const filters",
|
||||||
|
GetByteBufferConverted(weights_reordered, options.storage_precision)},
|
||||||
{"device FLT4* const biases",
|
{"device FLT4* const biases",
|
||||||
GetByteBufferConvertedResized(params.bias.data,
|
GetByteBufferConvertedResized(params.bias.data,
|
||||||
options.storage_precision,
|
options.storage_precision,
|
||||||
@ -1108,12 +1100,9 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise(
|
|||||||
}};
|
}};
|
||||||
|
|
||||||
auto weights_reordered = ReorderWeightsForConv(params, z_out);
|
auto weights_reordered = ReorderWeightsForConv(params, z_out);
|
||||||
auto weights =
|
|
||||||
options.storage_precision == metal::RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(weights_reordered)
|
|
||||||
: VectorFloatToHalf(weights_reordered);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const filters", weights},
|
{"device FLT4* const filters",
|
||||||
|
GetByteBufferConverted(weights_reordered, options.storage_precision)},
|
||||||
{"device FLT4* const biases",
|
{"device FLT4* const biases",
|
||||||
GetByteBufferConvertedResized(params.bias.data,
|
GetByteBufferConvertedResized(params.bias.data,
|
||||||
options.storage_precision,
|
options.storage_precision,
|
||||||
@ -1169,12 +1158,9 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise1x1PowerVR(
|
|||||||
}};
|
}};
|
||||||
|
|
||||||
auto weights_reordered = ReorderWeightsForConv(params, z_out);
|
auto weights_reordered = ReorderWeightsForConv(params, z_out);
|
||||||
auto weights =
|
|
||||||
options.storage_precision == metal::RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(weights_reordered)
|
|
||||||
: VectorFloatToHalf(weights_reordered);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const filters", weights},
|
{"device FLT4* const filters",
|
||||||
|
GetByteBufferConverted(weights_reordered, options.storage_precision)},
|
||||||
{"device FLT4* const biases",
|
{"device FLT4* const biases",
|
||||||
GetByteBufferConvertedResized(params.bias.data,
|
GetByteBufferConvertedResized(params.bias.data,
|
||||||
options.storage_precision,
|
options.storage_precision,
|
||||||
|
@ -566,19 +566,14 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
|
|||||||
return out_shape;
|
return out_shape;
|
||||||
}};
|
}};
|
||||||
|
|
||||||
std::vector<float> filters_reordered = ConvertToPIOHW4(attr.weights);
|
const int output_channels_count = attr.weights.shape.i * attr.weights.shape.o;
|
||||||
auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(filters_reordered)
|
|
||||||
: VectorFloatToHalf(filters_reordered);
|
|
||||||
auto biases = options.storage_precision == RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(attr.bias.data)
|
|
||||||
: VectorFloatToHalf(attr.bias.data);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const filters", filters},
|
{"device FLT4* const filters",
|
||||||
|
GetByteBufferConverted(ConvertToPIOHW4(attr.weights),
|
||||||
|
options.storage_precision)},
|
||||||
{"device FLT4* const biases",
|
{"device FLT4* const biases",
|
||||||
GetByteBufferConvertedResized(
|
GetByteBufferConvertedResized(attr.bias.data, options.storage_precision,
|
||||||
attr.bias.data, options.storage_precision,
|
output_channels_count)},
|
||||||
attr.weights.shape.i * attr.weights.shape.o)},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
desc->uniform_buffers = {
|
desc->uniform_buffers = {
|
||||||
@ -647,12 +642,9 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride1x1(
|
|||||||
|
|
||||||
// For this operation we keep weights and biases in one buffer
|
// For this operation we keep weights and biases in one buffer
|
||||||
auto weights_reordered = ReorderWeightsDepthWiseConv3x3Stride1x1(attr);
|
auto weights_reordered = ReorderWeightsDepthWiseConv3x3Stride1x1(attr);
|
||||||
auto weights =
|
|
||||||
options.storage_precision == metal::RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(weights_reordered)
|
|
||||||
: VectorFloatToHalf(weights_reordered);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const filters", weights},
|
{"device FLT4* const filters",
|
||||||
|
GetByteBufferConverted(weights_reordered, options.storage_precision)},
|
||||||
};
|
};
|
||||||
|
|
||||||
desc->uniform_buffers = {
|
desc->uniform_buffers = {
|
||||||
@ -714,12 +706,9 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride2(
|
|||||||
|
|
||||||
// For this operation we keep weights and biases in one buffer
|
// For this operation we keep weights and biases in one buffer
|
||||||
auto weights_reordered = ReorderWeightsDepthWiseConv3x3Stride2(attr);
|
auto weights_reordered = ReorderWeightsDepthWiseConv3x3Stride2(attr);
|
||||||
auto weights =
|
|
||||||
options.storage_precision == metal::RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(weights_reordered)
|
|
||||||
: VectorFloatToHalf(weights_reordered);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const filters", weights},
|
{"device FLT4* const filters",
|
||||||
|
GetByteBufferConverted(weights_reordered, options.storage_precision)},
|
||||||
};
|
};
|
||||||
|
|
||||||
desc->uniform_buffers = {
|
desc->uniform_buffers = {
|
||||||
|
@ -159,11 +159,9 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(filters_reordered)
|
|
||||||
: VectorFloatToHalf(filters_reordered);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const matrix", filters},
|
{"device FLT4* const matrix",
|
||||||
|
GetByteBufferConverted(filters_reordered, options.storage_precision)},
|
||||||
{"device FLT4* const biases",
|
{"device FLT4* const biases",
|
||||||
GetByteBufferConvertedResized(attr.bias.data, options.storage_precision,
|
GetByteBufferConvertedResized(attr.bias.data, options.storage_precision,
|
||||||
attr.weights.shape.o)},
|
attr.weights.shape.o)},
|
||||||
|
@ -69,11 +69,9 @@ std::vector<ComputeTaskDescriptorPtr> Multiply(
|
|||||||
}},
|
}},
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
auto coeffs = options.storage_precision == RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(mul_buffer->data)
|
|
||||||
: VectorFloatToHalf(mul_buffer->data);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const", coeffs},
|
{"device FLT4* const",
|
||||||
|
GetByteBufferConverted(mul_buffer->data, options.storage_precision)},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
return {desc};
|
return {desc};
|
||||||
|
@ -61,11 +61,9 @@ std::vector<ComputeTaskDescriptorPtr> PReLU(int id, ValueId input_id,
|
|||||||
}
|
}
|
||||||
desc->input_buffers = {{input_id}};
|
desc->input_buffers = {{input_id}};
|
||||||
desc->output_buffer = {output_id};
|
desc->output_buffer = {output_id};
|
||||||
auto alphas = options.storage_precision == RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(alpha_buffer->data)
|
|
||||||
: VectorFloatToHalf(alpha_buffer->data);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const", alphas},
|
{"device FLT4* const",
|
||||||
|
GetByteBufferConverted(alpha_buffer->data, options.storage_precision)},
|
||||||
};
|
};
|
||||||
if (attr.clip != 0) {
|
if (attr.clip != 0) {
|
||||||
desc->uniform_buffers = {
|
desc->uniform_buffers = {
|
||||||
@ -106,11 +104,9 @@ std::vector<ComputeTaskDescriptorPtr> PReLUFull(int id, ValueId input_id,
|
|||||||
}
|
}
|
||||||
desc->input_buffers = {{input_id}};
|
desc->input_buffers = {{input_id}};
|
||||||
desc->output_buffer = {output_id};
|
desc->output_buffer = {output_id};
|
||||||
auto alphas = options.storage_precision == RuntimeOptions::Precision::FP32
|
|
||||||
? GetByteBuffer(ConvertToPHWC4(*alpha))
|
|
||||||
: VectorFloatToHalf(ConvertToPHWC4(*alpha));
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FLT4* const", alphas},
|
{"device FLT4* const", GetByteBufferConverted(ConvertToPHWC4(*alpha),
|
||||||
|
options.storage_precision)},
|
||||||
};
|
};
|
||||||
if (attr.clip != 0) {
|
if (attr.clip != 0) {
|
||||||
desc->uniform_buffers = {
|
desc->uniform_buffers = {
|
||||||
|
@ -950,9 +950,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
|
auto filters =
|
||||||
? GetByteBuffer(filters_reordered)
|
GetByteBufferConverted(filters_reordered, options.storage_precision);
|
||||||
: VectorFloatToHalf(filters_reordered);
|
|
||||||
desc->immutable_buffers = {
|
desc->immutable_buffers = {
|
||||||
{"device FilterStripe* const filters", filters},
|
{"device FilterStripe* const filters", filters},
|
||||||
{"device FLT4* const biases",
|
{"device FLT4* const biases",
|
||||||
@ -1044,9 +1043,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed3x3(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
|
auto filters =
|
||||||
? GetByteBuffer(filters_reordered)
|
GetByteBufferConverted(filters_reordered, options.storage_precision);
|
||||||
: VectorFloatToHalf(filters_reordered);
|
|
||||||
auto biases = GetByteBufferConvertedResized(
|
auto biases = GetByteBufferConvertedResized(
|
||||||
params.bias.data, options.storage_precision, params.weights.shape.o);
|
params.bias.data, options.storage_precision, params.weights.shape.o);
|
||||||
border_desc->immutable_buffers = {
|
border_desc->immutable_buffers = {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user