TFLite iOS GPU: New conversion functions are used across all operations.

PiperOrigin-RevId: 273598884
2019-10-08 13:58:44 -07:00 · 2019-10-08 13:58:44 -07:00 · 0ae85d7a0e
commit 0ae85d7a0e
parent 3d0978f59c
9 changed files with 34 additions and 84 deletions
--- a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.cc
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.cc
@ -29,16 +29,6 @@ namespace tflite {
 namespace gpu {
 namespace metal {

-/// Helper function to convert buffer's content into stream of bytes
-std::vector<uint8_t> VectorFloatToHalf(const std::vector<float>& input_vector) {
-  std::vector<HalfBits> result;
-  result.reserve(input_vector.size());
-  for (const float v : input_vector) {
-    result.push_back(fp16_ieee_from_fp32_value(v));
-  }
-  return GetByteBuffer(result);
-}
-
 /// Converts float to destination type (if needed) and stores as bytes array.
 std::vector<uint8_t> GetByteBufferConverted(
    const std::vector<float>& input_vector,
--- a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h
@ -135,9 +135,6 @@ std::vector<uint8_t> GetByteBufferConvertedResized(
    const std::vector<float>& input_vector,
    RuntimeOptions::Precision destination_type, size_t elements_count);

-/// Helper function to convert FP32 to FP16 and into stream of bytes.
-std::vector<uint8_t> VectorFloatToHalf(const std::vector<float>& input_vector);
-
 }  // namespace metal
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/metal/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/add.cc
@ -95,11 +95,9 @@ std::vector<ComputeTaskDescriptorPtr> Add(int id,
      device FLT4* const broadcast) { return value + broadcast[gid.z]; })";
    desc->input_buffers = {{input_ids[0]}};
    desc->output_buffer = {output_id};
-    auto values = options.storage_precision == RuntimeOptions::Precision::FP32
-                      ? GetByteBuffer(broadcast->data)
-                      : VectorFloatToHalf(broadcast->data);
    desc->immutable_buffers = {
-        {"device FLT4* const", values},
+        {"device FLT4* const",
+         GetByteBufferConverted(broadcast->data, options.storage_precision)},
    };
    return {desc};
  }
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
@ -931,11 +931,9 @@ std::vector<ComputeTaskDescriptorPtr> Convolution(
      }};

  auto weights_reordered = ReorderWeightsForConvShared(params);
-  auto weights = options.storage_precision == RuntimeOptions::Precision::FP32
-                     ? GetByteBuffer(weights_reordered)
-                     : VectorFloatToHalf(weights_reordered);
  desc->immutable_buffers = {
-      {"device FLT4* const weights", weights},
+      {"device FLT4* const weights",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
      {"device FLT4* const biases",
       GetByteBufferConvertedResized(params.bias.data,
                                     options.storage_precision,
@ -990,12 +988,9 @@ std::vector<ComputeTaskDescriptorPtr> Convolution1x1(
      }};

  auto weights_reordered = ReorderWeightsForConv(params, z_out);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
  desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
      {"device FLT4* const biases",
       GetByteBufferConvertedResized(params.bias.data,
                                     options.storage_precision,
@ -1051,12 +1046,9 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric(
      }};

  auto weights_reordered = ReorderWeightsForConv(params, z_out);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
  desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
      {"device FLT4* const biases",
       GetByteBufferConvertedResized(params.bias.data,
                                     options.storage_precision,
@ -1108,12 +1100,9 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise(
      }};

  auto weights_reordered = ReorderWeightsForConv(params, z_out);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
  desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
      {"device FLT4* const biases",
       GetByteBufferConvertedResized(params.bias.data,
                                     options.storage_precision,
@ -1169,12 +1158,9 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise1x1PowerVR(
      }};

  auto weights_reordered = ReorderWeightsForConv(params, z_out);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
  desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
      {"device FLT4* const biases",
       GetByteBufferConvertedResized(params.bias.data,
                                     options.storage_precision,
--- a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
@ -566,19 +566,14 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConvolution(
        return out_shape;
      }};

-  std::vector<float> filters_reordered = ConvertToPIOHW4(attr.weights);
-  auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
-                     ? GetByteBuffer(filters_reordered)
-                     : VectorFloatToHalf(filters_reordered);
-  auto biases = options.storage_precision == RuntimeOptions::Precision::FP32
-                    ? GetByteBuffer(attr.bias.data)
-                    : VectorFloatToHalf(attr.bias.data);
+  const int output_channels_count = attr.weights.shape.i * attr.weights.shape.o;
  desc->immutable_buffers = {
-      {"device FLT4* const filters", filters},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(ConvertToPIOHW4(attr.weights),
+                              options.storage_precision)},
      {"device FLT4* const biases",
-       GetByteBufferConvertedResized(
-           attr.bias.data, options.storage_precision,
-           attr.weights.shape.i * attr.weights.shape.o)},
+       GetByteBufferConvertedResized(attr.bias.data, options.storage_precision,
+                                     output_channels_count)},
  };

  desc->uniform_buffers = {
@ -647,12 +642,9 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride1x1(

  // For this operation we keep weights and biases in one buffer
  auto weights_reordered = ReorderWeightsDepthWiseConv3x3Stride1x1(attr);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
  desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
  };

  desc->uniform_buffers = {
@ -714,12 +706,9 @@ std::vector<ComputeTaskDescriptorPtr> DepthWiseConv3x3Stride2(

  // For this operation we keep weights and biases in one buffer
  auto weights_reordered = ReorderWeightsDepthWiseConv3x3Stride2(attr);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
  desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
  };

  desc->uniform_buffers = {
--- a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
@ -159,11 +159,9 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
    }
  }

-  auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
-                     ? GetByteBuffer(filters_reordered)
-                     : VectorFloatToHalf(filters_reordered);
  desc->immutable_buffers = {
-      {"device FLT4* const matrix", filters},
+      {"device FLT4* const matrix",
+       GetByteBufferConverted(filters_reordered, options.storage_precision)},
      {"device FLT4* const biases",
       GetByteBufferConvertedResized(attr.bias.data, options.storage_precision,
                                     attr.weights.shape.o)},
--- a/tensorflow/lite/delegates/gpu/metal/kernels/mul.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/mul.cc
@ -69,11 +69,9 @@ std::vector<ComputeTaskDescriptorPtr> Multiply(
         }},
    };
  } else {
-    auto coeffs = options.storage_precision == RuntimeOptions::Precision::FP32
-                      ? GetByteBuffer(mul_buffer->data)
-                      : VectorFloatToHalf(mul_buffer->data);
    desc->immutable_buffers = {
-        {"device FLT4* const", coeffs},
+        {"device FLT4* const",
+         GetByteBufferConverted(mul_buffer->data, options.storage_precision)},
    };
  }
  return {desc};
--- a/tensorflow/lite/delegates/gpu/metal/kernels/prelu.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/prelu.cc
@ -61,11 +61,9 @@ std::vector<ComputeTaskDescriptorPtr> PReLU(int id, ValueId input_id,
  }
  desc->input_buffers = {{input_id}};
  desc->output_buffer = {output_id};
-  auto alphas = options.storage_precision == RuntimeOptions::Precision::FP32
-                    ? GetByteBuffer(alpha_buffer->data)
-                    : VectorFloatToHalf(alpha_buffer->data);
  desc->immutable_buffers = {
-      {"device FLT4* const", alphas},
+      {"device FLT4* const",
+       GetByteBufferConverted(alpha_buffer->data, options.storage_precision)},
  };
  if (attr.clip != 0) {
    desc->uniform_buffers = {
@ -106,11 +104,9 @@ std::vector<ComputeTaskDescriptorPtr> PReLUFull(int id, ValueId input_id,
  }
  desc->input_buffers = {{input_id}};
  desc->output_buffer = {output_id};
-  auto alphas = options.storage_precision == RuntimeOptions::Precision::FP32
-                    ? GetByteBuffer(ConvertToPHWC4(*alpha))
-                    : VectorFloatToHalf(ConvertToPHWC4(*alpha));
  desc->immutable_buffers = {
-      {"device FLT4* const", alphas},
+      {"device FLT4* const", GetByteBufferConverted(ConvertToPHWC4(*alpha),
+                                                    options.storage_precision)},
  };
  if (attr.clip != 0) {
    desc->uniform_buffers = {
--- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
@ -950,9 +950,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
    }
  }

-  auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
-                     ? GetByteBuffer(filters_reordered)
-                     : VectorFloatToHalf(filters_reordered);
+  auto filters =
+      GetByteBufferConverted(filters_reordered, options.storage_precision);
  desc->immutable_buffers = {
      {"device FilterStripe* const filters", filters},
      {"device FLT4* const biases",
@ -1044,9 +1043,8 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed3x3(
    }
  }

-  auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
-                     ? GetByteBuffer(filters_reordered)
-                     : VectorFloatToHalf(filters_reordered);
+  auto filters =
+      GetByteBufferConverted(filters_reordered, options.storage_precision);
  auto biases = GetByteBufferConvertedResized(
      params.bias.data, options.storage_precision, params.weights.shape.o);
  border_desc->immutable_buffers = {