Experimental feature.

Using special kernels for inference improvements. Added new hint to control behavior. PiperOrigin-RevId: 322465118 Change-Id: I62c2a3ddc75907f2d9e455b7454e1de8c54a9881
2020-07-21 16:20:04 -07:00 · 2020-07-21 16:20:04 -07:00 · 2a150a026a
commit 2a150a026a
parent 777bdc36a2
8 changed files with 242 additions and 44 deletions
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@ -366,6 +366,7 @@ cc_library(
        ":tensor_type",
        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
        "//tensorflow/lite/delegates/gpu/cl/selectors:operation_selector",
        "//tensorflow/lite/delegates/gpu/cl/selectors:special_selector",
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:memory_management",
        "//tensorflow/lite/delegates/gpu/common:model",
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@ -261,6 +262,12 @@ void InferenceContext::ReserveGraphTensors(
 absl::Status InferenceContext::ConvertOperations(
    const CreationContext& creation_context, const GraphFloat32& graph,
    ModelHints hints) {
  std::map<ValueId, TensorDescriptor> tensor_descriptors;
  const auto values = graph.values();
  for (auto value : values) {
    tensor_descriptors[value->id] = tensor_reserver_.Get(value->id).descriptor;
  }
  std::set<NodeId> consumed_nodes;
  std::vector<Node*> graph_nodes = graph.nodes();
  std::map<ValueId, int>
      tensor_usages;  // keeps latest index of operation that updated tensor
@ -270,45 +277,54 @@ absl::Status InferenceContext::ConvertOperations(
  }
  for (int i = 0; i < graph_nodes.size(); ++i) {
    const Node& node = *graph_nodes[i];
-    auto inputs = graph.FindInputs(node.id);
+    if (consumed_nodes.find(node.id) != consumed_nodes.end()) {
-    auto outputs = graph.FindOutputs(node.id);
+      continue;
    // Reordering of input ids and updating of temporary tensors_usage struct.
    // This stage is necessary because we are building OperationDef that rely on
    // order of input ids. But we also should have input id on first position
    // that potentially can be "linking" tensor and as result eliminated(unused)
    // We apply it only for ADD operation, because of ADD associativity and
    // ADD can be linked.
    // In current approach "linking" tensor can be only latest written
    // tensor(during linear order of execution) among input tensors.
    if (IsGenericAdd(node, inputs, outputs)) {
      int latest_written_tensor_index = 0;
      int last_usage = tensor_usages[inputs[0]->id];
      for (int j = 1; j < inputs.size(); ++j) {
        if (tensor_usages[inputs[j]->id] > last_usage) {
          last_usage = tensor_usages[inputs[j]->id];
          latest_written_tensor_index = j;
        }
      }
      std::swap(inputs[0], inputs[latest_written_tensor_index]);
    }
    for (const auto& out_id : outputs) {
      tensor_usages[out_id->id] = i;
    }
    OperationDef op_def;
    op_def.precision = precision_;
    for (int j = 0; j < inputs.size(); ++j) {
      op_def.src_tensors.push_back(
          tensor_reserver_.Get(inputs[j]->id).descriptor);
    }
    for (int j = 0; j < outputs.size(); ++j) {
      op_def.dst_tensors.push_back(
          tensor_reserver_.Get(outputs[j]->id).descriptor);
    }
    GPUOperationsSubgraph gpu_subgraph;
-    RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints,
+    if (hints.Check(ModelHints::kAllowSpecialKernels) &&
-                                         inputs, outputs, node, &gpu_subgraph));
+        GPUSubgraphFromGraph(creation_context, precision_, graph, node.id,
                             tensor_descriptors, &consumed_nodes, &gpu_subgraph)
            .ok()) {
      // Mapping of subgraph (set of nodes) to GPU operations. Should happen
      // before straigtforward mapping.
    } else {
      // Straigtforward mapping of one graph node to GPU operations.
      auto inputs = graph.FindInputs(node.id);
      auto outputs = graph.FindOutputs(node.id);
      // Reordering of input ids and updating of temporary tensors_usage struct.
      // This stage is necessary because we are building OperationDef that rely
      // on order of input ids. But we also should have input id on first
      // position that potentially can be "linking" tensor and as result
      // eliminated(unused) We apply it only for ADD operation, because of ADD
      // associativity and ADD can be linked. In current approach "linking"
      // tensor can be only latest written tensor(during linear order of
      // execution) among input tensors.
      if (IsGenericAdd(node, inputs, outputs)) {
        int latest_written_tensor_index = 0;
        int last_usage = tensor_usages[inputs[0]->id];
        for (int j = 1; j < inputs.size(); ++j) {
          if (tensor_usages[inputs[j]->id] > last_usage) {
            last_usage = tensor_usages[inputs[j]->id];
            latest_written_tensor_index = j;
          }
        }
        std::swap(inputs[0], inputs[latest_written_tensor_index]);
      }
      consumed_nodes.insert(node.id);
      OperationDef op_def;
      op_def.precision = precision_;
      for (int j = 0; j < inputs.size(); ++j) {
        op_def.src_tensors.push_back(
            tensor_reserver_.Get(inputs[j]->id).descriptor);
      }
      for (int j = 0; j < outputs.size(); ++j) {
        op_def.dst_tensors.push_back(
            tensor_reserver_.Get(outputs[j]->id).descriptor);
      }
      RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints,
                                           inputs, outputs, node,
                                           &gpu_subgraph));
    }
    std::unordered_map<int, ValueId> mapping_to_global_ids;
    for (int j = 0; j < gpu_subgraph.new_tensors.size(); ++j) {
      const auto& t = gpu_subgraph.new_tensors[j];
@ -324,7 +340,7 @@ absl::Status InferenceContext::ConvertOperations(
      for (int j = 0; j < gpu_op.input_ids.size(); ++j) {
        int id = gpu_op.input_ids[j];
        if (id >= 0) {
-          cl_node.inputs[j] = inputs[id]->id;
+          cl_node.inputs[j] = id;
        } else {
          cl_node.inputs[j] = mapping_to_global_ids[-(id + 1)];
        }
@ -333,7 +349,8 @@ absl::Status InferenceContext::ConvertOperations(
      for (int j = 0; j < gpu_op.output_ids.size(); ++j) {
        int id = gpu_op.output_ids[j];
        if (id >= 0) {
-          cl_node.outputs[j] = outputs[id]->id;
+          cl_node.outputs[j] = id;
          tensor_usages[id] = i;
        } else {
          cl_node.outputs[j] = mapping_to_global_ids[-(id + 1)];
        }
--- a/tensorflow/lite/delegates/gpu/cl/model_hints.h
+++ b/tensorflow/lite/delegates/gpu/cl/model_hints.h
@ -25,13 +25,18 @@ namespace cl {
 struct ModelHints {
  using ModelHint = uint64_t;
-  // By default we want the fastest inference
+  // By default we want the fastest inference.
  static constexpr ModelHint kFastestInference = 0x00000000;
-  // Can improve compilation time, but inference can be slower
+  // Can improve compilation time, but inference can be slower.
  static constexpr ModelHint kReduceKernelsCount = 0x00000001;
-  // Can improve tuning time, but inference can be slower
+  // Can improve tuning time, but inference can be slower.
  static constexpr ModelHint kFastTuning = 0x00000002;
  // Experimental.
  // Can improve performance and memory consumption, but slow down
  // initialization a lot and create more kernels.
  static constexpr ModelHint kAllowSpecialKernels = 0x00000004;
  void Add(ModelHint hint) {
    if (hint == kFastestInference) {
      hints = kFastestInference;
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@ -152,6 +152,26 @@ cc_library(
    ],
 )
 cc_library(
    name = "special_selector",
    srcs = ["special_selector.cc"],
    hdrs = ["special_selector.h"],
    deps = [
        ":subgraph",
        "//tensorflow/lite/delegates/gpu/cl:cl_device",
        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
        "//tensorflow/lite/delegates/gpu/cl/kernels/special:depthwise_conv_plus_1x1_conv",
        "//tensorflow/lite/delegates/gpu/common:data_type",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:shape",
        "//tensorflow/lite/delegates/gpu/common:status",
        "//tensorflow/lite/delegates/gpu/common:tensor",
        "@com_google_absl//absl/types:any",
    ],
 )
 cc_library(
    name = "subgraph",
    srcs = ["subgraph.cc"],
--- a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
@ -0,0 +1,111 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
 absl::Status TryDepthwiseConvPlus1x1Conv(
    const CreationContext& creation_context, CalculationsPrecision precision,
    const GraphFloat32& graph, NodeId first_node_id,
    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
  auto* dw_node = graph.GetNode(first_node_id);
  if (OperationTypeFromString(dw_node->operation.type) !=
      OperationType::DEPTHWISE_CONVOLUTION) {
    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
  }
  auto dw_outputs = graph.FindOutputs(dw_node->id);
  auto consumers = graph.FindConsumers(dw_outputs[0]->id);
  if (consumers.size() != 1) {
    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
  }
  auto* conv_node = consumers[0];
  if (consumed_nodes->find(conv_node->id) != consumed_nodes->end()) {
    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
  }
  if (OperationTypeFromString(conv_node->operation.type) !=
      OperationType::CONVOLUTION_2D) {
    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
  }
  if (graph.FindInputs(conv_node->id).size() != 1) {
    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
  }
  auto dw_attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
      dw_node->operation.attributes);
  auto conv_attr =
      absl::any_cast<Convolution2DAttributes>(conv_node->operation.attributes);
  auto dw_inputs = graph.FindInputs(dw_node->id);
  auto conv_outputs = graph.FindOutputs(conv_node->id);
  OperationDef op_def;
  op_def.precision = precision;
  auto it = tensor_descriptors.find(dw_inputs[0]->id);
  if (it != tensor_descriptors.end()) {
    op_def.src_tensors.push_back(it->second);
  }
  it = tensor_descriptors.find(conv_outputs[0]->id);
  if (it != tensor_descriptors.end()) {
    op_def.dst_tensors.push_back(it->second);
  }
  if (!IsDepthwiseConvPlus1x1ConvSupported(*creation_context.device, op_def,
                                           dw_attr, conv_attr)) {
    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
  }
  std::unique_ptr<GPUOperation>* gpu_op =
      InitSingleOpSubgraph(dw_inputs, conv_outputs, gpu_subgraph);
  DepthwiseConvPlus1x1Conv operation;
  RETURN_IF_ERROR(CreateDepthwiseConvPlus1x1Conv(
      creation_context, op_def, dw_attr, conv_attr, &operation));
  *gpu_op = absl::make_unique<DepthwiseConvPlus1x1Conv>(std::move(operation));
  consumed_nodes->insert(dw_node->id);
  consumed_nodes->insert(conv_node->id);
  return absl::OkStatus();
 }
 }  // namespace
 absl::Status GPUSubgraphFromGraph(
    const CreationContext& creation_context, CalculationsPrecision precision,
    const GraphFloat32& graph, NodeId first_node_id,
    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
  if (!creation_context.device->IsNvidia()) {
    return absl::NotFoundError(
        "Experimental feature, enabled for NVidia only, but device is not "
        "nvidia gpu.");
  }
  if (TryDepthwiseConvPlus1x1Conv(creation_context, precision, graph,
                                  first_node_id, tensor_descriptors,
                                  consumed_nodes, gpu_subgraph)
          .ok()) {
    return absl::OkStatus();
  }
  return absl::NotFoundError("No special combination.");
 }
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
@ -0,0 +1,43 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SPECIAL_SELECTOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SPECIAL_SELECTOR_H_
 #include <map>
 #include <set>
 #include <vector>
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 namespace tflite {
 namespace gpu {
 namespace cl {
 absl::Status GPUSubgraphFromGraph(
    const CreationContext& creation_context, CalculationsPrecision precision,
    const GraphFloat32& graph, NodeId first_node_id,
    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SPECIAL_SELECTOR_H_
--- a/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc
@ -32,10 +32,10 @@ std::unique_ptr<GPUOperation>* InitSingleOpSubgraph(
  gpu_subgraph->new_tensors.clear();
  gpu_subgraph->operations.push_back({});
  for (int i = 0; i < inputs.size(); ++i) {
-    gpu_subgraph->operations[0].input_ids.push_back(i);
+    gpu_subgraph->operations[0].input_ids.push_back(inputs[i]->id);
  }
  for (int i = 0; i < outputs.size(); ++i) {
-    gpu_subgraph->operations[0].output_ids.push_back(i);
+    gpu_subgraph->operations[0].output_ids.push_back(outputs[i]->id);
  }
  return &gpu_subgraph->operations[0].operation;
--- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
@ -44,6 +44,7 @@ absl::Status RunModelSample(const std::string& model_name) {
                              ? CalculationsPrecision::F16
                              : CalculationsPrecision::F32;
  create_info.storage_type = GetFastestStorageType(env.device());
  create_info.hints.Add(ModelHints::kAllowSpecialKernels);
  std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
  std::cout << "Storage type: " << ToString(create_info.storage_type)
            << std::endl;