Refactor op level cost estimator -- cost functions report raw data (e.g., num ops, num bytes, not time), and then PredictCosts() translates it to time (in Costs).

(1) Add a new structure, NodeCosts -- this is supposed to be used within op_level_cost_estimator; not for the users of OpLevelCostEstimator. (2) PredictCosts calls PredictNodeCosts, and then convert NodeCosts to Costs; users of OpLevelCostEstimator wouldn't see any difference. (3) The signature of Predict methods for each op type is Status Predict***(OpContext&, NodeCost*); within OpLevelCostEstimator, we'll use Status for handling erroneous cases. (4) Fixed PredictSoftmax(): previously, it incorrectly checking input is rank-2, but it can be any rank >=1. (5) Predict times for fused ops are changed (in unit test, 2ns at most); that's because we now add bytes (in int64) and then calculate time, whereas previously, we first calculate time for each op, and then add them, but bytes to time may introduces some errors (int to float), the current approach is more accurate (however small delta it is). (6) CropAndResize op cost ignored 2nd, 3rd, and 4th input tensors' mem cost; it's now incorporated. PiperOrigin-RevId: 346121141 Change-Id: I6caf1123f99dac6897f048644222f2fb46417885
2020-12-07 10:17:24 -08:00 · 2020-12-07 10:17:24 -08:00 · c6d1d34cf2
commit c6d1d34cf2
parent b3b3368715
4 changed files with 624 additions and 375 deletions
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@ -334,6 +334,7 @@ cc_library(
        "@com_google_absl//absl/strings",
        "//third_party/eigen3",
        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core/grappler/clusters:utils",
    ] + tf_protos_grappler(),
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@ -16,9 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_

+#include <numeric>
+
 #include "tensorflow/core/grappler/costs/cost_estimator.h"
 #include "tensorflow/core/grappler/costs/op_context.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/padding.h"

 namespace tensorflow {
@ -29,6 +32,62 @@ bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto,
 TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
                                      int rank, bool* found_unknown_shapes);

+// Node costs; an intermediate structure used within op level cost estimator.
+struct NodeCosts {
+  // If this FLAG is true, override calculated compute time with a minimum
+  // value, instead of calculating it from num_compute_ops and compute ops/sec.
+  // For example, PredictIdentity, PredictVariable, PredictMetadata set this
+  // FLAG.
+  bool minimum_cost_op = false;
+
+  // Compute ops.
+  int64 num_compute_ops = 0;
+
+  // Memory bytes accessed; note that these may be different to the size of
+  // tensors.
+  std::vector<int64> num_input_bytes_accessed;   // ordered by input tensors.
+  std::vector<int64> num_output_bytes_accessed;  // ordered by output ports.
+  int64 internal_read_bytes = 0;
+  int64 internal_write_bytes = 0;
+
+  // Convenience functions.
+  int64 num_total_input_bytes() {
+    return std::accumulate(num_input_bytes_accessed.begin(),
+                           num_input_bytes_accessed.end(), 0LL);
+  }
+  int64 num_total_read_bytes() {
+    return num_total_input_bytes() + internal_read_bytes;
+  }
+  int64 num_total_output_bytes() {
+    return std::accumulate(num_output_bytes_accessed.begin(),
+                           num_output_bytes_accessed.end(), 0LL);
+  }
+  int64 num_total_write_bytes() {
+    return num_total_output_bytes() + internal_write_bytes;
+  }
+  int64 num_bytes_accessed() {
+    return num_total_read_bytes() + num_total_write_bytes();
+  }
+
+  // Memory usage.
+  int64 max_memory = 0;
+  int64 persistent_memory = 0;
+  int64 temporary_memory = 0;
+
+  // Stats.
+  int64 num_nodes = 1;
+  int64 num_nodes_with_unknown_shapes = 0;
+  int64 num_nodes_with_unknown_op_type = 0;
+  int64 num_nodes_with_pure_memory_op = 0;
+  bool inaccurate = false;
+
+  // TODO(dyoon): this is added for compatibility; some old code is hard to
+  // migrate; hence, using these as a backup. Once we clean up, we'll delete
+  // these fields. New code should not use these.
+  bool has_costs = false;
+  Costs costs;
+};
+
 class OpLevelCostEstimator {
 public:
  OpLevelCostEstimator();
@ -40,9 +99,7 @@ class OpLevelCostEstimator {
  virtual DeviceInfo GetDeviceInfo(const DeviceProperties& device) const;

 protected:
-  // Predict cost of an op for which no accurate estimator is defined.
-  Costs PredictCostOfAnUnknownOp(const OpContext& op_context) const;
-
+  // TODO(dyoon): Consider to remove PredictOpCountBasedCosts() with OpInfo.
  // Naive cost estimate based on the given operations count and total
  // input/output tensor sizes of the given op_info combined.
  Costs PredictOpCountBasedCost(double operations, const OpInfo& op_info) const;
@ -54,6 +111,16 @@ class OpLevelCostEstimator {
                                double output_io_bytes,
                                const OpInfo& op_info) const;

+  // Top-level method cost function (PredictCosts calls this method to get
+  // NodeCosts, and then converts it to Costs). PredictNodeCosts() calls other
+  // Predict methods depending on op types.
+  Status PredictNodeCosts(const OpContext& op_context,
+                          NodeCosts* node_costs) const;
+
+  // Predict cost of an op for which no accurate estimator is defined.
+  Status PredictCostOfAnUnknownOp(const OpContext& op_context,
+                                  NodeCosts* node_costs) const;
+
  // This family of routines predicts the costs to
  // perform the specified TensorFlow Op on the
  // device represented by a subclass. The default
@ -64,37 +131,64 @@ class OpLevelCostEstimator {
  // Implementation of costs other than
  // execution_time is optional, depending on the
  // device.
-  Costs PredictNaryOp(const OpContext& op_context) const;
-  Costs PredictConv2D(const OpContext& op_context) const;
-  Costs PredictCwiseOp(const OpContext& op_context) const;
-  Costs PredictConv2DBackpropInput(const OpContext& op_context) const;
-  Costs PredictConv2DBackpropFilter(const OpContext& op_context) const;
-  Costs PredictFusedConv2DBiasActivation(const OpContext& op_context) const;
-  Costs PredictMatMul(const OpContext& op_context) const;
-  Costs PredictSparseTensorDenseMatMul(const OpContext& op_context) const;
-  Costs PredictNoOp(const OpContext& op_context) const;
-  Costs PredictIdentity(const OpContext& op_context) const;
-  Costs PredictVariable(const OpContext& op_context) const;
-  Costs PredictBatchMatMul(const OpContext& op_context) const;
-  Costs PredictMetadata(const OpContext& op_context) const;
-  Costs PredictGatherOrSlice(const OpContext& op_context) const;
-  Costs PredictScatter(const OpContext& op_context) const;
-  Costs PredictMaxPool(const OpContext& op_context) const;
-  Costs PredictMaxPoolGrad(const OpContext& op_context) const;
-  Costs PredictAvgPool(const OpContext& op_context) const;
-  Costs PredictAvgPoolGrad(const OpContext& op_context) const;
-  Costs PredictFusedBatchNorm(const OpContext& op_context) const;
-  Costs PredictFusedBatchNormGrad(const OpContext& op_context) const;
-  Costs PredictEinsum(const OpContext& op_context) const;
-  Costs PredictAssignVariableOps(const OpContext& op_context) const;
-  Costs PredictPureMemoryOp(const OpContext& op_context) const;
-  Costs PredictSoftmax(const OpContext& op_context) const;
-  Costs PredictResizeBilinear(const OpContext& op_context) const;
-  Costs PredictCropAndResize(const OpContext& op_context) const;
+  Status PredictNaryOp(const OpContext& op_context,
+                       NodeCosts* node_costs) const;
+  Status PredictConv2D(const OpContext& op_context,
+                       NodeCosts* node_costs) const;
+  Status PredictCwiseOp(const OpContext& op_context,
+                        NodeCosts* node_costs) const;
+  Status PredictConv2DBackpropInput(const OpContext& op_context,
+                                    NodeCosts* node_costs) const;
+  Status PredictConv2DBackpropFilter(const OpContext& op_context,
+                                     NodeCosts* node_costs) const;
+  Status PredictFusedConv2DBiasActivation(const OpContext& op_context,
+                                          NodeCosts* node_costs) const;
+  Status PredictMatMul(const OpContext& op_context,
+                       NodeCosts* node_costs) const;
+  Status PredictSparseTensorDenseMatMul(const OpContext& op_context,
+                                        NodeCosts* node_costs) const;
+  Status PredictNoOp(const OpContext& op_context, NodeCosts* node_costs) const;
+  Status PredictIdentity(const OpContext& op_context,
+                         NodeCosts* node_costs) const;
+  Status PredictVariable(const OpContext& op_context,
+                         NodeCosts* node_costs) const;
+  Status PredictBatchMatMul(const OpContext& op_context,
+                            NodeCosts* node_costs) const;
+  Status PredictMetadata(const OpContext& op_context,
+                         NodeCosts* node_costs) const;
+  Status PredictGatherOrSlice(const OpContext& op_context,
+                              NodeCosts* node_costs) const;
+  Status PredictScatter(const OpContext& op_context,
+                        NodeCosts* node_costs) const;
+  Status PredictMaxPool(const OpContext& op_context,
+                        NodeCosts* node_costs) const;
+  Status PredictMaxPoolGrad(const OpContext& op_context,
+                            NodeCosts* node_costs) const;
+  Status PredictAvgPool(const OpContext& op_context,
+                        NodeCosts* node_costs) const;
+  Status PredictAvgPoolGrad(const OpContext& op_context,
+                            NodeCosts* node_costs) const;
+  Status PredictFusedBatchNorm(const OpContext& op_context,
+                               NodeCosts* node_costs) const;
+  Status PredictFusedBatchNormGrad(const OpContext& op_context,
+                                   NodeCosts* node_costs) const;
+  Status PredictEinsum(const OpContext& op_context,
+                       NodeCosts* node_costs) const;
+  Status PredictAssignVariableOps(const OpContext& op_context,
+                                  NodeCosts* node_costs) const;
+  Status PredictPureMemoryOp(const OpContext& op_context,
+                             NodeCosts* node_costs) const;
+  Status PredictSoftmax(const OpContext& op_context,
+                        NodeCosts* node_costs) const;
+  Status PredictResizeBilinear(const OpContext& op_context,
+                               NodeCosts* node_costs) const;
+  Status PredictCropAndResize(const OpContext& op_context,
+                              NodeCosts* node_costs) const;

  // Generic cost prediction method for fused operations.
-  Costs PredictFusedOp(const OpContext& op_context,
-                       const std::vector<OpContext>& fused_op_contexts) const;
+  Status PredictFusedOp(const OpContext& op_context,
+                        const std::vector<OpContext>& fused_op_contexts,
+                        NodeCosts* node_costs) const;

  // Utility function for safe division. Returns 0
  // if rhs is 0 or negative.
@ -176,11 +270,19 @@ class OpLevelCostEstimator {
  static int64 CalculateInputSize(const OpInfo& op_info,
                                  bool* found_unknown_shapes);

+  // Same, but a vector format: one for each input.
+  static std::vector<int64> CalculateInputTensorSize(
+      const OpInfo& op_info, bool* found_unknown_shapes);
+
  // Calculate the total size in bytes of the all
  // the outputs of specified TensorFlow op.
  static int64 CalculateOutputSize(const OpInfo& op_info,
                                   bool* found_unknown_shapes);

+  // Same, but a vector format: one for each output.
+  static std::vector<int64> CalculateOutputTensorSize(
+      const OpInfo& op_info, bool* found_unknown_shapes);
+
  // For convolution and its grad ops.
  static ConvolutionDimensions ConvolutionDimensionsFromInputs(
      const TensorShapeProto& original_image_shape,
@ -203,9 +305,16 @@ class OpLevelCostEstimator {
  static OpInfo::TensorProperties DescribeTensor(
      DataType type, const std::vector<int64>& dims);

+  // Helper method for building common case NodeCosts.
+  static Status PredictDefaultNodeCosts(const int64 num_compute_ops,
+                                        const OpContext& op_context,
+                                        bool* found_unknown_shapes,
+                                        NodeCosts* node_costs);
+
 protected:
  std::map<string, int> elementwise_ops_;
-  typedef std::function<Costs(const OpContext& op_context)> CostImpl;
+  typedef std::function<Status(const OpContext& op_context, NodeCosts*)>
+      CostImpl;
  std::map<string, CostImpl> device_cost_impl_;
  // If true, assume compute and memory overlap; hence, the op cost is max of
  // compute_time and memory_time, instead of sum of those two.
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@ -894,8 +894,8 @@ TEST_F(OpLevelCostEstimatorTest,
      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ false,
      "NCHW", "HWIO"));
  EXPECT_EQ(Costs::Duration(825345), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(355321038), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(356146383), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(355321037), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(356146382), cost.execution_time);
  EXPECT_EQ(cost.num_ops_total, 1);
  EXPECT_FALSE(cost.inaccurate);
  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
@ -908,8 +908,8 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_HWIO) {
      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ true,
      "NCHW", "HWIO"));
  EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(355616768), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(357033576), cost.execution_time);
  EXPECT_EQ(cost.num_ops_total, 1);
  EXPECT_FALSE(cost.inaccurate);
  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
@ -922,8 +922,8 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_OIHW) {
      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ true,
      "NCHW", "OIHW"));
  EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(355616768), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(357033576), cost.execution_time);
  EXPECT_EQ(cost.num_ops_total, 1);
  EXPECT_FALSE(cost.inaccurate);
  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
@ -936,8 +936,8 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_HWIO) {
      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ true,
      "NHWC", "HWIO"));
  EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(355616768), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(357033576), cost.execution_time);
  EXPECT_EQ(cost.num_ops_total, 1);
  EXPECT_FALSE(cost.inaccurate);
  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
@ -950,8 +950,8 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_OIHW) {
      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ true,
      "NHWC", "OIHW"));
  EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(355616768), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(357033576), cost.execution_time);
  EXPECT_EQ(cost.num_ops_total, 1);
  EXPECT_FALSE(cost.inaccurate);
  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
@ -964,8 +964,8 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_VECT_C_OIHW) {
      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ true,
      "NCHW_VECT_C", "OIHW"));
  EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(355616768), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(357033576), cost.execution_time);
  EXPECT_EQ(cost.num_ops_total, 1);
  EXPECT_FALSE(cost.inaccurate);
  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
@ -978,8 +978,8 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_OIHW_VECT_I) {
      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ true,
      "NCHW", "OIHW_VECT_I"));
  EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(355616768), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(357033576), cost.execution_time);
  EXPECT_EQ(cost.num_ops_total, 1);
  EXPECT_FALSE(cost.inaccurate);
  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
@ -993,8 +993,8 @@ TEST_F(OpLevelCostEstimatorTest,
      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ true,
      "NCHW_VECT_C", "OIHW_VECT_I"));
  EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(355616768), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(357033576), cost.execution_time);
  EXPECT_EQ(cost.num_ops_total, 1);
  EXPECT_FALSE(cost.inaccurate);
  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
@ -2255,9 +2255,14 @@ TEST_F(OpLevelCostEstimatorTest, CropAndResizeExecutionTime) {
  DescribeTensor4D(kNumBoxes, kOutputImageDim, kOutputImageDim, kChannelSize,
                   op_context.op_info.add_outputs());

+  // Note this is time [ns, default in Duration in Costs], not bytes;
+  // whereas memory bandwidth from SetCpuDevice() is 10GB/s.
  const int kExpectedMemoryTime =
-      (kImageDim * kImageDim + kNumBoxes * kOutputImageDim * kOutputImageDim) *
-      4;
+      (kImageDim * kImageDim * 4 +  // input image in float.
+       kNumBoxes * 4 * 8 / 10 +     // boxes (kNumBoxes x 4) in int64.
+       kNumBoxes * kOutputImageDim * kOutputImageDim * 4);  // output in float.
+  // Note that input image and output image has kChannelSize dim, which is 10,
+  // hence, no need to divide it by 10 (bandwidth).

  {
    // Cost of CropAndResize with bilinear interpolation.