Internal change

PiperOrigin-RevId: 347508731 Change-Id: Id4cbecd128176c62878a9c153e79a4b2d64a52d9
2020-12-14 17:57:22 -08:00 · 2020-12-14 17:57:22 -08:00 · 78d7f8b2ef
commit 78d7f8b2ef
parent 3af08c5f47
5 changed files with 27 additions and 57 deletions
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@ -497,11 +497,6 @@ Status DirectSession::RunInternal(
    CallFrameInterface* call_frame, ExecutorsAndKeys* executors_and_keys,
    RunMetadata* run_metadata,
    const thread::ThreadPoolOptions& threadpool_options) {
-  // This is a temporary flag for controlling whether to always track the kernel
-  // execution cost. We will remove this once the feature is validated.
-  if (run_options.experimental().always_track_kernel_execution_cost())
-    EnableAlwaysTrackKernelExecutionCost();
-
  const uint64 start_time_usecs = options_.env->NowMicros();
  const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
  RunState run_state(step_id, &devices_);
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@ -74,12 +74,6 @@ limitations under the License.

 namespace tensorflow {

-// Temporary flag for controlling whether to always track kernel execution
-// costs.
-static bool always_track_kernel_execution_cost = false;
-void EnableAlwaysTrackKernelExecutionCost() {
-  always_track_kernel_execution_cost = true;
-}
 namespace {

 // 1-D, 0 element tensor.
@ -162,7 +156,7 @@ class ExecutorImpl : public Executor {
    KernelStats() = default;

    void Initialize(const GraphView& gview) {
-      is_expensive_ = absl::make_unique<std::atomic<bool>[]>(gview.num_nodes());
+      is_expensive_.resize(gview.num_nodes());
      cost_estimates_ =
          absl::make_unique<std::atomic_uint_fast64_t[]>(gview.num_nodes());
      for (int32 i = 0; i < gview.num_nodes(); ++i) {
@ -183,23 +177,26 @@ class ExecutorImpl : public Executor {
              kOpIsExpensiveThresholdCycles);
    }

+    // Returns the value of kernel->IsExpensive().
+    bool HasExpensiveMarker(const NodeItem& node) const {
+      return is_expensive_[node.node_id];
+    }
+
    // Updates the dynamic cost estimate, which is used to determine whether the
    // given node is expensive. The new cost estimate is a weighted average of
-    // the old cost estimate and the latest cost.
+    // the old cost estimate and the latest cost. We only update cost estimates
+    // for kernels for which IsExpensive() return true.
    void UpdateCostEstimate(const NodeItem& node, uint64 elapsed_cycles) {
      // N.B. Updates to `cost_estimate` are atomic but unlocked.  Simultaneous
      // updates may result in one or more updates being ignored.  This does not
      // affect correctness but may slow down the update frequency.
      std::atomic_uint_fast64_t& cost_estimate = cost_estimates_[node.node_id];
-      uint64 new_estimate = (kCostDecay - 1) *
-                                cost_estimate.load(std::memory_order_relaxed) /
-                                kCostDecay +
-                            (elapsed_cycles / kCostDecay);
-      cost_estimate.store(new_estimate, std::memory_order_relaxed);
+      auto prev_estimate = cost_estimate.load(std::memory_order_relaxed);

-      bool new_is_expensive = (new_estimate >= kOpIsExpensiveThresholdCycles);
-      is_expensive_[node.node_id].store(new_is_expensive,
-                                        std::memory_order_relaxed);
+      uint64 new_estimate =
+          ((kCostDecay - 1) * prev_estimate + elapsed_cycles) / kCostDecay;
+
+      cost_estimate.store(new_estimate, std::memory_order_relaxed);
    }

   private:
@ -207,10 +204,11 @@ class ExecutorImpl : public Executor {
    // determine whether an operation should be place in a threadpool.
    // Operations start out "expensive".
    static constexpr uint64 kInitialCostEstimateCycles = 100 * 1000 * 1000;
-    static constexpr uint64 kOpIsExpensiveThresholdCycles = 5000;
+    static constexpr uint64 kOpIsExpensiveThresholdCycles = 8000;
    static constexpr uint64 kCostDecay = 10;

-    std::unique_ptr<std::atomic<bool>[]> is_expensive_;
+    std::vector<bool> is_expensive_;
+    // std::unique_ptr<std::atomic<bool>[]> is_expensive_;
    std::unique_ptr<std::atomic_uint_fast64_t[]> cost_estimates_;
  };

@ -569,24 +567,19 @@ Status ExecutorState<PropagatorStateType>::ProcessSync(
        },
        profiler::GetTFTraceMeLevel(is_expensive));
    device->Compute(op_kernel, &ctx);
-  } else {
-    // In the common case, avoid creating any tracing objects.
-    if (is_expensive) {
-      KernelTimer timer;
-      device->Compute(op_kernel, &ctx);
+  } else if (kernel_stats_->HasExpensiveMarker(item)) {
+    KernelTimer timer;
+    device->Compute(op_kernel, &ctx);
+    // For expensive kernels, always update the cost estimate. For inexpensive
+    // kernels, update the cost estimate with ~1/16 probability. This assumes
+    // that the last 4 bits of the CPU cycle count is uniformly distributed.
+    constexpr int kKernelExecutionTrackingInvocationSkipCount = 16;
+    if (is_expensive ||
+        timer.start_cycles % kKernelExecutionTrackingInvocationSkipCount == 0) {
      kernel_stats_->UpdateCostEstimate(item, timer.ElapsedCycles());
-    } else if (always_track_kernel_execution_cost) {
-      KernelTimer timer;
-      device->Compute(op_kernel, &ctx);
-      // If always_track_kernel_execution_cost is set, update the cost estimate
-      // for inexpensive kernels with ~1/8 probability. This assumes that the
-      // last 3 bits of the CPU cycle count is uniformly distributed.
-      constexpr int kKernelExecutionTrackingInvocationSkipCount = 8;
-      if (timer.start_cycles % kKernelExecutionTrackingInvocationSkipCount == 0)
-        kernel_stats_->UpdateCostEstimate(item, timer.ElapsedCycles());
-    } else {
-      device->Compute(op_kernel, &ctx);
    }
+  } else {
+    device->Compute(op_kernel, &ctx);
  }
  nodestats::SetOpEnd(stats);
  if (outputs->size() < item.num_outputs) outputs->resize(item.num_outputs);
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@ -695,12 +695,6 @@ message RunOptions {
      int64 priority = 1;
    }
    RunHandlerPoolOptions run_handler_pool_options = 3;
-
-    // If true, always track kernel execution cost. This allows the executor to
-    // transit kernels from "inexpensive" to "expensive" during the execution.
-    // This is a temporary flag for validating this feature. We will remove this
-    // flag once the feature is validated.
-    bool always_track_kernel_execution_cost = 4;
  }

  Experimental experimental = 8;
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
@ -21,12 +21,6 @@ tf_proto {
      type: TYPE_MESSAGE
      type_name: ".tensorflow.RunOptions.Experimental.RunHandlerPoolOptions"
    }
-    field {
-      name: "always_track_kernel_execution_cost"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
    nested_type {
      name: "RunHandlerPoolOptions"
      field {
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
@ -68,12 +68,6 @@ tf_proto {
        type: TYPE_MESSAGE
        type_name: ".tensorflow.RunOptions.Experimental.RunHandlerPoolOptions"
      }
-      field {
-        name: "always_track_kernel_execution_cost"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
      nested_type {
        name: "RunHandlerPoolOptions"
        field {