From 5a85d44038b1b7a2c0d2e59614bea85b52cacdfe Mon Sep 17 00:00:00 2001
From: Jing Dong <jingdong@google.com>
Date: Thu, 10 Dec 2020 21:00:44 -0800
Subject: [PATCH] Internal change

PiperOrigin-RevId: 346924792
Change-Id: I835b6685484806dc9bebc462659013bcacde508b
---
 .../core/common_runtime/direct_session.cc     |  5 ++++
 tensorflow/core/common_runtime/executor.cc    | 29 +++++++++++++------
 tensorflow/core/common_runtime/executor.h     |  9 ++++++
 tensorflow/core/protobuf/config.proto         |  6 ++++
 ...ensorflow.-run-options.-experimental.pbtxt |  6 ++++
 .../golden/v1/tensorflow.-run-options.pbtxt   |  6 ++++
 6 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index c1313076600..5ff3f31cb04 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -497,6 +497,11 @@ Status DirectSession::RunInternal(
     CallFrameInterface* call_frame, ExecutorsAndKeys* executors_and_keys,
     RunMetadata* run_metadata,
     const thread::ThreadPoolOptions& threadpool_options) {
+  // This is a temporary flag for controlling whether to always track the kernel
+  // execution cost. We will remove this once the feature is validated.
+  if (run_options.experimental().always_track_kernel_execution_cost())
+    EnableAlwaysTrackKernelExecutionCost();
+
   const uint64 start_time_usecs = options_.env->NowMicros();
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
   RunState run_state(step_id, &devices_);
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 03c23f32880..443d588c2d3 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -73,6 +73,13 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 
 namespace tensorflow {
+
+// Temporary flag for controlling whether to always track kernel execution
+// costs.
+static bool always_track_kernel_execution_cost = false;
+void EnableAlwaysTrackKernelExecutionCost() {
+  always_track_kernel_execution_cost = true;
+}
 namespace {
 
 // 1-D, 0 element tensor.
@@ -179,12 +186,6 @@ class ExecutorImpl : public Executor {
     // Updates the dynamic cost estimate, which is used to determine whether the
     // given node is expensive. The new cost estimate is a weighted average of
     // the old cost estimate and the latest cost.
-    //
-    // NOTE: We currently only expect updates to the cost estimate when
-    // `is_expensive_[node.node_id]` is true (or at least, it *was* true, when
-    // we started to execute the kernel. As a result, we expect that a kernel
-    // can only ever transition from "expensive" to "inexpensive", but not vice
-    // versa.
     void UpdateCostEstimate(const NodeItem& node, uint64 elapsed_cycles) {
       // N.B. Updates to `cost_estimate` are atomic but unlocked.  Simultaneous
       // updates may result in one or more updates being ignored.  This does not
@@ -195,9 +196,10 @@ class ExecutorImpl : public Executor {
                                 kCostDecay +
                             (elapsed_cycles / kCostDecay);
       cost_estimate.store(new_estimate, std::memory_order_relaxed);
-      if (new_estimate < kOpIsExpensiveThresholdCycles) {
-        is_expensive_[node.node_id].store(false, std::memory_order_relaxed);
-      }
+
+      bool new_is_expensive = (new_estimate >= kOpIsExpensiveThresholdCycles);
+      is_expensive_[node.node_id].store(new_is_expensive,
+                                        std::memory_order_relaxed);
     }
 
    private:
@@ -573,6 +575,15 @@ Status ExecutorState<PropagatorStateType>::ProcessSync(
       KernelTimer timer;
       device->Compute(op_kernel, &ctx);
       kernel_stats_->UpdateCostEstimate(item, timer.ElapsedCycles());
+    } else if (always_track_kernel_execution_cost) {
+      KernelTimer timer;
+      device->Compute(op_kernel, &ctx);
+      // If always_track_kernel_execution_cost is set, update the cost estimate
+      // for inexpensive kernels with ~1/8 probability. This assumes that the
+      // last 3 bits of the CPU cycle count is uniformly distributed.
+      constexpr int kKernelExecutionTrackingInvocationSkipCount = 8;
+      if (timer.start_cycles % kKernelExecutionTrackingInvocationSkipCount == 0)
+        kernel_stats_->UpdateCostEstimate(item, timer.ElapsedCycles());
     } else {
       device->Compute(op_kernel, &ctx);
     }
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index d590ae0f711..d8ea85f1955 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -33,6 +33,15 @@ namespace tensorflow {
 
 class StepStatsCollector;
 
+// If this is called, we will sample execution cost for "inexpensive" kernels
+// and switch them to "expensive" when the estimated cost exceeds expensive-ness
+// threshold.
+// This is a temporary flag for validating the performance impact of
+// this feature. For simplicity, a global flag is used and once the flag
+// is turned on, it cannot be turned off. We will remove this flag once this
+// feature is validated.
+void EnableAlwaysTrackKernelExecutionCost();
+
 // Executor runs a graph computation.
 // Example:
 //   Graph* graph = ...;
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 9b50d5ecc26..569fe929cee 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -695,6 +695,12 @@ message RunOptions {
       int64 priority = 1;
     }
     RunHandlerPoolOptions run_handler_pool_options = 3;
+
+    // If true, always track kernel execution cost. This allows the executor to
+    // transit kernels from "inexpensive" to "expensive" during the execution.
+    // This is a temporary flag for validating this feature. We will remove this
+    // flag once the feature is validated.
+    bool always_track_kernel_execution_cost = 4;
   }
 
   Experimental experimental = 8;
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
index 913d82f680c..9b2b7f7210a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
@@ -21,6 +21,12 @@ tf_proto {
       type: TYPE_MESSAGE
       type_name: ".tensorflow.RunOptions.Experimental.RunHandlerPoolOptions"
     }
+    field {
+      name: "always_track_kernel_execution_cost"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     nested_type {
       name: "RunHandlerPoolOptions"
       field {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
index 9020b61d64f..d250ba35108 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
@@ -68,6 +68,12 @@ tf_proto {
         type: TYPE_MESSAGE
         type_name: ".tensorflow.RunOptions.Experimental.RunHandlerPoolOptions"
       }
+      field {
+        name: "always_track_kernel_execution_cost"
+        number: 4
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       nested_type {
         name: "RunHandlerPoolOptions"
         field {