From f386c885017188448062db1c37069ea47e86b55e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 14 Jun 2017 16:04:58 -0700
Subject: [PATCH] Make virtual scheduler potentially write summary information
 to stepstats_ variable.

PiperOrigin-RevId: 159039540
---
 .../core/grappler/costs/virtual_scheduler.cc  |  32 ++++-
 .../core/grappler/costs/virtual_scheduler.h   |   5 +
 .../grappler/costs/virtual_scheduler_test.cc  | 130 +++++++++++++++++-
 3 files changed, 159 insertions(+), 8 deletions(-)
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index c68d4e31c46..e5073d63017 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -244,8 +244,8 @@ string VirtualScheduler::DeviceName(const NodeDef* node) const {
 string VirtualScheduler::ChannelDeviceName(const NodeDef* from,
                                            const NodeDef* to) const {
   CHECK(!initialized_) << "ChannelDeviceName is called after Init().";
-
-  return kChannelDevice + ": " + DeviceName(from) + " to " + DeviceName(to);
+  return kChannelDevice + ": from " + DeviceName(from) + " to " +
+         DeviceName(to);
 }
 
 std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
@@ -318,8 +318,8 @@ NodeInfo VirtualScheduler::GetCurrNodeInfo() const {
   }
 
   // Construct NodeInfo.
-  const auto& node_state = node_map_.at(node);
   NodeInfo node_info;
+  const auto& node_state = node_map_.at(node);
   node_info.name = node->name();
   node_info.device_name = node_state.device_name;
   auto& op_info = node_info.op_info;
@@ -577,6 +577,7 @@ Costs VirtualScheduler::Summary() const {
             << " GB, at the end: " << state.memory_usage << " B";
 
     VLOG(1) << "Per-op execution time (and memory usage at peak memory usage):";
+
     // Profile non-persistent op memory usage.
     for (const auto& node_port : state.mem_usage_snapshot_at_peak) {
       const auto* node = node_port.first;
@@ -617,5 +618,30 @@ Costs VirtualScheduler::Summary() const {
   return critical_path_costs;
 }
 
+Costs VirtualScheduler::Summary(StepStats* stepstats) {
+  if (stepstats != nullptr) {
+    for (const auto& device : device_) {
+      DeviceStepStats* device_stepstats = stepstats->add_dev_stats();
+      device_stepstats->set_device(device.first);
+      for (const auto& node_def : device.second.nodes_executed) {
+        const NodeState& nodestate = node_map_.at(node_def);
+        NodeExecStats* node_stats = device_stepstats->add_node_stats();
+        node_stats->set_node_name(node_def->op());
+        node_stats->set_timeline_label(node_def->name());
+        node_stats->set_op_start_rel_micros(0);
+        node_stats->set_all_start_micros(
+            nodestate.time_scheduled.asMicroSeconds().count());
+        node_stats->set_op_end_rel_micros(
+            nodestate.time_finished.asMicroSeconds().count() -
+            nodestate.time_scheduled.asMicroSeconds().count());
+        node_stats->set_all_end_rel_micros(
+            nodestate.time_finished.asMicroSeconds().count() -
+            nodestate.time_scheduled.asMicroSeconds().count());
+      }
+    }
+  }
+  return Summary();
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 1331e2ec8be..7ee3310e2f1 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/grappler/costs/cost_estimator.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/costs/virtual_placer.h"
@@ -170,6 +171,9 @@ class VirtualScheduler {
 
   // Prints out summary of execution (timing, memory usage, etc.)
   Costs Summary() const;
+  // Like the above, but writes detailed stats to stepstats.
+  // If stepstats is nullptr, then just calls and return Summary().
+  Costs Summary(StepStats* stepstats);
 
  protected:
   // GetDeviceStates and GetNodeStates are currently for testing purpuse only.
@@ -228,6 +232,7 @@ class VirtualScheduler {
   // Auxilliary data structures for constructing NodeState and DeviceState.
   GraphProperties graph_properties_;
   Cluster* cluster_;                   // Not owned.
+
   const GrapplerItem* grappler_item_;  // Not owned.
   bool use_static_shapes_;
   bool initialized_;
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 5af9effde04..c630f60311d 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
-
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/virtual_placer.h"
@@ -41,13 +40,28 @@ class VirtualSchedulerTest : public ::testing::Test {
  protected:
   const string kCPU0 = "/job:localhost/replica:0/task:0/cpu:0";
 
+  DeviceProperties GetDummyCPUDevice() {
+    // Create CPU with 2 cores, 4 Ghz freq, 2 GB/s mem bandwidth.
+    // - 8 Gflops
+    // - 2 GB/s
+    DeviceProperties cpu_device;
+    cpu_device.set_type("CPU");
+    cpu_device.set_frequency(4000);
+    cpu_device.set_num_cores(2);
+    cpu_device.set_bandwidth(2000000);
+    return cpu_device;
+  }
+
   void SetUp() override {
     // Initializes cluster_ and placer_.
     std::unordered_map<string, DeviceProperties> devices;
-    DeviceProperties cpu_device;
-    cpu_device.set_type("CPU");
-    devices[kCPU0] = cpu_device;
 
+    // Set some dummy CPU properties
+    DeviceProperties cpu_device = GetDummyCPUDevice();
+
+    // IMPORTANT: Device is not actually ever used in the test case since
+    // force_cpu_type is defaulted to "Haswell"
+    devices[kCPU0] = cpu_device;
     cluster_.reset(new VirtualCluster(devices));
     placer_.reset(new VirtualPlacer(cluster_.get()));
   }
@@ -102,6 +116,38 @@ class VirtualSchedulerTest : public ::testing::Test {
     dependency_["y"] = {"x", "f"};
   }
 
+  void CreateGrapplerItemWithMatmulChain() {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
+    auto a = tensorflow::ops::RandomUniform(s.WithOpName("a"), {3200, 3200},
+                                            DT_FLOAT);
+    auto b = tensorflow::ops::RandomUniform(s.WithOpName("b"), {3200, 3200},
+                                            DT_FLOAT);
+    auto c = tensorflow::ops::RandomUniform(s.WithOpName("c"), {3200, 3200},
+                                            DT_FLOAT);
+    auto d = tensorflow::ops::RandomUniform(s.WithOpName("d"), {3200, 3200},
+                                            DT_FLOAT);
+    auto e = tensorflow::ops::RandomUniform(s.WithOpName("e"), {3200, 3200},
+                                            DT_FLOAT);
+
+    auto ab = tensorflow::ops::MatMul(s.WithOpName("ab"), a, b);
+    auto abc = tensorflow::ops::MatMul(s.WithOpName("abc"), ab, c);
+    auto abcd = tensorflow::ops::MatMul(s.WithOpName("abcd"), abc, d);
+    auto abcde = tensorflow::ops::MatMul(s.WithOpName("abcde"), abcd, e);
+
+    GraphDef def;
+    TF_CHECK_OK(s.ToGraphDef(&def));
+
+    grappler_item_.reset(new GrapplerItem);
+    grappler_item_->id = "test_matmul_sequence_graph";
+    grappler_item_->graph = def;
+    grappler_item_->fetch = {"abcde"};
+
+    dependency_["ab"] = {"a", "b"};
+    dependency_["abc"] = {"ab", "c"};
+    dependency_["abcd"] = {"abc", "d"};
+    dependency_["abcde"] = {"abcd", "e"};
+  }
+
   // AddN that takes 4 tensors with 10x10x10x10.
   void CreateGrapplerItemWithAddN() {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
@@ -201,6 +247,20 @@ class VirtualSchedulerTest : public ::testing::Test {
     TF_CHECK_OK(scheduler_->Init());
   }
 
+  // Returns cost based on op.
+  Costs SimplePredictCosts(const NodeInfo& info) const {
+    Costs c;
+    int64 exec_cost = 0;
+    if (info.op_info.op() == "MatMul") {
+      exec_cost = 2000000000;
+    }
+    if (info.op_info.op() == "RandomUniform") {
+      exec_cost = 1000000000;
+    }
+    c.execution_time = Costs::NanoSeconds(exec_cost);
+    return c;
+  }
+
   // Call this after init scheduler_. Scheduler stops after executing
   // target_node.
   std::unordered_map<string, NodeInfo> RunScheduler(const string& target_node) {
@@ -211,6 +271,8 @@ class VirtualSchedulerTest : public ::testing::Test {
       NodeInfo node_info = scheduler_->GetCurrNodeInfo();
       ops_executed[node_info.name] = node_info;
 
+      Costs node_costs = SimplePredictCosts(node_info);
+
       // Check scheduling order.
       auto it = dependency_.find(node_info.name);
       if (it != dependency_.end()) {
@@ -218,7 +280,7 @@ class VirtualSchedulerTest : public ::testing::Test {
           EXPECT_GT(ops_executed.count(preceding_node), 0);
         }
       }
-      more_nodes = scheduler_->MarkCurrNodeExecuted(zero_costs);
+      more_nodes = scheduler_->MarkCurrNodeExecuted(node_costs);
 
       if (node_info.name == target_node) {
         // Scheduler has the state after executing the target node.
@@ -312,6 +374,64 @@ class VirtualSchedulerTest : public ::testing::Test {
   const int depth_out_ = 16;
 };
 
+// Create small graph, run predict costs on it, make sure the costs from the
+// summary match the hand-calculated costs.
+TEST_F(VirtualSchedulerTest, SummaryCostTest) {
+  // Run matmul test.
+  CreateGrapplerItemWithMatmulChain();
+  InitScheduler();
+  auto ops_executed = RunScheduler("");
+  Costs c = scheduler_->Summary();
+
+  // RandomUniform - 5
+  // Matmuls - 4 * 2 = 8
+  // Total: 13
+  EXPECT_EQ(13000000, c.execution_time.asMicroSeconds().count());
+}
+
+// Like the above SummaryCostTest, but makes sure the stepstats timeline is
+// correct.
+TEST_F(VirtualSchedulerTest, SummaryCostStepStatsTest) {
+  // Run matmul test.
+  CreateGrapplerItemWithMatmulChain();
+  InitScheduler();
+  auto ops_executed = RunScheduler("");
+  StepStats stepstats;
+  Costs c = scheduler_->Summary(&stepstats);
+  EXPECT_EQ(13000000, c.execution_time.asMicroSeconds().count());
+
+  // Should only be 1 device!
+  EXPECT_EQ(1, stepstats.dev_stats().size());
+
+  // Create a map of op name -> start and end times (micros).
+  std::map<string, std::pair<int64, int64>> start_end_times;
+  for (const auto& device_step_stats : stepstats.dev_stats()) {
+    for (const auto& stats : device_step_stats.node_stats()) {
+      // The node name is actually in the timeline_label.
+      int64 start = stats.all_start_micros();
+      int64 end = start + stats.all_end_rel_micros();
+      start_end_times[stats.timeline_label()] =
+          std::pair<int64, int64>(start, end);
+    }
+  }
+
+  // The base start_time is the time to compute RandomUniforms
+  int64 cur_time = static_cast<int64>(5000000);
+  // The increment is the execution time of one matmul. See
+  // CreateGrapplerItemWithMatmulChain for details.
+  int64 increment = static_cast<int64>(2000000);
+  auto op_names = {"ab", "abc", "abcd", "abcde"};
+  for (const auto& op_name : op_names) {
+    int64 actual_start = start_end_times[op_name].first;
+    int64 actual_end = start_end_times[op_name].second;
+    int64 expected_start = cur_time;
+    int64 expected_end = cur_time + increment;
+    EXPECT_EQ(expected_start, actual_start);
+    EXPECT_EQ(expected_end, actual_end);
+    cur_time += increment;
+  }
+}
+
 TEST_F(VirtualSchedulerTest, InitAndBasicScheduling) {
   // Init.
   CreateGrapplerItemWithConv2Ds();