From f386c885017188448062db1c37069ea47e86b55e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 14 Jun 2017 16:04:58 -0700 Subject: [PATCH] Make virtual scheduler potentially write summary information to stepstats_ variable. PiperOrigin-RevId: 159039540 --- .../core/grappler/costs/virtual_scheduler.cc | 32 ++++- .../core/grappler/costs/virtual_scheduler.h | 5 + .../grappler/costs/virtual_scheduler_test.cc | 130 +++++++++++++++++- 3 files changed, 159 insertions(+), 8 deletions(-) diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc index c68d4e31c46..e5073d63017 100644 --- a/tensorflow/core/grappler/costs/virtual_scheduler.cc +++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc @@ -244,8 +244,8 @@ string VirtualScheduler::DeviceName(const NodeDef* node) const { string VirtualScheduler::ChannelDeviceName(const NodeDef* from, const NodeDef* to) const { CHECK(!initialized_) << "ChannelDeviceName is called after Init()."; - - return kChannelDevice + ": " + DeviceName(from) + " to " + DeviceName(to); + return kChannelDevice + ": from " + DeviceName(from) + " to " + + DeviceName(to); } std::pair VirtualScheduler::CreateSendRecv( @@ -318,8 +318,8 @@ NodeInfo VirtualScheduler::GetCurrNodeInfo() const { } // Construct NodeInfo. - const auto& node_state = node_map_.at(node); NodeInfo node_info; + const auto& node_state = node_map_.at(node); node_info.name = node->name(); node_info.device_name = node_state.device_name; auto& op_info = node_info.op_info; @@ -577,6 +577,7 @@ Costs VirtualScheduler::Summary() const { << " GB, at the end: " << state.memory_usage << " B"; VLOG(1) << "Per-op execution time (and memory usage at peak memory usage):"; + // Profile non-persistent op memory usage. for (const auto& node_port : state.mem_usage_snapshot_at_peak) { const auto* node = node_port.first; @@ -617,5 +618,30 @@ Costs VirtualScheduler::Summary() const { return critical_path_costs; } +Costs VirtualScheduler::Summary(StepStats* stepstats) { + if (stepstats != nullptr) { + for (const auto& device : device_) { + DeviceStepStats* device_stepstats = stepstats->add_dev_stats(); + device_stepstats->set_device(device.first); + for (const auto& node_def : device.second.nodes_executed) { + const NodeState& nodestate = node_map_.at(node_def); + NodeExecStats* node_stats = device_stepstats->add_node_stats(); + node_stats->set_node_name(node_def->op()); + node_stats->set_timeline_label(node_def->name()); + node_stats->set_op_start_rel_micros(0); + node_stats->set_all_start_micros( + nodestate.time_scheduled.asMicroSeconds().count()); + node_stats->set_op_end_rel_micros( + nodestate.time_finished.asMicroSeconds().count() - + nodestate.time_scheduled.asMicroSeconds().count()); + node_stats->set_all_end_rel_micros( + nodestate.time_finished.asMicroSeconds().count() - + nodestate.time_scheduled.asMicroSeconds().count()); + } + } + } + return Summary(); +} + } // end namespace grappler } // end namespace tensorflow diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h index 1331e2ec8be..7ee3310e2f1 100644 --- a/tensorflow/core/grappler/costs/virtual_scheduler.h +++ b/tensorflow/core/grappler/costs/virtual_scheduler.h @@ -21,6 +21,7 @@ limitations under the License. #include #include +#include "tensorflow/core/framework/step_stats.pb.h" #include "tensorflow/core/grappler/costs/cost_estimator.h" #include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/grappler/costs/virtual_placer.h" @@ -170,6 +171,9 @@ class VirtualScheduler { // Prints out summary of execution (timing, memory usage, etc.) Costs Summary() const; + // Like the above, but writes detailed stats to stepstats. + // If stepstats is nullptr, then just calls and return Summary(). + Costs Summary(StepStats* stepstats); protected: // GetDeviceStates and GetNodeStates are currently for testing purpuse only. @@ -228,6 +232,7 @@ class VirtualScheduler { // Auxilliary data structures for constructing NodeState and DeviceState. GraphProperties graph_properties_; Cluster* cluster_; // Not owned. + const GrapplerItem* grappler_item_; // Not owned. bool use_static_shapes_; bool initialized_; diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc index 5af9effde04..c630f60311d 100644 --- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc +++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc @@ -14,7 +14,6 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/grappler/costs/virtual_scheduler.h" - #include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/core/grappler/clusters/virtual_cluster.h" #include "tensorflow/core/grappler/costs/virtual_placer.h" @@ -41,13 +40,28 @@ class VirtualSchedulerTest : public ::testing::Test { protected: const string kCPU0 = "/job:localhost/replica:0/task:0/cpu:0"; + DeviceProperties GetDummyCPUDevice() { + // Create CPU with 2 cores, 4 Ghz freq, 2 GB/s mem bandwidth. + // - 8 Gflops + // - 2 GB/s + DeviceProperties cpu_device; + cpu_device.set_type("CPU"); + cpu_device.set_frequency(4000); + cpu_device.set_num_cores(2); + cpu_device.set_bandwidth(2000000); + return cpu_device; + } + void SetUp() override { // Initializes cluster_ and placer_. std::unordered_map devices; - DeviceProperties cpu_device; - cpu_device.set_type("CPU"); - devices[kCPU0] = cpu_device; + // Set some dummy CPU properties + DeviceProperties cpu_device = GetDummyCPUDevice(); + + // IMPORTANT: Device is not actually ever used in the test case since + // force_cpu_type is defaulted to "Haswell" + devices[kCPU0] = cpu_device; cluster_.reset(new VirtualCluster(devices)); placer_.reset(new VirtualPlacer(cluster_.get())); } @@ -102,6 +116,38 @@ class VirtualSchedulerTest : public ::testing::Test { dependency_["y"] = {"x", "f"}; } + void CreateGrapplerItemWithMatmulChain() { + tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0); + auto a = tensorflow::ops::RandomUniform(s.WithOpName("a"), {3200, 3200}, + DT_FLOAT); + auto b = tensorflow::ops::RandomUniform(s.WithOpName("b"), {3200, 3200}, + DT_FLOAT); + auto c = tensorflow::ops::RandomUniform(s.WithOpName("c"), {3200, 3200}, + DT_FLOAT); + auto d = tensorflow::ops::RandomUniform(s.WithOpName("d"), {3200, 3200}, + DT_FLOAT); + auto e = tensorflow::ops::RandomUniform(s.WithOpName("e"), {3200, 3200}, + DT_FLOAT); + + auto ab = tensorflow::ops::MatMul(s.WithOpName("ab"), a, b); + auto abc = tensorflow::ops::MatMul(s.WithOpName("abc"), ab, c); + auto abcd = tensorflow::ops::MatMul(s.WithOpName("abcd"), abc, d); + auto abcde = tensorflow::ops::MatMul(s.WithOpName("abcde"), abcd, e); + + GraphDef def; + TF_CHECK_OK(s.ToGraphDef(&def)); + + grappler_item_.reset(new GrapplerItem); + grappler_item_->id = "test_matmul_sequence_graph"; + grappler_item_->graph = def; + grappler_item_->fetch = {"abcde"}; + + dependency_["ab"] = {"a", "b"}; + dependency_["abc"] = {"ab", "c"}; + dependency_["abcd"] = {"abc", "d"}; + dependency_["abcde"] = {"abcd", "e"}; + } + // AddN that takes 4 tensors with 10x10x10x10. void CreateGrapplerItemWithAddN() { tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0); @@ -201,6 +247,20 @@ class VirtualSchedulerTest : public ::testing::Test { TF_CHECK_OK(scheduler_->Init()); } + // Returns cost based on op. + Costs SimplePredictCosts(const NodeInfo& info) const { + Costs c; + int64 exec_cost = 0; + if (info.op_info.op() == "MatMul") { + exec_cost = 2000000000; + } + if (info.op_info.op() == "RandomUniform") { + exec_cost = 1000000000; + } + c.execution_time = Costs::NanoSeconds(exec_cost); + return c; + } + // Call this after init scheduler_. Scheduler stops after executing // target_node. std::unordered_map RunScheduler(const string& target_node) { @@ -211,6 +271,8 @@ class VirtualSchedulerTest : public ::testing::Test { NodeInfo node_info = scheduler_->GetCurrNodeInfo(); ops_executed[node_info.name] = node_info; + Costs node_costs = SimplePredictCosts(node_info); + // Check scheduling order. auto it = dependency_.find(node_info.name); if (it != dependency_.end()) { @@ -218,7 +280,7 @@ class VirtualSchedulerTest : public ::testing::Test { EXPECT_GT(ops_executed.count(preceding_node), 0); } } - more_nodes = scheduler_->MarkCurrNodeExecuted(zero_costs); + more_nodes = scheduler_->MarkCurrNodeExecuted(node_costs); if (node_info.name == target_node) { // Scheduler has the state after executing the target node. @@ -312,6 +374,64 @@ class VirtualSchedulerTest : public ::testing::Test { const int depth_out_ = 16; }; +// Create small graph, run predict costs on it, make sure the costs from the +// summary match the hand-calculated costs. +TEST_F(VirtualSchedulerTest, SummaryCostTest) { + // Run matmul test. + CreateGrapplerItemWithMatmulChain(); + InitScheduler(); + auto ops_executed = RunScheduler(""); + Costs c = scheduler_->Summary(); + + // RandomUniform - 5 + // Matmuls - 4 * 2 = 8 + // Total: 13 + EXPECT_EQ(13000000, c.execution_time.asMicroSeconds().count()); +} + +// Like the above SummaryCostTest, but makes sure the stepstats timeline is +// correct. +TEST_F(VirtualSchedulerTest, SummaryCostStepStatsTest) { + // Run matmul test. + CreateGrapplerItemWithMatmulChain(); + InitScheduler(); + auto ops_executed = RunScheduler(""); + StepStats stepstats; + Costs c = scheduler_->Summary(&stepstats); + EXPECT_EQ(13000000, c.execution_time.asMicroSeconds().count()); + + // Should only be 1 device! + EXPECT_EQ(1, stepstats.dev_stats().size()); + + // Create a map of op name -> start and end times (micros). + std::map> start_end_times; + for (const auto& device_step_stats : stepstats.dev_stats()) { + for (const auto& stats : device_step_stats.node_stats()) { + // The node name is actually in the timeline_label. + int64 start = stats.all_start_micros(); + int64 end = start + stats.all_end_rel_micros(); + start_end_times[stats.timeline_label()] = + std::pair(start, end); + } + } + + // The base start_time is the time to compute RandomUniforms + int64 cur_time = static_cast(5000000); + // The increment is the execution time of one matmul. See + // CreateGrapplerItemWithMatmulChain for details. + int64 increment = static_cast(2000000); + auto op_names = {"ab", "abc", "abcd", "abcde"}; + for (const auto& op_name : op_names) { + int64 actual_start = start_end_times[op_name].first; + int64 actual_end = start_end_times[op_name].second; + int64 expected_start = cur_time; + int64 expected_end = cur_time + increment; + EXPECT_EQ(expected_start, actual_start); + EXPECT_EQ(expected_end, actual_end); + cur_time += increment; + } +} + TEST_F(VirtualSchedulerTest, InitAndBasicScheduling) { // Init. CreateGrapplerItemWithConv2Ds();