Make virtual scheduler potentially write summary information to stepstats_ variable.

PiperOrigin-RevId: 159039540
This commit is contained in:
A. Unique TensorFlower 2017-06-14 16:04:58 -07:00 committed by TensorFlower Gardener
parent e16d717e0b
commit f386c88501
3 changed files with 159 additions and 8 deletions

View File

@ -244,8 +244,8 @@ string VirtualScheduler::DeviceName(const NodeDef* node) const {
string VirtualScheduler::ChannelDeviceName(const NodeDef* from, string VirtualScheduler::ChannelDeviceName(const NodeDef* from,
const NodeDef* to) const { const NodeDef* to) const {
CHECK(!initialized_) << "ChannelDeviceName is called after Init()."; CHECK(!initialized_) << "ChannelDeviceName is called after Init().";
return kChannelDevice + ": from " + DeviceName(from) + " to " +
return kChannelDevice + ": " + DeviceName(from) + " to " + DeviceName(to); DeviceName(to);
} }
std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv( std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
@ -318,8 +318,8 @@ NodeInfo VirtualScheduler::GetCurrNodeInfo() const {
} }
// Construct NodeInfo. // Construct NodeInfo.
const auto& node_state = node_map_.at(node);
NodeInfo node_info; NodeInfo node_info;
const auto& node_state = node_map_.at(node);
node_info.name = node->name(); node_info.name = node->name();
node_info.device_name = node_state.device_name; node_info.device_name = node_state.device_name;
auto& op_info = node_info.op_info; auto& op_info = node_info.op_info;
@ -577,6 +577,7 @@ Costs VirtualScheduler::Summary() const {
<< " GB, at the end: " << state.memory_usage << " B"; << " GB, at the end: " << state.memory_usage << " B";
VLOG(1) << "Per-op execution time (and memory usage at peak memory usage):"; VLOG(1) << "Per-op execution time (and memory usage at peak memory usage):";
// Profile non-persistent op memory usage. // Profile non-persistent op memory usage.
for (const auto& node_port : state.mem_usage_snapshot_at_peak) { for (const auto& node_port : state.mem_usage_snapshot_at_peak) {
const auto* node = node_port.first; const auto* node = node_port.first;
@ -617,5 +618,30 @@ Costs VirtualScheduler::Summary() const {
return critical_path_costs; return critical_path_costs;
} }
Costs VirtualScheduler::Summary(StepStats* stepstats) {
if (stepstats != nullptr) {
for (const auto& device : device_) {
DeviceStepStats* device_stepstats = stepstats->add_dev_stats();
device_stepstats->set_device(device.first);
for (const auto& node_def : device.second.nodes_executed) {
const NodeState& nodestate = node_map_.at(node_def);
NodeExecStats* node_stats = device_stepstats->add_node_stats();
node_stats->set_node_name(node_def->op());
node_stats->set_timeline_label(node_def->name());
node_stats->set_op_start_rel_micros(0);
node_stats->set_all_start_micros(
nodestate.time_scheduled.asMicroSeconds().count());
node_stats->set_op_end_rel_micros(
nodestate.time_finished.asMicroSeconds().count() -
nodestate.time_scheduled.asMicroSeconds().count());
node_stats->set_all_end_rel_micros(
nodestate.time_finished.asMicroSeconds().count() -
nodestate.time_scheduled.asMicroSeconds().count());
}
}
}
return Summary();
}
} // end namespace grappler } // end namespace grappler
} // end namespace tensorflow } // end namespace tensorflow

View File

@ -21,6 +21,7 @@ limitations under the License.
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "tensorflow/core/framework/step_stats.pb.h"
#include "tensorflow/core/grappler/costs/cost_estimator.h" #include "tensorflow/core/grappler/costs/cost_estimator.h"
#include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/grappler/costs/graph_properties.h"
#include "tensorflow/core/grappler/costs/virtual_placer.h" #include "tensorflow/core/grappler/costs/virtual_placer.h"
@ -170,6 +171,9 @@ class VirtualScheduler {
// Prints out summary of execution (timing, memory usage, etc.) // Prints out summary of execution (timing, memory usage, etc.)
Costs Summary() const; Costs Summary() const;
// Like the above, but writes detailed stats to stepstats.
// If stepstats is nullptr, then just calls and return Summary().
Costs Summary(StepStats* stepstats);
protected: protected:
// GetDeviceStates and GetNodeStates are currently for testing purpuse only. // GetDeviceStates and GetNodeStates are currently for testing purpuse only.
@ -228,6 +232,7 @@ class VirtualScheduler {
// Auxilliary data structures for constructing NodeState and DeviceState. // Auxilliary data structures for constructing NodeState and DeviceState.
GraphProperties graph_properties_; GraphProperties graph_properties_;
Cluster* cluster_; // Not owned. Cluster* cluster_; // Not owned.
const GrapplerItem* grappler_item_; // Not owned. const GrapplerItem* grappler_item_; // Not owned.
bool use_static_shapes_; bool use_static_shapes_;
bool initialized_; bool initialized_;

View File

@ -14,7 +14,6 @@ limitations under the License.
==============================================================================*/ ==============================================================================*/
#include "tensorflow/core/grappler/costs/virtual_scheduler.h" #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
#include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/cc/ops/standard_ops.h"
#include "tensorflow/core/grappler/clusters/virtual_cluster.h" #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
#include "tensorflow/core/grappler/costs/virtual_placer.h" #include "tensorflow/core/grappler/costs/virtual_placer.h"
@ -41,13 +40,28 @@ class VirtualSchedulerTest : public ::testing::Test {
protected: protected:
const string kCPU0 = "/job:localhost/replica:0/task:0/cpu:0"; const string kCPU0 = "/job:localhost/replica:0/task:0/cpu:0";
DeviceProperties GetDummyCPUDevice() {
// Create CPU with 2 cores, 4 Ghz freq, 2 GB/s mem bandwidth.
// - 8 Gflops
// - 2 GB/s
DeviceProperties cpu_device;
cpu_device.set_type("CPU");
cpu_device.set_frequency(4000);
cpu_device.set_num_cores(2);
cpu_device.set_bandwidth(2000000);
return cpu_device;
}
void SetUp() override { void SetUp() override {
// Initializes cluster_ and placer_. // Initializes cluster_ and placer_.
std::unordered_map<string, DeviceProperties> devices; std::unordered_map<string, DeviceProperties> devices;
DeviceProperties cpu_device;
cpu_device.set_type("CPU");
devices[kCPU0] = cpu_device;
// Set some dummy CPU properties
DeviceProperties cpu_device = GetDummyCPUDevice();
// IMPORTANT: Device is not actually ever used in the test case since
// force_cpu_type is defaulted to "Haswell"
devices[kCPU0] = cpu_device;
cluster_.reset(new VirtualCluster(devices)); cluster_.reset(new VirtualCluster(devices));
placer_.reset(new VirtualPlacer(cluster_.get())); placer_.reset(new VirtualPlacer(cluster_.get()));
} }
@ -102,6 +116,38 @@ class VirtualSchedulerTest : public ::testing::Test {
dependency_["y"] = {"x", "f"}; dependency_["y"] = {"x", "f"};
} }
void CreateGrapplerItemWithMatmulChain() {
tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
auto a = tensorflow::ops::RandomUniform(s.WithOpName("a"), {3200, 3200},
DT_FLOAT);
auto b = tensorflow::ops::RandomUniform(s.WithOpName("b"), {3200, 3200},
DT_FLOAT);
auto c = tensorflow::ops::RandomUniform(s.WithOpName("c"), {3200, 3200},
DT_FLOAT);
auto d = tensorflow::ops::RandomUniform(s.WithOpName("d"), {3200, 3200},
DT_FLOAT);
auto e = tensorflow::ops::RandomUniform(s.WithOpName("e"), {3200, 3200},
DT_FLOAT);
auto ab = tensorflow::ops::MatMul(s.WithOpName("ab"), a, b);
auto abc = tensorflow::ops::MatMul(s.WithOpName("abc"), ab, c);
auto abcd = tensorflow::ops::MatMul(s.WithOpName("abcd"), abc, d);
auto abcde = tensorflow::ops::MatMul(s.WithOpName("abcde"), abcd, e);
GraphDef def;
TF_CHECK_OK(s.ToGraphDef(&def));
grappler_item_.reset(new GrapplerItem);
grappler_item_->id = "test_matmul_sequence_graph";
grappler_item_->graph = def;
grappler_item_->fetch = {"abcde"};
dependency_["ab"] = {"a", "b"};
dependency_["abc"] = {"ab", "c"};
dependency_["abcd"] = {"abc", "d"};
dependency_["abcde"] = {"abcd", "e"};
}
// AddN that takes 4 tensors with 10x10x10x10. // AddN that takes 4 tensors with 10x10x10x10.
void CreateGrapplerItemWithAddN() { void CreateGrapplerItemWithAddN() {
tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0); tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
@ -201,6 +247,20 @@ class VirtualSchedulerTest : public ::testing::Test {
TF_CHECK_OK(scheduler_->Init()); TF_CHECK_OK(scheduler_->Init());
} }
// Returns cost based on op.
Costs SimplePredictCosts(const NodeInfo& info) const {
Costs c;
int64 exec_cost = 0;
if (info.op_info.op() == "MatMul") {
exec_cost = 2000000000;
}
if (info.op_info.op() == "RandomUniform") {
exec_cost = 1000000000;
}
c.execution_time = Costs::NanoSeconds(exec_cost);
return c;
}
// Call this after init scheduler_. Scheduler stops after executing // Call this after init scheduler_. Scheduler stops after executing
// target_node. // target_node.
std::unordered_map<string, NodeInfo> RunScheduler(const string& target_node) { std::unordered_map<string, NodeInfo> RunScheduler(const string& target_node) {
@ -211,6 +271,8 @@ class VirtualSchedulerTest : public ::testing::Test {
NodeInfo node_info = scheduler_->GetCurrNodeInfo(); NodeInfo node_info = scheduler_->GetCurrNodeInfo();
ops_executed[node_info.name] = node_info; ops_executed[node_info.name] = node_info;
Costs node_costs = SimplePredictCosts(node_info);
// Check scheduling order. // Check scheduling order.
auto it = dependency_.find(node_info.name); auto it = dependency_.find(node_info.name);
if (it != dependency_.end()) { if (it != dependency_.end()) {
@ -218,7 +280,7 @@ class VirtualSchedulerTest : public ::testing::Test {
EXPECT_GT(ops_executed.count(preceding_node), 0); EXPECT_GT(ops_executed.count(preceding_node), 0);
} }
} }
more_nodes = scheduler_->MarkCurrNodeExecuted(zero_costs); more_nodes = scheduler_->MarkCurrNodeExecuted(node_costs);
if (node_info.name == target_node) { if (node_info.name == target_node) {
// Scheduler has the state after executing the target node. // Scheduler has the state after executing the target node.
@ -312,6 +374,64 @@ class VirtualSchedulerTest : public ::testing::Test {
const int depth_out_ = 16; const int depth_out_ = 16;
}; };
// Create small graph, run predict costs on it, make sure the costs from the
// summary match the hand-calculated costs.
TEST_F(VirtualSchedulerTest, SummaryCostTest) {
// Run matmul test.
CreateGrapplerItemWithMatmulChain();
InitScheduler();
auto ops_executed = RunScheduler("");
Costs c = scheduler_->Summary();
// RandomUniform - 5
// Matmuls - 4 * 2 = 8
// Total: 13
EXPECT_EQ(13000000, c.execution_time.asMicroSeconds().count());
}
// Like the above SummaryCostTest, but makes sure the stepstats timeline is
// correct.
TEST_F(VirtualSchedulerTest, SummaryCostStepStatsTest) {
// Run matmul test.
CreateGrapplerItemWithMatmulChain();
InitScheduler();
auto ops_executed = RunScheduler("");
StepStats stepstats;
Costs c = scheduler_->Summary(&stepstats);
EXPECT_EQ(13000000, c.execution_time.asMicroSeconds().count());
// Should only be 1 device!
EXPECT_EQ(1, stepstats.dev_stats().size());
// Create a map of op name -> start and end times (micros).
std::map<string, std::pair<int64, int64>> start_end_times;
for (const auto& device_step_stats : stepstats.dev_stats()) {
for (const auto& stats : device_step_stats.node_stats()) {
// The node name is actually in the timeline_label.
int64 start = stats.all_start_micros();
int64 end = start + stats.all_end_rel_micros();
start_end_times[stats.timeline_label()] =
std::pair<int64, int64>(start, end);
}
}
// The base start_time is the time to compute RandomUniforms
int64 cur_time = static_cast<int64>(5000000);
// The increment is the execution time of one matmul. See
// CreateGrapplerItemWithMatmulChain for details.
int64 increment = static_cast<int64>(2000000);
auto op_names = {"ab", "abc", "abcd", "abcde"};
for (const auto& op_name : op_names) {
int64 actual_start = start_end_times[op_name].first;
int64 actual_end = start_end_times[op_name].second;
int64 expected_start = cur_time;
int64 expected_end = cur_time + increment;
EXPECT_EQ(expected_start, actual_start);
EXPECT_EQ(expected_end, actual_end);
cur_time += increment;
}
}
TEST_F(VirtualSchedulerTest, InitAndBasicScheduling) { TEST_F(VirtualSchedulerTest, InitAndBasicScheduling) {
// Init. // Init.
CreateGrapplerItemWithConv2Ds(); CreateGrapplerItemWithConv2Ds();