Open sourced the cost analyzer

PiperOrigin-RevId: 157178951
2017-05-25 18:29:14 -07:00 · 2017-05-25 18:29:14 -07:00 · 2251633a50
commit 2251633a50
parent 3e767e9db0
12 changed files with 591 additions and 0 deletions
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -258,6 +258,7 @@ endif()
 # We include tf_cc_ops first, because tf_c depends on tf_cc.
 include(tf_cc_ops.cmake)
 include(tf_c.cmake)
 include(tf_grappler.cmake)
 if(tensorflow_BUILD_CC_EXAMPLE)
  include(tf_tutorials.cmake)
  include(tf_label_image_example.cmake)
--- a/tensorflow/contrib/cmake/tf_grappler.cmake
+++ b/tensorflow/contrib/cmake/tf_grappler.cmake
@ -0,0 +1,27 @@
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 ########################################################
 # tf_grappler library
 ########################################################
 file(GLOB tf_grappler_srcs
   "${tensorflow_source_dir}/tensorflow/core/grappler/clusters/single_machine.cc"
   "${tensorflow_source_dir}/tensorflow/core/grappler/clusters/single_machine.h"
   "${tensorflow_source_dir}/tensorflow/python/grappler/cost_analyzer.cc"
   "${tensorflow_source_dir}/tensorflow/python/grappler/cost_analyzer.h"
 )
 add_library(tf_grappler OBJECT ${tf_grappler_srcs})
 add_dependencies(tf_grappler tf_core_cpu)
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@ -721,6 +721,7 @@ if(WIN32)
        $<TARGET_OBJECTS:tf_cc_ops>
        $<TARGET_OBJECTS:tf_core_ops>
        $<TARGET_OBJECTS:tf_core_direct_session>
        $<TARGET_OBJECTS:tf_grappler>
        $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
        $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
        $<TARGET_OBJECTS:tf_core_kernels>
@ -767,6 +768,7 @@ add_library(pywrap_tensorflow_internal SHARED
    $<TARGET_OBJECTS:tf_cc_ops>
    $<TARGET_OBJECTS:tf_core_ops>
    $<TARGET_OBJECTS:tf_core_direct_session>
    $<TARGET_OBJECTS:tf_grappler>
    $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
    $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
    $<TARGET_OBJECTS:tf_core_kernels>
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@ -230,5 +230,60 @@ string GetOpDescription(const OpInfo& op_info) {
  return description;
 }
 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
                                               const GraphDef& graph) {
  OpPerformanceList ret;
  std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
  std::unordered_map<string, const NodeDef*> name_to_node;
  for (auto& node : cost_graph.node()) {
    name_to_cost[node.name()] = &node;
  }
  for (auto& node : graph.node()) {
    name_to_node[node.name()] = &node;
  }
  for (const auto& node : graph.node()) {
    // Skip the nodes that are not in the cost graph: these are nodes that
    // aren't run, because they aren't in the intersection of transitive
    // fan-in of a fetch node and the transitive fan-out of an input, or nodes
    // that were optimized away by the optimizer. Since they don't contribute
    // to the execution time we simply discard them.
    auto it = name_to_cost.find(node.name());
    if (it == name_to_cost.end()) {
      continue;
    }
    const CostGraphDef::Node* cost_node = it->second;
    OpPerformance* perf = ret.add_op_performance();
    perf->set_node(node.name());
    std::vector<OpInfo::TensorProperties> inputs =
        FindInputFeatures(node, name_to_cost, name_to_node);
    (*perf->mutable_op()) =
        BuildOpInfo(node, cost_node->device(), name_to_node, inputs);
    perf->set_temporary_memory_size(cost_node->temporary_memory_size());
    // Note that CostGraphDef::Node::compute_cost is microseconds, while
    // OpPerformance.compute_cost is nanoseconds.
    perf->set_compute_cost(cost_node->compute_cost() * 1000);
    perf->set_compute_time(cost_node->compute_time() * 1000);
    perf->set_memory_time(cost_node->memory_time() * 1000);
    for (const auto& output_info : cost_node->output_info()) {
      perf->mutable_op_memory()->add_output_memory(output_info.size());
    }
    perf->mutable_op_memory()->set_host_temp_memory(
        cost_node->host_temp_memory_size());
    perf->mutable_op_memory()->set_device_temp_memory(
        cost_node->device_temp_memory_size());
    perf->mutable_op_memory()->set_host_persistent_memory(
        cost_node->host_persistent_memory_size());
    perf->mutable_op_memory()->set_device_persistent_memory(
        cost_node->device_persistent_memory_size());
  }
  return ret;
 }
 }  // end namespace grappler
 }  // end namespace tensorflow
--- a/tensorflow/core/grappler/costs/utils.h
+++ b/tensorflow/core/grappler/costs/utils.h
@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/graph/types.h"
@ -56,6 +57,10 @@ OpInfo BuildOpInfo(
    const std::unordered_map<string, const NodeDef*>& name_to_node,
    const std::vector<OpInfo::TensorProperties>& inputs);
 // Gather performance data from a cost graph.
 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
                                               const GraphDef& graph);
 }  // end namespace grappler
 }  // end namespace tensorflow
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -155,6 +155,22 @@ tf_py_test(
    ],
 )
 cc_library(
    name = "cost_analyzer_lib",
    srcs = ["grappler/cost_analyzer.cc"],
    hdrs = ["grappler/cost_analyzer.h"],
    deps = [
        "//tensorflow/core:framework",
        "//tensorflow/core/grappler:grappler_item",
        "//tensorflow/core/grappler/clusters:cluster",
        "//tensorflow/core/grappler/clusters:single_machine",
        "//tensorflow/core/grappler/costs:analytical_cost_estimator",
        "//tensorflow/core/grappler/costs:measuring_cost_estimator",
        "//tensorflow/core/grappler/costs:op_performance_data_cc",
        "//tensorflow/core/grappler/costs:utils",
    ],
 )
 cc_library(
    name = "numpy_lib",
    srcs = ["lib/core/numpy.cc"],
@ -2644,6 +2660,7 @@ tf_py_wrap_cc(
        "client/tf_session.i",
        "framework/cpp_shape_inference.i",
        "framework/python_op_gen.i",
        "grappler/cost_analyzer.i",
        "grappler/tf_optimizer.i",
        "lib/core/py_func.i",
        "lib/core/strings.i",
@ -2660,6 +2677,7 @@ tf_py_wrap_cc(
        "util/transform_graph.i",
    ],
    deps = [
        ":cost_analyzer_lib",
        ":cpp_shape_inference",
        ":kernel_registry",
        ":numpy_lib",
@ -3673,3 +3691,28 @@ cuda_py_test(
        "//tensorflow/core:protos_all_py",
    ],
 )
 py_library(
    name = "cost_analyzer",
    srcs = [
        "grappler/cost_analyzer.py",
    ],
    srcs_version = "PY2AND3",
    deps = [":pywrap_tensorflow_internal"],
 )
 py_test(
    name = "cost_analyzer_test",
    size = "small",
    srcs = ["grappler/cost_analyzer_test.py"],
    srcs_version = "PY2AND3",
    tags = ["no_pip"],
    deps = [
        ":client_testlib",
        ":cost_analyzer",
        ":framework_for_generated_wrappers",
        ":math_ops",
        "//tensorflow/core:protos_all_py",
        "//third_party/py/numpy",
    ],
 )
--- a/tensorflow/python/grappler/cost_analyzer.cc
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@ -0,0 +1,224 @@
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/python/grappler/cost_analyzer.h"
 #include <iomanip>
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
 namespace tensorflow {
 namespace grappler {
 CostAnalyzer::CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
                           const string& suffix)
    : item_(&item),
      measure_estimator_(cluster, 10, 0),
      analytical_estimator_(cluster, false),
      suffix_(suffix) {}
 Status CostAnalyzer::GenerateReport(std::ostream& os) {
  GatherCosts();
  PreprocessCosts();
  AnalyzeCosts();
  PrintAnalysis(os);
  return Status::OK();
 }
 void CostAnalyzer::PredictCosts(CostEstimator* cost_estimator,
                                CostGraphDef* cost_graph, int64* total_time) {
  TF_CHECK_OK(cost_estimator->Initialize(*item_));
  Costs costs;
  const Status status =
      cost_estimator->PredictCosts(item_->graph, cost_graph, &costs);
  *total_time = costs.execution_time.count();
  if (!status.ok()) {
    LOG(ERROR) << "Could not estimate the cost for item " << item_->id << ": "
               << status.error_message();
    return;
  }
 }
 void CostAnalyzer::GatherCosts() {
  CostGraphDef cost_graph_measured;
  PredictCosts(&measure_estimator_, &cost_graph_measured,
               &total_time_measured_);
  VLOG(1) << "cost_graph_measured size: " << cost_graph_measured.node_size();
  op_perf_ = CostGraphToOpPerformanceData(cost_graph_measured, item_->graph);
  CostGraphDef cost_graph_analytical;
  PredictCosts(&analytical_estimator_, &cost_graph_analytical,
               &total_time_analytical_);
  VLOG(1) << "cost_graph_analytical size: "
          << cost_graph_analytical.node_size();
  CostGraphDef cost_graph_analytical_filtered;
  std::set<string> cost_nodes;
  for (auto& node : cost_graph_measured.node()) {
    cost_nodes.insert(node.name());
  }
  for (const auto& node : cost_graph_analytical.node()) {
    auto it = cost_nodes.find(node.name());
    // Filter the nodes that are not the cost nodes returned by
    // MeasuringCostEstimator.
    if (it == cost_nodes.end()) {
      continue;
    }
    auto added_node = cost_graph_analytical_filtered.add_node();
    *added_node = node;
  }
  VLOG(1) << "cost_graph_analytical_filtered size: "
          << cost_graph_analytical_filtered.node_size();
  op_perf_analytical_ = CostGraphToOpPerformanceData(
      cost_graph_analytical_filtered, item_->graph);
 }
 void CostAnalyzer::PreprocessCosts() {
  for (int i = 0; i < op_perf_.op_performance_size(); i++) {
    OpPerformance* perf = op_perf_.mutable_op_performance(i);
    const OpPerformance& analytical = op_perf_analytical_.op_performance(i);
    perf->set_compute_time(analytical.compute_time());
    perf->set_memory_time(analytical.memory_time());
    double measured_cost = perf->compute_cost();
    double analytical_compute_cost = analytical.compute_time();
    if (analytical_compute_cost == 0) {
      // Negative infinity indidates unavailable data.
      perf->set_compute_efficiency(-INFINITY);
    } else {
      perf->set_compute_efficiency(analytical_compute_cost / measured_cost);
    }
    double analytical_memory_cost = analytical.memory_time();
    if (analytical_memory_cost == 0) {
      // Negative infinity indidates unavailable data.
      perf->set_memory_efficiency(-INFINITY);
    } else {
      perf->set_memory_efficiency(analytical_memory_cost / measured_cost);
    }
  }
 }
 void CostAnalyzer::SortOpsByTime(std::map<string, OpPerfSummary> ops) {
  for (const auto& op : ops) {
    ops_.push_back(op.second);
  }
  struct CompareByTime {
    bool operator()(const OpPerfSummary& a, const OpPerfSummary& b) const {
      return a.time > b.time;
    }
  };
  std::stable_sort(ops_.begin(), ops_.end(), CompareByTime());
 }
 void CostAnalyzer::AnalyzeCosts() {
  std::map<string, OpPerfSummary> ops;
  for (const auto& op_perf : op_perf_.op_performance()) {
    string op_name = op_perf.op().op();
    ops[op_name].count++;
    ops[op_name].time += op_perf.compute_cost();
    ops[op_name].compute_time += op_perf.compute_time();
    ops[op_name].memory_time += op_perf.memory_time();
    ops[op_name].time_upper += op_perf.compute_time() + op_perf.memory_time();
    ops[op_name].time_lower +=
        std::max(op_perf.compute_time(), op_perf.memory_time());
    ops[op_name].name = op_name;
  }
  SortOpsByTime(ops);
  total_time_measured_serialized_ = 0;
  total_time_analytical_upper_ = 0;
  total_time_analytical_lower_ = 0;
  for (const auto& op : ops_) {
    total_time_measured_serialized_ += op.time;
    total_time_analytical_upper_ += op.time_upper;
    total_time_analytical_lower_ += op.time_lower;
  }
 }
 void CostAnalyzer::PrintAnalysis(std::ostream& os) const {
  os << std::endl;
  os << std::left << std::setw(50)
     << "Total time measured in ns (serialized): " << std::right
     << std::setw(20) << total_time_measured_serialized_ << std::endl;
  os << std::left << std::setw(50)
     << "Total time measured in ns (actual): " << std::right << std::setw(20)
     << total_time_measured_ << std::endl;
  os << std::left << std::setw(50)
     << "Total time analytical in ns (upper bound): " << std::right
     << std::setw(20) << total_time_analytical_upper_ << std::endl;
  os << std::left << std::setw(50)
     << "Total time analytical in ns (lower bound): " << std::right
     << std::setw(20) << total_time_analytical_lower_ << std::endl;
  double efficiency_upper = static_cast<double>(total_time_analytical_upper_) /
                            static_cast<double>(total_time_measured_);
  os << std::left << std::setw(50)
     << "Overall efficiency (analytical upper/actual): " << std::right
     << std::setw(20) << efficiency_upper << std::endl;
  double efficiency_lower = static_cast<double>(total_time_analytical_lower_) /
                            static_cast<double>(total_time_measured_);
  os << std::left << std::setw(50)
     << "Overall efficiency (analytical lower/actual): " << std::right
     << std::setw(20) << efficiency_lower << std::endl;
  os << std::endl;
  int width = 35;
  int width_narrow = 15;
  int width_wide = 20;
  os << std::setw(width + 1) << "Op,";
  os << std::setw(width_narrow + 1) << "Count,";
  os << std::setw(width_wide + 1) << "Measured time (ns),";
  os << std::setw(width_narrow + 2) << "Time percent,";
  os << std::setw(width_narrow + 2) << "Acc percent,";
  os << std::setw(width_wide + 1) << "Analytical upper,";
  os << std::setw(width_wide + 1) << "Analytical lower,";
  os << std::setw(width_narrow + 2) << "Overall eff";
  os << std::setw(width_narrow + 2) << "Compute eff";
  os << std::setw(width_narrow + 2) << "Memory eff" << std::endl;
  float acc_percent = 0;
  for (const auto& op : ops_) {
    double percent = static_cast<double>(op.time) /
                     static_cast<double>(total_time_measured_serialized_);
    double eff =
        static_cast<double>(op.time_upper) / static_cast<double>(op.time);
    double compute_eff =
        static_cast<double>(op.compute_time) / static_cast<double>(op.time);
    double memory_eff =
        static_cast<double>(op.memory_time) / static_cast<double>(op.time);
    os << std::setw(width) << op.name << ",";
    os << std::setw(width_narrow) << op.count << ",";
    os << std::setw(width_wide) << op.time << ",";
    os << std::setw(width_narrow) << std::setprecision(2) << percent * 100
       << "%,";
    acc_percent += percent;
    os << std::setw(width_narrow) << std::setprecision(2) << acc_percent * 100
       << "%,";
    os << std::setw(width_wide) << op.time_upper << ",";
    os << std::setw(width_wide) << op.time_lower << ",";
    os << std::setw(width_narrow) << std::setprecision(2) << eff * 100 << "%,";
    os << std::setw(width_narrow) << std::setprecision(2) << compute_eff * 100
       << "%,";
    os << std::setw(width_narrow) << std::setprecision(2) << memory_eff * 100
       << "%,";
    os << std::endl;
  }
  os << std::endl;
 }
 }  // end namespace grappler
 }  // end namespace tensorflow
--- a/tensorflow/python/grappler/cost_analyzer.h
+++ b/tensorflow/python/grappler/cost_analyzer.h
@ -0,0 +1,81 @@
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ANALYZER_H_
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ANALYZER_H_
 #include <iostream>
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
 #include "tensorflow/core/grappler/costs/cost_estimator.h"
 #include "tensorflow/core/grappler/costs/measuring_cost_estimator.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 namespace tensorflow {
 class GraphDef;
 class CostGraphDef;
 namespace grappler {
 struct GrapplerItem;
 // Aggregated perf summary for ops of the same type in a graph.
 struct OpPerfSummary {
  string name;
  int64 count;
  int64 time;
  int64 compute_time;
  int64 memory_time;
  // Upper and lower bound for estimated time.
  int64 time_upper;
  int64 time_lower;
 };
 // Generate op-level performance insights on compute/memory
 // efficiency, as well as graph-level aggregated performance statistics.
 class CostAnalyzer {
 public:
  explicit CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
                        const string& suffix);
  Status GenerateReport(std::ostream& os);
 private:
  void PredictCosts(CostEstimator* cost_estimator, CostGraphDef* cost_graph,
                    int64* total_time);
  void GatherCosts();
  void PreprocessCosts();
  void AnalyzeCosts();
  void SortOpsByTime(std::map<string, OpPerfSummary> ops);
  void PrintAnalysis(std::ostream& os) const;
  const GrapplerItem* item_;
  MeasuringCostEstimator measure_estimator_;
  AnalyticalCostEstimator analytical_estimator_;
  OpPerformanceList op_perf_;
  OpPerformanceList op_perf_analytical_;
  int64 total_time_measured_;
  int64 total_time_analytical_;
  std::vector<OpPerfSummary> ops_;
  int64 total_time_measured_serialized_;
  int64 total_time_analytical_upper_;
  int64 total_time_analytical_lower_;
  string suffix_;
 };
 }  // end namespace grappler
 }  // end namespace tensorflow
 #endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ANALYZER_H_
--- a/tensorflow/python/grappler/cost_analyzer.i
+++ b/tensorflow/python/grappler/cost_analyzer.i
@ -0,0 +1,67 @@
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 %include "tensorflow/python/lib/core/strings.i"
 %include "tensorflow/python/platform/base.i"
 %typemap(in) const tensorflow::MetaGraphDef& (tensorflow::MetaGraphDef temp) {
  char* c_string;
  Py_ssize_t py_size;
  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
    // Python has raised an error (likely TypeError or UnicodeEncodeError).
    SWIG_fail;
  }
  if (!temp.ParseFromString(string(c_string, py_size))) {
    PyErr_SetString(
        PyExc_TypeError,
        "The MetaGraphDef could not be parsed as a valid protocol buffer");
    SWIG_fail;
  }
  $1 = &temp;
 }
 %{
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
 #include "tensorflow/python/grappler/cost_analyzer.h"
 %}
 %{
 string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph) {
  tensorflow::grappler::ItemConfig cfg;
  std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
      tensorflow::grappler::GrapplerItemFromMetaGraphDef("metagraph", metagraph, cfg);
  // TODO(bsteiner): we should wrap the tf session instead to properly handle the case of a
  // distributed setup.
  const int timeout_s = 3600;
  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
  tensorflow::grappler::SingleMachine cluster(timeout_s, num_cpu_cores, num_gpus);
  string suffix;
  tensorflow::grappler::CostAnalyzer analyzer(*item, &cluster, suffix);
  std::stringstream os;
  analyzer.GenerateReport(os);
  return os.str();
 }
 %}
 string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph);
--- a/tensorflow/python/grappler/cost_analyzer.py
+++ b/tensorflow/python/grappler/cost_analyzer.py
@ -0,0 +1,29 @@
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
 """Provides a proper python API for the symbols exported through swig."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from tensorflow.python import pywrap_tensorflow as tf_wrap
 from tensorflow.python.framework import errors
 def GenerateCostReport(metagraph):
  """Analyze the cost of each TensorFlow operation in the provided metagraph."""
  with errors.raise_exception_on_not_ok_status():
    ret_from_swig = tf_wrap.GenerateCostReport(metagraph.SerializeToString())
  return ret_from_swig
--- a/tensorflow/python/grappler/cost_analyzer_test.py
+++ b/tensorflow/python/grappler/cost_analyzer_test.py
@ -0,0 +1,56 @@
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Tests for the cost analyzer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.grappler import cost_analyzer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 class PyWrapOptimizeGraphTest(test.TestCase):
  def testBasic(self):
    """Make sure arguments can be passed correctly."""
    a = constant_op.constant(10, name="a")
    b = constant_op.constant(20, name="b")
    c = math_ops.add_n([a, b], name="c")
    d = math_ops.add_n([b, c], name="d")
    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
    train_op.append(d)
    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
    report = cost_analyzer.GenerateCostReport(mg)
    # Check the report headers
    self.assertTrue(b"Total time measured in ns (serialized):" in report)
    self.assertTrue(b"Total time measured in ns (actual):" in report)
    self.assertTrue(b"Total time analytical in ns (upper bound):" in report)
    self.assertTrue(b"Total time analytical in ns (lower bound):" in report)
    self.assertTrue(b"Overall efficiency (analytical upper/actual):" in report)
    self.assertTrue(b"Overall efficiency (analytical lower/actual):" in report)
    # Also print the report to make it easier to debug
    print("{}".format(report))
 if __name__ == "__main__":
  test.main()
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@ -42,3 +42,4 @@ limitations under the License.
 %include "tensorflow/python/util/transform_graph.i"
 %include "tensorflow/python/grappler/tf_optimizer.i"
 %include "tensorflow/python/grappler/cost_analyzer.i"