diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 22ccf5208c1..b9ed7c0590c 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -36,6 +36,8 @@ SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
       num_gpus_(num_gpus),
       expected_init_time_s_(0),
       closing_(false) {
+  VLOG(1) << "Number of CPU cores: " << num_cpu_cores
+          << " Number of GPUs: " << num_gpus;
   thread_pool_.reset(new thread::ThreadPool(
       Env::Default(), SanitizeThreadSuffix("single_machine"), 2));
 
@@ -73,9 +75,12 @@ Status SingleMachine::Provision() {
   DeviceProperties attr = GetLocalCPUInfo();
   devices_["/job:localhost/replica:0/task:0/cpu:0"] = GetLocalCPUInfo();
 
+  VLOG(1) << "Number of GPUs: " << num_gpus_;
   for (int i = 0; i < num_gpus_; ++i) {
-    devices_[strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i)] =
-        GetLocalGPUInfo(i);
+    string device_name =
+        strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i);
+    VLOG(1) << "Adding GPU device " << device_name;
+    devices_[device_name] = GetLocalGPUInfo(i);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index e4a0d6f1b86..8fd1801863a 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -101,6 +101,7 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   }
 
   // Run "measurement_steps_" and measure the time.
+  VLOG(1) << "Number of measurement steps: " << measurement_steps_;
   if (measurement_threads_ > 0) {
     for (int i = 0; i < measurement_steps_; ++i) {
       thread_pool_->Schedule([i, &measurement_fn]() { measurement_fn(i); });
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index d8b8a12eb29..ba6686e7df9 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -314,6 +314,8 @@ std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
       bandwidth = 100;
     }
   }
+  VLOG(1) << "Device: " << device.type() << " GFLOPS: " << gflops
+          << " Bandwidth: " << bandwidth;
 
   return std::make_pair(gflops, bandwidth);
 }
@@ -461,7 +463,7 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
   ops *= conv_dims.kx * conv_dims.ky;
   ops *= conv_dims.iz * conv_dims.oz;
   ops *= kOpsPerMac;
-  VLOG(1) << "Operations for Conv2D" << ops;
+  VLOG(1) << "Operations for Conv2D " << ops;
 
   if (conv_info != nullptr) {
     *conv_info = conv_dims;
@@ -679,7 +681,7 @@ int64 OpLevelCostEstimator::CountConv2DBackPropInputOperations(
   ops *= conv_dims.iz * conv_dims.oz;
   ops *= kOpsPerMac;
 
-  VLOG(1) << "Operations for Conv2DBackPropInput" << ops;
+  VLOG(1) << "Operations for Conv2DBackPropInput " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
diff --git a/tensorflow/core/grappler/costs/virtual_placer.cc b/tensorflow/core/grappler/costs/virtual_placer.cc
index 0291bd04909..a2d463e7652 100644
--- a/tensorflow/core/grappler/costs/virtual_placer.cc
+++ b/tensorflow/core/grappler/costs/virtual_placer.cc
@@ -36,6 +36,7 @@ VirtualPlacer::VirtualPlacer(const Cluster* cluster) {
 
   } else {
     default_device_ = devices_.begin()->first;
+    VLOG(1) << "Number of devices: " << devices_.size();
     for (const auto& device : devices_) {
       if (str_util::Lowercase(device.first).find("gpu") != string::npos) {
         default_device_ = device.first;
@@ -47,6 +48,7 @@ VirtualPlacer::VirtualPlacer(const Cluster* cluster) {
 
 const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
   string device = get_canonical_device_name(node);
+  VLOG(3) << "Device name: " << device;
   auto it = devices_.find(device);
   DCHECK(it != devices_.end());
   return it->second;
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index 3aa1d2027f5..7135c83801a 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -31,7 +31,7 @@ struct ItemConfig {
       : ignore_user_placement(true),
         ignore_colocation(true),
         placeholder_unknown_output_shape_dim(-1),
-        apply_optimizations(true),
+        apply_optimizations(false),
         inline_functions(true) {}
 
   // If true, ignore all user specified node placement.
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 29b5a2574c8..c6b98e686ff 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3832,6 +3832,19 @@ py_library(
     deps = [":pywrap_tensorflow_internal"],
 )
 
+py_binary(
+    name = "cost_analyzer_tool",
+    srcs = [
+        "grappler/cost_analyzer_tool.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cost_analyzer",
+        ":framework_for_generated_wrappers",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 py_test(
     name = "cost_analyzer_test",
     size = "small",
diff --git a/tensorflow/python/grappler/cost_analyzer.cc b/tensorflow/python/grappler/cost_analyzer.cc
index 29976b79495..88bf900dca6 100644
--- a/tensorflow/python/grappler/cost_analyzer.cc
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@@ -30,11 +30,11 @@ CostAnalyzer::CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
       analytical_estimator_(cluster, false),
       suffix_(suffix) {}
 
-Status CostAnalyzer::GenerateReport(std::ostream& os) {
+Status CostAnalyzer::GenerateReport(std::ostream& os, bool per_node_report) {
   GatherCosts();
   PreprocessCosts();
   AnalyzeCosts();
-  PrintAnalysis(os);
+  PrintAnalysis(os, per_node_report);
   return Status::OK();
 }
 
@@ -158,7 +158,7 @@ void CostAnalyzer::AnalyzeCosts() {
   }
 }
 
-void CostAnalyzer::PrintAnalysis(std::ostream& os) const {
+void CostAnalyzer::PrintAnalysis(std::ostream& os, bool per_node_report) const {
   os << std::endl;
   os << std::left << std::setw(50)
      << "Total time measured in ns (serialized): " << std::right
@@ -225,6 +225,11 @@ void CostAnalyzer::PrintAnalysis(std::ostream& os) const {
     os << std::endl;
   }
   os << std::endl;
+
+  if (per_node_report) {
+    os << "Below is the per-node report:" << std::endl;
+    os << op_perf_.DebugString();
+  }
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/python/grappler/cost_analyzer.h b/tensorflow/python/grappler/cost_analyzer.h
index 3700bf5fb37..0e860e0fee9 100644
--- a/tensorflow/python/grappler/cost_analyzer.h
+++ b/tensorflow/python/grappler/cost_analyzer.h
@@ -50,7 +50,7 @@ class CostAnalyzer {
  public:
   explicit CostAnalyzer(const GrapplerItem& item, Cluster* cluster,
                         const string& suffix);
-  Status GenerateReport(std::ostream& os);
+  Status GenerateReport(std::ostream& os, bool per_node_report);
 
  private:
   void PredictCosts(CostEstimator* cost_estimator, CostGraphDef* cost_graph,
@@ -59,7 +59,7 @@ class CostAnalyzer {
   void PreprocessCosts();
   void AnalyzeCosts();
   void SortOpsByTime(std::map<string, OpPerfSummary> ops);
-  void PrintAnalysis(std::ostream& os) const;
+  void PrintAnalysis(std::ostream& os, bool per_node_report) const;
 
   const GrapplerItem* item_;
   MeasuringCostEstimator measure_estimator_;
diff --git a/tensorflow/python/grappler/cost_analyzer.i b/tensorflow/python/grappler/cost_analyzer.i
index a51d8673c99..6066b6131ff 100644
--- a/tensorflow/python/grappler/cost_analyzer.i
+++ b/tensorflow/python/grappler/cost_analyzer.i
@@ -42,8 +42,10 @@ limitations under the License.
 %}
 
 %{
-string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph) {
+string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool
+per_node_report) {
   tensorflow::grappler::ItemConfig cfg;
+  cfg.apply_optimizations = false;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
       tensorflow::grappler::GrapplerItemFromMetaGraphDef("metagraph", metagraph, cfg);
 
@@ -53,16 +55,20 @@ string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph) {
   int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
   int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   tensorflow::grappler::SingleMachine cluster(timeout_s, num_cpu_cores, num_gpus);
+  cluster.SetNumWarmupSteps(10);
+  cluster.AllowSoftPlacement(true);
+  cluster.DisableDetailedStats(false);
   TF_CHECK_OK(cluster.Provision());
 
   string suffix;
   tensorflow::grappler::CostAnalyzer analyzer(*item, &cluster, suffix);
 
   std::stringstream os;
-  analyzer.GenerateReport(os);
+  analyzer.GenerateReport(os, per_node_report);
   return os.str();
 }
 
 %}
 
-string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph);
+string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool
+per_node_report);
diff --git a/tensorflow/python/grappler/cost_analyzer.py b/tensorflow/python/grappler/cost_analyzer.py
index d16614c7c75..75c21e57271 100644
--- a/tensorflow/python/grappler/cost_analyzer.py
+++ b/tensorflow/python/grappler/cost_analyzer.py
@@ -22,8 +22,19 @@ from tensorflow.python import pywrap_tensorflow as tf_wrap
 from tensorflow.python.framework import errors
 
 
-def GenerateCostReport(metagraph):
-  """Analyze the cost of each TensorFlow operation in the provided metagraph."""
+def GenerateCostReport(metagraph, per_node_report=False):
+  """Analyze the cost of each TensorFlow op and node in the provided metagraph.
+
+  Args:
+    metagraph: An TensorFlow MetaGraphDef.
+    per_node_report: by default the report contains stats aggregated on a per op
+      type basis, setting per_node_report to True adds results for each
+      individual node to the report.
+
+  Returns:
+    A string of cost report.
+  """
   with errors.raise_exception_on_not_ok_status():
-    ret_from_swig = tf_wrap.GenerateCostReport(metagraph.SerializeToString())
+    ret_from_swig = tf_wrap.GenerateCostReport(metagraph.SerializeToString(),
+                                               per_node_report)
   return ret_from_swig
diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
new file mode 100644
index 00000000000..80c8970c0bb
--- /dev/null
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -0,0 +1,49 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""A tool for cost analysis."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.grappler import cost_analyzer
+from tensorflow.python.platform import app
+
+
+def main(_):
+  with open(FLAGS.input) as input_file:
+    metagraph = meta_graph_pb2.MetaGraphDef()
+    metagraph.ParseFromString(input_file.read())
+
+  report = cost_analyzer.GenerateCostReport(metagraph, FLAGS.per_node_report)
+  print(report)
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--input", type=str, default=None, help="Input .meta file path.")
+  parser.add_argument(
+      "--per_node_report",
+      action="store_true",
+      help="Generate per-node report. By default the report contains stats "
+      "aggregated on a per op type basis, per_node_report adds results "
+      "for each individual node to the report.")
+  FLAGS, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)