[tf.data] Adding a metric for bytes produced and consumed by individual transformations, refactoring infrastructure for recording tf.data metrics, and moving the metrics API and implementation from common_runtime to framework.

PiperOrigin-RevId: 305062865 Change-Id: I63911f00154baf36aa225f66dbef0843239b7392
2020-04-06 10:34:34 -07:00 · 2020-04-06 10:34:34 -07:00 · eabc157fd5
commit eabc157fd5
parent f87850a654
11 changed files with 319 additions and 186 deletions
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -2194,7 +2194,8 @@ filegroup(
 filegroup(
    name = "framework_internal_public_headers",
    srcs = [
-        "//tensorflow/core/framework:model.h",  # only needed for tests
+        "//tensorflow/core/framework:metrics.h",
        "//tensorflow/core/framework:model.h",
        "//tensorflow/core/framework:op_segment.h",
        "//tensorflow/core/framework:rendezvous.h",  # only needed for tests
        "//tensorflow/core/framework:resource_var.h",
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@ -303,7 +303,6 @@ tf_cuda_library(
        "lower_if_op.cc",
        "lower_while_op.cc",
        "memory_types.cc",
        "metrics.cc",
        "mkl_cpu_allocator.cc",
        "optimization_registry.cc",
        "parallel_concat_optimizer.cc",
--- a/tensorflow/core/common_runtime/metrics.h
+++ b/tensorflow/core/common_runtime/metrics.h
@ -16,93 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
-#include "tensorflow/core/lib/monitoring/counter.h"
+// TODO(jsimsa): Remove this forwarding header once all users are migrated to
-#include "tensorflow/core/platform/types.h"
+// using the one in framework.
-namespace tensorflow {
+#include "tensorflow/core/framework/metrics.h"
 namespace metrics {
 // Records that a tf.data.Dataset executed by the program used autotuning.
 //
 // The `name` argument identifies the Dataset type (e.g. "ParallelMap").
 void RecordTFDataAutotune(const string& name);
 // Returns a counter than can be used to record the number of bytes read from
 // the filesystem by a tf.data.Dataset source.
 //
 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name);
 // Records the number of bytes fetched from tf.data.Dataset iterator.
 void RecordTFDataBytesFetched(int64 num_bytes);
 // Records the time spent in ItertatorResource::GetNext() in microseconds.
 void RecordTFDataGetNextDuration(uint64 duration_us);
 // Records the number of elements produced by a tf.data.Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
 void RecordTFDataElements(const string& name, int64 num_elements);
 // Records the number of times each tf.data fingerprint is used
 // to measure duplicate pre-processing.
 //
 // The `name` argument identifies the Dataset graph fingerprint,
 // created using GraphHash().
 void RecordTFDataFingerprint(const string& name);
 // Records the number of independent graph changes resulting from the
 // application of a tf.data optimization.
 //
 // The `name` argument identifies the optimization (e.g. "noop_elimination").
 void RecordTFDataOptimization(const string& name, int64 num_changes);
 // Records parsing of dense tensor features.
 void RecordParseDenseFeature(int64 num_features);
 // Records parsing of sparse tensor features.
 void RecordParseSparseFeature(int64 num_features);
 // Records parsing of ragged tensor features.
 void RecordParseRaggedFeature(int64 num_features);
 // Records the size of input/output tensors in bytes.
 void RecordGraphInputTensors(const size_t size);
 void RecordGraphOutputTensors(const size_t size);
 void UpdateGraphExecTime(const uint64 running_time_usecs);
 // Records that one output of an op of type `op_name` was unused.
 void RecordUnusedOutput(const string& op_name);
 // Updates the metrics stored about time spent building graphs.
 //
 // By "GraphBuild", we refer to building a client graph, which is a sub-graph of
 // the full graph, induced by a set of options. In particular, these options
 // include the feeds and fetches requested.
 //
 // This includes time spent:
 //   * optimizing the graphs with Grappler
 //   * pruning the sub-graph (unless the place_pruned_graph option is set)
 //
 // When executing eagerly, this will not record any activity.
 //
 // TODO(jtkeeling): Should we record building/optimizing tf.functions?
 void UpdateGraphBuildTime(const uint64 running_time_usecs);
 // Updates the metrics stored about graph optimizations.
 void UpdateGraphOptimizationPassTime(const string& pass_name,
                                     const uint64 running_time_usecs);
 void UpdateGrapplerPassTime(const string& pass_name,
                            const uint64 running_time_usecs);
 // Updates the metrics stored about time XLA spents compiling graphs.
 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
 // Increment the number of jobs that failed during import to mlir.
 void IncrementMLIRImportFailureCount();
 }  // namespace metrics
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@ -47,6 +47,7 @@ exports_files(
        "logging.h",
        "lookup_interface.h",
        "memory_types.h",
        "metrics.h",
        "model.h",
        "node_def_builder.h",
        "numeric_op.h",
@ -176,6 +177,7 @@ filegroup(
        "logging.h",
        "lookup_interface.h",
        "memory_types.h",
        "metrics.h",
        "model.h",
        "node_def_builder.h",
        "node_def_util.h",
@ -246,6 +248,7 @@ filegroup(
        "logging.cc",
        "lookup_interface.cc",
        "memory_types.cc",
        "metrics.cc",
        "model.cc",
        "node_def_builder.cc",
        "op_kernel.cc",
@ -346,6 +349,8 @@ filegroup(
        "lookup_interface.h",
        "memory_types.cc",
        "memory_types.h",
        "metrics.cc",
        "metrics.h",
        "model.cc",
        "model.h",
        "node_def_builder.cc",
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@ -484,7 +484,7 @@ Status DatasetBaseIterator::GetNext(IteratorContext* ctx,
  DVLOG(3) << prefix() << " GetNext enter";
  RecordStart(ctx, /*stop_output=*/true);
  Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
-  if (s.ok() && !*end_of_sequence) RecordElement(ctx);
+  if (s.ok() && !*end_of_sequence) RecordElement(ctx, out_tensors);
  RecordStop(ctx, /*start_output=*/true);
  if (TF_PREDICT_FALSE(errors::IsOutOfRange(s))) {
    s = errors::Internal("Iterator \"", params_.prefix,
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@ -962,10 +962,15 @@ class DatasetBaseIterator : public IteratorBase {
  }
  // When modeling is enabled, this method records the fact that this iterator
-  // has produced an element.
+  // has produced an element and its size in bytes.
-  void RecordElement(IteratorContext* ctx) {
+  void RecordElement(IteratorContext* ctx, std::vector<Tensor>* out_tensors) {
    if (node_) {
      int64 num_bytes = GetAllocatedBytes(*out_tensors);
      node_->record_element();
      node_->record_bytes_produced(num_bytes);
      if (node_->output()) {
        node_->output()->record_bytes_consumed(num_bytes);
      }
    }
  }
--- a/tensorflow/core/common_runtime/metrics.cc
+++ b/tensorflow/core/common_runtime/metrics.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/common_runtime/metrics.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/lib/monitoring/sampler.h"
@ -61,6 +61,14 @@ auto* graph_unused_outputs = monitoring::Counter<1>::New(
 auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
    "/tensorflow/data/autotune", "tf.data autotuning", "name");
 auto* tf_data_bytes_consumed_counter = monitoring::Counter<1>::New(
    "/tensorflow/data/bytes_consumed",
    "The number of bytes consumed by a tf.data Dataset.", "name");
 auto* tf_data_bytes_produced_counter = monitoring::Counter<1>::New(
    "/tensorflow/data/bytes_produced",
    "The number of bytes produced by a tf.data Dataset.", "name");
 auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
    "/tensorflow/data/bytes_read",
    "The number of bytes read by tf.data Dataset sources.", "name");
@ -69,18 +77,18 @@ auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
    "/tensorflow/data/bytes_fetched",
    "The number of bytes fetched from tf.data Dataset iterator.");
 auto* tf_data_getnext_duration_counter = monitoring::Sampler<0>::New(
    {"/tensorflow/data/getnext_duration",
     "Microseconds spent fetching an element from tf.data Dataset iterator."},
    // Power of 2 with bucket count 10 (1024 ms)
    {monitoring::Buckets::Exponential(1, 2, 10)});
 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
    "/tensorflow/data/elements", "tf.data elements", "name");
 auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New(
    "/tensorflow/data/fingerprint", "tf.data fingerprint", "name");
 auto* tf_data_getnext_duration_counter = monitoring::Sampler<0>::New(
    {"/tensorflow/data/getnext_duration",
     "Microseconds spent fetching an element from tf.data Dataset iterator."},
    // Power of 2 with bucket count 10 (1024 ms)
    {monitoring::Buckets::Exponential(1, 2, 10)});
 auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
    "/tensorflow/data/optimization", "tf.data optimization", "name");
@ -132,28 +140,36 @@ void RecordTFDataAutotune(const string& name) {
  tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
 }
 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name) {
  return tf_data_bytes_consumed_counter->GetCell(name);
 }
 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name) {
  return tf_data_bytes_produced_counter->GetCell(name);
 }
 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) {
  return tf_data_bytes_read_counter->GetCell(name);
 }
 monitoring::CounterCell* GetTFDataElementsCounter(const string& name) {
  return tf_data_elements_counter->GetCell(name);
 }
 void RecordTFDataBytesFetched(int64 num_bytes) {
  tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
 }
 void RecordTFDataFingerprint(const string& name) {
  tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
 }
 void RecordTFDataGetNextDuration(uint64 duration_us) {
  static auto* tfdata_getnext_duration_cell =
      tf_data_getnext_duration_counter->GetCell();
  tfdata_getnext_duration_cell->Add(duration_us);
 }
 void RecordTFDataElements(const string& name, int64 num_elements) {
  tf_data_elements_counter->GetCell(name)->IncrementBy(num_elements);
 }
 void RecordTFDataFingerprint(const string& name) {
  tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
 }
 void RecordTFDataOptimization(const string& name, int64 num_changes) {
  tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
 }
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@ -0,0 +1,123 @@
 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
 #define TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/platform/types.h"
 namespace tensorflow {
 namespace metrics {
 // Records that a tf.data.Dataset executed by the program used autotuning.
 //
 // The `name` argument identifies the Dataset type (e.g. "ParallelMap").
 void RecordTFDataAutotune(const string& name);
 // Returns a counter that can be used to record the number of bytes produced by
 // a tf.data.Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name);
 // Returns a counter that can be used to record the number of bytes produced by
 // a tf.data.Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name);
 // Returns a counter than can be used to record the number of bytes read from
 // the filesystem by a tf.data.Dataset source.
 //
 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
 //
 // TODO(jsimsa): Remove this now that we have GetTFDataBytesConsumedCounter?
 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name);
 // Returns a counter than can be used to record the number of elements produced
 // by a tf.data.Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
 monitoring::CounterCell* GetTFDataElementsCounter(const string& name);
 // Records the number of bytes fetched from tf.data.Dataset iterator.
 void RecordTFDataBytesFetched(int64 num_bytes);
 // Records the time spent in ItertatorResource::GetNext() in microseconds.
 void RecordTFDataGetNextDuration(uint64 duration_us);
 // Records the number of times each tf.data fingerprint is used
 // to measure duplicate pre-processing.
 //
 // The `name` argument identifies the Dataset graph fingerprint,
 // created using GraphHash().
 void RecordTFDataFingerprint(const string& name);
 // Records the number of independent graph changes resulting from the
 // application of a tf.data optimization.
 //
 // The `name` argument identifies the optimization (e.g. "noop_elimination").
 void RecordTFDataOptimization(const string& name, int64 num_changes);
 // Records parsing of dense tensor features.
 void RecordParseDenseFeature(int64 num_features);
 // Records parsing of sparse tensor features.
 void RecordParseSparseFeature(int64 num_features);
 // Records parsing of ragged tensor features.
 void RecordParseRaggedFeature(int64 num_features);
 // Records the size of input/output tensors in bytes.
 void RecordGraphInputTensors(const size_t size);
 void RecordGraphOutputTensors(const size_t size);
 void UpdateGraphExecTime(const uint64 running_time_usecs);
 // Records that one output of an op of type `op_name` was unused.
 void RecordUnusedOutput(const string& op_name);
 // Updates the metrics stored about time spent building graphs.
 //
 // By "GraphBuild", we refer to building a client graph, which is a sub-graph of
 // the full graph, induced by a set of options. In particular, these options
 // include the feeds and fetches requested.
 //
 // This includes time spent:
 //   * optimizing the graphs with Grappler
 //   * pruning the sub-graph (unless the place_pruned_graph option is set)
 //
 // When executing eagerly, this will not record any activity.
 //
 // TODO(jtkeeling): Should we record building/optimizing tf.functions?
 void UpdateGraphBuildTime(const uint64 running_time_usecs);
 // Updates the metrics stored about graph optimizations.
 void UpdateGraphOptimizationPassTime(const string& pass_name,
                                     const uint64 running_time_usecs);
 void UpdateGrapplerPassTime(const string& pass_name,
                            const uint64 running_time_usecs);
 // Updates the metrics stored about time XLA spents compiling graphs.
 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
 // Increment the number of jobs that failed during import to mlir.
 void IncrementMLIRImportFailureCount();
 }  // namespace metrics
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@ -711,23 +711,10 @@ void Model::AddProcessingTime(const string& name, int64 delta) {
  }
 }
-void Model::Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget,
+void Model::FlushMetrics() {
                     int64 ram_budget) {
  switch (algorithm) {
    case AutotuneAlgorithm::HILL_CLIMB:
      OptimizeHillClimb(cpu_budget, ram_budget);
      break;
    case AutotuneAlgorithm::GRADIENT_DESCENT:
      OptimizeGradientDescent(cpu_budget, ram_budget);
      break;
  }
 }
 void Model::RecordElement(const string& name) {
  tf_shared_lock l(mu_);
-  auto node = gtl::FindOrNull(lookup_table_, name);
+  for (const auto& pair : lookup_table_) {
-  if (node) {
+    pair.second->FlushMetrics();
    (*node)->record_element();
  }
 }
@ -740,6 +727,18 @@ int64 Model::NumElements(const string& name) {
  return 0;
 }
 void Model::Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget,
                     int64 ram_budget) {
  switch (algorithm) {
    case AutotuneAlgorithm::HILL_CLIMB:
      OptimizeHillClimb(cpu_budget, ram_budget);
      break;
    case AutotuneAlgorithm::GRADIENT_DESCENT:
      OptimizeGradientDescent(cpu_budget, ram_budget);
      break;
  }
 }
 void Model::RecordStart(const string& name, bool stop_output) {
  tf_shared_lock l(mu_);
  auto node = gtl::FindOrNull(lookup_table_, name);
@ -772,7 +771,6 @@ void Model::RemoveNode(const string& name) {
      (*node)->output()->remove_input(*node);
    }
    VLOG(3) << "Removing " << (*node)->long_name();
    remove_node_hook_(*node);
  }
  lookup_table_.erase(name);
 }
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@ -124,9 +125,19 @@ class Node {
  using Factory = std::function<std::shared_ptr<Node>(Args)>;
  explicit Node(Args args)
-      : id_(args.id), name_(args.name), output_(args.output.get()) {}
+      : id_(args.id),
        name_(std::move(args.name)),
        autotune_(true),
        buffered_bytes_(0),
        buffered_elements_(0),
        bytes_consumed_(0),
        bytes_produced_(0),
        num_elements_(0),
        record_metrics_(true),
        metrics_(name_),
        output_(args.output.get()) {}
-  virtual ~Node() {}
+  virtual ~Node() { FlushMetrics(); }
  // Adds an input.
  void add_input(std::shared_ptr<Node> node) TF_LOCKS_EXCLUDED(mu_) {
@ -142,22 +153,29 @@ class Node {
  // Returns an indication whether autotuning is enabled for this node.
  bool autotune() const TF_LOCKS_EXCLUDED(mu_) {
    tf_shared_lock l(mu_);
    return autotune_;
  }
  // Returns the number of bytes stored in this node's buffer.
  int64 buffered_bytes() const TF_LOCKS_EXCLUDED(mu_) {
    tf_shared_lock l(mu_);
    return buffered_bytes_;
  }
  // Returns the number of elements stored in this node's buffer.
  int64 buffered_elements() const TF_LOCKS_EXCLUDED(mu_) {
    tf_shared_lock l(mu_);
    return buffered_elements_;
  }
  // Returns the number of bytes consumed by the node.
  int64 bytes_consumed() const TF_LOCKS_EXCLUDED(mu_) {
    return bytes_consumed_;
  }
  // Returns the number of bytes produced by the node.
  int64 bytes_produced() const TF_LOCKS_EXCLUDED(mu_) {
    return bytes_produced_;
  }
  // Indicates whether the node has tunable parameters.
  bool has_tunable_parameters() const TF_LOCKS_EXCLUDED(mu_) {
    tf_shared_lock l(mu_);
@ -184,7 +202,6 @@ class Node {
  // Returns the number of elements produced by the node.
  int64 num_elements() const TF_LOCKS_EXCLUDED(mu_) {
    tf_shared_lock l(mu_);
    return num_elements_;
  }
@ -197,17 +214,20 @@ class Node {
    return processing_time_;
  }
  // Records that the node consumed the given number of bytes.
  void record_bytes_consumed(int64 num_bytes) { bytes_consumed_ += num_bytes; }
  // Records that the node produced the given number of bytes.
  void record_bytes_produced(int64 num_bytes) { bytes_produced_ += num_bytes; }
  // Records the change in this node's buffer.
-  void record_buffer_event(int64 bytes_delta, int64 elements_delta)
+  void record_buffer_event(int64 bytes_delta, int64 elements_delta) {
      TF_LOCKS_EXCLUDED(mu_) {
    mutex_lock l(mu_);
    buffered_bytes_ += bytes_delta;
    buffered_elements_ += elements_delta;
  }
  // Records that the node produced an element.
  void record_element() TF_LOCKS_EXCLUDED(mu_) {
    mutex_lock l(mu_);
    num_elements_++;
  }
@ -226,8 +246,7 @@ class Node {
      processing_time_ += time_nanos - iter->second;
      work_start_.erase(iter);
    } else {
-      VLOG(1)
+      VLOG(1) << "Encountered a stop event without a matching start event.";
          << "Encountered a stop event that was not preceded by a start event.";
    }
  }
@ -239,18 +258,17 @@ class Node {
  // Sets the value that determines whether autotuning is enabled for this node.
  void set_autotune(bool autotune) TF_LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
+    autotune_.store(autotune);
    autotune_ = autotune;
  }
  // Collects tunable parameters in the subtree rooted in this node.
  void CollectTunableParameters(
      std::map<string, std::shared_ptr<Parameter>>* parameters) const
      TF_LOCKS_EXCLUDED(mu_) {
    tf_shared_lock l(mu_);
    if (!autotune_) {
      return;
    }
    tf_shared_lock l(mu_);
    for (auto& pair : parameters_) {
      if (pair.second->state->tunable) {
        parameters->insert(std::make_pair(long_name(), pair.second));
@ -266,10 +284,17 @@ class Node {
    tf_shared_lock l(mu_);
    string result;
    strings::StrAppend(&result, long_name(), ":\n");
-    strings::StrAppend(&result, "  autotune=", autotune_, "\n");
+    strings::StrAppend(&result, "  autotune=", autotune_.load(), "\n");
-    strings::StrAppend(&result, "  buffered_bytes=", buffered_bytes_, "\n");
+    strings::StrAppend(&result, "  buffered_bytes=", buffered_bytes_.load(),
                       "\n");
    strings::StrAppend(&result,
                       "  buffered_elements=", buffered_elements_.load(), "\n");
    strings::StrAppend(&result, "  bytes_consumed=", bytes_consumed_.load(),
                       "\n");
    strings::StrAppend(&result, "  bytes_produced=", bytes_produced_.load(),
                       "\n");
    strings::StrAppend(&result, "  processing_time=", processing_time_, "\n");
-    strings::StrAppend(&result, "  num_elements=", num_elements_, "\n");
+    strings::StrAppend(&result, "  num_elements=", num_elements_.load(), "\n");
    string inputs;
    for (auto& input : inputs_) {
      strings::StrAppend(&inputs, input->long_name(), ",");
@ -281,6 +306,16 @@ class Node {
    return result;
  }
  // Flushes the metrics recorded by this node.
  void FlushMetrics() TF_LOCKS_EXCLUDED(mu_) {
    if (!record_metrics_) {
      return;
    }
    metrics_.record_bytes_consumed(bytes_consumed_);
    metrics_.record_bytes_produced(bytes_produced_);
    metrics_.record_num_elements(num_elements_);
  }
  // Returns the per-element output time for this node and if `gradient` is not
  // `nullptr`, collects the gradient of the output time w.r.t. tunable
  // parameters of the subtree rooted in this node and the last input time.
@ -301,13 +336,16 @@ class Node {
    tf_shared_lock l(mu_);
    std::shared_ptr<Node> result = Clone(output);
    {
      result->autotune_.store(autotune_);
      result->buffered_bytes_.store(buffered_bytes_);
      result->buffered_elements_.store(buffered_elements_);
      result->bytes_consumed_.store(bytes_consumed_);
      result->bytes_produced_.store(bytes_produced_);
      result->num_elements_.store(num_elements_);
      result->record_metrics_.store(false);
      mutex_lock l2(result->mu_);
      result->autotune_ = autotune_;
      result->buffered_bytes_ = buffered_bytes_;
      result->buffered_elements_ = buffered_elements_;
      result->processing_time_ = processing_time_;
      result->num_elements_ = num_elements_;
      result->parameters_ = parameters_;
      result->processing_time_ = processing_time_;
    }
    for (auto& input : inputs_) {
      result->add_input(input->Snapshot(result));
@ -324,10 +362,10 @@ class Node {
  // Returns the total number of bytes buffered in all nodes in the subtree for
  // which autotuning is enabled.
  double TotalBufferedBytes() const TF_LOCKS_EXCLUDED(mu_) {
    tf_shared_lock l(mu_);
    if (!autotune_) {
      return 0;
    }
    tf_shared_lock l(mu_);
    double result = 0;
    auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
    if (!parameter) {
@ -346,10 +384,10 @@ class Node {
  // autotuning is enabled. This number represents the amount of memory that
  // would be used by the subtree nodes if all of their buffers were full.
  double TotalMaximumBufferedBytes() const TF_LOCKS_EXCLUDED(mu_) {
    tf_shared_lock l(mu_);
    if (!autotune_) {
      return 0;
    }
    tf_shared_lock l(mu_);
    double result = 0;
    auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
    if (!parameter) {
@ -374,6 +412,50 @@ class Node {
  }
 protected:
  // Used for (incrementally) recording metrics. The class is thread-safe.
  class Metrics {
   public:
    explicit Metrics(const string& name)
        : bytes_consumed_counter_(metrics::GetTFDataBytesConsumedCounter(name)),
          bytes_produced_counter_(metrics::GetTFDataBytesProducedCounter(name)),
          num_elements_counter_(metrics::GetTFDataElementsCounter(name)),
          recorded_bytes_consumed_(0),
          recorded_bytes_produced_(0),
          recorded_num_elements_(0) {}
    // Expects the total number of bytes consumed and records the delta since
    // last invocation.
    void record_bytes_consumed(int64 total_bytes) {
      int64 delta =
          total_bytes - recorded_bytes_consumed_.exchange(total_bytes);
      bytes_consumed_counter_->IncrementBy(delta);
    }
    // Expects the total number of bytes produced and records the delta since
    // last invocation.
    void record_bytes_produced(int64 total_bytes) {
      int64 delta =
          total_bytes - recorded_bytes_produced_.exchange(total_bytes);
      bytes_produced_counter_->IncrementBy(delta);
    }
    // Expects the total number of elements produced and records the delta since
    // last invocation.
    void record_num_elements(int64 total_elements) {
      int64 delta =
          total_elements - recorded_num_elements_.exchange(total_elements);
      num_elements_counter_->IncrementBy(delta);
    }
   private:
    monitoring::CounterCell* const bytes_consumed_counter_;
    monitoring::CounterCell* const bytes_produced_counter_;
    monitoring::CounterCell* const num_elements_counter_;
    std::atomic<int64> recorded_bytes_consumed_;
    std::atomic<int64> recorded_bytes_produced_;
    std::atomic<int64> recorded_num_elements_;
  };
  // Returns the number of inputs.
  int64 num_inputs() const TF_SHARED_LOCKS_REQUIRED(mu_) {
    int64 num_inputs = 0;
@ -495,13 +577,17 @@ class Node {
  // Indicates whether the subtree rooted in this node should be included in
  // autotuning. In particular, if this is `false`, then the subtree is excluded
  // from computation of output time and processing time.
-  bool autotune_ TF_GUARDED_BY(mu_) = true;
+  std::atomic<bool> autotune_;
-  int64 buffered_bytes_ TF_GUARDED_BY(mu_) = 0;
+  std::atomic<int64> buffered_bytes_;
-  int64 buffered_elements_ TF_GUARDED_BY(mu_) = 0;
+  std::atomic<int64> buffered_elements_;
-  int64 processing_time_ TF_GUARDED_BY(mu_) = 0;
+  std::atomic<int64> bytes_consumed_;
-  int64 num_elements_ TF_GUARDED_BY(mu_) = 0;
+  std::atomic<int64> bytes_produced_;
-  std::map<std::thread::id, int64> work_start_ TF_GUARDED_BY(mu_);
+  std::atomic<int64> num_elements_;
  std::atomic<bool> record_metrics_;
  Metrics metrics_;
  std::map<string, std::shared_ptr<Parameter>> parameters_ TF_GUARDED_BY(mu_);
  int64 processing_time_ TF_GUARDED_BY(mu_) = 0;
  std::map<std::thread::id, int64> work_start_ TF_GUARDED_BY(mu_);
  // Statistic of inputs processing time history.
  double input_processing_time_sum_ = 0.0L;
@ -561,19 +647,8 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args);
 // implementation of `DatasetBase` and `DatasetBaseIterator` respectively.
 class Model {
 public:
  using NodeHook = std::function<void(std::shared_ptr<Node>)>;
  // Creates a new model.
-  //
+  Model() : collect_resource_usage_(false) {}
  // The `remove_node_hook` argument can be used to specify functionality that
  // should be invoked before a node is removed from the model. The hook can be
  // used for dependency injection -- to allow the model to invoke functionality
  // from modules that it could not depend on statically.
  Model(NodeHook remove_node_hook)
      : collect_resource_usage_(false),
        remove_node_hook_(std::move(remove_node_hook)) {
    DCHECK(remove_node_hook_ != nullptr);
  }
  // Indicates whether to collect resource usage.
  bool collect_resource_usage() const { return collect_resource_usage_; }
@ -588,16 +663,16 @@ class Model {
  void AddProcessingTime(const string& name, int64 delta)
      TF_LOCKS_EXCLUDED(mu_);
-  // Uses the given algorithm to perform the autotuning optimization.
+  // Flushes metrics record by the model.
-  void Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget, int64 ram_budget)
+  void FlushMetrics() TF_LOCKS_EXCLUDED(mu_);
      TF_LOCKS_EXCLUDED(mu_);
  // Records that a node has produced an element.
  void RecordElement(const string& name) TF_LOCKS_EXCLUDED(mu_);
  // Returns the number of elements that the input pipeline has produced.
  int64 NumElements(const string& name) TF_LOCKS_EXCLUDED(mu_);
  // Uses the given algorithm to perform the autotuning optimization.
  void Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget, int64 ram_budget)
      TF_LOCKS_EXCLUDED(mu_);
  // Records that the given node has started work. If `stop_output` is set, it
  // also records that the output of the given node has stopped work.
  void RecordStart(const string& name, bool stop_output) TF_LOCKS_EXCLUDED(mu_);
@ -674,9 +749,6 @@ class Model {
  // tunable parameter (because the information is used for for tuning the value
  // of the parameter) and never stops.
  std::atomic<bool> collect_resource_usage_;
  // A hook invoked immediately before a node is removed from the model.
  const NodeHook remove_node_hook_;
 };
 }  // namespace model
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@ -110,10 +110,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
     public:
      explicit Iterator(const Params& params)
          : DatasetIterator<Dataset>(params) {
-        auto remove_node_hook = [](std::shared_ptr<model::Node> node) {
+        model_ = std::make_shared<model::Model>();
          metrics::RecordTFDataElements(node->name(), node->num_elements());
        };
        model_ = std::make_shared<model::Model>(std::move(remove_node_hook));
      }
      ~Iterator() override {
@ -168,16 +165,16 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
     private:
      Status EnsureOptimizeThreadStarted(IteratorContext* ctx)
          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (!optimize_thread_) {
+        if (!model_thread_) {
          std::shared_ptr<IteratorContext> new_ctx =
              std::make_shared<IteratorContext>(*ctx);
-          optimize_thread_ = ctx->StartThread(
+          model_thread_ = ctx->StartThread(
-              "tf_data_model", [this, new_ctx]() { OptimizeThread(new_ctx); });
+              "tf_data_model", [this, new_ctx]() { ModelThread(new_ctx); });
        }
        return Status::OK();
      }
-      void OptimizeThread(const std::shared_ptr<IteratorContext>& ctx) {
+      void ModelThread(const std::shared_ptr<IteratorContext>& ctx) {
        int64 last_optimization_ms = 0;
        int64 optimization_period_ms = 10;
        int64 current_time_ms = EnvTime::NowMicros() / EnvTime::kMillisToMicros;
@ -205,13 +202,14 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
          }
          current_time_ms = EnvTime::NowMicros() / EnvTime::kMillisToMicros;
          last_optimization_ms = current_time_ms;
          model_->FlushMetrics();
        }
      }
      mutex mu_;
      condition_variable cond_var_;
      std::shared_ptr<model::Model> model_;
-      std::unique_ptr<Thread> optimize_thread_ TF_GUARDED_BY(mu_);
+      std::unique_ptr<Thread> model_thread_ TF_GUARDED_BY(mu_);
      bool cancelled_ TF_GUARDED_BY(mu_) = false;
      std::unique_ptr<IteratorBase> input_impl_;
    };