From bf3bd1c026d0ab751369392e7f09b76fc41df4dd Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Wed, 3 Apr 2019 16:10:14 -0700
Subject: [PATCH] [tf.data] Adjusting auto-tuning period to 1 minute (from
 previous incorrect value of 1000 minutes) and improving auto-tuning logging.

PiperOrigin-RevId: 241826050
---
 tensorflow/core/framework/model.cc            | 38 ++++++++++---------
 tensorflow/core/framework/model.h             | 12 ++++--
 .../core/kernels/data/model_dataset_op.cc     | 31 +++++++--------
 3 files changed, 44 insertions(+), 37 deletions(-)
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 889fc5e1768..4a511f61c59 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -382,12 +382,11 @@ std::shared_ptr<Node> Model::AddNode(Node::Factory factory, const string& name,
     output_ = node;
   }
   if (output) {
-    VLOG(3) << "Adding " << node->name() << "(id:" << node->id()
-            << ") as input for " << output->name() << "(id:" << output->id()
-            << ")";
+    VLOG(3) << "Adding " << node->long_name() << " as input for "
+            << output->long_name();
     output->add_input(node);
   } else {
-    VLOG(3) << "Adding " << node->name() << "(id:" << node->id() << ")";
+    VLOG(3) << "Adding " << node->long_name();
   }
   collect_resource_usage_ =
       collect_resource_usage_ || node->has_tunable_parameters();
@@ -415,16 +414,17 @@ void Model::Optimize(int64 cpu_budget) {
     tf_shared_lock lock(mu_);
     snapshot = output_->Snapshot(nullptr);
   }
+  VLOG(2) << "Starting optimization of tunable parameters";
   const int64 processing_time = ProcessingTime(snapshot);
   auto parameters = CollectTunableParameters(snapshot);
-  for (auto& parameter : parameters) {
-    parameter->value = 1;
+  for (auto& pair : parameters) {
+    pair.second->value = 1;
   }
   while (true) {
     const int64 output_time = OutputTime(snapshot);
     bool all_max = true;
-    for (auto& parameter : parameters) {
-      if (parameter->value < parameter->max) {
+    for (auto& pair : parameters) {
+      if (pair.second->value < pair.second->max) {
         all_max = false;
         break;
       }
@@ -434,17 +434,17 @@ void Model::Optimize(int64 cpu_budget) {
     }
     int64 best_delta = -1;
     Parameter* best_parameter = nullptr;
-    for (auto& parameter : parameters) {
-      if (parameter->value == parameter->max) {
+    for (auto& pair : parameters) {
+      if (pair.second->value == pair.second->max) {
         continue;
       }
-      parameter->value++;
+      pair.second->value++;
       int64 delta = output_time - OutputTime(snapshot);
       if (delta > best_delta) {
         best_delta = delta;
-        best_parameter = parameter.get();
+        best_parameter = pair.second.get();
       }
-      parameter->value--;
+      pair.second->value--;
     }
     if (!best_parameter) {
       // This should never happen because we are using a model snapshot and
@@ -457,8 +457,10 @@ void Model::Optimize(int64 cpu_budget) {
     best_parameter->value++;
   }
   VLOG(2) << "Number of tunable parameters: " << parameters.size();
-  for (auto& parameter : parameters) {
-    VLOG(2) << "Setting tunable parameter: " << parameter->value;
+  for (auto& pair : parameters) {
+    auto& parameter = pair.second;
+    VLOG(2) << "Setting tunable parameter " << pair.first << " to "
+            << parameter->value;
     mutex_lock l(*parameter->state->mu);
     parameter->state->value = parameter->value;
     parameter->state->cond_var->notify_all();
@@ -513,15 +515,15 @@ void Model::RemoveNode(const string& name) {
     if ((*node)->output()) {
       (*node)->output()->remove_input(*node);
     }
-    VLOG(3) << "Removing " << (*node)->name() << "(id:" << (*node)->id() << ")";
+    VLOG(3) << "Removing " << (*node)->long_name();
     remove_node_hook_(*node);
   }
   lookup_table_.erase(name);
 }
 
-std::vector<std::shared_ptr<Parameter>> Model::CollectTunableParameters(
+std::map<string, std::shared_ptr<Parameter>> Model::CollectTunableParameters(
     std::shared_ptr<Node> node) {
-  std::vector<std::shared_ptr<Parameter>> parameters;
+  std::map<string, std::shared_ptr<Parameter>> parameters;
   node->CollectTunableParameters(&parameters);
   return parameters;
 }
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index c11e7f2bb00..cf5019566a8 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -160,6 +160,9 @@ class Node {
     return inputs_;
   }
 
+  // Returns a longer node name that is guaranteed to be unique.
+  string long_name() const { return strings::StrCat(name_, "(id:", id_, ")"); }
+
   // Returns the node name.
   const string& name() const { return name_; }
 
@@ -212,12 +215,12 @@ class Node {
 
   // Collects tunable parameters in the subtree rooted in this node.
   void CollectTunableParameters(
-      std::vector<std::shared_ptr<Parameter>>* parameters) const
+      std::map<string, std::shared_ptr<Parameter>>* parameters) const
       LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     for (auto& pair : parameters_) {
       if (pair.second->state->tunable) {
-        parameters->push_back(pair.second);
+        parameters->insert(std::make_pair(long_name(), pair.second));
       }
     }
     for (auto& input : inputs_) {
@@ -407,8 +410,9 @@ class Model {
   void RemoveNode(const string& name) LOCKS_EXCLUDED(mu_);
 
  private:
-  // Collects tunable parameters in the tree rooted in the given node.
-  std::vector<std::shared_ptr<Parameter>> CollectTunableParameters(
+  // Collects tunable parameters in the tree rooted in the given node, returning
+  // a mapping from a (unique) node name to a tunable parameter.
+  std::map<string, std::shared_ptr<Parameter>> CollectTunableParameters(
       std::shared_ptr<Node> node);
 
   // Collects the output time for the given node.
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 7c6af83cc7a..6e54ceedab1 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -26,7 +26,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-constexpr int kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMicros;
+constexpr int64 kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMillis;
 
 class ModelDatasetOp : public UnaryDatasetOpKernel {
  public:
@@ -159,31 +159,32 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       void OptimizeThread(const std::shared_ptr<IteratorContext>& ctx) {
         int64 last_optimization_ms = 0;
         int64 optimization_period_ms = 10;
+        int64 current_time_ms =
+            ctx->env()->NowMicros() / EnvTime::kMillisToMicros;
         while (true) {
           {
             mutex_lock l(mu_);
             while (!cancelled_ &&
-                   last_optimization_ms + optimization_period_ms >=
-                       ctx->env()->NowMicros() / EnvTime::kMillisToMicros) {
-              cond_var_.wait_for(
-                  l, std::chrono::milliseconds(
-                         last_optimization_ms + optimization_period_ms -
-                         ctx->env()->NowMicros() / EnvTime::kMillisToMicros));
+                   last_optimization_ms + optimization_period_ms >
+                       current_time_ms) {
+              auto wait_ms = last_optimization_ms + optimization_period_ms -
+                             current_time_ms;
+              VLOG(2) << "Waiting for " << wait_ms << " ms.";
+              cond_var_.wait_for(l, std::chrono::milliseconds(wait_ms));
+              current_time_ms =
+                  ctx->env()->NowMicros() / EnvTime::kMillisToMicros;
             }
             if (cancelled_) return;
           }
           model_->Optimize(dataset()->cpu_budget_);
           // Exponentially increase the period of running the optimization
           // until a threshold is reached.
-          if (optimization_period_ms < kOptimizationPeriodThresholdMs) {
-            if (optimization_period_ms << 1 < kOptimizationPeriodThresholdMs) {
-              optimization_period_ms <<= 1;
-            } else {
-              optimization_period_ms = kOptimizationPeriodThresholdMs;
-            }
+          if (optimization_period_ms != kOptimizationPeriodThresholdMs) {
+            optimization_period_ms = std::min(optimization_period_ms << 1,
+                                              kOptimizationPeriodThresholdMs);
           }
-          last_optimization_ms =
-              ctx->env()->NowMicros() / EnvTime::kMillisToMicros;
+          current_time_ms = ctx->env()->NowMicros() / EnvTime::kMillisToMicros;
+          last_optimization_ms = current_time_ms;
         }
       }