From bf3bd1c026d0ab751369392e7f09b76fc41df4dd Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Wed, 3 Apr 2019 16:10:14 -0700 Subject: [PATCH] [tf.data] Adjusting auto-tuning period to 1 minute (from previous incorrect value of 1000 minutes) and improving auto-tuning logging. PiperOrigin-RevId: 241826050 --- tensorflow/core/framework/model.cc | 38 ++++++++++--------- tensorflow/core/framework/model.h | 12 ++++-- .../core/kernels/data/model_dataset_op.cc | 31 +++++++-------- 3 files changed, 44 insertions(+), 37 deletions(-) diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc index 889fc5e1768..4a511f61c59 100644 --- a/tensorflow/core/framework/model.cc +++ b/tensorflow/core/framework/model.cc @@ -382,12 +382,11 @@ std::shared_ptr Model::AddNode(Node::Factory factory, const string& name, output_ = node; } if (output) { - VLOG(3) << "Adding " << node->name() << "(id:" << node->id() - << ") as input for " << output->name() << "(id:" << output->id() - << ")"; + VLOG(3) << "Adding " << node->long_name() << " as input for " + << output->long_name(); output->add_input(node); } else { - VLOG(3) << "Adding " << node->name() << "(id:" << node->id() << ")"; + VLOG(3) << "Adding " << node->long_name(); } collect_resource_usage_ = collect_resource_usage_ || node->has_tunable_parameters(); @@ -415,16 +414,17 @@ void Model::Optimize(int64 cpu_budget) { tf_shared_lock lock(mu_); snapshot = output_->Snapshot(nullptr); } + VLOG(2) << "Starting optimization of tunable parameters"; const int64 processing_time = ProcessingTime(snapshot); auto parameters = CollectTunableParameters(snapshot); - for (auto& parameter : parameters) { - parameter->value = 1; + for (auto& pair : parameters) { + pair.second->value = 1; } while (true) { const int64 output_time = OutputTime(snapshot); bool all_max = true; - for (auto& parameter : parameters) { - if (parameter->value < parameter->max) { + for (auto& pair : parameters) { + if (pair.second->value < pair.second->max) { all_max = false; break; } @@ -434,17 +434,17 @@ void Model::Optimize(int64 cpu_budget) { } int64 best_delta = -1; Parameter* best_parameter = nullptr; - for (auto& parameter : parameters) { - if (parameter->value == parameter->max) { + for (auto& pair : parameters) { + if (pair.second->value == pair.second->max) { continue; } - parameter->value++; + pair.second->value++; int64 delta = output_time - OutputTime(snapshot); if (delta > best_delta) { best_delta = delta; - best_parameter = parameter.get(); + best_parameter = pair.second.get(); } - parameter->value--; + pair.second->value--; } if (!best_parameter) { // This should never happen because we are using a model snapshot and @@ -457,8 +457,10 @@ void Model::Optimize(int64 cpu_budget) { best_parameter->value++; } VLOG(2) << "Number of tunable parameters: " << parameters.size(); - for (auto& parameter : parameters) { - VLOG(2) << "Setting tunable parameter: " << parameter->value; + for (auto& pair : parameters) { + auto& parameter = pair.second; + VLOG(2) << "Setting tunable parameter " << pair.first << " to " + << parameter->value; mutex_lock l(*parameter->state->mu); parameter->state->value = parameter->value; parameter->state->cond_var->notify_all(); @@ -513,15 +515,15 @@ void Model::RemoveNode(const string& name) { if ((*node)->output()) { (*node)->output()->remove_input(*node); } - VLOG(3) << "Removing " << (*node)->name() << "(id:" << (*node)->id() << ")"; + VLOG(3) << "Removing " << (*node)->long_name(); remove_node_hook_(*node); } lookup_table_.erase(name); } -std::vector> Model::CollectTunableParameters( +std::map> Model::CollectTunableParameters( std::shared_ptr node) { - std::vector> parameters; + std::map> parameters; node->CollectTunableParameters(¶meters); return parameters; } diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h index c11e7f2bb00..cf5019566a8 100644 --- a/tensorflow/core/framework/model.h +++ b/tensorflow/core/framework/model.h @@ -160,6 +160,9 @@ class Node { return inputs_; } + // Returns a longer node name that is guaranteed to be unique. + string long_name() const { return strings::StrCat(name_, "(id:", id_, ")"); } + // Returns the node name. const string& name() const { return name_; } @@ -212,12 +215,12 @@ class Node { // Collects tunable parameters in the subtree rooted in this node. void CollectTunableParameters( - std::vector>* parameters) const + std::map>* parameters) const LOCKS_EXCLUDED(mu_) { tf_shared_lock l(mu_); for (auto& pair : parameters_) { if (pair.second->state->tunable) { - parameters->push_back(pair.second); + parameters->insert(std::make_pair(long_name(), pair.second)); } } for (auto& input : inputs_) { @@ -407,8 +410,9 @@ class Model { void RemoveNode(const string& name) LOCKS_EXCLUDED(mu_); private: - // Collects tunable parameters in the tree rooted in the given node. - std::vector> CollectTunableParameters( + // Collects tunable parameters in the tree rooted in the given node, returning + // a mapping from a (unique) node name to a tunable parameter. + std::map> CollectTunableParameters( std::shared_ptr node); // Collects the output time for the given node. diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc index 7c6af83cc7a..6e54ceedab1 100644 --- a/tensorflow/core/kernels/data/model_dataset_op.cc +++ b/tensorflow/core/kernels/data/model_dataset_op.cc @@ -26,7 +26,7 @@ namespace tensorflow { namespace data { namespace { -constexpr int kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMicros; +constexpr int64 kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMillis; class ModelDatasetOp : public UnaryDatasetOpKernel { public: @@ -159,31 +159,32 @@ class ModelDatasetOp : public UnaryDatasetOpKernel { void OptimizeThread(const std::shared_ptr& ctx) { int64 last_optimization_ms = 0; int64 optimization_period_ms = 10; + int64 current_time_ms = + ctx->env()->NowMicros() / EnvTime::kMillisToMicros; while (true) { { mutex_lock l(mu_); while (!cancelled_ && - last_optimization_ms + optimization_period_ms >= - ctx->env()->NowMicros() / EnvTime::kMillisToMicros) { - cond_var_.wait_for( - l, std::chrono::milliseconds( - last_optimization_ms + optimization_period_ms - - ctx->env()->NowMicros() / EnvTime::kMillisToMicros)); + last_optimization_ms + optimization_period_ms > + current_time_ms) { + auto wait_ms = last_optimization_ms + optimization_period_ms - + current_time_ms; + VLOG(2) << "Waiting for " << wait_ms << " ms."; + cond_var_.wait_for(l, std::chrono::milliseconds(wait_ms)); + current_time_ms = + ctx->env()->NowMicros() / EnvTime::kMillisToMicros; } if (cancelled_) return; } model_->Optimize(dataset()->cpu_budget_); // Exponentially increase the period of running the optimization // until a threshold is reached. - if (optimization_period_ms < kOptimizationPeriodThresholdMs) { - if (optimization_period_ms << 1 < kOptimizationPeriodThresholdMs) { - optimization_period_ms <<= 1; - } else { - optimization_period_ms = kOptimizationPeriodThresholdMs; - } + if (optimization_period_ms != kOptimizationPeriodThresholdMs) { + optimization_period_ms = std::min(optimization_period_ms << 1, + kOptimizationPeriodThresholdMs); } - last_optimization_ms = - ctx->env()->NowMicros() / EnvTime::kMillisToMicros; + current_time_ms = ctx->env()->NowMicros() / EnvTime::kMillisToMicros; + last_optimization_ms = current_time_ms; } }