Merge pull request #32245 from trentlo:no_mem_opt_if_jit_on

PiperOrigin-RevId: 270071858
2019-09-19 10:46:18 -07:00 · 2019-09-19 10:46:18 -07:00 · caa5a8ea6b
commit caa5a8ea6b
parent 34391f7659 67c50b0914
7 changed files with 171 additions and 12 deletions
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@ -629,6 +629,7 @@ cc_library(
        "//tensorflow/core:core_cpu",
        "//tensorflow/core:framework",
        "//tensorflow/core:framework_bounds_check",
+        "//tensorflow/core:framework_internal",
        "//tensorflow/core:graph",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/stream_executor/lib",
@ -637,7 +638,6 @@ cc_library(
        "@com_google_absl//absl/container:flat_hash_set",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
    ],
 )

--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/xla_config_registry.h"

 namespace tensorflow {

@ -218,19 +219,12 @@ void RemoveFromXlaCluster(NodeDef* node_def) {
 void RemoveFromXlaCluster(Node* node) { node->ClearAttr(kXlaClusterAttr); }

 namespace {
-struct XlaGlobalJitLevel {
-  OptimizerOptions::GlobalJitLevel single_gpu;
-  OptimizerOptions::GlobalJitLevel general;
-};
+typedef xla_config_registry::XlaGlobalJitLevel XlaGlobalJitLevel;

 XlaGlobalJitLevel GetXlaGlobalJitLevel(
-    const GraphOptimizationPassOptions& options) {
+    const OptimizerOptions::GlobalJitLevel& jit_level_in_session_opts) {
  XlaGlobalJitLevel result;

-  OptimizerOptions::GlobalJitLevel jit_level_in_session_opts =
-      options.session_options->config.graph_options()
-          .optimizer_options()
-          .global_jit_level();
  if (jit_level_in_session_opts == OptimizerOptions::DEFAULT) {
    // To set compilation to be on by default, change the following line.
    result.single_gpu = result.general = OptimizerOptions::OFF;
@ -289,7 +283,12 @@ bool IsSingleGpuGraph(const Graph& g) {

 OptimizerOptions::GlobalJitLevel GetGlobalJitLevelForGraph(
    const GraphOptimizationPassOptions& options) {
-  XlaGlobalJitLevel xla_global_jit_level = GetXlaGlobalJitLevel(options);
+  OptimizerOptions::GlobalJitLevel jit_level_in_session_opts =
+      options.session_options->config.graph_options()
+          .optimizer_options()
+          .global_jit_level();
+  XlaGlobalJitLevel xla_global_jit_level =
+      GetXlaGlobalJitLevel(jit_level_in_session_opts);
  if (xla_global_jit_level.single_gpu == xla_global_jit_level.general) {
    VLOG(4) << "GetGlobalJitLevelForGraph returning "
            << xla_global_jit_level.single_gpu;
@ -386,4 +385,8 @@ XlaAutoClusteringSummary GetXlaAutoClusteringSummary(const Graph& graph) {

  return result;
 }
+
+// Register a callback for querying XlaGlobalJitLevel.
+REGISTER_XLA_CONFIG_GETTER(GetXlaGlobalJitLevel);
+
 }  // namespace tensorflow
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -2810,6 +2810,7 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
    "util/presized_cuckoo_map.h",
    "util/tensor_slice_set.h",
    "util/tensor_slice_util.h",
+    "util/xla_config_registry.h",
 ]

 tf_cuda_library(
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@ -605,6 +605,7 @@ cc_library(
        ":shape_optimizer",
        "//tensorflow/core:core_cpu_base",
        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:protos_all_cc",
@ -617,6 +618,7 @@ cc_library(
        "//tensorflow/core/grappler/utils:tpu",
        "//tensorflow/core/grappler/verifiers:graph_verifier",
        "//tensorflow/core/grappler/verifiers:structure_verifier",
+        "//tensorflow/core/lib/gtl:map_util",
        "@com_google_absl//absl/strings",
    ],
 )
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/core/util/xla_config_registry.h"

 namespace tensorflow {
 namespace grappler {
@ -126,6 +127,38 @@ bool AutoMixedPrecisionEnabled(RewriterConfig::Toggle opt_level) {
  return false;
 }

+bool IsXlaGlobalJitOn(
+    const OptimizerOptions::GlobalJitLevel& jit_level_in_session_opts) {
+  xla_config_registry::XlaGlobalJitLevel xla_global_jit_level =
+      xla_config_registry::GetGlobalJitLevel(jit_level_in_session_opts);
+  // Return true only if XLA JIT is ON for both single-gpu and multi-gpu
+  // graphs. This is a conservative approach that turns off the memory optimizer
+  // when we are sure that all graphs will be processed by XLA JIT.
+  bool is_on = (xla_global_jit_level.single_gpu == OptimizerOptions::ON_1 ||
+                xla_global_jit_level.single_gpu == OptimizerOptions::ON_2) &&
+               (xla_global_jit_level.general == OptimizerOptions::ON_1 ||
+                xla_global_jit_level.general == OptimizerOptions::ON_2);
+  return is_on;
+}
+
+// A helper function to decide whether to enable the memory optimizer.
+bool MemoryOptimizerEnabled(
+    RewriterConfig::MemOptType mem_opt_type,
+    OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
+  // Disable the default memory optimizer when XLA JIT is ON as it hurts the
+  // XLA JIT performance. The (current) XLA clustering can result in loss of
+  // concurrency between kernel compute and memory copies. As such, it usually
+  // loses the concurrency needed to hide the latencies of the inserted swap-ins
+  // and swap-outs and incurs great performance overhead. Remove this check when
+  // the XLA JIT can better deal with the concurrency.
+  if (mem_opt_type == RewriterConfig::DEFAULT_MEM_OPT &&
+      IsXlaGlobalJitOn(jit_level_in_session_opts)) {
+    return false;
+  }
+
+  return mem_opt_type != RewriterConfig::NO_MEM_OPT;
+}
+
 }  // namespace

 #define MK_OPT(NAME, VALUE) \
@ -216,7 +249,9 @@ Status MetaOptimizer::InitializeOptimizers(
    optimizers->push_back(
        MakeUnique<DependencyOptimizer>(cfg_.dependency_optimization()));
  }
-  if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
+  auto global_jit_level =
+      config_proto_.graph_options().optimizer_options().global_jit_level();
+  if (MemoryOptimizerEnabled(cfg_.memory_optimization(), global_jit_level)) {
    if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
      optimizers->push_back(
          // Use the default target node name prefix "gradients/"
--- a/tensorflow/core/util/xla_config_registry.cc
+++ b/tensorflow/core/util/xla_config_registry.cc
@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/xla_config_registry.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+namespace xla_config_registry {
+
+namespace {
+struct GlobalJitLevelState {
+  mutex mu;
+  GlobalJitLevelGetterTy getter GUARDED_BY(mu);
+};
+
+GlobalJitLevelState* GetSingletonState() {
+  static GlobalJitLevelState* state = new GlobalJitLevelState;
+  return state;
+}
+}  // namespace
+
+void RegisterGlobalJitLevelGetter(GlobalJitLevelGetterTy getter) {
+  GlobalJitLevelState* state = GetSingletonState();
+  mutex_lock l(state->mu);
+  CHECK(!state->getter);
+  state->getter = std::move(getter);
+}
+
+XlaGlobalJitLevel GetGlobalJitLevel(
+    OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
+  GlobalJitLevelState* state = GetSingletonState();
+  mutex_lock l(state->mu);
+  if (!state->getter) {
+    return {jit_level_in_session_opts, jit_level_in_session_opts};
+  }
+  return state->getter(jit_level_in_session_opts);
+}
+
+}  // namespace xla_config_registry
+
+}  // namespace tensorflow
--- a/tensorflow/core/util/xla_config_registry.h
+++ b/tensorflow/core/util/xla_config_registry.h
@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_XLA_CONFIG_REGISTRY_H_
+#define TENSORFLOW_CORE_UTIL_XLA_CONFIG_REGISTRY_H_
+
+#include <functional>
+
+#include "tensorflow/core/framework/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+namespace xla_config_registry {
+
+// XlaGlobalJitLevel is used by XLA to expose its JIT level for processing
+// single gpu and general (multi-gpu) graphs.
+struct XlaGlobalJitLevel {
+  OptimizerOptions::GlobalJitLevel single_gpu;
+  OptimizerOptions::GlobalJitLevel general;
+};
+
+// Input is the jit_level in session config, and return value is the jit_level
+// from XLA, reflecting the effect of the environment variable flags.
+typedef std::function<XlaGlobalJitLevel(
+    const OptimizerOptions::GlobalJitLevel&)>
+    GlobalJitLevelGetterTy;
+
+void RegisterGlobalJitLevelGetter(GlobalJitLevelGetterTy getter);
+
+XlaGlobalJitLevel GetGlobalJitLevel(
+    OptimizerOptions::GlobalJitLevel jit_level_in_session_opts);
+
+#define REGISTER_XLA_CONFIG_GETTER(getter) \
+  REGISTER_XLA_CONFIG_GETTER_UNIQ_HELPER(__COUNTER__, getter)
+
+#define REGISTER_XLA_CONFIG_GETTER_UNIQ_HELPER(ctr, getter) \
+  REGISTER_XLA_CONFIG_GETTER_UNIQ(ctr, getter)
+
+#define REGISTER_XLA_CONFIG_GETTER_UNIQ(ctr, getter)                    \
+  static bool xla_config_registry_registration_##ctr =                  \
+      (::tensorflow::xla_config_registry::RegisterGlobalJitLevelGetter( \
+           getter),                                                     \
+       true)
+
+}  // namespace xla_config_registry
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_XLA_CONFIG_REGISTRY_H_