From a82ec8a0726adf2a180dc6bb746edc01b738644b Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Wed, 4 Sep 2019 17:33:20 -0700
Subject: [PATCH 01/14] Disable the Grappler memory opt when GlobalJitLevel is
 on.

---
 .../grappler/optimizers/meta_optimizer.cc     | 21 ++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index a56b08d2e20..831ce5403be 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -126,6 +126,23 @@ bool AutoMixedPrecisionEnabled(RewriterConfig::Toggle opt_level) {
   return false;
 }
 
+// A helper function to decide whether to enable the memory optimizer.
+bool MemoryOptimizerEnabled(RewriterConfig::MemOptType mem_opt_type,
+                            OptimizerOptions::GlobalJitLevel global_jit_level) {
+  // Disable the memory optimizer when GlobalJitLevel is ON as it hurts the
+  // XLA JIT performance. The (current) XLA clustering can result in loss of
+  // concurrency between kernel compute and memory copies. As such, it usually
+  // loses the concurrency needed to hide the latencies of the inserted swap-ins
+  // and swap-outs and incurs great performance overhead. Remove this check when
+  // the XLA JIT can better deal with the concurrency.
+  if (global_jit_level == OptimizerOptions::ON_1 ||
+      global_jit_level == OptimizerOptions::ON_2) {
+    return false;
+  }
+
+  return mem_opt_type != RewriterConfig::NO_MEM_OPT;
+}
+
 }  // namespace
 
 #define MK_OPT(NAME, VALUE) \
@@ -216,7 +233,9 @@ Status MetaOptimizer::InitializeOptimizers(
     optimizers->push_back(
         MakeUnique<AutoMixedPrecision>(cfg_.auto_mixed_precision()));
   }
-  if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
+  auto global_jit_level =
+      config_proto_.graph_options().optimizer_options().global_jit_level();
+  if (MemoryOptimizerEnabled(cfg_.memory_optimization(), global_jit_level)) {
     if (cfg_.memory_optimizer_target_node_name_scope().empty()) {
       optimizers->push_back(
           // Use the default target node name prefix "gradients/"

From cb3f976f5fe5886857c266f848977e2b5ff2e3da Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Thu, 5 Sep 2019 11:14:15 -0700
Subject: [PATCH 02/14] Check GlobalJitLevel only for DEFAULT_MEM_OPT.

This relaxes the disable check and should be a slightly better behavior, as
users still have some ways to enable the memory optimizer when they want to.
---
 tensorflow/core/grappler/optimizers/meta_optimizer.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 831ce5403be..262ed722a54 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -129,14 +129,15 @@ bool AutoMixedPrecisionEnabled(RewriterConfig::Toggle opt_level) {
 // A helper function to decide whether to enable the memory optimizer.
 bool MemoryOptimizerEnabled(RewriterConfig::MemOptType mem_opt_type,
                             OptimizerOptions::GlobalJitLevel global_jit_level) {
-  // Disable the memory optimizer when GlobalJitLevel is ON as it hurts the
-  // XLA JIT performance. The (current) XLA clustering can result in loss of
+  // Disable the default memory optimizer when GlobalJitLevel is ON as it hurts
+  // the XLA JIT performance. The (current) XLA clustering can result in loss of
   // concurrency between kernel compute and memory copies. As such, it usually
   // loses the concurrency needed to hide the latencies of the inserted swap-ins
   // and swap-outs and incurs great performance overhead. Remove this check when
   // the XLA JIT can better deal with the concurrency.
-  if (global_jit_level == OptimizerOptions::ON_1 ||
-      global_jit_level == OptimizerOptions::ON_2) {
+  if (mem_opt_type == RewriterConfig::DEFAULT_MEM_OPT &&
+      (global_jit_level == OptimizerOptions::ON_1 ||
+       global_jit_level == OptimizerOptions::ON_2)) {
     return false;
   }
 

From 28d774bde1348d73f01d0de9a84ec3f06ae82b59 Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Mon, 9 Sep 2019 14:55:10 -0700
Subject: [PATCH 03/14] Add a XlaConfigProxy class.

Callbacks can be registered to this class so that runtime environment
flags can be parsed to change configs in the Tensorflow core. A primary use
of this is for the Tensorflow core to query the XLA JIT level, which can be
configured by some runtime environment flags in addition to ConfigProto.
---
 tensorflow/compiler/jit/xla_cluster_util.cc   | 34 ++++++--
 tensorflow/core/BUILD                         |  1 +
 .../grappler/optimizers/meta_optimizer.cc     |  8 +-
 tensorflow/core/util/xla_config_proxy.cc      | 29 +++++++
 tensorflow/core/util/xla_config_proxy.h       | 84 +++++++++++++++++++
 5 files changed, 148 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/core/util/xla_config_proxy.cc
 create mode 100644 tensorflow/core/util/xla_config_proxy.h

diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index 97737c5ee9c..42033d6bea1 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/xla_config_proxy.h"
 
 namespace tensorflow {
 
@@ -224,13 +225,9 @@ struct XlaGlobalJitLevel {
 };
 
 XlaGlobalJitLevel GetXlaGlobalJitLevel(
-    const GraphOptimizationPassOptions& options) {
+    const OptimizerOptions::GlobalJitLevel& jit_level_in_session_opts) {
   XlaGlobalJitLevel result;
 
-  OptimizerOptions::GlobalJitLevel jit_level_in_session_opts =
-      options.session_options->config.graph_options()
-          .optimizer_options()
-          .global_jit_level();
   if (jit_level_in_session_opts == OptimizerOptions::DEFAULT) {
     // To set compilation to be on by default, change the following line.
     result.single_gpu = result.general = OptimizerOptions::OFF;
@@ -289,7 +286,12 @@ bool IsSingleGpuGraph(const Graph& g) {
 
 OptimizerOptions::GlobalJitLevel GetGlobalJitLevelForGraph(
     const GraphOptimizationPassOptions& options) {
-  XlaGlobalJitLevel xla_global_jit_level = GetXlaGlobalJitLevel(options);
+  OptimizerOptions::GlobalJitLevel jit_level_in_session_opts =
+      options.session_options->config.graph_options()
+          .optimizer_options()
+          .global_jit_level();
+  XlaGlobalJitLevel xla_global_jit_level =
+      GetXlaGlobalJitLevel(jit_level_in_session_opts);
   if (xla_global_jit_level.single_gpu == xla_global_jit_level.general) {
     VLOG(4) << "GetGlobalJitLevelForGraph returning "
             << xla_global_jit_level.single_gpu;
@@ -386,4 +388,24 @@ XlaAutoClusteringSummary GetXlaAutoClusteringSummary(const Graph& graph) {
 
   return result;
 }
+
+// Implement and register a setter for OptimizerOptions::GlobalJitLevel, so that
+// the Tensorflow core can query its value.
+bool GlobalJitLevelSetter(OptimizerOptions::GlobalJitLevel& global_jit_level) {
+  XlaGlobalJitLevel xla_global_jit_level =
+      GetXlaGlobalJitLevel(global_jit_level);
+  // Take the general flag to avoid the dependency on Tensorflow::Graph.
+  OptimizerOptions::GlobalJitLevel new_jit_level = xla_global_jit_level.general;
+
+  if (new_jit_level != global_jit_level) {
+    global_jit_level = new_jit_level;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+REGISTER_XLA_CONFIG_SETTER(OptimizerOptions::GlobalJitLevel,
+                           GlobalJitLevelSetter);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index d1ab6df7a81..2ff04c2cf2b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2805,6 +2805,7 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
     "util/presized_cuckoo_map.h",
     "util/tensor_slice_set.h",
     "util/tensor_slice_util.h",
+    "util/xla_config_proxy.h",
 ]
 
 tf_cuda_library(
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 262ed722a54..a80fb09cf0a 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/core/util/xla_config_proxy.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -127,14 +128,17 @@ bool AutoMixedPrecisionEnabled(RewriterConfig::Toggle opt_level) {
 }
 
 // A helper function to decide whether to enable the memory optimizer.
-bool MemoryOptimizerEnabled(RewriterConfig::MemOptType mem_opt_type,
-                            OptimizerOptions::GlobalJitLevel global_jit_level) {
+bool MemoryOptimizerEnabled(
+    RewriterConfig::MemOptType mem_opt_type,
+    OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
   // Disable the default memory optimizer when GlobalJitLevel is ON as it hurts
   // the XLA JIT performance. The (current) XLA clustering can result in loss of
   // concurrency between kernel compute and memory copies. As such, it usually
   // loses the concurrency needed to hide the latencies of the inserted swap-ins
   // and swap-outs and incurs great performance overhead. Remove this check when
   // the XLA JIT can better deal with the concurrency.
+  OptimizerOptions::GlobalJitLevel global_jit_level =
+      XlaConfigProxy::GetGlobalJitLevel(jit_level_in_session_opts);
   if (mem_opt_type == RewriterConfig::DEFAULT_MEM_OPT &&
       (global_jit_level == OptimizerOptions::ON_1 ||
        global_jit_level == OptimizerOptions::ON_2)) {
diff --git a/tensorflow/core/util/xla_config_proxy.cc b/tensorflow/core/util/xla_config_proxy.cc
new file mode 100644
index 00000000000..796b4b85bf1
--- /dev/null
+++ b/tensorflow/core/util/xla_config_proxy.cc
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/xla_config_proxy.h"
+
+namespace tensorflow {
+
+/*static*/
+OptimizerOptions::GlobalJitLevel XlaConfigProxy::GetGlobalJitLevel(
+    OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
+  OptimizerOptions::GlobalJitLevel global_jit_level = jit_level_in_session_opts;
+  ConfigSetterRegistry<OptimizerOptions::GlobalJitLevel>::Global()->Update(
+      jit_level_in_session_opts);
+  return global_jit_level;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/xla_config_proxy.h b/tensorflow/core/util/xla_config_proxy.h
new file mode 100644
index 00000000000..3cd5146f3c2
--- /dev/null
+++ b/tensorflow/core/util/xla_config_proxy.h
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H
+#define TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H
+
+#include <functional>
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// A proxy class of XLA config.
+class XlaConfigProxy {
+ public:
+  // This class provides a means for updating configs (mainly ConfigProto)
+  // according to runtime environment variable flags. As the flags may be
+  // maintained outside the Tensorflow core and they may or may not exist
+  // depending on the build configuration, here implements a registration
+  // mechanism to register their config setters for query.
+  template <typename ConfigType>
+  class ConfigSetterRegistry {
+   public:
+    static ConfigSetterRegistry* Global() {
+      static ConfigSetterRegistry* config_registry = new ConfigSetterRegistry;
+      return config_registry;
+    }
+
+    // Register a setter for ConfigType.
+    void Register(std::function<bool(ConfigType&)> setter) {
+      CHECK(!a_setter_);
+      a_setter_ = std::move(setter);
+    }
+
+    // Invoke the registered setter to update the ConfigType value. Return true
+    // if the value is updated.
+    bool Update(ConfigType& value) {
+      if (!a_setter_) {
+        return true;
+      }
+      return a_setter_(value);
+    }
+
+   private:
+    ConfigSetterRegistry() = default;
+    ConfigSetterRegistry(const ConfigSetterRegistry&) = delete;
+    ConfigSetterRegistry& operator=(const ConfigSetterRegistry&) = delete;
+
+   private:
+    std::function<bool(ConfigType&)> a_setter_;
+  };
+
+  template <typename ConfigType>
+  class ConfigSetterRegistration {
+   public:
+    ConfigSetterRegistration(std::function<bool(ConfigType&)> setter) {
+      ConfigSetterRegistry<ConfigType>::Global()->Register(std::move(setter));
+    }
+  };
+
+ public:
+  static OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
+      OptimizerOptions::GlobalJitLevel jit_level_in_session_opts);
+};
+
+#define REGISTER_XLA_CONFIG_SETTER(ConfigType, setter)                      \
+  static ::tensorflow::XlaConfigProxy::ConfigSetterRegistration<ConfigType> \
+      unique_xla_config_setter_ctr_##__COUNTER__(setter)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H

From 348488b58902c7b71ae70bbba0cc805fd2a65092 Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Mon, 9 Sep 2019 16:17:49 -0700
Subject: [PATCH 04/14] Add a unittest for XlaConfigProxy.

---
 tensorflow/core/BUILD                         |  1 +
 tensorflow/core/util/xla_config_proxy.h       |  1 +
 tensorflow/core/util/xla_config_proxy_test.cc | 61 +++++++++++++++++++
 3 files changed, 63 insertions(+)
 create mode 100644 tensorflow/core/util/xla_config_proxy_test.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 2ff04c2cf2b..b1baac5c4ef 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -4141,6 +4141,7 @@ tf_cc_tests(
         "util/tensor_slice_util_test.cc",
         "util/tensor_slice_writer_test.cc",
         "util/work_sharder_test.cc",
+        "util/xla_config_proxy_test.cc",
     ],
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
diff --git a/tensorflow/core/util/xla_config_proxy.h b/tensorflow/core/util/xla_config_proxy.h
index 3cd5146f3c2..d0caf2b8660 100644
--- a/tensorflow/core/util/xla_config_proxy.h
+++ b/tensorflow/core/util/xla_config_proxy.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H
 
 #include <functional>
+#include "tensorflow/core/platform/default/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
diff --git a/tensorflow/core/util/xla_config_proxy_test.cc b/tensorflow/core/util/xla_config_proxy_test.cc
new file mode 100644
index 00000000000..848cff7e081
--- /dev/null
+++ b/tensorflow/core/util/xla_config_proxy_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/xla_config_proxy.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace {
+
+enum ConfigA { A_ON, A_OFF };
+
+enum ConfigB { B_ON, B_OFF };
+
+};  // anonymous
+
+TEST(XlaConfigProxyTest, RegistrationsAndQueries) {
+  ConfigA config_a = A_ON;
+  ConfigB config_b = B_ON;
+
+  XlaConfigProxy::ConfigSetterRegistry<ConfigA>::Global()->Update(config_a);
+  XlaConfigProxy::ConfigSetterRegistry<ConfigB>::Global()->Update(config_b);
+
+  // Expect original values.
+  EXPECT_EQ(A_ON, config_a);
+  EXPECT_EQ(B_ON, config_b);
+
+  // Register callbacks.
+  XlaConfigProxy::ConfigSetterRegistration<ConfigA>
+  contrig_setter_registration_a([](ConfigA& config_a) -> bool {
+    config_a = A_OFF;
+    return true;
+  });
+
+  XlaConfigProxy::ConfigSetterRegistration<ConfigB>
+  contrig_setter_registration_b([](ConfigB& config_b) -> bool {
+    config_b = B_OFF;
+    return true;
+  });
+
+  // Verify that callbacks work.
+  XlaConfigProxy::ConfigSetterRegistry<ConfigA>::Global()->Update(config_a);
+  XlaConfigProxy::ConfigSetterRegistry<ConfigB>::Global()->Update(config_b);
+  EXPECT_EQ(A_OFF, config_a);
+  EXPECT_EQ(B_OFF, config_b);
+}
+
+}  // namespace tensorflow

From b77327e29523ef011ac9d8c3f73526245efd15f0 Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Mon, 9 Sep 2019 16:54:29 -0700
Subject: [PATCH 05/14] Fix a typo in XlaConfigProxy.

---
 tensorflow/core/util/xla_config_proxy.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/util/xla_config_proxy.cc b/tensorflow/core/util/xla_config_proxy.cc
index 796b4b85bf1..952ddb2f97c 100644
--- a/tensorflow/core/util/xla_config_proxy.cc
+++ b/tensorflow/core/util/xla_config_proxy.cc
@@ -22,7 +22,7 @@ OptimizerOptions::GlobalJitLevel XlaConfigProxy::GetGlobalJitLevel(
     OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
   OptimizerOptions::GlobalJitLevel global_jit_level = jit_level_in_session_opts;
   ConfigSetterRegistry<OptimizerOptions::GlobalJitLevel>::Global()->Update(
-      jit_level_in_session_opts);
+      global_jit_level);
   return global_jit_level;
 }
 

From 21d983f7617d2847a06ec387ae72df57a9438809 Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Tue, 10 Sep 2019 21:42:35 -0700
Subject: [PATCH 06/14] Fix typo.

---
 tensorflow/core/util/xla_config_proxy_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/util/xla_config_proxy_test.cc b/tensorflow/core/util/xla_config_proxy_test.cc
index 848cff7e081..e4ed8bcbb2b 100644
--- a/tensorflow/core/util/xla_config_proxy_test.cc
+++ b/tensorflow/core/util/xla_config_proxy_test.cc
@@ -40,13 +40,13 @@ TEST(XlaConfigProxyTest, RegistrationsAndQueries) {
 
   // Register callbacks.
   XlaConfigProxy::ConfigSetterRegistration<ConfigA>
-  contrig_setter_registration_a([](ConfigA& config_a) -> bool {
+  config_setter_registration_a([](ConfigA& config_a) -> bool {
     config_a = A_OFF;
     return true;
   });
 
   XlaConfigProxy::ConfigSetterRegistration<ConfigB>
-  contrig_setter_registration_b([](ConfigB& config_b) -> bool {
+  config_setter_registration_b([](ConfigB& config_b) -> bool {
     config_b = B_OFF;
     return true;
   });

From 88572baddc066ac0d3c2f46b070c490ff89ac867 Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Wed, 11 Sep 2019 11:13:14 -0700
Subject: [PATCH 07/14] Simplify XlaConfigProxy by removing the utililies.

Also, rename XlaConfigProxy to XlaConfigRegistry.
---
 tensorflow/compiler/jit/xla_cluster_util.cc   | 23 ++---
 tensorflow/core/BUILD                         |  3 +-
 .../grappler/optimizers/meta_optimizer.cc     |  4 +-
 tensorflow/core/util/xla_config_proxy.h       | 85 -------------------
 tensorflow/core/util/xla_config_proxy_test.cc | 61 -------------
 ...config_proxy.cc => xla_config_registry.cc} | 11 +--
 tensorflow/core/util/xla_config_registry.h    | 59 +++++++++++++
 7 files changed, 74 insertions(+), 172 deletions(-)
 delete mode 100644 tensorflow/core/util/xla_config_proxy.h
 delete mode 100644 tensorflow/core/util/xla_config_proxy_test.cc
 rename tensorflow/core/util/{xla_config_proxy.cc => xla_config_registry.cc} (64%)
 create mode 100644 tensorflow/core/util/xla_config_registry.h

diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index 42033d6bea1..6a0259f7b0e 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
-#include "tensorflow/core/util/xla_config_proxy.h"
+#include "tensorflow/core/util/xla_config_registry.h"
 
 namespace tensorflow {
 
@@ -389,23 +389,18 @@ XlaAutoClusteringSummary GetXlaAutoClusteringSummary(const Graph& graph) {
   return result;
 }
 
-// Implement and register a setter for OptimizerOptions::GlobalJitLevel, so that
-// the Tensorflow core can query its value.
-bool GlobalJitLevelSetter(OptimizerOptions::GlobalJitLevel& global_jit_level) {
+// Implement and register a callback for querying GlobalJitLevel.
+OptimizerOptions::GlobalJitLevel GlobalJitLevelGetter(
+    OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
   XlaGlobalJitLevel xla_global_jit_level =
-      GetXlaGlobalJitLevel(global_jit_level);
+      GetXlaGlobalJitLevel(jit_level_in_session_opts);
   // Take the general flag to avoid the dependency on Tensorflow::Graph.
-  OptimizerOptions::GlobalJitLevel new_jit_level = xla_global_jit_level.general;
+  OptimizerOptions::GlobalJitLevel global_jit_level =
+      xla_global_jit_level.general;
 
-  if (new_jit_level != global_jit_level) {
-    global_jit_level = new_jit_level;
-    return true;
-  } else {
-    return false;
-  }
+  return global_jit_level;
 }
 
-REGISTER_XLA_CONFIG_SETTER(OptimizerOptions::GlobalJitLevel,
-                           GlobalJitLevelSetter);
+REGISTER_XLA_CONFIG_GETTER(GlobalJitLevelGetter);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b1baac5c4ef..75f6d942dba 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2805,7 +2805,7 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
     "util/presized_cuckoo_map.h",
     "util/tensor_slice_set.h",
     "util/tensor_slice_util.h",
-    "util/xla_config_proxy.h",
+    "util/xla_config_registry.h",
 ]
 
 tf_cuda_library(
@@ -4141,7 +4141,6 @@ tf_cc_tests(
         "util/tensor_slice_util_test.cc",
         "util/tensor_slice_writer_test.cc",
         "util/work_sharder_test.cc",
-        "util/xla_config_proxy_test.cc",
     ],
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index a80fb09cf0a..992c8e1e64c 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -51,7 +51,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/ptr_util.h"
-#include "tensorflow/core/util/xla_config_proxy.h"
+#include "tensorflow/core/util/xla_config_registry.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -138,7 +138,7 @@ bool MemoryOptimizerEnabled(
   // and swap-outs and incurs great performance overhead. Remove this check when
   // the XLA JIT can better deal with the concurrency.
   OptimizerOptions::GlobalJitLevel global_jit_level =
-      XlaConfigProxy::GetGlobalJitLevel(jit_level_in_session_opts);
+      XlaConfigRegistry::GetGlobalJitLevel(jit_level_in_session_opts);
   if (mem_opt_type == RewriterConfig::DEFAULT_MEM_OPT &&
       (global_jit_level == OptimizerOptions::ON_1 ||
        global_jit_level == OptimizerOptions::ON_2)) {
diff --git a/tensorflow/core/util/xla_config_proxy.h b/tensorflow/core/util/xla_config_proxy.h
deleted file mode 100644
index d0caf2b8660..00000000000
--- a/tensorflow/core/util/xla_config_proxy.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H
-#define TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H
-
-#include <functional>
-#include "tensorflow/core/platform/default/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-
-namespace tensorflow {
-
-// A proxy class of XLA config.
-class XlaConfigProxy {
- public:
-  // This class provides a means for updating configs (mainly ConfigProto)
-  // according to runtime environment variable flags. As the flags may be
-  // maintained outside the Tensorflow core and they may or may not exist
-  // depending on the build configuration, here implements a registration
-  // mechanism to register their config setters for query.
-  template <typename ConfigType>
-  class ConfigSetterRegistry {
-   public:
-    static ConfigSetterRegistry* Global() {
-      static ConfigSetterRegistry* config_registry = new ConfigSetterRegistry;
-      return config_registry;
-    }
-
-    // Register a setter for ConfigType.
-    void Register(std::function<bool(ConfigType&)> setter) {
-      CHECK(!a_setter_);
-      a_setter_ = std::move(setter);
-    }
-
-    // Invoke the registered setter to update the ConfigType value. Return true
-    // if the value is updated.
-    bool Update(ConfigType& value) {
-      if (!a_setter_) {
-        return true;
-      }
-      return a_setter_(value);
-    }
-
-   private:
-    ConfigSetterRegistry() = default;
-    ConfigSetterRegistry(const ConfigSetterRegistry&) = delete;
-    ConfigSetterRegistry& operator=(const ConfigSetterRegistry&) = delete;
-
-   private:
-    std::function<bool(ConfigType&)> a_setter_;
-  };
-
-  template <typename ConfigType>
-  class ConfigSetterRegistration {
-   public:
-    ConfigSetterRegistration(std::function<bool(ConfigType&)> setter) {
-      ConfigSetterRegistry<ConfigType>::Global()->Register(std::move(setter));
-    }
-  };
-
- public:
-  static OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
-      OptimizerOptions::GlobalJitLevel jit_level_in_session_opts);
-};
-
-#define REGISTER_XLA_CONFIG_SETTER(ConfigType, setter)                      \
-  static ::tensorflow::XlaConfigProxy::ConfigSetterRegistration<ConfigType> \
-      unique_xla_config_setter_ctr_##__COUNTER__(setter)
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H
diff --git a/tensorflow/core/util/xla_config_proxy_test.cc b/tensorflow/core/util/xla_config_proxy_test.cc
deleted file mode 100644
index e4ed8bcbb2b..00000000000
--- a/tensorflow/core/util/xla_config_proxy_test.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/util/xla_config_proxy.h"
-
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-
-namespace {
-
-enum ConfigA { A_ON, A_OFF };
-
-enum ConfigB { B_ON, B_OFF };
-
-};  // anonymous
-
-TEST(XlaConfigProxyTest, RegistrationsAndQueries) {
-  ConfigA config_a = A_ON;
-  ConfigB config_b = B_ON;
-
-  XlaConfigProxy::ConfigSetterRegistry<ConfigA>::Global()->Update(config_a);
-  XlaConfigProxy::ConfigSetterRegistry<ConfigB>::Global()->Update(config_b);
-
-  // Expect original values.
-  EXPECT_EQ(A_ON, config_a);
-  EXPECT_EQ(B_ON, config_b);
-
-  // Register callbacks.
-  XlaConfigProxy::ConfigSetterRegistration<ConfigA>
-  config_setter_registration_a([](ConfigA& config_a) -> bool {
-    config_a = A_OFF;
-    return true;
-  });
-
-  XlaConfigProxy::ConfigSetterRegistration<ConfigB>
-  config_setter_registration_b([](ConfigB& config_b) -> bool {
-    config_b = B_OFF;
-    return true;
-  });
-
-  // Verify that callbacks work.
-  XlaConfigProxy::ConfigSetterRegistry<ConfigA>::Global()->Update(config_a);
-  XlaConfigProxy::ConfigSetterRegistry<ConfigB>::Global()->Update(config_b);
-  EXPECT_EQ(A_OFF, config_a);
-  EXPECT_EQ(B_OFF, config_b);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/util/xla_config_proxy.cc b/tensorflow/core/util/xla_config_registry.cc
similarity index 64%
rename from tensorflow/core/util/xla_config_proxy.cc
rename to tensorflow/core/util/xla_config_registry.cc
index 952ddb2f97c..aa1a8bf127c 100644
--- a/tensorflow/core/util/xla_config_proxy.cc
+++ b/tensorflow/core/util/xla_config_registry.cc
@@ -13,17 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/util/xla_config_proxy.h"
+#include "tensorflow/core/util/xla_config_registry.h"
 
 namespace tensorflow {
 
 /*static*/
-OptimizerOptions::GlobalJitLevel XlaConfigProxy::GetGlobalJitLevel(
-    OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
-  OptimizerOptions::GlobalJitLevel global_jit_level = jit_level_in_session_opts;
-  ConfigSetterRegistry<OptimizerOptions::GlobalJitLevel>::Global()->Update(
-      global_jit_level);
-  return global_jit_level;
-}
+XlaConfigRegistry::global_jit_level_getter_t
+    XlaConfigRegistry::global_jit_level_getter_;
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/xla_config_registry.h b/tensorflow/core/util/xla_config_registry.h
new file mode 100644
index 00000000000..2835cee6c06
--- /dev/null
+++ b/tensorflow/core/util/xla_config_registry.h
@@ -0,0 +1,59 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H
+#define TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H
+
+#include <functional>
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// A registry class where XLA can register callbacks for Tensorflow to query
+// its status.
+class XlaConfigRegistry {
+ public:
+  // Input is jit_level in session config, and return is the config from
+  // XLA, reflecting the effect of the environment variable flags.
+  typedef std::function<OptimizerOptions::GlobalJitLevel(
+      OptimizerOptions::GlobalJitLevel)>
+      global_jit_level_getter_t;
+
+  static bool Register(XlaConfigRegistry::global_jit_level_getter_t getter) {
+    CHECK(!global_jit_level_getter_);
+    global_jit_level_getter_ = std::move(getter);
+    return true;
+  }
+
+  static OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
+      OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
+    if (!global_jit_level_getter_) {
+      return jit_level_in_session_opts;
+    }
+    return global_jit_level_getter_(jit_level_in_session_opts);
+  }
+
+ private:
+  static global_jit_level_getter_t global_jit_level_getter_;
+};
+
+#define REGISTER_XLA_CONFIG_GETTER(getter) \
+  static bool registered_##__COUNTER__ =   \
+      ::tensorflow::XlaConfigRegistry::Register(getter)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H

From 11e4bbee8d780a71b431c2fca3b15a2162afa5d1 Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Wed, 11 Sep 2019 12:44:11 -0700
Subject: [PATCH 08/14] Minor revision of header macros in
 xla_config_registry.h.

---
 tensorflow/core/util/xla_config_registry.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/util/xla_config_registry.h b/tensorflow/core/util/xla_config_registry.h
index 2835cee6c06..17c1b27b10a 100644
--- a/tensorflow/core/util/xla_config_registry.h
+++ b/tensorflow/core/util/xla_config_registry.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H
-#define TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H
+#ifndef TENSORFLOW_CORE_UTIL_XLA_CONFIG_REGISTRY_H
+#define TENSORFLOW_CORE_UTIL_XLA_CONFIG_REGISTRY_H
 
 #include <functional>
 #include "tensorflow/core/platform/default/logging.h"
@@ -56,4 +56,4 @@ class XlaConfigRegistry {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_UTIL_XLA_CONFIG_PROXY_H
+#endif  // TENSORFLOW_CORE_UTIL_XLA_CONFIG_REGISTRY_H

From 9abe61a0bb18fe77a6cbf24fec4e587867f4503d Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Wed, 11 Sep 2019 22:54:14 -0700
Subject: [PATCH 09/14] Revise XlaConfigRegistry.

1. Return XlaGlobalJitLevel instead of GlobalJitLevel in GetGlobalJitLevel().
2. Turn off the memory optimizer only when JIT is on for both single-gpu and
   multi-gpu. This is a more conservative behavior.
3. Naming style conformance.
---
 tensorflow/compiler/jit/xla_cluster_util.cc   | 20 +++--------------
 .../grappler/optimizers/meta_optimizer.cc     | 19 ++++++++++++----
 tensorflow/core/util/xla_config_registry.cc   |  2 +-
 tensorflow/core/util/xla_config_registry.h    | 22 +++++++++++--------
 4 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index 6a0259f7b0e..73e0ccf63bb 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -219,10 +219,7 @@ void RemoveFromXlaCluster(NodeDef* node_def) {
 void RemoveFromXlaCluster(Node* node) { node->ClearAttr(kXlaClusterAttr); }
 
 namespace {
-struct XlaGlobalJitLevel {
-  OptimizerOptions::GlobalJitLevel single_gpu;
-  OptimizerOptions::GlobalJitLevel general;
-};
+typedef XlaConfigRegistry::XlaGlobalJitLevel XlaGlobalJitLevel;
 
 XlaGlobalJitLevel GetXlaGlobalJitLevel(
     const OptimizerOptions::GlobalJitLevel& jit_level_in_session_opts) {
@@ -389,18 +386,7 @@ XlaAutoClusteringSummary GetXlaAutoClusteringSummary(const Graph& graph) {
   return result;
 }
 
-// Implement and register a callback for querying GlobalJitLevel.
-OptimizerOptions::GlobalJitLevel GlobalJitLevelGetter(
-    OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
-  XlaGlobalJitLevel xla_global_jit_level =
-      GetXlaGlobalJitLevel(jit_level_in_session_opts);
-  // Take the general flag to avoid the dependency on Tensorflow::Graph.
-  OptimizerOptions::GlobalJitLevel global_jit_level =
-      xla_global_jit_level.general;
-
-  return global_jit_level;
-}
-
-REGISTER_XLA_CONFIG_GETTER(GlobalJitLevelGetter);
+// Register a callback for querying XlaGlobalJitLevel.
+REGISTER_XLA_CONFIG_GETTER(GetXlaGlobalJitLevel);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 992c8e1e64c..4a071e2e6e2 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -127,6 +127,20 @@ bool AutoMixedPrecisionEnabled(RewriterConfig::Toggle opt_level) {
   return false;
 }
 
+bool IsXlaGlobalJitOn(
+    const OptimizerOptions::GlobalJitLevel& jit_level_in_session_opts) {
+  XlaConfigRegistry::XlaGlobalJitLevel xla_global_jit_level =
+      XlaConfigRegistry::GetGlobalJitLevel(jit_level_in_session_opts);
+  // Return true only if XLA JIT is ON for both single-gpu and multi-gpu
+  // graphs. This is a conservative approach that turns off the memory optimizer
+  // when we are sure that all graphs will be processed by XLA JIT.
+  bool is_on = (xla_global_jit_level.single_gpu == OptimizerOptions::ON_1 ||
+                xla_global_jit_level.single_gpu == OptimizerOptions::ON_2) &&
+               (xla_global_jit_level.general == OptimizerOptions::ON_1 ||
+                xla_global_jit_level.general == OptimizerOptions::ON_2);
+  return is_on;
+}
+
 // A helper function to decide whether to enable the memory optimizer.
 bool MemoryOptimizerEnabled(
     RewriterConfig::MemOptType mem_opt_type,
@@ -137,11 +151,8 @@ bool MemoryOptimizerEnabled(
   // loses the concurrency needed to hide the latencies of the inserted swap-ins
   // and swap-outs and incurs great performance overhead. Remove this check when
   // the XLA JIT can better deal with the concurrency.
-  OptimizerOptions::GlobalJitLevel global_jit_level =
-      XlaConfigRegistry::GetGlobalJitLevel(jit_level_in_session_opts);
   if (mem_opt_type == RewriterConfig::DEFAULT_MEM_OPT &&
-      (global_jit_level == OptimizerOptions::ON_1 ||
-       global_jit_level == OptimizerOptions::ON_2)) {
+      IsXlaGlobalJitOn(jit_level_in_session_opts)) {
     return false;
   }
 
diff --git a/tensorflow/core/util/xla_config_registry.cc b/tensorflow/core/util/xla_config_registry.cc
index aa1a8bf127c..1297265a89d 100644
--- a/tensorflow/core/util/xla_config_registry.cc
+++ b/tensorflow/core/util/xla_config_registry.cc
@@ -18,7 +18,7 @@ limitations under the License.
 namespace tensorflow {
 
 /*static*/
-XlaConfigRegistry::global_jit_level_getter_t
+XlaConfigRegistry::GlobalJitLevelGetterTy
     XlaConfigRegistry::global_jit_level_getter_;
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/xla_config_registry.h b/tensorflow/core/util/xla_config_registry.h
index 17c1b27b10a..99bcff1e69c 100644
--- a/tensorflow/core/util/xla_config_registry.h
+++ b/tensorflow/core/util/xla_config_registry.h
@@ -26,33 +26,37 @@ namespace tensorflow {
 // its status.
 class XlaConfigRegistry {
  public:
+  struct XlaGlobalJitLevel {
+    OptimizerOptions::GlobalJitLevel single_gpu;
+    OptimizerOptions::GlobalJitLevel general;
+  };
+
   // Input is jit_level in session config, and return is the config from
   // XLA, reflecting the effect of the environment variable flags.
-  typedef std::function<OptimizerOptions::GlobalJitLevel(
-      OptimizerOptions::GlobalJitLevel)>
-      global_jit_level_getter_t;
+  typedef std::function<XlaGlobalJitLevel(
+      const OptimizerOptions::GlobalJitLevel&)>
+      GlobalJitLevelGetterTy;
 
-  static bool Register(XlaConfigRegistry::global_jit_level_getter_t getter) {
+  static void Register(XlaConfigRegistry::GlobalJitLevelGetterTy getter) {
     CHECK(!global_jit_level_getter_);
     global_jit_level_getter_ = std::move(getter);
-    return true;
   }
 
-  static OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
+  static XlaGlobalJitLevel GetGlobalJitLevel(
       OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
     if (!global_jit_level_getter_) {
-      return jit_level_in_session_opts;
+      return {jit_level_in_session_opts, jit_level_in_session_opts};
     }
     return global_jit_level_getter_(jit_level_in_session_opts);
   }
 
  private:
-  static global_jit_level_getter_t global_jit_level_getter_;
+  static GlobalJitLevelGetterTy global_jit_level_getter_;
 };
 
 #define REGISTER_XLA_CONFIG_GETTER(getter) \
   static bool registered_##__COUNTER__ =   \
-      ::tensorflow::XlaConfigRegistry::Register(getter)
+      (::tensorflow::XlaConfigRegistry::Register(getter), true)
 
 }  // namespace tensorflow
 

From 98c9fbdd1b2e6cd74bf60dfadcde4970ad13d106 Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Wed, 11 Sep 2019 23:26:12 -0700
Subject: [PATCH 10/14] Minor comment polishing.

---
 tensorflow/core/grappler/optimizers/meta_optimizer.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 4a071e2e6e2..c01818e370d 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -145,8 +145,8 @@ bool IsXlaGlobalJitOn(
 bool MemoryOptimizerEnabled(
     RewriterConfig::MemOptType mem_opt_type,
     OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
-  // Disable the default memory optimizer when GlobalJitLevel is ON as it hurts
-  // the XLA JIT performance. The (current) XLA clustering can result in loss of
+  // Disable the default memory optimizer when XLA JIT is ON as it hurts the
+  // XLA JIT performance. The (current) XLA clustering can result in loss of
   // concurrency between kernel compute and memory copies. As such, it usually
   // loses the concurrency needed to hide the latencies of the inserted swap-ins
   // and swap-outs and incurs great performance overhead. Remove this check when

From 2d229cd7f50746a3747f2b7bba36c7bfe9f5e1c3 Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Thu, 12 Sep 2019 11:04:07 -0700
Subject: [PATCH 11/14] Address review comments about XlaConfigRegistry.

1. Fix an oversight that the macro does not get expanded.
2. Add a mutex to the Register() method.
3. Comment polishing.
---
 tensorflow/core/util/xla_config_registry.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/util/xla_config_registry.h b/tensorflow/core/util/xla_config_registry.h
index 99bcff1e69c..7d1cb50225b 100644
--- a/tensorflow/core/util/xla_config_registry.h
+++ b/tensorflow/core/util/xla_config_registry.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 #include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
@@ -26,18 +27,22 @@ namespace tensorflow {
 // its status.
 class XlaConfigRegistry {
  public:
+  // XlaGlobalJitLevel is used by XLA to expose its JIT level for processing
+  // single gpu and general (multi-gpu) graphs.
   struct XlaGlobalJitLevel {
     OptimizerOptions::GlobalJitLevel single_gpu;
     OptimizerOptions::GlobalJitLevel general;
   };
 
-  // Input is jit_level in session config, and return is the config from
-  // XLA, reflecting the effect of the environment variable flags.
+  // Input is the jit_level in session config, and return value is the jit_level
+  // from XLA, reflecting the effect of the environment variable flags.
   typedef std::function<XlaGlobalJitLevel(
       const OptimizerOptions::GlobalJitLevel&)>
       GlobalJitLevelGetterTy;
 
   static void Register(XlaConfigRegistry::GlobalJitLevelGetterTy getter) {
+    static mutex mu(LINKER_INITIALIZED);
+    mutex_lock l(mu);
     CHECK(!global_jit_level_getter_);
     global_jit_level_getter_ = std::move(getter);
   }
@@ -55,7 +60,13 @@ class XlaConfigRegistry {
 };
 
 #define REGISTER_XLA_CONFIG_GETTER(getter) \
-  static bool registered_##__COUNTER__ =   \
+  REGISTER_XLA_CONFIG_GETTER_UNIQ_HELPER(__COUNTER__, getter)
+
+#define REGISTER_XLA_CONFIG_GETTER_UNIQ_HELPER(ctr, getter) \
+  REGISTER_XLA_CONFIG_GETTER_UNIQ(ctr, getter)
+
+#define REGISTER_XLA_CONFIG_GETTER_UNIQ(ctr, getter)   \
+  static bool xla_config_registry_registration_##ctr = \
       (::tensorflow::XlaConfigRegistry::Register(getter), true)
 
 }  // namespace tensorflow

From a908920e93e194f4789df1c4e174d166ec605353 Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Thu, 12 Sep 2019 11:45:57 -0700
Subject: [PATCH 12/14] Guard both getter and setter in XlaConfigRegistry.

---
 tensorflow/core/util/xla_config_registry.cc | 3 +++
 tensorflow/core/util/xla_config_registry.h  | 7 ++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/util/xla_config_registry.cc b/tensorflow/core/util/xla_config_registry.cc
index 1297265a89d..3028845895c 100644
--- a/tensorflow/core/util/xla_config_registry.cc
+++ b/tensorflow/core/util/xla_config_registry.cc
@@ -17,6 +17,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+/*static*/
+mutex XlaConfigRegistry::mu_(LINKER_INITIALIZED);
+
 /*static*/
 XlaConfigRegistry::GlobalJitLevelGetterTy
     XlaConfigRegistry::global_jit_level_getter_;
diff --git a/tensorflow/core/util/xla_config_registry.h b/tensorflow/core/util/xla_config_registry.h
index 7d1cb50225b..b947396e9c1 100644
--- a/tensorflow/core/util/xla_config_registry.h
+++ b/tensorflow/core/util/xla_config_registry.h
@@ -41,14 +41,14 @@ class XlaConfigRegistry {
       GlobalJitLevelGetterTy;
 
   static void Register(XlaConfigRegistry::GlobalJitLevelGetterTy getter) {
-    static mutex mu(LINKER_INITIALIZED);
-    mutex_lock l(mu);
+    mutex_lock l(mu_);
     CHECK(!global_jit_level_getter_);
     global_jit_level_getter_ = std::move(getter);
   }
 
   static XlaGlobalJitLevel GetGlobalJitLevel(
       OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
+    mutex_lock l(mu_);
     if (!global_jit_level_getter_) {
       return {jit_level_in_session_opts, jit_level_in_session_opts};
     }
@@ -56,7 +56,8 @@ class XlaConfigRegistry {
   }
 
  private:
-  static GlobalJitLevelGetterTy global_jit_level_getter_;
+  static mutex mu_;
+  static GlobalJitLevelGetterTy global_jit_level_getter_ GUARDED_BY(mu_);
 };
 
 #define REGISTER_XLA_CONFIG_GETTER(getter) \

From 1d997791dcca369ebf1d823c3133d8bad5cfadb5 Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Thu, 12 Sep 2019 15:52:11 -0700
Subject: [PATCH 13/14] Use `new` instead of static object to avoid destructor.

By avoiding destructor, it should work better with multi-threaded program.
---
 tensorflow/compiler/jit/xla_cluster_util.cc   |  2 +-
 .../grappler/optimizers/meta_optimizer.cc     |  4 +-
 tensorflow/core/util/xla_config_registry.cc   | 36 +++++++++--
 tensorflow/core/util/xla_config_registry.h    | 61 ++++++++-----------
 4 files changed, 58 insertions(+), 45 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index 73e0ccf63bb..3863bcf3131 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -219,7 +219,7 @@ void RemoveFromXlaCluster(NodeDef* node_def) {
 void RemoveFromXlaCluster(Node* node) { node->ClearAttr(kXlaClusterAttr); }
 
 namespace {
-typedef XlaConfigRegistry::XlaGlobalJitLevel XlaGlobalJitLevel;
+typedef xla_config_registry::XlaGlobalJitLevel XlaGlobalJitLevel;
 
 XlaGlobalJitLevel GetXlaGlobalJitLevel(
     const OptimizerOptions::GlobalJitLevel& jit_level_in_session_opts) {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index c01818e370d..f0ded3b635e 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -129,8 +129,8 @@ bool AutoMixedPrecisionEnabled(RewriterConfig::Toggle opt_level) {
 
 bool IsXlaGlobalJitOn(
     const OptimizerOptions::GlobalJitLevel& jit_level_in_session_opts) {
-  XlaConfigRegistry::XlaGlobalJitLevel xla_global_jit_level =
-      XlaConfigRegistry::GetGlobalJitLevel(jit_level_in_session_opts);
+  xla_config_registry::XlaGlobalJitLevel xla_global_jit_level =
+      xla_config_registry::GetGlobalJitLevel(jit_level_in_session_opts);
   // Return true only if XLA JIT is ON for both single-gpu and multi-gpu
   // graphs. This is a conservative approach that turns off the memory optimizer
   // when we are sure that all graphs will be processed by XLA JIT.
diff --git a/tensorflow/core/util/xla_config_registry.cc b/tensorflow/core/util/xla_config_registry.cc
index 3028845895c..a3270620c02 100644
--- a/tensorflow/core/util/xla_config_registry.cc
+++ b/tensorflow/core/util/xla_config_registry.cc
@@ -17,11 +17,37 @@ limitations under the License.
 
 namespace tensorflow {
 
-/*static*/
-mutex XlaConfigRegistry::mu_(LINKER_INITIALIZED);
+namespace xla_config_registry {
 
-/*static*/
-XlaConfigRegistry::GlobalJitLevelGetterTy
-    XlaConfigRegistry::global_jit_level_getter_;
+namespace {
+struct GlobalJitLevelState {
+  mutex mu;
+  GlobalJitLevelGetterTy getter;
+};
+
+GlobalJitLevelState* GetSingletonState() {
+  static GlobalJitLevelState* state = new GlobalJitLevelState;
+  return state;
+}
+}  // anonymous
+
+void RegisterGlobalJitLevelGetter(GlobalJitLevelGetterTy getter) {
+  GlobalJitLevelState* state = GetSingletonState();
+  mutex_lock l(state->mu);
+  CHECK(!state->getter);
+  state->getter = std::move(getter);
+}
+
+XlaGlobalJitLevel GetGlobalJitLevel(
+    OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
+  GlobalJitLevelState* state = GetSingletonState();
+  mutex_lock l(state->mu);
+  if (!state->getter) {
+    return {jit_level_in_session_opts, jit_level_in_session_opts};
+  }
+  return state->getter(jit_level_in_session_opts);
+}
+
+}  // namespace xla_config_registry
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/xla_config_registry.h b/tensorflow/core/util/xla_config_registry.h
index b947396e9c1..a7ad8d9fb61 100644
--- a/tensorflow/core/util/xla_config_registry.h
+++ b/tensorflow/core/util/xla_config_registry.h
@@ -23,52 +23,39 @@ limitations under the License.
 
 namespace tensorflow {
 
-// A registry class where XLA can register callbacks for Tensorflow to query
-// its status.
-class XlaConfigRegistry {
- public:
-  // XlaGlobalJitLevel is used by XLA to expose its JIT level for processing
-  // single gpu and general (multi-gpu) graphs.
-  struct XlaGlobalJitLevel {
-    OptimizerOptions::GlobalJitLevel single_gpu;
-    OptimizerOptions::GlobalJitLevel general;
-  };
+namespace xla_config_registry {
 
-  // Input is the jit_level in session config, and return value is the jit_level
-  // from XLA, reflecting the effect of the environment variable flags.
-  typedef std::function<XlaGlobalJitLevel(
-      const OptimizerOptions::GlobalJitLevel&)>
-      GlobalJitLevelGetterTy;
-
-  static void Register(XlaConfigRegistry::GlobalJitLevelGetterTy getter) {
-    mutex_lock l(mu_);
-    CHECK(!global_jit_level_getter_);
-    global_jit_level_getter_ = std::move(getter);
-  }
-
-  static XlaGlobalJitLevel GetGlobalJitLevel(
-      OptimizerOptions::GlobalJitLevel jit_level_in_session_opts) {
-    mutex_lock l(mu_);
-    if (!global_jit_level_getter_) {
-      return {jit_level_in_session_opts, jit_level_in_session_opts};
-    }
-    return global_jit_level_getter_(jit_level_in_session_opts);
-  }
-
- private:
-  static mutex mu_;
-  static GlobalJitLevelGetterTy global_jit_level_getter_ GUARDED_BY(mu_);
+// XlaGlobalJitLevel is used by XLA to expose its JIT level for processing
+// single gpu and general (multi-gpu) graphs.
+struct XlaGlobalJitLevel {
+  OptimizerOptions::GlobalJitLevel single_gpu;
+  OptimizerOptions::GlobalJitLevel general;
 };
 
+// Input is the jit_level in session config, and return value is the jit_level
+// from XLA, reflecting the effect of the environment variable flags.
+typedef std::function<XlaGlobalJitLevel(
+    const OptimizerOptions::GlobalJitLevel&)>
+    GlobalJitLevelGetterTy;
+
+void RegisterGlobalJitLevelGetter(GlobalJitLevelGetterTy getter);
+
+XlaGlobalJitLevel GetGlobalJitLevel(
+    OptimizerOptions::GlobalJitLevel jit_level_in_session_opts);
+
 #define REGISTER_XLA_CONFIG_GETTER(getter) \
   REGISTER_XLA_CONFIG_GETTER_UNIQ_HELPER(__COUNTER__, getter)
 
 #define REGISTER_XLA_CONFIG_GETTER_UNIQ_HELPER(ctr, getter) \
   REGISTER_XLA_CONFIG_GETTER_UNIQ(ctr, getter)
 
-#define REGISTER_XLA_CONFIG_GETTER_UNIQ(ctr, getter)   \
-  static bool xla_config_registry_registration_##ctr = \
-      (::tensorflow::XlaConfigRegistry::Register(getter), true)
+#define REGISTER_XLA_CONFIG_GETTER_UNIQ(ctr, getter)                    \
+  static bool xla_config_registry_registration_##ctr =                  \
+      (::tensorflow::xla_config_registry::RegisterGlobalJitLevelGetter( \
+           getter),                                                     \
+       true)
+
+}  // namespace xla_config_registry
 
 }  // namespace tensorflow
 

From ebfad012e6bf73333003996c783461e37da2556e Mon Sep 17 00:00:00 2001
From: Trent Lo <trentl@nvidia.com>
Date: Thu, 12 Sep 2019 16:37:19 -0700
Subject: [PATCH 14/14] Add GUARDED_BY for GlobalJitLevelState.

---
 tensorflow/core/util/xla_config_registry.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/util/xla_config_registry.cc b/tensorflow/core/util/xla_config_registry.cc
index a3270620c02..6666f192098 100644
--- a/tensorflow/core/util/xla_config_registry.cc
+++ b/tensorflow/core/util/xla_config_registry.cc
@@ -22,7 +22,7 @@ namespace xla_config_registry {
 namespace {
 struct GlobalJitLevelState {
   mutex mu;
-  GlobalJitLevelGetterTy getter;
+  GlobalJitLevelGetterTy getter GUARDED_BY(mu);
 };
 
 GlobalJitLevelState* GetSingletonState() {