Add mixed precision Grappler optimizer

- Adds an opt-in Grappler pass to convert parts of a graph from float32 to float16 Co-authored-by: Ben Barsdell <bbarsdell@nvidia.com> Co-authored-by: Carl Case <carlc@nvidia.com> Co-authored-by: Trent Lo <trentl@nvidia.com> Co-authored-by: Nathan Luehr <nluehr@nvidia.com>
2019-03-04 17:59:12 -08:00 · 2019-03-04 17:59:12 -08:00 · 30d87b5299
commit 30d87b5299
parent 6a883e7f8d
11 changed files with 3203 additions and 10 deletions
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@ -27,7 +27,8 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {

-int GetNumAvailableGPUs() {
+int GetNumAvailableGPUs(
+    const std::pair<int, int>& min_cuda_compute_capability) {
  int num_eligible_gpus = 0;
 #if GOOGLE_CUDA
  if (ValidateGPUMachineManager().ok()) {
@ -39,20 +40,29 @@ int GetNumAvailableGPUs() {
        if (exec_status.ok()) {
          se::StreamExecutor* se = exec_status.ValueOrDie();
          const se::DeviceDescription& desc = se->GetDeviceDescription();
+          int cc_major = 0;
+          int cc_minor = 0;
+          desc.cuda_compute_capability(&cc_major, &cc_minor);
+          std::pair<int, int> cuda_compute_capability(cc_major, cc_minor);
          int min_gpu_core_count = 8;
-          if (desc.core_count() >= min_gpu_core_count) {
+          if (desc.core_count() >= min_gpu_core_count &&
+              cuda_compute_capability >= min_cuda_compute_capability) {
            num_eligible_gpus++;
          }
        }
      }
    }
  }
-  LOG(INFO) << "Number of eligible GPUs (core count >= 8): "
-            << num_eligible_gpus;
+  LOG(INFO)
+      << "Number of eligible GPUs (core count >= 8, compute capability >= "
+      << min_cuda_compute_capability.first << "."
+      << min_cuda_compute_capability.second << "): " << num_eligible_gpus;
 #else
-  LOG(INFO) << "Number of eligible GPUs (core count >= 8): "
-            << num_eligible_gpus
-            << " (Note: TensorFlow was not compiled with CUDA support)";
+  LOG(INFO)
+      << "Number of eligible GPUs (core count >= 8, compute capability >= "
+      << min_cuda_compute_capability.first << "."
+      << min_cuda_compute_capability.second << "): " << num_eligible_gpus
+      << " (Note: TensorFlow was not compiled with CUDA support)";
 #endif  // GOOGLE_CUDA
  return num_eligible_gpus;
 }
--- a/tensorflow/core/grappler/devices.h
+++ b/tensorflow/core/grappler/devices.h
@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_DEVICES_H_

 #include <functional>
+#include <utility>

 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@ -26,8 +27,10 @@ namespace tensorflow {
 namespace grappler {

 // Get the number of available GPUs whose number of multiprocessors is no less
-// than 8.
-int GetNumAvailableGPUs();
+// than 8 and whose CUDA compute capability is no less than
+// min_cuda_compute_capability.
+int GetNumAvailableGPUs(
+    const std::pair<int, int>& min_cuda_compute_capability = {0, 0});

 // Maximum amount of gpu memory available per gpu. gpu_id must be in the range
 // [0, num_available_gpu)
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@ -523,6 +523,46 @@ tf_cuda_cc_test(
    ],
 )

+cc_library(
+    name = "auto_mixed_precision",
+    srcs = ["auto_mixed_precision.cc"],
+    hdrs = [
+        "auto_mixed_precision.h",
+        "auto_mixed_precision_lists.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":custom_graph_optimizer_registry",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:virtual_placer",
+        "//tensorflow/core/grappler/utils:frame",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "auto_mixed_precision_test",
+    srcs = ["auto_mixed_precision_test.cc"],
+    deps = [
+        ":auto_mixed_precision",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler/clusters:single_machine",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
 cc_library(
    name = "meta_optimizer",
    srcs = ["meta_optimizer.cc"],
@ -531,6 +571,7 @@ cc_library(
    ],
    visibility = ["//visibility:public"],
    deps = [
+        ":auto_mixed_precision",
        ":arithmetic_optimizer",
        ":auto_parallel",
        ":constant_folding",
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Convert data types to float16 where appropriate to improve performance on
+// GPUs.
+class AutoMixedPrecision : public GraphOptimizer {
+ public:
+  explicit AutoMixedPrecision(
+      RewriterConfig::Toggle opt_level = RewriterConfig::ON) {}
+
+  ~AutoMixedPrecision() override {}
+
+  string name() const override { return "auto_mixed_precision"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_H_
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@ -0,0 +1,302 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_LISTS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_LISTS_H_
+
+#include <set>
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/env_var.h"
+
+// TODO(benbarsdell): GOOGLE_CUDA doesn't seem to be working?
+//#if GOOGLE_CUDA
+// Needed for CUDA_VERSION macro.
+#include "cuda/include/cuda.h"
+//#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+namespace grappler {
+
+class AutoMixedPrecisionLists {
+ private:
+  static void UpdateList(std::set<string>* list, const string& to_add,
+                         const string& to_remove) {
+    for (auto x : str_util::Split(to_add, ",")) {
+      list->insert(x);
+    }
+    for (auto x : str_util::Split(to_remove, ",")) {
+      list->erase(x);
+    }
+  }
+
+  static bool IsPseudoFastMath() {
+    string optimization_level;
+    ReadStringFromEnvVar("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "",
+                         &optimization_level);
+    optimization_level = str_util::Uppercase(optimization_level);
+    return optimization_level == "TENSOR_CORES_ONLY";
+  }
+
+ public:
+  // Returns the set of ops that are considered numerically-safe (for execution
+  // in fp16) and performance-critical. These ops are always converted to fp16.
+  static std::set<string> WhiteList() {
+    string to_add, to_remove;
+    ReadStringFromEnvVar("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_WHITELIST_ADD",
+                         "", &to_add);
+    ReadStringFromEnvVar(
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_WHITELIST_REMOVE", "",
+        &to_remove);
+
+    auto list = std::set<string> {
+#if CUDA_VERSION >= 9010 // Fp16 BatchMatMul is slow before CUDA 9.1.
+        "BatchMatMul",
+#endif
+        "Conv2D",
+        "Conv2DBackpropFilter",
+        "Conv2DBackpropInput",
+        // TODO(benbarsdell): Enable these when Tensor Core kernels are
+        // available for 3D convolutions.
+        // "Conv3D",
+        // "Conv3DBackpropFilter",
+        // "Conv3DBackpropFilterV2",
+        // "Conv3DBackpropInput",
+        // "Conv3DBackpropInputV2",
+        "CudnnRNN",
+        "CudnnRNNBackprop",
+        "CudnnRNNBackpropV2",
+        "CudnnRNNBackpropV3",
+        "CudnnRNNV2",
+        "CudnnRNNV3",
+        // TODO(benbarsdell): Enable these when fast and safe fp16 kernels are
+        // available for depthwise convolutions.
+        // "DepthwiseConv2dNative",
+        // "DepthwiseConv2dNativeBackpropFilter",
+        // "DepthwiseConv2dNativeBackpropInput",
+        "MatMul",
+    };
+    UpdateList(&list, to_add, to_remove);
+    return list;
+  }
+
+  // Returns the set of ops that are considered numerically-safe (for execution
+  // in fp16), but which may be made unsafe by an upstream blacklist op.
+  static std::set<string> GrayList() {
+    if (IsPseudoFastMath()) {
+      return std::set<string>{};
+    }
+    string to_add, to_remove;
+    ReadStringFromEnvVar("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_GRAYLIST_ADD",
+                         "", &to_add);
+    ReadStringFromEnvVar(
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_GRAYLIST_REMOVE", "",
+        &to_remove);
+
+    auto list = std::set<string>{
+        "Add",
+        "AddN",
+        "AddV2",
+        "AvgPool",
+        "AvgPool3D",
+        "AvgPool3DGrad",
+        "AvgPoolGrad",
+        "BiasAdd",
+        "BiasAddGrad",
+        "BiasAddV1",
+        "Elu",
+        "EluGrad",
+        "Erf",
+        "Erfc",
+        "FloorDiv",
+        "FusedBatchNormV2",
+        "FusedBatchNormGradV2",
+        "Inv",
+        "LeakyRelu",
+        "LeakyReluGrad",
+        "Mul",
+        "Prod",
+        "RealDiv",
+        "Reciprocal",
+        "Sigmoid",
+        "SigmoidGrad",
+        "Softplus",
+        "SoftplusGrad",
+        "Sqrt",
+        "Sub",
+        "Sum",
+        "Tanh",
+        "TanhGrad",
+    };
+    UpdateList(&list, to_add, to_remove);
+    return list;
+  }
+
+  // Returns the set of ops that are considered numerically-dangerous (i.e.,
+  // unsafe for execution in fp16) and whose effects may also be observed in
+  // downstream nodes (e.g., in Exp -> Add, the Add is unsafe due to the Exp).
+  static std::set<string> BlackList() {
+    if (IsPseudoFastMath()) {
+      return std::set<string>{};
+    }
+    string to_add, to_remove;
+    ReadStringFromEnvVar("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_BLACKLIST_ADD",
+                         "", &to_add);
+    ReadStringFromEnvVar(
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_BLACKLIST_REMOVE", "",
+        &to_remove);
+
+    auto list = std::set<string>{
+        "Exp",
+        "Expm1",
+        "L2Loss",
+        "Log",
+        "Log1p",
+        "LogSoftmax",
+        "Mean",
+        "Pow",
+        "SaveV2",
+        "Softmax",
+        "SoftmaxCrossEntropyWithLogits",
+        "SparseSoftmaxCrossEntropyWithLogits",
+    };
+    UpdateList(&list, to_add, to_remove);
+    return list;
+  }
+
+  // Returns the set of ops that do not have numerically-significant effects
+  // (i.e., they are always considered safe for execution in fp16 precision).
+  static std::set<string> ClearList() {
+    if (IsPseudoFastMath()) {
+      return std::set<string>{};
+    }
+    string to_add, to_remove;
+    ReadStringFromEnvVar(
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_CLEARLIST_ADD", "",
+        &to_add);
+    ReadStringFromEnvVar(
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_CLEARLIST_REMOVE", "",
+        &to_remove);
+
+    auto list = std::set<string> {
+        "Abs",
+        "ArgMax",
+        "ArgMin",
+        "BatchToSpace",
+        "BatchToSpaceND",
+        "BroadcastTo",
+        "Ceil",
+        "CheckNumerics",
+        "ClipByValue",
+        "Concat",
+        "ConcatV2",
+        "DepthToSpace",
+        "DynamicPartition",
+        "DynamicStitch",
+        "Enter",
+        "EnsureShape",
+        "Equal",
+        "Exit",
+        "ExpandDims",
+        "Fill",
+        "Floor",
+        "Gather",
+        "GatherNd",
+        "GatherV2",
+        "Greater",
+        "GreaterEqual",
+        "Identity",
+        "IdentityN",
+        "IsFinite",
+        "IsInf",
+        "IsNan",
+        "Less",
+        "LessEqual",
+        "Max",
+        "MaxPool",
+        "MaxPool3D",
+        "MaxPool3DGrad",
+        "MaxPool3DGradGrad",
+        "MaxPoolGrad",
+        "MaxPoolGradGrad",
+        "MaxPoolGradGradV2",
+        "MaxPoolGradV2",
+        "MaxPoolV2",
+        "Maximum",
+        "Merge",
+        "Min",
+        "Minimum",
+        "MirrorPad",
+        "MirrorPadGrad",
+        "Neg",
+        "NextIteration",
+        "NotEqual",
+        "OnesLike",
+        "Pack",
+        "Pad",
+        "PadV2",
+        "PreventGradient",
+        "Rank",
+        "Relu",
+        "Relu6",
+        "Relu6Grad",
+        "ReluGrad",
+        "Reshape",
+        "ResizeNearestNeighbor",
+        "ResizeNearestNeighborGrad",
+        "Reverse",
+        "ReverseSequence",
+        "ReverseV2",
+        "Round",
+        "Select",
+        "Shape",
+        "ShapeN",
+        "Sign",
+        "Size",
+        "Slice",
+        "Snapshot",
+        "SpaceToBatch",
+        "SpaceToBatchND",
+        "SpaceToDepth",
+        "Split",
+        "SplitV",
+        "Squeeze",
+        "StackPopV2",
+        "StackPushV2",
+        "StopGradient",
+        "StridedSlice",
+        "StridedSliceGrad",
+        "Switch",
+        "TensorArrayConcatV3",
+        "TensorArrayGatherV3",
+        "TensorArrayReadV3",
+        "TensorArrayScatterV3",
+        "TensorArraySplitV3",
+        "TensorArrayWriteV3",
+        "Tile",
+        "TopK",
+        "TopKV2",
+        "Transpose",
+        "Where",
+        "ZerosLike",
+    };
+    UpdateList(&list, to_add, to_remove);
+    return list;
+  }
+};
+
+}
+}
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_MIXED_PRECISION_LISTS_H_
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@ -0,0 +1,479 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/grappler/clusters/single_machine.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+const std::pair<int, int> kMinGPUArch = {7, 0};
+
+class AutoMixedPrecisionTest : public GrapplerTest {
+ protected:
+  void SetUp() override {
+    int num_gpus = GetNumAvailableGPUs();
+    // If GPUs are available, require that they all satisfy the min arch.
+    gpu_available_ =
+        num_gpus > 0 && num_gpus == GetNumAvailableGPUs(kMinGPUArch);
+
+    if (gpu_available_) {
+      virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
+    } else {
+      DeviceProperties device_properties;
+      device_properties.set_type("GPU");
+      device_properties.mutable_environment()->insert({"architecture", "7"});
+      virtual_cluster_.reset(
+          new VirtualCluster({{"/GPU:1", device_properties}}));
+    }
+    TF_CHECK_OK(virtual_cluster_->Provision());
+  }
+
+  void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
+
+  NodeDef* AddSimpleNode(const string& name, const string& op,
+                         const std::vector<string>& inputs,
+                         GraphDef* graph) const {
+    std::vector<std::pair<string, AttrValue>> attributes;
+    if (op == "AddN" || op == "ShapeN") {
+      AttrValue num_inputs;
+      num_inputs.set_i(inputs.size());
+      attributes.emplace_back("N", num_inputs);
+    }
+    if (op == "ShapeN") {
+      AttrValue out_type;
+      out_type.set_type(DT_INT32);
+      attributes.emplace_back("out_type", out_type);
+    }
+    AttrValue type;
+    type.set_type(DT_FLOAT);
+    if (op == "Const" || op == "Placeholder" || op == "VariableV2" ||
+        op == "VarHandleOp" || op == "ReadVariableOp") {
+      attributes.emplace_back("dtype", type);
+    } else if (op == "SparseMatMul") {
+      attributes.emplace_back("Ta", type);
+      attributes.emplace_back("Tb", type);
+    } else if (op == "IdentityN") {
+      AttrValue type_list;
+      for (int i = 0; i < (int)inputs.size(); ++i) {
+        type_list.mutable_list()->add_type(DT_FLOAT);
+      }
+      attributes.emplace_back("T", type_list);
+    } else if (op == "StackV2" || op == "StackPopV2") {
+      attributes.emplace_back("elem_type", type);
+    } else if (op == "Cast") {
+      attributes.emplace_back("SrcT", type);
+      attributes.emplace_back("DstT", type);
+    } else {
+      attributes.emplace_back("T", type);
+    }
+    return AddNode(name, op, inputs, attributes, graph);
+  }
+
+  std::unique_ptr<Cluster> virtual_cluster_;
+  bool gpu_available_;
+};
+
+void VerifyGraphsEquivalent(const GraphDef& original_graph,
+                            const GraphDef& optimized_graph,
+                            const string& func) {
+  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
+  GraphView optimized_view(&optimized_graph);
+  for (int i = 0; i < original_graph.node_size(); ++i) {
+    const NodeDef& original = original_graph.node(i);
+    const NodeDef& optimized = *optimized_view.GetNode(original.name());
+    EXPECT_EQ(original.name(), optimized.name()) << func;
+    EXPECT_EQ(original.op(), optimized.op()) << func;
+    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
+    if (original.input_size() == optimized.input_size()) {
+      for (int j = 0; j < original.input_size(); ++j) {
+        EXPECT_EQ(original.input(j), optimized.input(j)) << func;
+      }
+    }
+  }
+}
+
+TEST_F(AutoMixedPrecisionTest, NoOp) {
+  GraphDef graph;
+  AddSimpleNode("In", "Placeholder", {}, &graph);
+  AddSimpleNode("B1", "Exp", {"In"}, &graph);
+  AddSimpleNode("C1", "Relu", {"B1"}, &graph);
+  AddSimpleNode("G1", "Sqrt", {"C1"}, &graph);
+  AddSimpleNode("C2", "Relu", {"G1"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  VerifyGraphsEquivalent(item.graph, output, __FUNCTION__);
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output_view.GetNode("In")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("B1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("G1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C2")->attr().at("T").type(), DT_FLOAT);
+}
+
+TEST_F(AutoMixedPrecisionTest, AlreadyFp16) {
+  GraphDef graph;
+  AddSimpleNode("In", "Placeholder", {}, &graph);
+  NodeDef* cast1 = AddSimpleNode("Cast1", "Cast", {"In"}, &graph);
+  cast1->mutable_attr()->at("DstT").set_type(DT_HALF);
+  NodeDef* w1 = AddSimpleNode("W1", "MatMul", {"Cast1", "Cast1"}, &graph);
+  w1->mutable_attr()->at("T").set_type(DT_HALF);
+  NodeDef* c1 = AddSimpleNode("C1", "Relu", {"W1"}, &graph);
+  c1->mutable_attr()->at("T").set_type(DT_HALF);
+  NodeDef* cast2 = AddSimpleNode("Cast2", "Cast", {"C1"}, &graph);
+  cast2->mutable_attr()->at("SrcT").set_type(DT_HALF);
+  AddSimpleNode("C2", "Relu", {"Cast2"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  VerifyGraphsEquivalent(item.graph, output, __FUNCTION__);
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output_view.GetNode("In")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("Cast1")->attr().at("DstT").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("W1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("C1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("Cast2")->attr().at("SrcT").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("Cast2")->attr().at("DstT").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C2")->attr().at("T").type(), DT_FLOAT);
+}
+
+TEST_F(AutoMixedPrecisionTest, Simple) {
+  GraphDef graph;
+  AddSimpleNode("In", "Placeholder", {}, &graph);
+  AddSimpleNode("B1", "Exp", {"In"}, &graph);
+  AddSimpleNode("C1", "Relu", {"B1"}, &graph);
+  AddSimpleNode("G1", "Sqrt", {"C1"}, &graph);
+  AddSimpleNode("C2", "Relu", {"G1"}, &graph);
+  AddSimpleNode("W1", "MatMul", {"C2", "C2"}, &graph);
+  AddSimpleNode("C3", "Relu", {"W1"}, &graph);
+  AddSimpleNode("B2", "Exp", {"C3"}, &graph);
+  AddSimpleNode("C4", "Relu", {"B2"}, &graph);
+  AddSimpleNode("B4", "SparseMatMul", {"C4", "C4"}, &graph);
+  AddSimpleNode("C5", "Relu", {"B4"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), graph.node_size() + 2);
+  EXPECT_EQ(output_view.GetNode("In")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("B1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("G1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C2")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("W1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("C3")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("B2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C4")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("B4")->attr().at("Ta").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("B4")->attr().at("Tb").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C5")->attr().at("T").type(), DT_FLOAT);
+}
+
+TEST_F(AutoMixedPrecisionTest, BidirectionalClearChain) {
+  GraphDef graph;
+  AddSimpleNode("In", "Placeholder", {}, &graph);
+  AddSimpleNode("C1", "Relu", {"In"}, &graph);
+  AddSimpleNode("C2", "Relu", {"In"}, &graph);
+  AddSimpleNode("W1", "MatMul", {"C1", "C1"}, &graph);
+  AddSimpleNode("C3", "ShapeN", {"C1", "C2"}, &graph);
+  AddSimpleNode("C4", "Relu", {"C2"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), graph.node_size() + 1);
+  EXPECT_EQ(output_view.GetNode("In")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("C2")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("W1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("C3")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("C4")->attr().at("T").type(), DT_HALF);
+};
+
+TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
+  GraphDef graph;
+  AddSimpleNode("In", "Placeholder", {}, &graph);
+  AddSimpleNode("Const1", "Const", {}, &graph);
+  AddSimpleNode("W1", "MatMul", {"In", "Const1"}, &graph);
+  AddSimpleNode("C1", "Relu", {"W1"}, &graph);
+  AddSimpleNode("G1", "Sqrt", {"C1"}, &graph);
+  AddSimpleNode("B1", "Exp", {"G1"}, &graph);
+  AddSimpleNode("C2", "Relu", {"B1"}, &graph);
+  AddSimpleNode("W2", "MatMul", {"C2", "C2"}, &graph);
+  AddSimpleNode("C3", "Relu", {"W2"}, &graph);
+  AddSimpleNode("B2", "Exp", {"C3"}, &graph);
+  AddSimpleNode("C4", "Relu", {"B2"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+  item.fetch.push_back("W1");
+  item.fetch.push_back("C2");
+  item.fetch.push_back("C3");
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), graph.node_size() + 2);
+  EXPECT_EQ(output_view.GetNode("In")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("Const1")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("W1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("G1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("B1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("W2")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("C3")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("B2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C4")->attr().at("T").type(), DT_FLOAT);
+}
+
+TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
+  GraphDef graph;
+  AddSimpleNode("In", "Placeholder", {}, &graph);
+  AddSimpleNode("C1", "Relu", {"In"}, &graph);
+  AddSimpleNode("W1", "MatMul", {"C1", "C1"}, &graph);
+  AddSimpleNode("G1", "Tanh", {"W1"}, &graph);
+  NodeDef* w2 = AddSimpleNode("W2", "MatMul", {"G1", "G1"}, &graph);
+  w2->set_device("/job:localhost/replica:0/task:0/device:CPU:0");
+  AddSimpleNode("C2", "Relu", {"W2"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), graph.node_size() + 2);
+  EXPECT_EQ(output_view.GetNode("In")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("W1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("G1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("W2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C2")->attr().at("T").type(), DT_FLOAT);
+}
+
+TEST_F(AutoMixedPrecisionTest, PreserveIdentityAfterVariable) {
+  GraphDef graph;
+  AddSimpleNode("In", "Placeholder", {}, &graph);
+  AddSimpleNode("V1", "VariableV2", {}, &graph);
+  AddSimpleNode("C1", "Identity", {"V1"}, &graph);
+  AddSimpleNode("W1", "MatMul", {"In", "C1"}, &graph);
+  AddSimpleNode("VarHandle1", "VarHandleOp", {}, &graph);
+  AddSimpleNode("V2", "ReadVariableOp", {"VarHandle1"}, &graph);
+  AddSimpleNode("W2", "MatMul", {"In", "V2"}, &graph);
+  AddSimpleNode("Const1", "Const", {}, &graph);
+  AddSimpleNode("C2", "Identity", {"Const1"}, &graph);
+  AddSimpleNode("W3", "MatMul", {"In", "C2"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), graph.node_size() + 4);
+  EXPECT_EQ(output_view.GetNode("In")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("V1")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("W1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("VarHandle1")->attr().at("dtype").type(),
+            DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("V2")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("W2")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("Const1")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("C2")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("W3")->attr().at("T").type(), DT_HALF);
+}
+
+TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
+  GraphDef graph;
+  AddSimpleNode("X", "Placeholder", {}, &graph);
+  AddSimpleNode("Const1", "Const", {}, &graph);
+  AddSimpleNode("Scale", "Placeholder", {}, &graph);
+  AddSimpleNode("Offset", "Placeholder", {}, &graph);
+  AddSimpleNode("Mean", "Placeholder", {}, &graph);
+  AddSimpleNode("Variance", "Placeholder", {}, &graph);
+  AddSimpleNode("W1", "Conv2D", {"X", "Const1"}, &graph);
+  AddSimpleNode("BN1", "FusedBatchNorm",
+                {"W1", "Scale", "Offset", "Mean", "Variance"}, &graph);
+  AddSimpleNode("BNG1", "FusedBatchNormGrad",
+                {"BN1", "W1", "Scale", "Mean", "Variance"}, &graph);
+  AddSimpleNode("G1", "Add", {"BN1", "BNG1"}, &graph);
+  AddSimpleNode("W2", "Conv2D", {"G1", "Const1"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), graph.node_size() + 2);
+  EXPECT_EQ(output_view.GetNode("W1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("BN1")->op(), "FusedBatchNormV2");
+  EXPECT_EQ(output_view.GetNode("BN1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("BN1")->attr().at("U").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("BNG1")->op(), "FusedBatchNormGradV2");
+  EXPECT_EQ(output_view.GetNode("BNG1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("BNG1")->attr().at("U").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("G1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("W2")->attr().at("T").type(), DT_HALF);
+}
+
+TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
+  GraphDef graph;
+  AddSimpleNode("In", "Placeholder", {}, &graph);
+  AddSimpleNode("W1", "MatMul", {"In", "In"}, &graph);
+  AddSimpleNode("ID", "IdentityN", {"W1", "W1", "W1"}, &graph);
+  AddSimpleNode("G1", "AddN", {"ID:0", "ID:1", "ID:2"}, &graph);
+  AddSimpleNode("W2", "MatMul", {"G1", "G1"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), graph.node_size() + 1);
+  EXPECT_EQ(output_view.GetNode("In")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("W1")->attr().at("T").type(), DT_HALF);
+  for (auto type : output_view.GetNode("ID")->attr().at("T").list().type()) {
+    EXPECT_EQ(type, DT_HALF);
+  }
+  EXPECT_EQ(output_view.GetNode("G1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("W2")->attr().at("T").type(), DT_HALF);
+}
+
+TEST_F(AutoMixedPrecisionTest, ExistingCast) {
+  GraphDef graph;
+  NodeDef* ph = AddSimpleNode("In", "Placeholder", {}, &graph);
+  ph->mutable_attr()->at("dtype").set_type(DT_BOOL);
+  NodeDef* cast = AddSimpleNode("Cast1", "Cast", {"In"}, &graph);
+  cast->mutable_attr()->at("SrcT").set_type(DT_BOOL);
+  AddSimpleNode("W1", "MatMul", {"Cast1", "Cast1"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), graph.node_size());
+  EXPECT_EQ(output_view.GetNode("Cast1")->attr().at("DstT").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("W1")->attr().at("T").type(), DT_HALF);
+}
+
+TEST_F(AutoMixedPrecisionTest, StackV2) {
+  GraphDef graph;
+  AddSimpleNode("Handle1", "Const", {}, &graph);
+  AddSimpleNode("Stack1", "StackV2", {"Handle1"}, &graph);
+  AddSimpleNode("In", "Placeholder", {}, &graph);
+  AddSimpleNode("Push1", "StackPushV2", {"Stack1", "In"}, &graph);
+  AddSimpleNode("W1", "MatMul", {"In", "In"}, &graph);
+  AddSimpleNode("Push2", "StackPushV2", {"Stack1", "W1"}, &graph);
+  AddSimpleNode("Pop1", "StackPopV2", {"Stack1"}, &graph);
+  AddSimpleNode("G1", "Tanh", {"Pop1"}, &graph);
+  AddSimpleNode("W2", "MatMul", {"G1", "G1"}, &graph);
+  AddSimpleNode("Push3", "StackPushV2", {"Stack1", "W2"}, &graph);
+  AddSimpleNode("Handle2", "Const", {}, &graph);
+  AddSimpleNode("Stack2", "StackV2", {"Handle2"}, &graph);
+  AddSimpleNode("Push1-2", "StackPushV2", {"Stack2", "In"}, &graph);
+  AddSimpleNode("Pop1-2", "StackPopV2", {"Stack2"}, &graph);
+
+  GrapplerItem item;
+  item.graph = graph;
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), graph.node_size() + 1);
+  EXPECT_EQ(output_view.GetNode("Stack1")->attr().at("elem_type").type(),
+            DT_HALF);
+  EXPECT_EQ(output_view.GetNode("Push1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("W1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("Push2")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("Pop1")->attr().at("elem_type").type(),
+            DT_HALF);
+  EXPECT_EQ(output_view.GetNode("G1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("W2")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("Push3")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("Stack2")->attr().at("elem_type").type(),
+            DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("Push1-2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("Pop1-2")->attr().at("elem_type").type(),
+            DT_FLOAT);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"
 #include "tensorflow/core/grappler/optimizers/auto_parallel.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
@ -44,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/dump_graph.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/ptr_util.h"

 namespace tensorflow {
@ -78,7 +80,7 @@ int NumIterations(const RewriterConfig& cfg) {
 // Check if optimizer is allowed to run only once.
 bool IsRunOnceOptimizer(const string& name) {
  return name == "layout" || name == "memory_optimizer" ||
-         name == "loop_optimizer";
+         name == "loop_optimizer" || name == "auto_mixed_precision";
 }

 // Check if the graphdef contains nodes that indicate TPU execution.
@ -115,6 +117,25 @@ Status CompressConstants(GraphDef* graph) {
  return Status::OK();
 }

+// A helper function to decide whether to enable the automatic mixed precision
+// optimizer.
+bool AutoMixedPrecisionEnabled(RewriterConfig::Toggle opt_level) {
+  if (opt_level == RewriterConfig::ON ||
+      opt_level == RewriterConfig::AGGRESSIVE) {
+    return true;
+  }
+  if (opt_level == RewriterConfig::OFF) return false;
+  // Default is to check env var, otherwise off.
+  static bool is_enabled = [] {
+    bool ret = false;
+    TF_CHECK_OK(
+        ReadBoolFromEnvVar("TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE",
+                           /*default_val=*/false, &ret));
+    return ret;
+  }();
+  return is_enabled;
+}
+
 }  // namespace

 #define MK_OPT(NAME, VALUE) \
@ -128,6 +149,8 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
  MK_OPT("shape", new ShapeOptimizer());
  MK_OPT("remap", new Remapper(cfg_.remapping()));
  MK_OPT("layout", new LayoutOptimizer());
+  MK_OPT("auto_mixed_precision",
+         new AutoMixedPrecision(cfg_.auto_mixed_precision()));
  MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
  MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
  MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas()));
@ -161,6 +184,10 @@ Status MetaOptimizer::InitializeOptimizers(
  if (!cfg_.disable_model_pruning()) {
    optimizers->push_back(MakeUnique<ModelPruner>());
  }
+  if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision())) {
+    optimizers->push_back(
+        MakeUnique<AutoMixedPrecision>(cfg_.auto_mixed_precision()));
+  }
  if (cfg_.implementation_selector() != RewriterConfig::OFF) {
    optimizers->push_back(MakeUnique<ImplementationSelector>());
  }
@ -694,6 +721,7 @@ bool MetaOptimizerEnabled(const ConfigProto& cfg) {
         rewrite_cfg.debug_stripper() == RewriterConfig::ON ||
         rewrite_cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
         rewrite_cfg.pin_to_host_optimization() == RewriterConfig::ON ||
+         AutoMixedPrecisionEnabled(rewrite_cfg.auto_mixed_precision()) ||
         !rewrite_cfg.optimizers().empty() ||
         !rewrite_cfg.custom_optimizers().empty();
 }
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@ -81,6 +81,11 @@ message RewriterConfig {
  // Enable the swap of kernel implementations based on the device placement
  // (default is ON).
  Toggle implementation_selector = 22;
+  // Optimize data types (default is OFF).
+  // e.g., This will try to use float16 on GPU which is faster.
+  // Note that this can change the numerical stability of the graph and may
+  // require the use of loss scaling to maintain model convergence.
+  Toggle auto_mixed_precision = 23;
  // Disable the entire meta optimizer (off by default).
  bool disable_meta_optimizer = 19;

--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -6096,6 +6096,32 @@ tf_py_test(
    ],
 )

+cuda_py_test(
+    name = "auto_mixed_precision_test",
+    size = "small",
+    srcs = [
+        "grappler/auto_mixed_precision_test.py",
+    ],
+    additional_deps = [
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":array_ops",
+        ":constant_op",
+        ":dtypes",
+        ":math_ops",
+        ":nn",
+        ":ops",
+        ":random_ops",
+        ":control_flow_ops",
+        ":training",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
    name = "nccl_ops_gen",
    visibility = ["//tensorflow:internal"],
--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@ -0,0 +1,533 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Grappler AutoMixedPrecision."""
+
+import numpy as np
+
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2 as rwcpb2
+from tensorflow.core.protobuf import config_pb2 as cpb2
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+def _input(shape):
+  """Generates an input of a given shape."""
+  return variables.Variable(random_ops.truncated_normal(shape, seed=0))
+
+
+def _weight(shape):
+  """Generates a weight of a given shape."""
+  # Note that the lambda is needed to allow construction inside loops.
+  return variables.Variable(
+      lambda: init_ops.glorot_uniform_initializer(seed=0)(shape))
+
+
+def _bias(shape):
+  """Generates a bias of a given shape."""
+  return constant_op.constant(0.1, shape=shape)
+
+
+def _conv2d(x, w):
+  """Returns a 2d convolution layer with full stride."""
+  return nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
+
+
+def _max_pool_2x2(x):
+  """Downsamples a feature map by 2X."""
+  return nn.max_pool(
+      x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
+
+
+def _fused_batchnorm(x, scale, offset):
+  """Batchnorm."""
+  return nn_impl.fused_batch_norm(
+        x, scale=scale, offset=offset, is_training=True)
+
+
+def _conv_bn(x):
+  """Conv followed by batchnorm."""
+  i = array_ops.reshape(x, [-1, 8, 8, 1])
+  f = _weight([3, 3, 1, 6])
+  x = _conv2d(i, f)
+  s = _weight([6])
+  o = _weight([6])
+  y, m, v = _fused_batchnorm(x, s, o)
+  y = array_ops.identity(y)
+  return y
+
+
+def _matmul_act(x):
+  """Matmul followed by activation."""
+  i = array_ops.reshape(x, [8, 8])
+  f = _weight([8, 8])
+  x = math_ops.matmul(i, f)
+  y = nn.relu(x)
+  return y
+
+
+def _conv_pool(x):
+  """Conv followed by pooling."""
+  x_image = array_ops.reshape(x, [-1, 8, 8, 1])
+  w_conv1 = _weight([3, 3, 1, 6])
+  b_conv1 = _bias([6])
+  h_conv1 = nn.relu(nn.bias_add(_conv2d(x_image, w_conv1), b_conv1))
+  h_pool1 = _max_pool_2x2(h_conv1)
+  w_conv2 = _weight([3, 3, 6, 4])
+  b_conv2 = _bias([4])
+  h_conv2 = nn.relu(nn.bias_add(_conv2d(h_pool1, w_conv2), b_conv2))
+  h_pool2 = _max_pool_2x2(h_conv2)
+  return h_pool2
+
+
+def _simple_loop(x, functor):
+  """Simple loop whose body is provided by the functor."""
+  init = (constant_op.constant(0), x)
+  c = lambda i, j: i < 4
+  b = lambda i, j: (i+1, functor(j))
+  ij = control_flow_ops.while_loop(c, b, init)
+  return ij
+
+
+
+def _loop_vars_intertwined(x0, y0, functor_x, functor_y):
+  """Loop whose loop variables are intertwined."""
+  c = lambda i, j, x, y: j < 4
+  b = lambda i, j, x, y: (j+1, i+1, functor_y(y), functor_x(x))
+  init = (constant_op.constant(0), constant_op.constant(0), x0, y0)
+  ijzw = control_flow_ops.while_loop(c, b, init)
+  return ijzw
+
+
+def _lstm_cell(prev_c, prev_h, x):
+  """ LSTMCell
+  i: input gate
+  f: forget gate
+  o: output gate
+  c: cell state
+  x: input
+  h: embedding
+  """
+  bias = _bias([4])
+  w = _weight([8, 16])
+  ifoc = math_ops.matmul(array_ops.concat([x, prev_h], axis=1), w)
+  i, f, o, c = array_ops.split(ifoc, 4, axis=1)
+  i = math_ops.sigmoid(nn.bias_add(i, bias))
+  f = math_ops.sigmoid(nn.bias_add(f, bias))
+  o = math_ops.sigmoid(nn.bias_add(o, bias))
+  c = math_ops.tanh(nn.bias_add(c, bias))
+  next_c = f * prev_c + i * c
+  next_h = o * math_ops.tanh(next_c)
+  return next_c, next_h
+
+
+def _recurrent_lstm(c, h):
+  """ Dynamic single-layer LSTM with TensorArray """
+  def cond(i, c, h, ta_x):
+    return i<4
+
+  def body(i, c, h, ta_x):
+    x = ta_x.read(i)
+    next_c, next_h = _lstm_cell(c, h, x)
+    return (i+1, next_c, next_h, ta_x)
+
+  ta_x = tensor_array_ops.TensorArray(
+           dtype=dtypes.float32,
+           size=4)
+  for i in range(0, 4):
+    ta_x = ta_x.write(
+               i, constant_op.constant(0.1, shape=[8, 4],
+                                       dtype=dtypes.float32))
+  init = (constant_op.constant(0), c, h, ta_x)
+  r = control_flow_ops.while_loop(cond, body, init)
+  return r
+
+
+def _make_node_with_color(color, input_tensor, name=None):
+  color = color.lower()
+  if color == 'w': # White node
+    weights = _weight(input_tensor.get_shape().as_list())
+    return math_ops.matmul(input_tensor, weights, name=name)
+  elif color == 'g': # Gray node
+    return math_ops.sqrt(input_tensor, name=name)
+  elif color == 'c': # Clear node
+    return nn.relu(input_tensor, name=name)
+  elif color == 'b': # Black node
+    return math_ops.log(input_tensor, name=name)
+  else:
+    raise ValueError("Invalid node color: " + str(color))
+
+
+def _build_intertwined_loop_graph(inpA_colors, inpB_colors, bodyA_colors,
+                                  bodyB_colors, outA_colors, outB_colors):
+  a = _input([8, 8])
+  for i, color in enumerate(inpA_colors):
+    a = _make_node_with_color(color, a, 'inputA_%i' % i)
+  b = _input([8, 8])
+  for i, color in enumerate(inpB_colors):
+    b = _make_node_with_color(color, b, 'inputB_%i' % i)
+  def bodyA(x):
+    for i, color in enumerate(bodyA_colors):
+      x = _make_node_with_color(color, x, 'bodyA_%i' % i)
+    return x
+  def bodyB(x):
+    for i, color in enumerate(bodyB_colors):
+      x = _make_node_with_color(color, x, 'bodyB_%i' % i)
+    return x
+  a, b = _loop_vars_intertwined(a, b, bodyA, bodyB)[2:]
+  for i, color in enumerate(outA_colors):
+    a = _make_node_with_color(color, a, 'outputA_%i' % i)
+  for i, color in enumerate(outB_colors):
+    b = _make_node_with_color(color, b, 'outputB_%i' % i)
+  a = array_ops.identity(a)
+  b = array_ops.identity(b)
+  return a, b
+
+
+def _get_config(auto_mixed_precision=True):
+  if auto_mixed_precision:
+    rewrite_config = rwcpb2.RewriterConfig(
+        auto_mixed_precision=rwcpb2.RewriterConfig.ON,
+        # do not remove duplicated nodes
+        arithmetic_optimization=rwcpb2.RewriterConfig.OFF)
+  else:
+    rewrite_config = rwcpb2.RewriterConfig(
+        auto_mixed_precision=rwcpb2.RewriterConfig.OFF,
+        # do not remove duplicated nodes
+        arithmetic_optimization=rwcpb2.RewriterConfig.OFF)
+  rewrite_config.min_graph_nodes = -1
+  graph_options = cpb2.GraphOptions(
+      rewrite_options=rewrite_config, build_cost_model=1)
+  config = cpb2.ConfigProto(graph_options=graph_options)
+  config.graph_options.optimizer_options.opt_level = -1
+  return config
+
+
+def _is_cast_to_fp16(node_name):
+  return node_name.endswith('-CastToFp16-AutoMixedPrecision')
+
+
+def _is_cast_to_fp32(node_name):
+  return node_name.endswith('-CastToFp32-AutoMixedPrecision')
+
+
+def _count_casts(nodes):
+  num_to_fp16 = 0
+  num_to_fp32 = 0
+  for node in nodes:
+    if _is_cast_to_fp16(node.name):
+      num_to_fp16 += 1
+    elif _is_cast_to_fp32(node.name):
+      num_to_fp32 += 1
+  return num_to_fp16, num_to_fp32
+
+
+def _build_node_map(nodes):
+  node_map = {}
+  for node in nodes:
+    node_map[node.name] = node
+  return node_map
+
+
+class AutoMixedPrecisionTest(test.TestCase):
+  """Tests the Grappler auto mixed precision optimizer."""
+  MIN_GPU_ARCH = (7, 0)
+
+  def _assert_output_fp16(self, node_map, node_name, output_port=0):
+      self.assertEqual(node_map[node_name].output_info[output_port].dtype,
+                       types_pb2.DT_HALF)
+
+  def _run(self, fetches):
+    with session.Session(config=_get_config(False)) as sess:
+      sess.run(variables.global_variables_initializer())
+      output_val_ref = self.evaluate(fetches)
+
+    with session.Session(config=_get_config()) as sess:
+      sess.run(variables.global_variables_initializer())
+      metadata = cpb2.RunMetadata()
+      output_val = sess.run(fetches, run_metadata=metadata)
+
+    return output_val_ref, output_val, metadata.cost_graph
+
+  def _run_intertwined_loop_test(self, inpA, inpB, bodyA, bodyB, outA, outB,
+                                 expected_num_to_fp16, expected_num_to_fp32):
+    """Runs a test of an intertwined loop with different node colors in
+    different sections of the graph. The arguments must be strings where each
+    character represents the color of a node in that section of the graph:
+    w = white, g = gray, c = clear, b = black. CAPITALIZED characters indicate
+    that the node is expected to be changed to DT_HALF during graph
+    optimization.
+
+    inpA -> loop [ bodyA ] -> outA
+              :            |
+               =====<<=====
+              |            :
+    inpB -> loop [ bodyB ] -> outB
+    """
+    if test.is_gpu_available(cuda_only=True,
+                             min_cuda_compute_capability=self.MIN_GPU_ARCH):
+      random_seed.set_random_seed(0)
+      expected_types = []
+      for section in [inpA, inpB, bodyA, bodyB, outA, outB]:
+        section_expected_types = []
+        for color in section:
+          if color.isupper():
+            expected_type = types_pb2.DT_HALF
+          else:
+            expected_type = types_pb2.DT_FLOAT
+          section_expected_types.append(expected_type)
+        expected_types.append(section_expected_types)
+
+      a, b = _build_intertwined_loop_graph(inpA, inpB, bodyA, bodyB, outA, outB)
+      output_val_ref, output_val, cost_graph = self._run((a, b))
+      node_map = _build_node_map(cost_graph.node)
+      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+
+      section_names = ['inputA', 'inputB', 'while/bodyA', 'while/bodyB',
+                       'outputA', 'outputB']
+      all_types_correct = True
+      for section_name, expected_types in zip(section_names, expected_types):
+        for i, expected_type in enumerate(expected_types):
+          node_name = section_name + '_%i' % i
+          output_port = 0
+          optimized_type = node_map[node_name].output_info[output_port].dtype
+          if (optimized_type != expected_type):
+            print("Expected node %s to have type %s but got type %s" %
+                  (node_name, expected_type, optimized_type))
+            all_types_correct = False
+      self.assertTrue(all_types_correct)
+      self.assertEqual(num_to_fp16, expected_num_to_fp16)
+      self.assertEqual(num_to_fp32, expected_num_to_fp32)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+
+  def testConvBN(self):
+    if test.is_gpu_available(cuda_only=True,
+                             min_cuda_compute_capability=self.MIN_GPU_ARCH):
+      random_seed.set_random_seed(0)
+      x = _input([2, 8, 8, 1])
+      x = _conv_bn(x)
+      output = _conv_bn(x)
+
+      output_val_ref, output_val, cost_graph = self._run(output)
+      node_map = _build_node_map(cost_graph.node)
+      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+
+      self._assert_output_fp16(node_map, 'Conv2D')
+      self._assert_output_fp16(node_map, 'FusedBatchNorm')
+      self._assert_output_fp16(node_map, 'Conv2D_1')
+      self.assertEqual(num_to_fp16, 3) # Before Conv2D:0, Conv2D:1, Conv2D_1:1
+      self.assertEqual(num_to_fp32, 1) # After FusedBatchNorm:0
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+
+  def testConvBNDropout(self):
+    if test.is_gpu_available(cuda_only=True,
+                             min_cuda_compute_capability=self.MIN_GPU_ARCH):
+      random_seed.set_random_seed(0)
+      x = _input([2, 8, 8, 1])
+      y = _conv_bn(x)
+      y = nn.dropout(y, rate=0.5)
+      y = _conv_bn(y)
+      y = array_ops.identity(y)
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+      g = optimizer.compute_gradients(y, [x])
+      output = (y, g)
+
+      output_val_ref, output_val, cost_graph = self._run(output)
+      node_map = _build_node_map(cost_graph.node)
+      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+      self._assert_output_fp16(node_map, 'Conv2D')
+      self._assert_output_fp16(node_map, 'FusedBatchNorm')
+      self._assert_output_fp16(node_map, 'dropout/mul')
+      self._assert_output_fp16(node_map, 'dropout/Cast')
+      self._assert_output_fp16(node_map, 'dropout/mul_1')
+      self._assert_output_fp16(node_map, 'Conv2D_1')
+
+      output_val_ref, output_val, cost_graph = self._run(output)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+
+  def testConvPool(self):
+    if test.is_gpu_available(cuda_only=True,
+                             min_cuda_compute_capability=self.MIN_GPU_ARCH):
+      random_seed.set_random_seed(0)
+      x = _input([2, 8, 8, 1])
+      output = _conv_pool(x)
+
+      output_val_ref, output_val, cost_graph = self._run(output)
+      node_map = _build_node_map(cost_graph.node)
+      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+
+      self._assert_output_fp16(node_map, 'Conv2D')
+      self._assert_output_fp16(node_map, 'Relu')
+      self._assert_output_fp16(node_map, 'MaxPool')
+      self._assert_output_fp16(node_map, 'Conv2D_1')
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+
+  def testSimpleLoop(self):
+    if test.is_gpu_available(cuda_only=True,
+                             min_cuda_compute_capability=self.MIN_GPU_ARCH):
+      random_seed.set_random_seed(0)
+      x = _input([8, 8])
+      y = _simple_loop(x, _matmul_act)[1]
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+      g = optimizer.compute_gradients(y, [x])
+      output = (y, g)
+
+      output_val_ref, output_val, cost_graph = self._run(output)
+      node_map = _build_node_map(cost_graph.node)
+
+      self._assert_output_fp16(node_map, 'while/MatMul')
+      self._assert_output_fp16(node_map, 'while/Relu')
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+
+  def testLoopWithVarsIntertwined(self):
+    if test.is_gpu_available(cuda_only=True,
+                             min_cuda_compute_capability=self.MIN_GPU_ARCH):
+      random_seed.set_random_seed(0)
+      x = _input([8, 8])
+      i, j, k, l = _loop_vars_intertwined(array_ops.ones(array_ops.shape(x)),
+                                             x, _matmul_act, _matmul_act)
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+      g = optimizer.compute_gradients(k, [x])
+      output = (k, l, g)
+
+      output_val_ref, output_val, cost_graph = self._run(output)
+      node_map = _build_node_map(cost_graph.node)
+
+      self._assert_output_fp16(node_map, 'while/MatMul')
+      self._assert_output_fp16(node_map, 'while/Relu')
+      self._assert_output_fp16(node_map, 'while/MatMul_1')
+      self._assert_output_fp16(node_map, 'while/Relu_1')
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+
+  def testMultiPaths(self):
+    if test.is_gpu_available(cuda_only=True,
+                             min_cuda_compute_capability=self.MIN_GPU_ARCH):
+      random_seed.set_random_seed(0)
+      x = _input([2, 8, 8, 3])
+      x1, x2, x3 = array_ops.split(x, num_or_size_splits=3, axis=3)
+      y1 = _conv_pool(x1)
+      y2 = _conv_pool(x2)
+      y3 = _conv_pool(x3)
+      y = array_ops.concat([y1, y2, y3], axis=3)
+      y = array_ops.identity(y)
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+      g = optimizer.compute_gradients(y, [x])
+      output = (y, g)
+
+      output_val_ref, output_val, cost_graph = self._run(output)
+      node_map = _build_node_map(cost_graph.node)
+
+      self._assert_output_fp16(node_map, 'split')
+      for suffix in [''] + ['_%i' % i for i in range(1, 6)]:
+        self._assert_output_fp16(node_map, 'Conv2D' + suffix)
+        self._assert_output_fp16(node_map, 'Relu' + suffix)
+        self._assert_output_fp16(node_map, 'MaxPool' + suffix)
+      self._assert_output_fp16(node_map, 'concat')
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+
+  def testMultiPaths2(self):
+    if test.is_gpu_available(cuda_only=True,
+                             min_cuda_compute_capability=self.MIN_GPU_ARCH):
+      random_seed.set_random_seed(0)
+      x = _input([8, 8])
+      y1 = _matmul_act(x)
+      y2 = _matmul_act(x)
+      y = y1 + y2 + x
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+      g = optimizer.compute_gradients(y, [x])
+      output = (g, y)
+
+      output_val_ref, output_val, cost_graph = self._run(output)
+      node_map = _build_node_map(cost_graph.node)
+
+      self._assert_output_fp16(node_map, 'MatMul')
+      self._assert_output_fp16(node_map, 'Relu')
+      self._assert_output_fp16(node_map, 'MatMul_1')
+      self._assert_output_fp16(node_map, 'Relu_1')
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+
+  def testRecurrentLSTM(self):
+    if test.is_gpu_available(cuda_only=True,
+                             min_cuda_compute_capability=self.MIN_GPU_ARCH):
+      random_seed.set_random_seed(0)
+      init_c = _input([8, 4])
+      init_h = _input([8, 4])
+      i, c, h, ta = _recurrent_lstm(init_c, init_h)
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+      g = optimizer.compute_gradients(h, [init_c, init_h])
+      output = (h, g)
+
+      output_val_ref, output_val, cost_graph = self._run(output)
+      node_map = _build_node_map(cost_graph.node)
+
+      self._assert_output_fp16(node_map, 'while/concat')
+      self._assert_output_fp16(node_map, 'while/MatMul')
+      self._assert_output_fp16(node_map, 'while/split')
+      self._assert_output_fp16(node_map, 'while/Sigmoid')
+      self._assert_output_fp16(node_map, 'while/Sigmoid_1')
+      self._assert_output_fp16(node_map, 'while/Sigmoid_2')
+      self._assert_output_fp16(node_map, 'while/Tanh')
+      self._assert_output_fp16(node_map, 'while/Tanh_1')
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+
+  def testPropagationThroughIntertwinedLoop1(self):
+    self._run_intertwined_loop_test('C', 'C', 'bgW', 'C', 'g', 'b', 4, 3)
+
+  def testPropagationThroughIntertwinedLoop2(self):
+    # Note that this results in NextIteration and Merge being painted different
+    # colors, requiring NextIteration to be forced to match.
+    self._run_intertwined_loop_test('b', 'g', 'gW', 'C', 'c', 'C', 3, 2)
+
+  def testPropagationThroughIntertwinedLoop3(self):
+    self._run_intertwined_loop_test('g', 'g', 'g', 'g', 'W', 'c', 3, 2)
+
+  def testPropagationThroughIntertwinedLoop4(self):
+    self._run_intertwined_loop_test('W', 'g', 'g', 'g', 'g', 'g', 3, 2)
+
+  def testPropagationThroughIntertwinedLoop5(self):
+    self._run_intertwined_loop_test('W', 'c', 'b', 'c', 'c', 'W', 4, 2)
+
+  def testPropagationThroughIntertwinedLoop6(self):
+    self._run_intertwined_loop_test('b', 'g', 'g', 'g', 'g', 'W', 2, 1)
+
+  def testPropagationThroughIntertwinedLoop7(self):
+    self._run_intertwined_loop_test('c', 'c', 'bWg', 'c', 'g', 'b', 2, 1)
+
+  def testPropagationThroughIntertwinedLoop8(self):
+    self._run_intertwined_loop_test('C', 'C', 'C', 'C', 'W', 'g', 3, 2)
+
+  def testPropagationThroughIntertwinedLoop9(self):
+    self._run_intertwined_loop_test('W', 'g', 'G', 'G', 'g', 'W', 4, 2)
+
+  def testPropagationThroughIntertwinedLoop10(self):
+    self._run_intertwined_loop_test('g', 'g', 'GWG', 'G', 'g', 'g', 3, 2)
+
+if __name__ == '__main__':
+  test.main()