Merge pull request #40596 from reedwm:auto_mp_mkl2

PiperOrigin-RevId: 317381920 Change-Id: I8e7fe93090dafeedba1e7dccfb093d16c6e5b742
2020-06-19 14:39:31 -07:00 · 2020-06-19 14:39:31 -07:00 · 8e88146931
commit 8e88146931
parent 9e7df609bc e7733c8dd4
8 changed files with 925 additions and 478 deletions
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@ -1,11 +1,14 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cc_test_mkl", "tf_kernel_library")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cc_test_mkl", "tf_copts", "tf_cuda_cc_test", "tf_kernel_library")

 # Platform specific build config
 load(
    "//tensorflow/core/platform:build_config_root.bzl",
    "if_static",
 )
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "mkl_deps",
+)

 package(
    licenses = ["notice"],  # Apache 2.0
@ -610,6 +613,7 @@ cc_library(
        "auto_mixed_precision.h",
        "auto_mixed_precision_lists.h",
    ],
+    copts = tf_copts(),
    visibility = ["//visibility:public"],
    deps = [
        ":custom_graph_optimizer_registry",
@ -627,7 +631,7 @@ cc_library(
        "//tensorflow/core/grappler/costs:virtual_placer",
        "@com_google_absl//absl/container:flat_hash_map",
        "@com_google_absl//absl/container:flat_hash_set",
-    ],
+    ] + mkl_deps(),
 )

 tf_cuda_cc_test(
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"

 #include <fstream>
+#include <memory>

 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@ -52,6 +53,7 @@ const std::pair<int, int> kMinGPUArch = {0, 0};

 const char kSuffix[] = "AutoMixedPrecision";
 const char kCastToFp16[] = "CastToFp16";
+const char kCastToBf16[] = "CastToBf16";
 const char kCastToFp32[] = "CastToFp32";

 // Instances of this class represent unique type attribute identifiers within a
@ -840,22 +842,6 @@ DataTypeSet AllowedDataTypes(const OpDef& op_def, const TypeAttrId& t_attr_id) {
  return AllowedDataTypes(*attr_def);
 }

-NodeDef BuildCastNode(const MutableGraphView::OutputPort& src, bool to_fp16,
-                      const string& device) {
-  const char* cast_string = to_fp16 ? kCastToFp16 : kCastToFp32;
-  string name = strings::StrCat(src.node->name(), "-", src.port_id, "-",
-                                cast_string, "-", kSuffix);
-  NodeDef node;
-  node.set_name(name);
-  node.set_op("Cast");
-  node.set_device(device);
-  node.add_input(strings::StrCat(src.node->name(), ":", src.port_id));
-  (*node.mutable_attr())["SrcT"].set_type(to_fp16 ? DT_FLOAT : DT_HALF);
-  (*node.mutable_attr())["DstT"].set_type(to_fp16 ? DT_HALF : DT_FLOAT);
-  (*node.mutable_attr())["Truncate"].set_b(false);
-  return node;
-}
-
 Status ValidateLists(const gtl::FlatSet<string>& white_list,
                     const gtl::FlatSet<string>& black_list,
                     const gtl::FlatSet<string>& gray_list,
@ -941,7 +927,8 @@ class AutoMixedPrecisionImpl {
 public:
  AutoMixedPrecisionImpl(Cluster* cluster,
                         const std::unordered_set<string>& nodes_to_preserve,
-                         GraphDef* graph, string id)
+                         GraphDef* graph, string id,
+                         AutoMixedPrecisionMode mode)
      : virtual_placer_(cluster->GetDevices()),
        nodes_to_preserve_(nodes_to_preserve),
        graph_(graph),
@ -949,23 +936,35 @@ class AutoMixedPrecisionImpl {
        id_(id),
        graph_view_(graph),
        cuda_version_(GetCudaVersion(*cluster)),
-        cudnn_version_(GetCudnnVersion(*cluster)) {}
+        cudnn_version_(GetCudnnVersion(*cluster)),
+        mode_(mode),
+        target_dtype_(mode_ == AutoMixedPrecisionMode::CUDA ? DT_HALF
+                                                            : DT_BFLOAT16) {}

  Status Optimize();

 private:
  typedef absl::flat_hash_set<NodeTypeId> NodeTypeIdSet;

+  std::unique_ptr<AutoMixedPrecisionLists> get_mixed_precision_lists() const {
+    switch (mode_) {
+      case AutoMixedPrecisionMode::CUDA:
+        return std::make_unique<AutoMixedPrecisionListsCuda>(cuda_version_,
+                                                             cudnn_version_);
+      case AutoMixedPrecisionMode::MKL:
+        return std::make_unique<AutoMixedPrecisionListsMkl>();
+    }
+  }
  Status PrintDebugLogs(bool preop, size_t timestamp);
  void LogSkippedNode(const NodeDef& node) const;
  bool MustPreserve(const NodeDef& node) const;
-  bool IsOnGPU(const NodeDef& node) const;
+  bool IsOnDevice(const NodeDef& node, const string& device_type) const;
  bool IsOnSuitableGPUArch(const NodeDef& node) const;
  bool ShouldProcess(const NodeDef& node) const;
-  bool NodeHasFP16KernelForTypeAttr(const NodeDef& node, TypeAttrId taid) const;
+  bool NodeHasF16KernelForTypeAttr(const NodeDef& node, TypeAttrId taid) const;
  bool NodeImplicitlyReadsNonResourceVariable(const NodeDef& node) const;
  void ConvertBatchNormOpsToV2();
-  bool SupportsFloat16(const NodeTypeId& node_type) const;
+  bool SupportsF16(const NodeTypeId& node_type) const;
  const NodeTypeId* GetTensorListFloat32NodeTypeId(const NodeDef& node) const;
  bool IsSourceOrSinkOp(const string& op) const;
  void FindFloat32TensorListOpClustersAndBlacklistUnsafe(
@ -990,6 +989,8 @@ class AutoMixedPrecisionImpl {
      absl::flat_hash_set<int>* white_set) const;
  void MakeCastsWhiteIfAllOutputsWhite(
      absl::flat_hash_set<int>* white_set) const;
+  NodeDef BuildCastNode(const MutableGraphView::OutputPort& src, bool to_f16,
+                        const string& device) const;
  Status ChangeTypeAttrsAndAddCasts(const absl::flat_hash_set<int>& white_set);

  VirtualPlacer virtual_placer_;
@ -1003,21 +1004,44 @@ class AutoMixedPrecisionImpl {
  NodeTypeAttrMap node_type_map_;
  GraphTypeTopologyView graph_type_view_;
  bool force_all_fp16_;
-  gtl::FlatSet<string> fp16_whitelist_;
-  gtl::FlatSet<string> fp16_blacklist_;
-  gtl::FlatSet<string> fp16_graylist_;
-  gtl::FlatSet<string> fp16_clearlist_;
+  AutoMixedPrecisionMode mode_;
+  gtl::FlatSet<string> f16_whitelist_;
+  gtl::FlatSet<string> f16_blacklist_;
+  gtl::FlatSet<string> f16_graylist_;
+  gtl::FlatSet<string> f16_clearlist_;
  absl::flat_hash_set<const NodeDef*> should_process_nodes_;
+  DataType target_dtype_;  // Either DT_HALF or DT_BFLOAT16
 };

-bool AutoMixedPrecisionImpl::NodeHasFP16KernelForTypeAttr(
+NodeDef AutoMixedPrecisionImpl::BuildCastNode(
+    const MutableGraphView::OutputPort& src, bool to_f16,
+    const string& device) const {
+  DataType src_type = to_f16 ? DT_FLOAT : target_dtype_;
+  DataType dst_type = to_f16 ? target_dtype_ : DT_FLOAT;
+  const char* cast_string =
+      !to_f16 ? kCastToFp32
+              : target_dtype_ == DT_HALF ? kCastToFp16 : kCastToBf16;
+  string name = strings::StrCat(src.node->name(), "-", src.port_id, "-",
+                                cast_string, "-", kSuffix);
+  NodeDef node;
+  node.set_name(name);
+  node.set_op("Cast");
+  node.set_device(device);
+  node.add_input(strings::StrCat(src.node->name(), ":", src.port_id));
+  (*node.mutable_attr())["SrcT"].set_type(src_type);
+  (*node.mutable_attr())["DstT"].set_type(dst_type);
+  (*node.mutable_attr())["Truncate"].set_b(false);
+  return node;
+}
+
+bool AutoMixedPrecisionImpl::NodeHasF16KernelForTypeAttr(
    const NodeDef& node, TypeAttrId taid) const {
  NodeDef node_copy(node);
  if (node.device().empty()) {
    string device_name = virtual_placer_.get_canonical_device_name(node);
    node_copy.set_device(device_name);
  }
-  if (!SetDataType(&node_copy, taid, DataType::DT_HALF)) {
+  if (!SetDataType(&node_copy, taid, target_dtype_)) {
    return false;
  }
  return IsKernelRegisteredForNode(node_copy).ok();
@ -1053,21 +1077,22 @@ Status AutoMixedPrecisionImpl::PrintDebugLogs(bool preop, size_t timestamp) {
    fname = io::JoinPath(prepend_path,
                         strings::StrCat("paintbuckets", suffix, ".txt"));
    f.open(fname.c_str(), std::fstream::out);
+    std::unique_ptr<AutoMixedPrecisionLists> mp_lists =
+        get_mixed_precision_lists();
    f << "WhiteList:\n";
-    for (const auto& x :
-         AutoMixedPrecisionLists::WhiteList(cuda_version_, cudnn_version_)) {
+    for (const auto& x : mp_lists->WhiteList()) {
      f << x << "\n";
    }
    f << "\nBlackList:\n";
-    for (const auto& x : AutoMixedPrecisionLists::BlackList()) {
+    for (const auto& x : mp_lists->BlackList()) {
      f << x << "\n";
    }
    f << "\nGrayList:\n";
-    for (const auto& x : AutoMixedPrecisionLists::GrayList()) {
+    for (const auto& x : mp_lists->GrayList()) {
      f << x << "\n";
    }
    f << "\nClearList:\n";
-    for (const auto& x : AutoMixedPrecisionLists::ClearList()) {
+    for (const auto& x : mp_lists->ClearList()) {
      f << x << "\n";
    }
    f.close();
@ -1088,7 +1113,8 @@ bool AutoMixedPrecisionImpl::MustPreserve(const NodeDef& node) const {
  return nodes_to_preserve_.count(node.name());
 }

-bool AutoMixedPrecisionImpl::IsOnGPU(const NodeDef& node) const {
+bool AutoMixedPrecisionImpl::IsOnDevice(const NodeDef& node,
+                                        const string& device_type) const {
  string device_name;
  if (node.device().empty()) {
    device_name = virtual_placer_.get_canonical_device_name(node);
@ -1099,7 +1125,7 @@ bool AutoMixedPrecisionImpl::IsOnGPU(const NodeDef& node) const {
  string not_used;
  if (DeviceNameUtils::SplitDeviceName(device_name, &not_used, &device) &&
      absl::StrContains(absl::AsciiStrToLower(device),
-                        absl::AsciiStrToLower(DEVICE_GPU))) {
+                        absl::AsciiStrToLower(device_type))) {
    return true;
  }
  return false;
@ -1164,15 +1190,14 @@ bool IsTensorListWriterOp(const string& op) {
  return tensor_list_writer_ops.count(op);
 }

-bool AutoMixedPrecisionImpl::SupportsFloat16(
-    const NodeTypeId& node_type) const {
+bool AutoMixedPrecisionImpl::SupportsF16(const NodeTypeId& node_type) const {
  const OpDef* op_def;
  Status status =
      OpRegistry::Global()->LookUpOpDef(node_type.node->op(), &op_def);
  if (!status.ok()) return false;
  return AllowedDataTypes(*op_def, node_type.type_attr)
-             .Contains(DataType::DT_HALF) &&
-         NodeHasFP16KernelForTypeAttr(*node_type.node, node_type.type_attr);
+             .Contains(target_dtype_) &&
+         NodeHasF16KernelForTypeAttr(*node_type.node, node_type.type_attr);
 }

 // TODO(mconley): Make this change the node's name (to aid debugging). Need to
@ -1219,22 +1244,40 @@ Status AutoMixedPrecisionImpl::Optimize() {
      "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "", &optimization_level));
  optimization_level = absl::AsciiStrToUpper(optimization_level);
  force_all_fp16_ = optimization_level == "UNSAFE_FORCE_ALL";
+  if (force_all_fp16_ && mode_ == AutoMixedPrecisionMode::MKL) {
+    // Many ops do not support bfloat16 on the CPU so we disallowing forcing to
+    // bfloat16.
+    return errors::InvalidArgument(
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL cannot be set to "
+        "UNSAFE_FORCE_ALL when MKL is used");
+  }

-  fp16_whitelist_ =
-      AutoMixedPrecisionLists::WhiteList(cuda_version_, cudnn_version_);
-  fp16_blacklist_ = AutoMixedPrecisionLists::BlackList();
-  fp16_graylist_ = AutoMixedPrecisionLists::GrayList();
-  fp16_clearlist_ = AutoMixedPrecisionLists::ClearList();
-  TF_RETURN_IF_ERROR(ValidateLists(fp16_whitelist_, fp16_blacklist_,
-                                   fp16_graylist_, fp16_clearlist_));
+  std::unique_ptr<AutoMixedPrecisionLists> mp_lists =
+      get_mixed_precision_lists();
+  f16_whitelist_ = mp_lists->WhiteList();
+  f16_blacklist_ = mp_lists->BlackList();
+  f16_graylist_ = mp_lists->GrayList();
+  f16_clearlist_ = mp_lists->ClearList();
+  TF_RETURN_IF_ERROR(ValidateLists(f16_whitelist_, f16_blacklist_,
+                                   f16_graylist_, f16_clearlist_));

  size_t timestamp = Env::Default()->NowMicros() / 1000;
  TF_RETURN_IF_ERROR(PrintDebugLogs(/* preop = */ true, timestamp));

  VLOG(2) << "Identifying nodes that should be processed";
  for (const NodeDef& node : graph_->node()) {
-    if (!MustPreserve(node) && IsOnGPU(node) &&
-        (ShouldIgnorePerformance() || IsOnSuitableGPUArch(node))) {
+    bool should_process;
+    switch (mode_) {
+      case AutoMixedPrecisionMode::CUDA:
+        should_process =
+            !MustPreserve(node) && IsOnDevice(node, DEVICE_GPU) &&
+            (ShouldIgnorePerformance() || IsOnSuitableGPUArch(node));
+        break;
+      case AutoMixedPrecisionMode::MKL:
+        should_process = !MustPreserve(node) && IsOnDevice(node, DEVICE_CPU);
+        break;
+    }
+    if (should_process) {
      should_process_nodes_.insert(&node);
    } else {
      LogSkippedNode(node);
@ -1260,29 +1303,29 @@ Status AutoMixedPrecisionImpl::Optimize() {
  for (const auto& cluster : tensor_list_clusters) {
    VLOG(1) << "Found safe Tensor List cluster of size " << cluster.size();
    for (const NodeDef* node : cluster) {
-      VLOG(2) << "Cluster member: " << node->op() << " node " << node->name();
+      VLOG(2) << "  Cluster member: " << node->op() << " node " << node->name();
    }
    FindTensorListImplicitFloat32Edges(cluster, &ephemeral_edges);
  }
  TF_RETURN_IF_ERROR(graph_type_view_.AddEphemeralEdges(ephemeral_edges));

-  // The goal here is to change performance-critical ops to fp16, and to do so
-  // with the minimal number of casts, subject to the constraint that the
+  // The goal here is to change performance-critical ops to fp16 or bf16, and to
+  // do so with the minimal number of casts, subject to the constraint that the
  // model's convergence is not affected. This is achieved by first identifying
-  // which nodes should be changed to fp16 and then inserting casts at the
-  // boundaries between fp16/non-fp16 nodes.
+  // which nodes should be changed to f16 and then inserting casts at the
+  // boundaries between f16/non-f16 nodes.

-  // The algorithm for deciding which nodes to change to fp16 is as follows:
+  // The algorithm for deciding which nodes to change to f16 is as follows:
  // 1) Add all performance-critical ops (aka "whitelist" ops) to the white_set.
  //    This is done under the assumption that whitelist ops are always
-  //    numerically-safe in fp16 and that they are the most important ops for
+  //    numerically-safe in f16 and that they are the most important ops for
  //    improving performance.
  // 2) Add nodes to the black_set iff they are numerically-dangerous (aka
  //    "blacklist" ops) or they are on a forward path from a blacklist node to
  //    a black/gray node (including the node at the end of the path) through
  //    non-numerically-dangerous ops (aka "greylist" and "clearlist" ops).
  //    This is done to prevent numerically-dangerous ops and their downstream
-  //    effects from being changed to fp16, which would risk breaking the
+  //    effects from being changed to f16, which would risk breaking the
  //    numerical accuracy of the model.
  // 3) For all remaining nodes that are not considered dangerous (greylist
  //    and clearlist ops), find those that are between (i.e., both upstream
@ -1480,7 +1523,7 @@ void AutoMixedPrecisionImpl::AddWhitelistOps(
    const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
    if (!ShouldProcess(*root.node)) continue;
    bool force_white = force_all_fp16_ && CanForceFP16(*root.node);
-    if (fp16_whitelist_.count(root.node->op()) || force_white) {
+    if (f16_whitelist_.count(root.node->op()) || force_white) {
      bool inserted = white_set->insert(root_idx).second;
      if (VLOG_IS_ON(2) && inserted) {
        VLOG(2) << "Painting type " << root.type_attr.DebugString()
@ -1504,8 +1547,8 @@ void AutoMixedPrecisionImpl::PropagateBlackFwdThroughClearAndGray(
  absl::flat_hash_set<int> upstream_of_black_or_gray_set;
  for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
    const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
-    if (!(fp16_blacklist_.count(root.node->op()) ||
-          fp16_graylist_.count(root.node->op()))) {
+    if (!(f16_blacklist_.count(root.node->op()) ||
+          f16_graylist_.count(root.node->op()))) {
      continue;
    }
    DfsTypeTraversal(graph_type_view_, {&root},
@ -1514,7 +1557,7 @@ void AutoMixedPrecisionImpl::PropagateBlackFwdThroughClearAndGray(
                       const NodeTypeId& item = *graph_type_view_.GetNode(idx);
                       return idx == root_idx ||
                              (!upstream_of_black_or_gray_set.count(idx) &&
-                               fp16_clearlist_.count(item.node->op()));
+                               f16_clearlist_.count(item.node->op()));
                     }),
                     DfsTypeCallbacks::PreOrder([&](int idx) {
                       upstream_of_black_or_gray_set.insert(idx);
@ -1524,7 +1567,7 @@ void AutoMixedPrecisionImpl::PropagateBlackFwdThroughClearAndGray(
  // Propagate black forward through nodes in upstream_of_black_or_gray_set.
  for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
    const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
-    if (black_set->count(root_idx) || !fp16_blacklist_.count(root.node->op())) {
+    if (black_set->count(root_idx) || !f16_blacklist_.count(root.node->op())) {
      continue;
    }
    DfsTypeTraversal(
@ -1552,7 +1595,7 @@ void AutoMixedPrecisionImpl::AddClearAndGrayToWhiteIfBetweenWhite(
  absl::flat_hash_set<int> downstream_of_white_set;
  for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
    const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
-    if (!ShouldProcess(*root.node) || !fp16_whitelist_.count(root.node->op())) {
+    if (!ShouldProcess(*root.node) || !f16_whitelist_.count(root.node->op())) {
      continue;
    }
    DfsTypeTraversal(
@ -1561,14 +1604,14 @@ void AutoMixedPrecisionImpl::AddClearAndGrayToWhiteIfBetweenWhite(
          const NodeTypeId& item = *graph_type_view_.GetNode(idx);
          return idx == root_idx ||
                 (!downstream_of_white_set.count(idx) &&
-                  !fp16_whitelist_.count(item.node->op()) &&
+                  !f16_whitelist_.count(item.node->op()) &&
                  !black_set.count(idx) && ShouldProcess(*item.node) &&
                  // TODO(benbarsdell): Consider allowing propagation through
                  // ops that are already float16 in order to reduce the number
                  // of casts.
-                  IsFloat32(item) && SupportsFloat16(item) &&
-                  (fp16_clearlist_.count(item.node->op()) ||
-                   fp16_graylist_.count(item.node->op())));
+                  IsFloat32(item) && SupportsF16(item) &&
+                  (f16_clearlist_.count(item.node->op()) ||
+                   f16_graylist_.count(item.node->op())));
        }),
        DfsTypeCallbacks::PreOrder(
            [&](int idx) { downstream_of_white_set.insert(idx); }));
@ -1579,7 +1622,7 @@ void AutoMixedPrecisionImpl::AddClearAndGrayToWhiteIfBetweenWhite(
  for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
    const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
    if (!ShouldProcess(*root.node) || upstream_of_white_set.count(root_idx) ||
-        !fp16_whitelist_.count(root.node->op())) {
+        !f16_whitelist_.count(root.node->op())) {
      continue;
    }
    DfsTypeTraversal(
@ -1620,8 +1663,8 @@ void AutoMixedPrecisionImpl::PropagateWhiteThroughClear(
          return idx == root_idx ||
                 (!white_set->count(idx) && !black_set.count(idx) &&
                  ShouldProcess(*item.node) && IsFloat32(item) &&
-                  SupportsFloat16(item) &&
-                  (fp16_clearlist_.count(item.node->op())) &&
+                  SupportsF16(item) &&
+                  (f16_clearlist_.count(item.node->op())) &&
                  // We don't propagate (backwards) through nodes that read
                  // Variables because it can break the behavior of TensorBoard
                  // visualization and/or (in the case of Enter nodes) the model
@ -1806,13 +1849,13 @@ void AutoMixedPrecisionImpl::MakeCastsWhiteIfAllOutputsWhite(
  }
 }

-// Changes all white-painted type attributes to DT_HALF, and inserts Cast nodes
-// at node outputs for all edges that connect white-painted <->
-// non-white-painted type attributes.
+// Changes all white-painted type attributes to DT_HALF or DT_BFLOAT16, and
+// inserts Cast nodes at node outputs for all edges that connect
+// white-painted <-> non-white-painted type attributes.
 Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
    const absl::flat_hash_set<int>& white_set) {
  int num_nodes_changed = 0;
-  int num_nonvar_casts_to_fp16 = 0;
+  int num_nonvar_casts_to_f16 = 0;
  int num_nodes_preop = graph_->node_size();
  for (int node_idx = 0; node_idx < num_nodes_preop; ++node_idx) {
    NodeDef* node = graph_->mutable_node(node_idx);
@ -1829,8 +1872,9 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
      bool src_is_white = white_set.count(node_type_idx);
      if (src_is_white) {
        VLOG(1) << "Changing type " << type_attr.DebugString() << " of "
-                << node->op() << " node " << node->name() << " to DT_HALF";
-        if (!SetDataType(node, type_attr, DT_HALF)) {
+                << node->op() << " node " << node->name() << " to "
+                << DataTypeString(target_dtype_);
+        if (!SetDataType(node, type_attr, target_dtype_)) {
          return errors::Internal("Failed to set type attribute");
        }
        ++num_nodes_changed;
@ -1855,16 +1899,16 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
          bool dst_is_white = white_set.count(dst_type_idx);
          if (src_is_white != dst_is_white) {
            if (!added_cast_node) {
-              bool to_fp16 = dst_is_white;
+              bool to_f16 = dst_is_white;
              VLOG(1) << "Inserting cast to "
-                      << (to_fp16 ? "DT_HALF" : "DT_FLOAT") << " at "
-                      << src.node->op() << " " << src.node->name() << ":"
-                      << src.port_id;
+                      << (to_f16 ? DataTypeString(target_dtype_) : "DT_FLOAT")
+                      << " at " << src.node->op() << " " << src.node->name()
+                      << ":" << src.port_id;
              added_cast_node = graph_view_.AddNode(
-                  BuildCastNode(src, to_fp16, src.node->device()));
-              if (to_fp16 && !IsConstant(*node) && !IsVariable(*node) &&
+                  BuildCastNode(src, to_f16, src.node->device()));
+              if (to_f16 && !IsConstant(*node) && !IsVariable(*node) &&
                  !NodeImplicitlyReadsNonResourceVariable(*node)) {
-                ++num_nonvar_casts_to_fp16;
+                ++num_nonvar_casts_to_f16;
              }
            }
            TF_RETURN_IF_ERROR(graph_view_.UpdateRegularFaninByPort(
@ -1874,9 +1918,13 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
      }
    }
  }
+  // Use Python type names (e.g. float16) instead of C++ type names (e.g. half)
+  // since many Python users will see this message.
+  const char* type_str = target_dtype_ == DT_HALF ? "float16" : "bfloat16";
  LOG(INFO) << "Converted " << num_nodes_changed << "/" << num_nodes_preop
-            << " nodes to float16 precision using " << num_nonvar_casts_to_fp16
-            << " cast(s) to float16 (excluding Const and Variable casts)";
+            << " nodes to " << type_str << " precision using "
+            << num_nonvar_casts_to_f16 << " cast(s) to " << type_str
+            << " (excluding Const and Variable casts)";
  return Status::OK();
 }

@ -1902,12 +1950,24 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
    return errors::InvalidArgument("cluster == nullptr");
  }

+#if !defined(INTEL_MKL) || !defined(ENABLE_INTEL_MKL_BFLOAT16)
+  if (mode_ == AutoMixedPrecisionMode::MKL) {
+    return errors::Unimplemented(
+        "The auto_mixed_precision_mkl optimizer cannot be used since "
+        "this build of TensorFlow is not compiled with MKL support for "
+        "bfloat16. "
+        "For information on MKL builds, see: "
+        "https://software.intel.com/en-us/articles/intel-optimization-for-"
+        "tensorflow-installation-guide");
+  }
+#endif
+
  // Start by copying input graph to output.
  *output = item.graph;

  int num_gpus = ShouldIgnorePerformance() ? GetNumGPUs(*cluster)
                                           : GetNumGPUs(*cluster, kMinGPUArch);
-  if (num_gpus < 1) {
+  if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
    // AutoMixedPrecision is currently only tuned for GPU.
    LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
                 << " graph optimizer";
@ -1916,7 +1976,7 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,

  // Optimize the output graph in-place.
  AutoMixedPrecisionImpl optimizer(cluster, item.NodesToPreserve(), output,
-                                   item.id);
+                                   item.id, mode_);
  if (item.id == "tf_graph") {
    LOG(INFO) << "Running " << name() << " graph optimizer";
  } else {
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
@ -22,16 +22,25 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {

-// Convert data types to float16 where appropriate to improve performance on
-// GPUs.
+enum class AutoMixedPrecisionMode { CUDA, MKL };
+
+// Convert data types to float16 or bfloat16 where appropriate to improve
+// performance on GPUs or CPUs.
 class AutoMixedPrecision : public GraphOptimizer {
 public:
+  // If 'mode' is CUDA, converts nodes to float16 on Nvidia GPUs. If MKL,
+  // converts nodes to bfloat16 on CPUs in order to take advantage of MKL
+  // performance improvements with bfloat16.
  explicit AutoMixedPrecision(
-      RewriterConfig::Toggle opt_level = RewriterConfig::ON) {}
+      AutoMixedPrecisionMode mode = AutoMixedPrecisionMode::CUDA)
+      : mode_(mode) {}

  ~AutoMixedPrecision() override {}

-  string name() const override { return "auto_mixed_precision"; };
+  string name() const override {
+    return mode_ == AutoMixedPrecisionMode::CUDA ? "auto_mixed_precision_cuda"
+                                                 : "auto_mixed_precision_mkl";
+  };

  bool UsesFunctionLibrary() const override { return false; }

@ -40,6 +49,9 @@ class AutoMixedPrecision : public GraphOptimizer {

  void Feedback(Cluster* cluster, const GrapplerItem& item,
                const GraphDef& optimize_output, double result) override;
+
+ private:
+  const AutoMixedPrecisionMode mode_;
 };

 }  // end namespace grappler
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@ -23,10 +23,43 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {

+// Represents the four lists of ops: the white list, gray list, black list, and
+// clear list. These lists determine which ops are converted to fp16/bf16
+// (referred to as 'f16' for short) and which ops stay as fp32.
 class AutoMixedPrecisionLists {
- private:
-  static void UpdateList(gtl::FlatSet<string>* list, const string& to_add,
-                         const string& to_remove) {
+ public:
+  virtual ~AutoMixedPrecisionLists() {}
+
+  // Returns the set of ops that are considered numerically-safe (for execution
+  // in f16), performance-critical, and can run in f16. These ops are always
+  // converted to f16.
+  virtual gtl::FlatSet<string> WhiteList() = 0;
+  // Returns the set of ops that can run in f16 and are considered numerically-
+  // safe (for execution in f16), but which may be made unsafe by an upstream
+  // blacklist op.
+  virtual gtl::FlatSet<string> GrayList() = 0;
+  // Returns the set of ops that are considered numerically-dangerous (i.e.,
+  // unsafe for execution in f16) and whose effects may also be observed in
+  // downstream nodes (e.g. for f16, in Exp -> Add, the Add is unsafe due to
+  // the Exp).
+  virtual gtl::FlatSet<string> BlackList() = 0;
+  // Returns the set of ops that do not have numerically-significant effects
+  // (i.e., they are always considered safe for execution in f16 precision), and
+  // can run in f16.
+  virtual gtl::FlatSet<string> ClearList() = 0;
+
+ protected:
+  // Adds or removes ops from list if certain environmental variables are set.
+  static void UpdateList(const string& list_name, gtl::FlatSet<string>* list) {
+    CHECK(list_name == "WHITELIST" || list_name == "GRAYLIST" ||  // Crash OK.
+          list_name == "BLACKLIST" || list_name == "CLEARLIST");
+    string add_env_var =
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_" + list_name + "_ADD";
+    string remove_env_var =
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_" + list_name + "_REMOVE";
+    string to_add, to_remove;
+    TF_CHECK_OK(ReadStringFromEnvVar(add_env_var, "", &to_add));
+    TF_CHECK_OK(ReadStringFromEnvVar(remove_env_var, "", &to_remove));
    for (const auto& x : str_util::Split(to_add, ",")) {
      list->insert(x);
    }
@ -35,6 +68,29 @@ class AutoMixedPrecisionLists {
    }
  }

+  // Subclasses should include these on the ClearList.
+  static void AddTensorListOps(gtl::FlatSet<string>* list) {
+    // Note: if a data structure op (such as TensorListPopBack) is added here,
+    // IsTensorListReaderOp or IsTensorListWriterOp may need to be modified
+    // LINT.IfChange
+    constexpr const char* tensor_list_ops[] = {
+        "TensorListConcat",     "TensorListConcatLists",
+        "TensorListConcatV2",   "TensorListGather",
+        "TensorListGetItem",    "TensorListPopBack",
+        "TensorListPushBack",   "TensorListPushBackBatch",
+        "TensorListFromTensor", "TensorListScatter",
+        "TensorListScatterV2",  "TensorListScatterIntoExistingList",
+        "TensorListSetItem",    "TensorListSplit",
+        "TensorListStack"};
+    // LINT.ThenChange(//tensorflow/core/grappler/optimizers/auto_mixed_precision.cc)
+    for (auto op : tensor_list_ops) {
+      list->insert(op);
+    }
+  }
+};
+
+class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
+ private:
  static bool IsPseudoFastMath() {
    string optimization_level;
    TF_CHECK_OK(
@ -45,16 +101,10 @@ class AutoMixedPrecisionLists {
  }

 public:
-  // Returns the set of ops that are considered numerically-safe (for execution
-  // in fp16) and performance-critical. These ops are always converted to fp16.
-  static gtl::FlatSet<string> WhiteList(int cuda_version, int cudnn_version) {
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_WHITELIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_WHITELIST_REMOVE", "",
-        &to_remove));
+  AutoMixedPrecisionListsCuda(int cuda_version, int cudnn_version)
+      : cuda_version_(cuda_version), cudnn_version_(cudnn_version) {}

+  gtl::FlatSet<string> WhiteList() override {
    auto list = gtl::FlatSet<string>{
        "BlockLSTM",
        "BlockLSTMV2",
@ -81,12 +131,12 @@ class AutoMixedPrecisionLists {
        // "DepthwiseConv2dNativeBackpropInput",
        "MatMul",
    };
-    if (cuda_version >= 9010) {
+    if (cuda_version_ >= 9010) {
      // Fp16 BatchMatMul is slow before CUDA 9.1.
      list.insert("BatchMatMul");
      list.insert("BatchMatMulV2");
    }
-    if (cudnn_version >= 7602) {
+    if (cudnn_version_ >= 7602) {
      // Fp16 3D conv is slow before CUDNN 7.6.2.
      list.insert("Conv3D");
      list.insert("Conv3DBackpropFilter");
@ -94,22 +144,14 @@ class AutoMixedPrecisionLists {
      list.insert("Conv3DBackpropInput");
      list.insert("Conv3DBackpropInputV2");
    }
-    UpdateList(&list, to_add, to_remove);
+    UpdateList("WHITELIST", &list);
    return list;
  }

-  // Returns the set of ops that are considered numerically-safe (for execution
-  // in fp16), but which may be made unsafe by an upstream blacklist op.
-  static gtl::FlatSet<string> GrayList() {
+  gtl::FlatSet<string> GrayList() override {
    if (IsPseudoFastMath()) {
      return gtl::FlatSet<string>{};
    }
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_GRAYLIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_GRAYLIST_REMOVE", "",
-        &to_remove));

    auto list = gtl::FlatSet<string>{
        "Add",
@ -156,23 +198,14 @@ class AutoMixedPrecisionLists {
        "Tanh",
        "TanhGrad",
    };
-    UpdateList(&list, to_add, to_remove);
+    UpdateList("GRAYLIST", &list);
    return list;
  }

-  // Returns the set of ops that are considered numerically-dangerous (i.e.,
-  // unsafe for execution in fp16) and whose effects may also be observed in
-  // downstream nodes (e.g., in Exp -> Add, the Add is unsafe due to the Exp).
-  static gtl::FlatSet<string> BlackList() {
+  gtl::FlatSet<string> BlackList() override {
    if (IsPseudoFastMath()) {
      return gtl::FlatSet<string>{};
    }
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_BLACKLIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_BLACKLIST_REMOVE", "",
-        &to_remove));

    auto list = gtl::FlatSet<string>{
        "Exp",
@ -185,22 +218,14 @@ class AutoMixedPrecisionLists {
        "SparseSoftmaxCrossEntropyWithLogits",
        "Sum",
    };
-    UpdateList(&list, to_add, to_remove);
+    UpdateList("BLACKLIST", &list);
    return list;
  }

-  // Returns the set of ops that do not have numerically-significant effects
-  // (i.e., they are always considered safe for execution in fp16 precision).
-  static gtl::FlatSet<string> ClearList() {
+  gtl::FlatSet<string> ClearList() override {
    if (IsPseudoFastMath()) {
      return gtl::FlatSet<string>{};
    }
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_CLEARLIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_CLEARLIST_REMOVE", "",
-        &to_remove));

    auto list = gtl::FlatSet<string>{
        "Abs",
@ -291,21 +316,6 @@ class AutoMixedPrecisionLists {
        "StridedSlice",
        "StridedSliceGrad",
        "Switch",
-        "TensorListConcat",
-        "TensorListConcatLists",
-        "TensorListConcatV2",
-        "TensorListGather",
-        "TensorListGetItem",
-        "TensorListPopBack",
-        "TensorListPushBack",
-        "TensorListPushBackBatch",
-        "TensorListFromTensor",
-        "TensorListScatter",
-        "TensorListScatterV2",
-        "TensorListScatterIntoExistingList",
-        "TensorListSetItem",
-        "TensorListSplit",
-        "TensorListStack",
        "Tile",
        "TopK",
        "TopKV2",
@ -313,7 +323,96 @@ class AutoMixedPrecisionLists {
        "Where",
        "ZerosLike",
    };
-    UpdateList(&list, to_add, to_remove);
+    AddTensorListOps(&list);
+    UpdateList("CLEARLIST", &list);
+    return list;
+  }
+
+ private:
+  int cuda_version_;
+  int cudnn_version_;
+};
+
+class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
+ public:
+  AutoMixedPrecisionListsMkl() {}
+
+  // Only ops which are supported by MKL in bfloat16 should be added to the
+  // white list, gray list, or clear list.
+  gtl::FlatSet<string> WhiteList() override {
+    auto list = gtl::FlatSet<string>{"Conv2D",
+                                     "Conv2DBackpropFilter",
+                                     "Conv2DBackpropInput",
+                                     "Conv3D",
+                                     "Conv3DBackpropFilterV2",
+                                     "Conv3DBackpropInputV2",
+                                     "DepthwiseConv2dNative",
+                                     "DepthwiseConv2dNativeBackpropFilter",
+                                     "DepthwiseConv2dNativeBackpropInput",
+                                     "MatMul",
+                                     "BatchMatMul",
+                                     "BatchMatMulV2"};
+
+    UpdateList("WHITELIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> GrayList() override {
+    auto list = gtl::FlatSet<string>{
+        "Add",
+        "AddN",
+        "AddV2",
+        "AvgPool",
+        "AvgPool3D",
+        "AvgPool3DGrad",
+        "AvgPoolGrad",
+        "BiasAdd",
+        "BiasAddGrad",
+        "BiasAddV1",
+        "FusedBatchNormV2",
+        "FusedBatchNormGradV2",
+        "FusedBatchNormV3",
+        "FusedBatchNormGradV3",
+        "LeakyRelu",
+        "LeakyReluGrad",
+        "Mul",
+        "Sub",
+    };
+    UpdateList("GRAYLIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> BlackList() override {
+    auto list = gtl::FlatSet<string>{
+        "Exp",
+        "Expm1",
+        "L2Loss",
+        "Mean",
+        "Pow",
+        "SaveV2",
+        "Softmax",
+        "SoftmaxCrossEntropyWithLogits",
+        "SparseSoftmaxCrossEntropyWithLogits",
+        "Sum",
+    };
+    UpdateList("BLACKLIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> ClearList() override {
+    auto list = gtl::FlatSet<string>{
+        "Concat",          "ConcatV2",  "Enter",         "EnsureShape",
+        "Equal",           "Exit",      "ExpandDims",    "Identity",
+        "MaxPool",         "MaxPool3D", "MaxPool3DGrad", "MaxPoolGrad",
+        "MaxPoolV2",       "Maximum",   "Merge",         "NextIteration",
+        "PreventGradient", "Relu",      "Relu6",         "Relu6Grad",
+        "ReluGrad",        "Reshape",   "Select",        "SelectV2",
+        "Shape",           "ShapeN",    "Slice",         "Split",
+        "SplitV",          "Squeeze",   "StopGradient",  "Switch",
+        "Transpose",       "ZerosLike",
+    };
+    AddTensorListOps(&list);
+    UpdateList("CLEARLIST", &list);
    return list;
  }
 };
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-// Currently, this test only passes when TensorFlow passes with CUDA, because
-// otherwise the optimizer will not turn clearlist nodes to float16. When
-// looking at clearlist nodes, this optimizer checks if the nodes have a float16
-// GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM || \
+    (INTEL_MKL && defined(ENABLE_INTEL_MKL_BFLOAT16))

 #include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"

@ -70,6 +67,31 @@ Tensor GenerateRandomTensorInRange(const TensorShape& shape, double minval,
  return tensor;
 }

+void VerifyGraphsEquivalent(const GraphDef& original_graph,
+                            const GraphDef& optimized_graph,
+                            const string& func) {
+  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
+  GraphView optimized_view(&optimized_graph);
+  for (int i = 0; i < original_graph.node_size(); ++i) {
+    const NodeDef& original = original_graph.node(i);
+    const NodeDef& optimized = *optimized_view.GetNode(original.name());
+    EXPECT_EQ(original.name(), optimized.name()) << func;
+    EXPECT_EQ(original.op(), optimized.op()) << func;
+    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
+    if (original.input_size() == optimized.input_size()) {
+      for (int j = 0; j < original.input_size(); ++j) {
+        EXPECT_EQ(original.input(j), optimized.input(j)) << func;
+      }
+    }
+  }
+}
+
+// Currently, this test suite only passes when TensorFlow passes with CUDA,
+// because otherwise the optimizer will not turn clearlist nodes to float16.
+// When looking at clearlist nodes, this optimizer checks if the nodes have a
+// float16 GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 const std::pair<int, int> kMinGPUArch = {7, 0};

 class AutoMixedPrecisionTest : public GrapplerTest {
@ -184,25 +206,6 @@ class AutoMixedPrecisionTest : public GrapplerTest {
  bool gpu_available_;
 };

-void VerifyGraphsEquivalent(const GraphDef& original_graph,
-                            const GraphDef& optimized_graph,
-                            const string& func) {
-  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
-  GraphView optimized_view(&optimized_graph);
-  for (int i = 0; i < original_graph.node_size(); ++i) {
-    const NodeDef& original = original_graph.node(i);
-    const NodeDef& optimized = *optimized_view.GetNode(original.name());
-    EXPECT_EQ(original.name(), optimized.name()) << func;
-    EXPECT_EQ(original.op(), optimized.op()) << func;
-    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
-    if (original.input_size() == optimized.input_size()) {
-      for (int j = 0; j < original.input_size(); ++j) {
-        EXPECT_EQ(original.input(j), optimized.input(j)) << func;
-      }
-    }
-  }
-}
-
 TEST_F(AutoMixedPrecisionTest, NoOp) {
  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
  Output input = ops::Const(s.WithOpName("input"), 1.234f, {32});
@ -1164,8 +1167,191 @@ TEST_F(AutoMixedPrecisionTest, TanhOp) {
      });
 }

+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#if INTEL_MKL
+#ifdef ENABLE_INTEL_MKL_BFLOAT16
+
+class AutoMixedPrecisionMklTest : public GrapplerTest {
+ protected:
+  void SetUp() override {
+    virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 0));
+    TF_CHECK_OK(virtual_cluster_->Provision());
+  }
+  void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
+
+  std::unique_ptr<Cluster> virtual_cluster_;
+};
+
+TEST_F(AutoMixedPrecisionMklTest, AlreadyBf16) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output input = ops::Const(s.WithOpName("input"), 1.f, {32, 32});
+  Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_BFLOAT16);
+  Output wht1 = ops::MatMul(s.WithOpName("wht1"), cst1, cst1);
+  Output clr1 = ops::Relu(s.WithOpName("clr1"), wht1);
+  Output cst2 = ops::Cast(s.WithOpName("cst2"), clr1, DT_FLOAT);
+  Output clr2 = ops::Relu(s.WithOpName("clr2"), cst2);
+  Output fetch = ops::Identity(s.WithOpName("fetch"), clr2);
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  AutoMixedPrecision optimizer{AutoMixedPrecisionMode::MKL};
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+  VLOG(1) << output.DebugString();
+
+  VerifyGraphsEquivalent(item.graph, output, __FUNCTION__);
+  GraphView output_view(&output);
+  EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("cst1")->attr().at("DstT").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("cst2")->attr().at("SrcT").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("cst2")->attr().at("DstT").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_FLOAT);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+  }
+}
+
+TEST_F(AutoMixedPrecisionMklTest, Simple) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
+  Output blk1 = ops::Exp(s.WithOpName("blk1"), input);
+  Output clr1 = ops::Relu(s.WithOpName("clr1"), blk1);
+  Output gry1 = ops::Sqrt(s.WithOpName("gry1"), clr1);
+  Output clr2 = ops::Relu(s.WithOpName("clr2"), gry1);
+  Output wht1 = ops::MatMul(s.WithOpName("wht1"), clr2, clr2);
+  Output clr3 = ops::Relu(s.WithOpName("clr3"), wht1);
+  Output blk2 = ops::Log(s.WithOpName("blk2"), clr3);
+  Output clr4 = ops::Relu(s.WithOpName("clr4"), blk2);
+  Output blk3 = ops::SparseMatMul(s.WithOpName("blk3"), clr4, clr4);
+  Output clr5 = ops::Relu(s.WithOpName("clr5"), blk3);
+  Output fetch = ops::Identity(s.WithOpName("fetch"), clr5);
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  AutoMixedPrecision optimizer{AutoMixedPrecisionMode::MKL};
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
+  EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("clr3")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr4")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Ta").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Tb").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr5")->attr().at("T").type(), DT_FLOAT);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectClose(tensors_expected[i], tensors[i], -1, 5e-4);
+  }
+}
+
+TEST_F(AutoMixedPrecisionMklTest, TensorListSetGet) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  tensorflow::Input shape = {32, 32};
+  auto tl1 = ops::TensorListReserve(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
+  Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
+  Output idx1 = ops::Const(s.WithOpName("idx1"), 1);
+  Output idx2 = ops::Const(s.WithOpName("idx2"), 2);
+  Output idx3 = ops::Const(s.WithOpName("idx3"), 3);
+  auto tl1w1 =
+      ops::TensorListSetItem(s.WithOpName("tl1w1"), tl1.handle, idx1, input);
+  Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, input);
+  auto tl1w2 =
+      ops::TensorListSetItem(s.WithOpName("tl1w2"), tl1.handle, idx2, wht1);
+  // Ensure that TensorListResize doesn't cause any problems.
+  Output tl1rs =
+      ops::TensorListResize(s.WithOpName("tl1rs"), tl1w2.output_handle, 6);
+  Output tl1r1 = ops::TensorListGetItem(s.WithOpName("tl1r1"), tl1rs, idx2,
+                                        shape, DT_FLOAT)
+                     .item;
+  Output gry1 = ops::Mul(s.WithOpName("gry1"), tl1r1, tl1r1);
+  Output wht2 = ops::MatMul(s.WithOpName("wht2"), gry1, gry1);
+  auto tl1w3 =
+      ops::TensorListSetItem(s.WithOpName("tl1w3"), tl1.handle, idx3, wht2);
+  Output tl1r2 =
+      ops::TensorListGetItem(s.WithOpName("tl1r2"), tl1w3.output_handle, idx3,
+                             shape, DT_FLOAT)
+          .item;
+  auto tl2 = ops::TensorListReserve(s.WithOpName("tl2"), shape, 8, DT_FLOAT);
+  auto tl2w1 =
+      ops::TensorListSetItem(s.WithOpName("tl2w1"), tl2.handle, idx1, input);
+  Output tl2r1 =
+      ops::TensorListGetItem(s.WithOpName("tl2r1"), tl2w1.output_handle, idx1,
+                             shape, DT_FLOAT)
+          .item;
+  Output fetch1 = ops::Identity(s.WithOpName("fetch1"), tl1r2);
+  Output fetch2 = ops::Identity(s.WithOpName("fetch2"), tl2r1);
+
+  GrapplerItem item;
+  item.fetch = {"fetch1", "fetch2"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  AutoMixedPrecision optimizer{AutoMixedPrecisionMode::MKL};
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
+  const char* type_key = "element_dtype";
+  EXPECT_EQ(output_view.GetNode("tl1")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1w1")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1w2")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1r1")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1w3")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("tl2w1")->attr().at(type_key).type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("tl2r1")->attr().at(type_key).type(), DT_FLOAT);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectClose(tensors_expected[i], tensors[i], -1, 1e-2);
+  }
+}
+
+#endif  // ENABLE_INTEL_MKL_BFLOAT16
+#endif  // INTEL_MKL
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM || (INTEL_MKL &&
+        // defined(ENABLE_INTEL_MKL_BFLOAT16))
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@ -188,7 +188,9 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
  MK_OPT("remap", new Remapper(cfg_.remapping()));
  MK_OPT("layout", new GenericLayoutOptimizer());
  MK_OPT("auto_mixed_precision",
-         new AutoMixedPrecision(cfg_.auto_mixed_precision()));
+         new AutoMixedPrecision(AutoMixedPrecisionMode::CUDA));
+  MK_OPT("auto_mixed_precision_mkl",
+         new AutoMixedPrecision(AutoMixedPrecisionMode::MKL));
  MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
  MK_OPT("common_subgraph_elimination",
         new CommonSubgraphElimination(cfg_.common_subgraph_elimination()));
@ -249,7 +251,11 @@ Status MetaOptimizer::InitializeOptimizers(
  }
  if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision())) {
    optimizers->push_back(
-        MakeUnique<AutoMixedPrecision>(cfg_.auto_mixed_precision()));
+        MakeUnique<AutoMixedPrecision>(AutoMixedPrecisionMode::CUDA));
+  }
+  if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision_mkl())) {
+    optimizers->push_back(
+        MakeUnique<AutoMixedPrecision>(AutoMixedPrecisionMode::MKL));
  }
  if (cfg_.pin_to_host_optimization() == RewriterConfig::ON) {
    optimizers->push_back(MakeUnique<PinToHostOptimizer>());
@ -835,6 +841,7 @@ bool MetaOptimizerEnabled(const ConfigProto& cfg) {
         rewrite_cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
         rewrite_cfg.pin_to_host_optimization() == RewriterConfig::ON ||
         AutoMixedPrecisionEnabled(rewrite_cfg.auto_mixed_precision()) ||
+         AutoMixedPrecisionEnabled(rewrite_cfg.auto_mixed_precision_mkl()) ||
         !rewrite_cfg.optimizers().empty() ||
         !rewrite_cfg.custom_optimizers().empty();
 }
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@ -85,11 +85,15 @@ message RewriterConfig {
  // Enable the swap of kernel implementations based on the device placement
  // (default is ON).
  Toggle implementation_selector = 22;
-  // Optimize data types (default is OFF).
-  // e.g., This will try to use float16 on GPU which is faster.
+  // Optimize data types for CUDA (default is OFF).
+  // This will try to use float16 on GPU which is faster.
  // Note that this can change the numerical stability of the graph and may
  // require the use of loss scaling to maintain model convergence.
  Toggle auto_mixed_precision = 23;
+  // Optimize data types for MKL (default is OFF).
+  // This will try to use bfloat16 on CPUs, which is faster.
+  // Note that this can change the numerical stability of the graph.
+  Toggle auto_mixed_precision_mkl = 25;
  // Disable the entire meta optimizer (off by default).
  bool disable_meta_optimizer = 19;

--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function

 import os
-import unittest

+from absl.testing import parameterized
 import numpy as np

 from tensorflow.core.framework import types_pb2
@ -209,7 +209,7 @@ def _make_node_with_color(color, input_tensor, name=None):
  if color == 'c':  # Clear node
    return nn.relu(input_tensor, name=name)
  if color == 'b':  # Black node
-    return math_ops.sqrt(math_ops.pow(input_tensor, 2.), name=name)
+    return math_ops.pow(math_ops.pow(input_tensor, 2.), 0.5, name=name)
  raise ValueError('Invalid node color: ' + str(color))


@ -231,18 +231,21 @@ def _build_simple_loop_graph(inp_colors, body_colors, out_colors):
  return a


-def _get_config(auto_mixed_precision=True):
+def _get_config(auto_mixed_precision_mode):
  """Returns a ConfigProto with auto mixed precision enabled if appropriate."""
-  if auto_mixed_precision:
-    rewrite_config = rewriter_config_pb2.RewriterConfig(
-        auto_mixed_precision=rewriter_config_pb2.RewriterConfig.ON,
-        # do not remove duplicated nodes
-        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+  rewrite_config = rewriter_config_pb2.RewriterConfig(
+      # do not remove duplicated nodes
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      # do not turn Conv2D and other nodes into _FusedConv2D
+      remapping=rewriter_config_pb2.RewriterConfig.OFF,
+  )
+  if auto_mixed_precision_mode == 'cuda':
+    rewrite_config.auto_mixed_precision = rewriter_config_pb2.RewriterConfig.ON
+  elif auto_mixed_precision_mode == 'mkl':
+    rewrite_config.auto_mixed_precision_mkl = (
+        rewriter_config_pb2.RewriterConfig.ON)
  else:
-    rewrite_config = rewriter_config_pb2.RewriterConfig(
-        auto_mixed_precision=rewriter_config_pb2.RewriterConfig.OFF,
-        # do not remove duplicated nodes
-        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+    assert auto_mixed_precision_mode is None
  rewrite_config.min_graph_nodes = -1
  graph_options = config_pb2.GraphOptions(
      rewrite_options=rewrite_config, build_cost_model=1)
@ -255,19 +258,33 @@ def _is_cast_to_fp16(node_name):
  return node_name.endswith('-CastToFp16-AutoMixedPrecision')


+def _is_cast_to_bf16(node_name):
+  return node_name.endswith('-CastToBf16-AutoMixedPrecision')
+
+
 def _is_cast_to_fp32(node_name):
  return node_name.endswith('-CastToFp32-AutoMixedPrecision')


-def _count_casts(nodes):
+def _count_casts(mode, nodes):
+  """Counts the number of casts to f16 and fp32."""
  num_to_fp16 = 0
+  num_to_bf16 = 0
  num_to_fp32 = 0
  for node in nodes:
    if _is_cast_to_fp16(node.name):
      num_to_fp16 += 1
+    if _is_cast_to_bf16(node.name):
+      num_to_bf16 += 1
    elif _is_cast_to_fp32(node.name):
      num_to_fp32 += 1
-  return num_to_fp16, num_to_fp32
+  if mode == 'cuda':
+    assert num_to_bf16 == 0
+    return num_to_fp16, num_to_fp32
+  else:
+    assert mode == 'mkl'
+    assert num_to_fp16 == 0
+    return num_to_bf16, num_to_fp32


 def _build_node_map(nodes):
@ -303,7 +320,7 @@ def _example_noninlined_funcdef(features):
  return features * math_ops.sigmoid(features)


-class AutoMixedPrecisionTest(test.TestCase):
+class AutoMixedPrecisionTest(test.TestCase, parameterized.TestCase):
  """Tests the Grappler auto mixed precision optimizer."""
  IGNORE_PERF_VAR = 'TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'

@ -311,8 +328,8 @@ class AutoMixedPrecisionTest(test.TestCase):

  def setUp(self):
    super(AutoMixedPrecisionTest, self).setUp()
-    # Enable the tests to be run on pre-Volta GPUs by telling the grappler pass
-    # to ignore performance and always transform the graph.
+    # Enable the CUDA tests to be run on pre-Volta GPUs by telling the grappler
+    # pass to ignore performance and always transform the graph.
    self._original_ignore_perf_value = os.getenv(self.IGNORE_PERF_VAR)
    os.environ[self.IGNORE_PERF_VAR] = '1'

@ -323,24 +340,33 @@ class AutoMixedPrecisionTest(test.TestCase):
      del os.environ[self.IGNORE_PERF_VAR]
    super(AutoMixedPrecisionTest, self).tearDown()

-  def _assert_output_fp16(self, node_map, node_name, output_port=0):
-    self.assertEqual(node_map[node_name].output_info[output_port].dtype,
-                     types_pb2.DT_HALF)
+  def _lower_precision_dtype(self, mode):
+    return dtypes.float16 if mode == 'cuda' else dtypes.bfloat16

-  def _run(self, fetches):
+  def _assert_output_f16(self, mode, node_map, node_name, output_port=0):
+    self.assertEqual(node_map[node_name].output_info[output_port].dtype,
+                     self._lower_precision_dtype(mode).as_datatype_enum)
+
+  def _run(self, mode, fetches):
    """Runs the graph and returns the evaluation of the fetches."""
-    with session.Session(config=_get_config(False)) as sess:
+    with session.Session(config=_get_config(None)) as sess:
      sess.run(variables.global_variables_initializer())
      output_val_ref = self.evaluate(fetches)

-    with session.Session(config=_get_config()) as sess:
+    with session.Session(config=_get_config(mode)) as sess:
      sess.run(variables.global_variables_initializer())
      metadata = config_pb2.RunMetadata()
      output_val = sess.run(fetches, run_metadata=metadata)

    return output_val_ref, output_val, metadata.cost_graph

-  def _run_simple_loop_test(self, inp, body, out):
+  def _maybe_skip(self, mode):
+    if mode == 'cuda' and not test.is_gpu_available(cuda_only=True):
+      self.skipTest('No GPU is available')
+    if mode == 'mkl' and not test_util.IsMklEnabled():
+      self.skipTest('MKL is not enabled')
+
+  def _run_simple_loop_test(self, mode, inp, body, out):
    """Runs a test of a simple loop.

    The loop has different node colors in different sections of the graph. The
@ -352,6 +378,7 @@ class AutoMixedPrecisionTest(test.TestCase):
    inp -> loop [ body ] -> out.

    Args:
+      mode: Either 'cuda' or 'mkl'.
      inp: A string of letters indicating the colors and expected dtypes of the
        input nodes.
      body: A string of letters indicating the colors and expected dtypes of the
@ -359,398 +386,446 @@ class AutoMixedPrecisionTest(test.TestCase):
      out: A string of letters indicating the colors and expected dtypes of the
        output nodes.
    """
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      expected_types = []
-      for section in [inp, body, out]:
-        section_expected_types = []
-        for color in section:
-          if color.isupper():
-            expected_type = types_pb2.DT_HALF
-          else:
-            expected_type = types_pb2.DT_FLOAT
-          section_expected_types.append(expected_type)
-        expected_types.append(section_expected_types)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    expected_types = []
+    for section in [inp, body, out]:
+      section_expected_types = []
+      for color in section:
+        if color.isupper():
+          expected_type = self._lower_precision_dtype(mode).as_datatype_enum
+        else:
+          expected_type = types_pb2.DT_FLOAT
+        section_expected_types.append(expected_type)
+      expected_types.append(section_expected_types)

-      a = _build_simple_loop_graph(inp, body, out)
-      output_val_ref, output_val, cost_graph = self._run(a)
-      node_map = _build_node_map(cost_graph.node)
+    a = _build_simple_loop_graph(inp, body, out)
+    output_val_ref, output_val, cost_graph = self._run(mode, a)
+    node_map = _build_node_map(cost_graph.node)

-      section_names = ['input', 'while/body', 'output']
-      all_types_correct = True
-      for section_name, expected_types in zip(section_names, expected_types):
-        for i, expected_type in enumerate(expected_types):
-          node_name = section_name + '_%i' % i
-          output_port = 0
-          optimized_type = node_map[node_name].output_info[output_port].dtype
-          if optimized_type != expected_type:
-            print('Expected node %s to have type %s but got type %s' %
-                  (node_name, expected_type, optimized_type))
-            all_types_correct = False
-      self.assertTrue(all_types_correct)
+    section_names = ['input', 'while/body', 'output']
+    all_types_correct = True
+    for section_name, expected_types in zip(section_names, expected_types):
+      for i, expected_type in enumerate(expected_types):
+        node_name = section_name + '_%i' % i
+        output_port = 0
+        optimized_type = node_map[node_name].output_info[output_port].dtype
+        if optimized_type != expected_type:
+          print('Expected node %s to have type %s but got type %s' %
+                (node_name, expected_type, optimized_type))
+          all_types_correct = False
+    self.assertTrue(all_types_correct)
+    if mode == 'mkl':
+      self.assertAllClose(output_val_ref, output_val, atol=2e-2, rtol=2e-2)
+    else:
      self.assertAllClose(output_val_ref, output_val, atol=2e-3, rtol=1e-3)

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_deprecated_v1
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv_bn(self):
+  def test_conv_bn(self, mode):
    """Test graph with convolution followed by batch norm."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 1])
-      x = _conv_bn(x)
-      output = _conv_bn(x)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    x = _conv_bn(x)
+    output = _conv_bn(x)

-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    num_to_f16, num_to_fp32 = _count_casts(mode, cost_graph.node)

-      self._assert_output_fp16(node_map, 'Conv2D')
-      self._assert_output_fp16(node_map, 'FusedBatchNormV3')
-      self._assert_output_fp16(node_map, 'Conv2D_1')
-      self.assertEqual(num_to_fp16,
-                       3)  # Before Conv2D:0, Conv2D:1, Conv2D_1:1
-      self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
+    self._assert_output_f16(mode, node_map, 'Conv2D')
+    self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
+    self._assert_output_f16(mode, node_map, 'Conv2D_1')
+    self.assertEqual(num_to_f16, 3)  # Before Conv2D:0, Conv2D:1, Conv2D_1:1
+    self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
+    if mode == 'mkl':
+      tol = 1e-2
+    elif test.is_built_with_rocm():
      # Bump up the tolerance for the ROCm platform
      # The default tolerance (1e-3) results in a tiny fraction (<1%) of
      # miscompares on ROCm platform, and hence the tolerance bump
-      tol = 2e-3 if test.is_built_with_rocm else 1e-3
-      self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+      tol = 2e-3
+    else:
+      tol = 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)

-  # TODO: enable these tests when cuDNN is upgraded to >= 7.6.2. Same with the
-  # test_conv3d() below.
-  @unittest.skip('Test case should be skipped when cuDNN < 7.6.2')
+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_deprecated_v1
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv3d_bn(self):
+  def test_conv3d_bn(self, mode):
    """Test graph with convolution followed by batch norm."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 8, 1])
-      x = _conv3d_bn(x)
-      output = _conv3d_bn(x)
+    self._maybe_skip(mode)
+    if mode == 'cuda':
+      # TODO(reedwm): enable these tests when cuDNN is upgraded to >= 7.6.2.
+      self.skipTest('Test case should be skipped when cuDNN < 7.6.2')
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 8, 1])
+    x = _conv3d_bn(x)
+    output = _conv3d_bn(x)

-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    num_to_fp16, num_to_fp32 = _count_casts(mode, cost_graph.node)

-      self._assert_output_fp16(node_map, 'Conv3D')
-      self._assert_output_fp16(node_map, 'FusedBatchNormV3')
-      self._assert_output_fp16(node_map, 'Conv3D_1')
-      self.assertEqual(num_to_fp16, 3)  # Before Conv3D:0, Conv3D:1, Conv3D_1:1
-      self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
-      self.assertAllClose(output_val_ref, output_val, atol=1e-2, rtol=1e-2)
+    self._assert_output_f16(mode, node_map, 'Conv3D')
+    self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
+    self._assert_output_f16(mode, node_map, 'Conv3D_1')
+    self.assertEqual(num_to_fp16, 3)  # Before Conv3D:0, Conv3D:1, Conv3D_1:1
+    self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
+    self.assertAllClose(output_val_ref, output_val, atol=1e-2, rtol=1e-2)

-  @unittest.skip('Test case should be skipped when cuDNN < 7.6.2')
+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_deprecated_v1
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv3d(self):
+  def test_conv3d(self, mode):
    """Test grad ops with convolution3d graph."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 8, 1])
-      f = _weight([3, 3, 3, 1, 6])
-      y = _conv3d(x, f)
-      y = array_ops.identity(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x, f])
-      output = (y, g)
+    self._maybe_skip(mode)
+    if mode == 'cuda':
+      # TODO(reedwm): enable these tests when cuDNN is upgraded to >= 7.6.2.
+      self.skipTest('Test case should be skipped when cuDNN < 7.6.2')
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 8, 1])
+    f = _weight([3, 3, 3, 1, 6])
+    y = _conv3d(x, f)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x, f])
+    output = (y, g)

-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      self._assert_output_fp16(node_map, 'Conv3D')
-      self._assert_output_fp16(node_map,
-                               'gradients/Conv3D_grad/Conv3DBackpropInputV2')
-      self._assert_output_fp16(node_map,
-                               'gradients/Conv3D_grad/Conv3DBackpropFilterV2')
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    self._assert_output_f16(mode, node_map, 'Conv3D')
+    self._assert_output_f16(mode, node_map,
+                            'gradients/Conv3D_grad/Conv3DBackpropInputV2')
+    self._assert_output_f16(mode, node_map,
+                            'gradients/Conv3D_grad/Conv3DBackpropFilterV2')

-      output_val_ref, output_val, cost_graph = self._run(output)
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    tol = 5e-2 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)

+  # TODO(reedwm): Fix and enable this test with MKL. Currently this crashes with
+  # MKL
+  @parameterized.parameters(['cuda'])
  @test_util.run_deprecated_v1
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv_bn_dropout(self):
+  def test_conv_bn_dropout(self, mode):
    """Test dropout precision of convolution batch norm graph."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 1])
-      y = _conv_bn(x)
-      y = nn.dropout(y, rate=0.5)
-      y = math_ops.add(y, 1, name='addition')
-      y = _conv_bn(y)
-      y = array_ops.identity(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(
-          learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (y, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    y = _conv_bn(x)
+    y = nn.dropout(y, rate=0.5)
+    y = math_ops.add(y, 1, name='addition')
+    y = _conv_bn(y)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (y, g)

-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      self._assert_output_fp16(node_map, 'Conv2D')
-      self._assert_output_fp16(node_map, 'FusedBatchNormV3')
-      # We do not assert dropout's dtype because we do not want to rely on the
-      # node names of dropout's internal implementation.
-      self._assert_output_fp16(node_map, 'addition')
-      self._assert_output_fp16(node_map, 'Conv2D_1')
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    self._assert_output_f16(mode, node_map, 'Conv2D')
+    self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
+    # We do not assert dropout's dtype because we do not want to rely on the
+    # node names of dropout's internal implementation.
+    self._assert_output_f16(mode, node_map, 'addition')
+    self._assert_output_f16(mode, node_map, 'Conv2D_1')

-      output_val_ref, output_val, cost_graph = self._run(output)
-      # Bump up the tolerance for the ROCm platform
-      # The default tolerance (1e-3) results in a tiny fraction (<1%) of
-      # miscompares on ROCm platform, and hence the tolerance bump
-      tol = 2e-3 if test.is_built_with_rocm else 1e-3
-      self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    # Bump up the tolerance for the ROCm platform
+    # The default tolerance (1e-3) results in a tiny fraction (<1%) of
+    # miscompares on ROCm platform, and hence the tolerance bump
+    tol = 2e-3 if test.is_built_with_rocm else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)

+  # TODO(reedwm): Fix and enable this test with MKL. Currently this crashes with
+  # MKL
+  @parameterized.parameters(['cuda'])
  @test_util.run_deprecated_v1
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv_pool(self):
+  def test_conv_pool(self, mode):
    """Test graph with convolution followed by pooling."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 1])
-      output = _conv_pool(x)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    output = _conv_pool(x)

-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    num_to_f16, num_to_fp32 = _count_casts(mode, cost_graph.node)

-      self._assert_output_fp16(node_map, 'Conv2D')
-      self._assert_output_fp16(node_map, 'Relu')
-      self._assert_output_fp16(node_map, 'MaxPool')
-      self._assert_output_fp16(node_map, 'Conv2D_1')
-      self.assertEqual(num_to_fp16, 4)
-      self.assertEqual(num_to_fp32, 1)
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'Conv2D')
+    self._assert_output_f16(mode, node_map, 'Relu')
+    self._assert_output_f16(mode, node_map, 'MaxPool')
+    self._assert_output_f16(mode, node_map, 'Conv2D_1')
+    self.assertEqual(num_to_f16, 4)
+    self.assertEqual(num_to_fp32, 1)
+    tol = 5e-3 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_v1_only('b/138749235')
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_simple_loop(self):
+  def test_simple_loop(self, mode):
    """Test graph with while loop."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      y = _simple_loop(x, _matmul_act)[1]
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (y, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    y = _simple_loop(x, _matmul_act)[1]
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (y, g)

-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)

-      self._assert_output_fp16(node_map, 'while/MatMul')
-      self._assert_output_fp16(node_map, 'while/Relu')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/MatMul')
+    self._assert_output_f16(mode, node_map, 'while/Relu')
+    tol = 1e-2 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_v1_only('b/138749235')
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_loop_with_vars_intertwined(self):
+  def test_loop_with_vars_intertwined(self, mode):
    """Test graph with intertwined while loops."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      _, _, k, l = _loop_vars_intertwined(
-          array_ops.ones(array_ops.shape(x)), x, _matmul_act, _matmul_act)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(k, [x])
-      output = (k, l, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    _, _, k, l = _loop_vars_intertwined(
+        array_ops.ones(array_ops.shape(x)), x, _matmul_act, _matmul_act)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(k, [x])
+    output = (k, l, g)

-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)

-      self._assert_output_fp16(node_map, 'while/MatMul')
-      self._assert_output_fp16(node_map, 'while/Relu')
-      self._assert_output_fp16(node_map, 'while/MatMul_1')
-      self._assert_output_fp16(node_map, 'while/Relu_1')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/MatMul')
+    self._assert_output_f16(mode, node_map, 'while/Relu')
+    self._assert_output_f16(mode, node_map, 'while/MatMul_1')
+    self._assert_output_f16(mode, node_map, 'while/Relu_1')
+    tol = 5e-3 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)

+  @parameterized.parameters(['cuda'])
  @test_util.run_deprecated_v1
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_multi_paths(self):
+  def test_multi_paths(self, mode):
    """Test graph with multiple paths."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 3])
-      x1, x2, x3 = array_ops.split(x, num_or_size_splits=3, axis=3)
-      y1 = _conv_pool(x1)
-      y2 = _conv_pool(x2)
-      y3 = _conv_pool(x3)
-      y = array_ops.concat([y1, y2, y3], axis=3)
-      y = array_ops.identity(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (y, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 3])
+    x1, x2, x3 = array_ops.split(x, num_or_size_splits=3, axis=3)
+    y1 = _conv_pool(x1)
+    y2 = _conv_pool(x2)
+    y3 = _conv_pool(x3)
+    y = array_ops.concat([y1, y2, y3], axis=3)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (y, g)

-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)

-      self._assert_output_fp16(node_map, 'split')
-      for suffix in [''] + ['_%i' % i for i in range(1, 6)]:
-        self._assert_output_fp16(node_map, 'Conv2D' + suffix)
-        self._assert_output_fp16(node_map, 'Relu' + suffix)
-        self._assert_output_fp16(node_map, 'MaxPool' + suffix)
-      self._assert_output_fp16(node_map, 'concat')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'split')
+    for suffix in [''] + ['_%i' % i for i in range(1, 6)]:
+      self._assert_output_f16(mode, node_map, 'Conv2D' + suffix)
+      self._assert_output_f16(mode, node_map, 'Relu' + suffix)
+      self._assert_output_f16(mode, node_map, 'MaxPool' + suffix)
+    self._assert_output_f16(mode, node_map, 'concat')
+    self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_deprecated_v1
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_multi_paths_2(self):
+  def test_multi_paths_2(self, mode):
    """Test graph with multiple paths."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      y1 = _matmul_act(x)
-      y2 = _matmul_act(x)
-      y = y1 + y2 + x
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (g, y)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    y1 = _matmul_act(x)
+    y2 = _matmul_act(x)
+    y = y1 + y2 + x
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (g, y)

-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)

-      self._assert_output_fp16(node_map, 'MatMul')
-      self._assert_output_fp16(node_map, 'Relu')
-      self._assert_output_fp16(node_map, 'MatMul_1')
-      self._assert_output_fp16(node_map, 'Relu_1')
+    self._assert_output_f16(mode, node_map, 'MatMul')
+    self._assert_output_f16(mode, node_map, 'Relu')
+    self._assert_output_f16(mode, node_map, 'MatMul_1')
+    self._assert_output_f16(mode, node_map, 'Relu_1')
+    if mode == 'mkl':
+      tol = 2e-2
+    elif test.is_built_with_rocm():
      # Bump up the tolerance for the ROCm platform
      # The default tolerance (1e-3) results in a tiny fraction (<1%) of
      # miscompares on ROCm platform, and hence the tolerance bump
-      tol = 2e-3 if test.is_built_with_rocm else 1e-3
-      self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+      tol = 2e-3
+    else:
+      tol = 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)

+  @parameterized.parameters(['cuda'])  # MKL doesn't support bf16 Sigmoid
  @test_util.run_v1_only('b/138749235')
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_recurrent_lstm(self):
+  def test_recurrent_lstm(self, mode):
    """Test graph with recurrent lstm."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      init_c = _input([8, 4])
-      init_h = _input([8, 4])
-      _, _, h, _ = _recurrent_lstm(init_c, init_h)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(h, [init_c, init_h])
-      output = (h, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    init_c = _input([8, 4])
+    init_h = _input([8, 4])
+    _, _, h, _ = _recurrent_lstm(init_c, init_h)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(h, [init_c, init_h])
+    output = (h, g)

-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)

-      self._assert_output_fp16(node_map, 'while/concat')
-      self._assert_output_fp16(node_map, 'while/MatMul')
-      self._assert_output_fp16(node_map, 'while/split')
-      self._assert_output_fp16(node_map, 'while/Sigmoid')
-      self._assert_output_fp16(node_map, 'while/Sigmoid_1')
-      self._assert_output_fp16(node_map, 'while/Sigmoid_2')
-      self._assert_output_fp16(node_map, 'while/Tanh')
-      self._assert_output_fp16(node_map, 'while/Tanh_1')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/concat')
+    self._assert_output_f16(mode, node_map, 'while/MatMul')
+    self._assert_output_f16(mode, node_map, 'while/split')
+    self._assert_output_f16(mode, node_map, 'while/Sigmoid')
+    self._assert_output_f16(mode, node_map, 'while/Sigmoid_1')
+    self._assert_output_f16(mode, node_map, 'while/Sigmoid_2')
+    self._assert_output_f16(mode, node_map, 'while/Tanh')
+    self._assert_output_f16(mode, node_map, 'while/Tanh_1')
+    self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_v1_only('v1 loop test')
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_1(self):
-    self._run_simple_loop_test('W', 'C', 'C')
+  def test_propagation_through_simple_loop_1(self, mode):
+    self._run_simple_loop_test(mode, 'W', 'C', 'C')

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_v1_only('v1 loop test')
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_2(self):
-    self._run_simple_loop_test('C', 'C', 'W')
+  def test_propagation_through_simple_loop_2(self, mode):
+    self._run_simple_loop_test(mode, 'C', 'C', 'W')

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_v1_only('v1 loop test')
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_3(self):
-    self._run_simple_loop_test('W', 'G', 'W')
+  def test_propagation_through_simple_loop_3(self, mode):
+    self._run_simple_loop_test(mode, 'W', 'G', 'W')

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_v1_only('v1 loop test')
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_4(self):
-    self._run_simple_loop_test('W', 'gbg', 'W')
+  def test_propagation_through_simple_loop_4(self, mode):
+    self._run_simple_loop_test(mode, 'W', 'gbg', 'W')

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_v1_only('b/138749235')
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_5(self):
-    self._run_simple_loop_test('b', 'gWC', 'c')
+  def test_propagation_through_simple_loop_5(self, mode):
+    self._run_simple_loop_test(mode, 'b', 'gWC', 'c')

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_v1_only('b/138749235')
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_6(self):
-    self._run_simple_loop_test('b', 'CWCG', 'C')
+  def test_propagation_through_simple_loop_6(self, mode):
+    self._run_simple_loop_test(mode, 'b', 'CWCG', 'C')

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_v1_only('b/138749235')
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_7(self):
-    self._run_simple_loop_test('C', 'GWCG', 'C')
+  def test_propagation_through_simple_loop_7(self, mode):
+    self._run_simple_loop_test(mode, 'C', 'GWCG', 'C')

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_v1_only('b/138749235')
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_8(self):
-    self._run_simple_loop_test('C', 'CgbgWC', 'g')
+  def test_propagation_through_simple_loop_8(self, mode):
+    self._run_simple_loop_test(mode, 'C', 'CgbgWC', 'g')

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_deprecated_v1
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_noninlined_funcdef(self):
+  def test_noninlined_funcdef(self, mode):
    """Test graph with non-inlined function subgraph.

    This requires the grappler pass to handle an OpDef that only appears in the
    graph's function registry instead of the global op registry.
+
+    Args:
+      mode: Either 'cuda' or 'mkl'.
    """
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      y = _matmul_act(x)
-      y = _example_noninlined_funcdef(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (g, y)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    y = _matmul_act(x)
+    y = _example_noninlined_funcdef(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (g, y)

-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)

-      self._assert_output_fp16(node_map, 'MatMul')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'MatMul')
+    tol = 1e-2 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)

+  @parameterized.parameters(['cuda', 'mkl'])
  @test_util.run_deprecated_v1
  @test_util.disable_xla('This test does not pass with XLA')
-  def test_ingraph_train_loop(self):
+  def test_ingraph_train_loop(self, mode):
    """Tests a graph containing a while loop around a training update.

    This requires the grappler pass to take special care with its handling of
    Enter ops that appear in front of reads from non-resource variables. See
    the use of NodeImplicitlyReadsVariable in auto_mixed_precision.cc.
+
+    Args:
+      mode: Either 'cuda' or 'mkl'.
    """
+    self._maybe_skip(mode)
    if tf2.enabled():
      # This test tests non-resource variables, which are only used in TF1.
      self.skipTest('TensorFlow 1 required')
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(1234)
-      np.random.seed(1234)
-      num_iter, bs, nchan, nclass = 100, 64, 32, 100
+    random_seed.set_random_seed(1234)
+    np.random.seed(1234)
+    num_iter, bs, nchan, nclass = 100, 64, 32, 100

-      data = np.random.normal(size=(bs * num_iter, nchan)).astype(np.float32)
-      labels = np.random.randint(nclass, size=(bs * num_iter,))
-      ds = dataset_ops.Dataset.from_tensor_slices((data, labels))
-      ds = ds.batch(bs).prefetch(3)
-      it = ds.make_one_shot_iterator()
+    data = np.random.normal(size=(bs * num_iter, nchan)).astype(np.float32)
+    labels = np.random.randint(nclass, size=(bs * num_iter,))
+    ds = dataset_ops.Dataset.from_tensor_slices((data, labels))
+    ds = ds.batch(bs).prefetch(3)
+    it = ds.make_one_shot_iterator()

-      def body(_, i):
-        i += 1
-        x, yt = it.get_next()
-        dense = layers.Dense(nclass)
-        y = dense(x)
-        loss = losses.sparse_softmax_cross_entropy(yt, y)
-        opt = adam.AdamOptimizer()
-        train_op = opt.minimize(loss, var_list=dense.trainable_weights)
-        with ops.control_dependencies([train_op]):
-          loss = array_ops.identity(loss)
-        return loss, i
+    def body(_, i):
+      i += 1
+      x, yt = it.get_next()
+      dense = layers.Dense(nclass)
+      y = dense(x)
+      loss = losses.sparse_softmax_cross_entropy(yt, y)
+      opt = adam.AdamOptimizer()
+      train_op = opt.minimize(loss, var_list=dense.trainable_weights)
+      with ops.control_dependencies([train_op]):
+        loss = array_ops.identity(loss)
+      return loss, i

-      begin, end = constant_op.constant(0), constant_op.constant(num_iter)
-      loss, _ = control_flow_ops.while_loop(
-          lambda loss, i: math_ops.less(i, end), body, [0.0, begin])
+    begin, end = constant_op.constant(0), constant_op.constant(num_iter)
+    loss, _ = control_flow_ops.while_loop(lambda loss, i: math_ops.less(i, end),
+                                          body, [0.0, begin])

-      output_val_ref, output_val, cost_graph = self._run(loss)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, loss)
+    node_map = _build_node_map(cost_graph.node)

-      self._assert_output_fp16(node_map, 'while/dense/MatMul')
-      self._assert_output_fp16(
-          node_map, 'while/gradients/while/dense/MatMul_grad/MatMul_1')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/dense/MatMul')
+    self._assert_output_f16(mode, node_map,
+                            'while/gradients/while/dense/MatMul_grad/MatMul_1')
+    self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)

  # TODO(benbarsdell): Add tests for list ops (TensorList*) that pass through
  # graph source/sink nodes, similar to the TensorListThroughFunction C++ test.