From f0359d50ea4e65af72d458ff9c8ec938e6d8883c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 4 Mar 2021 23:01:25 -0800
Subject: [PATCH] Internal change

PiperOrigin-RevId: 361080379
Change-Id: Idb7c57795f47f3fdab9f6565f2c98e687ba5e15f
---
 .bazelrc                                               |  9 ++++++++-
 .../core/common_runtime/mkl_threadpool_device_test.cc  |  4 ++--
 tensorflow/core/common_runtime/process_util.cc         |  8 ++++----
 tensorflow/core/common_runtime/threadpool_device.cc    |  4 ++--
 tensorflow/core/kernels/mkl/mkl_concat_op.cc           |  4 ++--
 .../core/kernels/mkl/mkl_conv_grad_filter_ops.cc       |  4 ++--
 tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc |  4 ++--
 tensorflow/core/kernels/mkl/mkl_conv_ops.cc            |  4 ++--
 tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc |  8 ++++----
 tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc      |  4 ++--
 tensorflow/core/kernels/mkl/mkl_matmul_op.cc           |  4 ++--
 tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h    |  8 ++++----
 tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc  |  8 ++++----
 tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc          |  4 ++--
 tensorflow/core/kernels/mkl/mkl_quantize_op.cc         |  4 ++--
 tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h   |  4 ++--
 tensorflow/core/kernels/mkl/mkl_relu_op.cc             |  8 ++++----
 .../mkl/mkl_requantization_range_per_channel_op.cc     |  4 ++--
 tensorflow/core/kernels/mkl/mkl_slice_op.cc            |  4 ++--
 tensorflow/core/kernels/mkl/mkl_softmax_op.cc          |  4 ++--
 tensorflow/core/util/mkl_threadpool.h                  |  4 ++--
 tensorflow/core/util/mkl_util.h                        |  8 ++++----
 tensorflow/tensorflow.bzl                              |  4 ++--
 third_party/mkl_dnn/BUILD                              |  5 +++--
 third_party/mkl_dnn/build_defs.bzl                     | 10 +++++-----
 third_party/mkl_dnn/mkldnn_v1.BUILD                    |  7 ++++---
 26 files changed, 76 insertions(+), 67 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 858ebc094a4..7ca8b50fb7a 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -172,13 +172,20 @@ build:mkl -c opt
 build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl_threadpool --define=build_with_mkl_opensource=true
+build:mkl_threadpool --define=build_with_mkldnn_threadpool=true
 build:mkl_threadpool -c opt
 
+# Config setting to build with oneDNN and without the binary blob
+build:mkl_opensource_only --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl_opensource_only --define=tensorflow_mkldnn_contraction_kernel=0
+build:mkl_opensource_only --define=build_with_mkl_opensource=true
+build:mkl_opensource_only --define=build_with_openmp=true
+build:mkl_opensource_only -c opt
+
 # Config setting to build with oneDNN for Arm.
 build:mkl_aarch64 --define=build_with_mkl_aarch64=true --define=enable_mkl=true
 build:mkl_aarch64 --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl_aarch64 --define=build_with_mkl_opensource=true
-build:mkl_aarch64 --define=build_with_openmp=true
 build:mkl_aarch64 -c opt
 
 # This config refers to building with CUDA available. It does not necessarily
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
index 8301790c078..8d64f6e69db 100644
--- a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
+++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-#if defined(_OPENMP) && defined(ENABLE_ONEDNN_OPENMP)
+#if defined(_OPENMP) && !defined(ENABLE_MKLDNN_THREADPOOL)
 TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
   SessionOptions options;
   unsetenv("OMP_NUM_THREADS");
@@ -36,7 +36,7 @@ TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
   EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
 }
 
-#endif  // defined(_OPENMP) && defined(ENABLE_ONEDNN_OPENMP)
+#endif  // defined(_OPENMP) && !defined(ENABLE_MKLDNN_THREADPOOL)
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 2b32d2c54ed..8f87873a5bd 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -101,7 +101,7 @@ int32 NumIntraOpThreadsFromEnvironment() {
   const char* val = std::getenv("TF_NUM_INTRAOP_THREADS");
   return (val && strings::safe_strto32(val, &num)) ? num : 0;
 }
-#if defined(ENABLE_ONEDNN_OPENMP) && defined(INTEL_MKL)
+#if !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
 int32 OMPThreadsFromEnvironment() {
   // 1) std::getenv is thread-safe (as long as no other function modifies the
   // host env) from C++11 onward. 2) Most of TF code (except tests and
@@ -121,14 +121,14 @@ int32 DefaultNumIntraOpThreads() {
   // Default to the maximum parallelism for the current process.
   return port::MaxParallelism();
 }
-#endif  // defined(ENABLE_ONEDNN_OPENMP) && defined(INTEL_MKL)
+#endif  // !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   const int32 inter_op = options.config.inter_op_parallelism_threads();
   if (inter_op > 0) return inter_op;
   const int32 env_inter_op = GetEnvNumInterOpThreads();
   if (env_inter_op > 0) return env_inter_op;
 
-#if defined(ENABLE_ONEDNN_OPENMP) && defined(INTEL_MKL)
+#if !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
   if (!DisableMKL()) {
     // MKL library executes ops in parallel using OMP threads.
     // Setting inter_op conservatively to avoid thread oversubscription that
@@ -149,7 +149,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
         << ". Tune using inter_op_parallelism_threads for best performance.";
     return mkl_inter_op;
   }
-#endif  // defined(ENABLE_ONEDNN_OPENMP) && defined(INTEL_MKL)
+#endif  // !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
   return DefaultNumInterOpThreads();
 }
 
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 12926cf4035..02cd53221d4 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -50,7 +50,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
                                name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator),
       scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
-#if defined(ENABLE_ONEDNN_OPENMP) && defined(INTEL_MKL)
+#if !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
   // Early return when MKL is disabled
   if (DisableMKL()) return;
 #ifdef _OPENMP
@@ -65,7 +65,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
                     (mkl_intra_op + ht - 1) / ht);
   }
 #endif  // _OPENMP
-#endif  // defined(ENABLE_ONEDNN_OPENMP) && defined(INTEL_MKL)
+#endif  // !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
 }
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
diff --git a/tensorflow/core/kernels/mkl/mkl_concat_op.cc b/tensorflow/core/kernels/mkl/mkl_concat_op.cc
index 5e50b363a53..82208c2f64c 100644
--- a/tensorflow/core/kernels/mkl/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_concat_op.cc
@@ -280,7 +280,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
                std::shared_ptr<stream> fwd_stream) {
     DCHECK_EQ(in_data.size(), context_.data_mem.size());
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
       context_.data_mem_shdptr[i]->set_data_handle(
           static_cast<void*>(in_data[i].get_data_handle()), *fwd_stream);
     }
@@ -292,7 +292,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     }
     context_.dst_mem->set_data_handle(
         static_cast<void*>(dst_data.get_data_handle()));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
       context_.data_mem[i] = *context_.data_mem_shdptr[i];
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
index 77de34be289..bba1d167c82 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
@@ -104,7 +104,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   void Execute(const T* src_data, const T* diff_filter_data,
                const T* diff_bias_data, const T* diff_dst_data,
                std::shared_ptr<stream> bwd_filter_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     // TODO: Create a common function and avoid the duplicate code
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)), *bwd_filter_stream);
@@ -129,7 +129,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
     }
     context_.diff_dst_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(diff_dst_data)));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
     execute_primitives(context_.bwd_filter_primitives, bwd_filter_stream,
                        context_.bwd_filter_primitives_args);
 
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
index f9b6578f943..97b7ac1d960 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
@@ -106,7 +106,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   void Execute(const T* diff_src_data, const T* filter_data,
                const T* diff_dst_data,
                std::shared_ptr<stream> bwd_input_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     // TODO: Create a common function and avoid the duplicate code
     context_.diff_src_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_src_data)), *bwd_input_stream);
@@ -121,7 +121,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
         static_cast<T*>(const_cast<T*>(filter_data)));
     context_.diff_dst_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_dst_data)));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
     execute_primitives(context_.bwd_input_primitives, bwd_input_stream,
                        context_.bwd_input_primitives_args);
 
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
index 9d3ad960d89..f6d31a84e61 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
@@ -114,7 +114,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
   void Execute(const Tinput* src_data, const Tfilter* filter_data,
                const Tbias* bias_data, const Toutput* dst_data,
                std::shared_ptr<stream> fwd_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     // TODO: Create a common function and avoid the duplicate code
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<Tinput*>(src_data)), *fwd_stream);
@@ -137,7 +137,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
     }
     context_.dst_mem->set_data_handle(
         static_cast<void*>(const_cast<Toutput*>(dst_data)));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
     DCHECK_EQ(context_.fwd_primitives.size(),
               context_.fwd_primitives_args.size());
diff --git a/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
index e2d7588228b..e498651697d 100644
--- a/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
@@ -78,7 +78,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   void Execute(const T* src_data, const U* weights_data, T* dst_data,
                U* mean_data, U* variance_data,
                std::shared_ptr<stream> fwd_stream, U* workspace_data) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     // TODO: Create a common function and avoid the duplicate code
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)), *fwd_stream);
@@ -116,7 +116,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     if (workspace_data != nullptr) {
       context_.ws_mem->set_data_handle(workspace_data);
     }
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
     // Execute batch-normalization forward primitives.
     execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
@@ -422,7 +422,7 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
                const T* diff_dst_data, const U* weights_data, T* diff_src_data,
                U* diff_weights_data, U* res_space_data,
                std::shared_ptr<stream> bwd_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     // TODO: Create a common function and avoid the duplicate code
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)), *bwd_stream);
@@ -460,7 +460,7 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
     }
 
     context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
     // Execute backward batch-normalization primitives.
     DCHECK_EQ(context_.bwd_primitives.size(), context_.net_args.size());
     execute_primitives(context_.bwd_primitives, bwd_stream, context_.net_args);
diff --git a/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
index 00fa4243a40..af2b61d4c86 100644
--- a/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
@@ -1211,7 +1211,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(Test, MklFusedMatMulOpTest,
 // This test is flaky for --config=mkl_threadpool (The supposedly cached op
 // sometimes took longer than even 0.9 * original_time.)
 // TODO(intel-tf): Re-enable the test for --config=mkl_threadpool.
-#ifdef ENABLE_ONEDNN_OPENMP
+#ifndef ENABLE_MKLDNN_THREADPOOL
 // Test the performance of MklFusedMatMul weight cache.
 // For the first time B matrix will be reordered and cached which will be
 // used for subsequent runs
@@ -1314,7 +1314,7 @@ TEST_F(MklFusedMatMulCacheTest, WeightCached) {
     test::ExpectTensorNear<float>(expected, output_new, 1e-5);
   }
 }
-#endif  // ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
 class BiasCacheTest : public OpsTestBase {
  public:
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
index 98a3e57f239..2e11f9242b4 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
@@ -155,14 +155,14 @@ class MklMatMulOp : public OpKernel {
     char char_transa = transa ? 'T' : 'N';
     char char_transb = transb ? 'T' : 'N';
     VLOG(2) << "MKL DNN SGEMM called";
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     MklDnnThreadPool eigen_tp(ctx);
     dnnl_sgemm_tp(char_transa, char_transb, m, n, k, alpha, a, lda, b, ldb,
                   beta, c, ldc, &eigen_tp);
 #else
     dnnl_sgemm(char_transa, char_transb, m, n, k, alpha, a, lda, b, ldb, beta,
                c, ldc);
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
   }
 
   void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m,
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index fd7a9d78a0d..dc915eeb606 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -95,7 +95,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
   void Execute(const Tinput* src_data, const Tweight* weight_data,
                const Tbias* bias_data, Toutput* dst_data,
                std::shared_ptr<stream> fwd_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<Tinput*>(src_data)), *fwd_stream);
     context_.weight_mem->set_data_handle(
@@ -112,7 +112,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
     context_.bias_mem->set_data_handle(
         static_cast<void*>(const_cast<Tbias*>(bias_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
     execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
 
@@ -534,7 +534,7 @@ class MklMatMulPrimitive : public MklPrimitive {
 
   void Execute(const T* a_data, const T* b_data, T* c_data,
                std::shared_ptr<stream> stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     context_.a_mem->set_data_handle(static_cast<void*>(const_cast<T*>(a_data)),
                                     *stream);
     context_.b_mem->set_data_handle(static_cast<void*>(const_cast<T*>(b_data)),
@@ -545,7 +545,7 @@ class MklMatMulPrimitive : public MklPrimitive {
     context_.a_mem->set_data_handle(static_cast<void*>(const_cast<T*>(a_data)));
     context_.b_mem->set_data_handle(static_cast<void*>(const_cast<T*>(b_data)));
     context_.c_mem->set_data_handle(static_cast<void*>(const_cast<T*>(c_data)));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
     execute_primitives(context_.matmul_primitives, stream, context_.net_args);
 
     // After execution, set data handle back
diff --git a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
index 466d20a687e..522005b71ef 100644
--- a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
@@ -86,7 +86,7 @@ template <typename T>
 void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
                                         void* ws_data,
                                         std::shared_ptr<stream> fwd_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
   context_.src_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(src_data)), *fwd_stream);
   context_.dst_mem->set_data_handle(static_cast<void*>(dst_data), *fwd_stream);
@@ -106,7 +106,7 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(ws_data);
   }
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
   execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
 
   // Set back data handle.
@@ -188,7 +188,7 @@ template <typename T>
 void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
                                         T* diff_src_data, const void* ws_data,
                                         std::shared_ptr<stream> bwd_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
   context_.diff_dst_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(diff_dst_data)), *bwd_stream);
   context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data),
@@ -205,7 +205,7 @@ void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(const_cast<void*>(ws_data));
   }
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
   execute_primitives(context_.bwd_primitives, bwd_stream, context_.net_args);
 
diff --git a/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
index b7431d46789..5bfc2661e86 100644
--- a/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
@@ -431,7 +431,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> {
                     ((max_input - min_input) *
                      std::max(std::abs(max_weight), std::abs(min_weight)));
 
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
         auto parallel_func = [&](int64 start, int64 end) {
           for (int64 j = start; j < end; j++) {
             int x = 0;
@@ -460,7 +460,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> {
           comp_bias[j] =
               ((bias_buf[j] * out_scale) + static_cast<float>(x * qa_amin));
         }
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
         return reinterpret_cast<Tbias*>(comp_bias_);
 
       } else if (mode_ == QUANTIZE_MODE_SCALED) {
diff --git a/tensorflow/core/kernels/mkl/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
index eff346f2c45..a96d1a59be0 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
@@ -87,13 +87,13 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
 
   void Execute(void* src_data, void* dst_data,
                std::shared_ptr<stream> reorder_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     context_.src_mem->set_data_handle(src_data, *reorder_stream);
     context_.dst_mem->set_data_handle(dst_data, *reorder_stream);
 #else
     context_.src_mem->set_data_handle(src_data);
     context_.dst_mem->set_data_handle(dst_data);
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
     context_.reorder_prim->execute(*reorder_stream, context_.prim_args);
     // After execution, set data handle back.
     context_.src_mem->set_data_handle(DummyData);
diff --git a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
index 88c48a6c93d..1624a00331a 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h
@@ -70,9 +70,9 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a,
   float* min_c = (*min_c_vector)->flat<float>().data();
   float* max_c = (*max_c_vector)->flat<float>().data();
 
-#ifdef ENABLE_ONEDNN_OPENMP
+#ifndef ENABLE_MKLDNN_THREADPOOL
 #pragma omp parallel for
-#endif  // ENABLE_ONEDNN_OPENMP
+#endif  // !ENABLE_MKLDNN_THREADPOOL
   // TODO: Add eigen parallel_for
   for (int64_t n = 0; n < n_channel; ++n) {
     float a_float_for_one_quant_level =
diff --git a/tensorflow/core/kernels/mkl/mkl_relu_op.cc b/tensorflow/core/kernels/mkl/mkl_relu_op.cc
index 09cac9c2419..ffbc6697742 100644
--- a/tensorflow/core/kernels/mkl/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_relu_op.cc
@@ -74,7 +74,7 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
   //   dst_data:  output data buffer of dst
   void Execute(const T* src_data, T* dst_data,
                std::shared_ptr<stream> fwd_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)), *fwd_stream);
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data),
@@ -83,7 +83,7 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
     DCHECK_EQ(context_.fwd_primitives.size(),
               context_.fwd_primitives_args.size());
     execute_primitives(context_.fwd_primitives, fwd_stream,
@@ -255,7 +255,7 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
   //   diff_src_data:  output data buffer of diff_src
   void Execute(const T* src_data, const T* diff_dst_data, T* diff_src_data,
                std::shared_ptr<stream> bwd_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)), *bwd_stream);
     context_.diff_dst_mem->set_data_handle(
@@ -268,7 +268,7 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
     context_.diff_dst_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(diff_dst_data)));
     context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
     DCHECK_EQ(context_.bwd_primitives.size(),
               context_.bwd_primitives_args.size());
     execute_primitives(context_.bwd_primitives, bwd_stream,
diff --git a/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
index 24dabb07ca0..f6bc773de4f 100644
--- a/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_requantization_range_per_channel_op.cc
@@ -76,13 +76,13 @@ class MklRequantizationRangePerChannelOp : public OpKernel {
     // Find the ranges of each channel in parallel.
     float out_min_max = std::numeric_limits<float>::min();
 
-#ifdef ENABLE_ONEDNN_OPENMP
+#ifndef ENABLE_MKLDNN_THREADPOOL
 #ifdef _MSC_VER
 #pragma omp parallel for
 #else
 #pragma omp parallel for reduction(max : out_min_max)
 #endif
-#endif  // ENABLE_ONEDNN_OPENMP
+#endif  // !ENABLE_MKLDNN_THREADPOOL
     // TODO: Add eigen parallel_for
     for (int64_t i = 0; i < depth; ++i) {
       Eigen::Tensor<qint32, 0, Eigen::RowMajor> min =
diff --git a/tensorflow/core/kernels/mkl/mkl_slice_op.cc b/tensorflow/core/kernels/mkl/mkl_slice_op.cc
index 4de404a8793..a956cf66d40 100644
--- a/tensorflow/core/kernels/mkl/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_slice_op.cc
@@ -185,7 +185,7 @@ class MklSlicePrimitive : public MklPrimitive {
 
   void Execute(const MklSliceParams& sliceParams,
                std::shared_ptr<stream> slice_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     context_.src_mem->set_data_handle(sliceParams.from->get_data_handle(),
                                       *slice_stream);
     context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle(),
@@ -193,7 +193,7 @@ class MklSlicePrimitive : public MklPrimitive {
 #else
     context_.src_mem->set_data_handle(sliceParams.from->get_data_handle());
     context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle());
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
     execute_primitives(context_.slice_primitives, slice_stream,
                        context_.slice_primitives_args);
diff --git a/tensorflow/core/kernels/mkl/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
index 37dee52f32f..f436f0feec8 100644
--- a/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
@@ -58,7 +58,7 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   //   dst_data:  output data buffer of dst
   void Execute(const T* src_data, T* dst_data,
                std::shared_ptr<stream> fwd_cpu_stream) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)), *fwd_cpu_stream);
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data),
@@ -67,7 +67,7 @@ class MklSoftmaxPrimitive : public MklPrimitive {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
     DCHECK_EQ(context_.fwd_primitives.size(), context_.fwd_net_args.size());
     execute_primitives(context_.fwd_primitives, fwd_cpu_stream,
diff --git a/tensorflow/core/util/mkl_threadpool.h b/tensorflow/core/util/mkl_threadpool.h
index 1184a55c5f9..713c8ea13fa 100644
--- a/tensorflow/core/util/mkl_threadpool.h
+++ b/tensorflow/core/util/mkl_threadpool.h
@@ -32,7 +32,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
 using dnnl::stream_attr;
 using dnnl::threadpool_iface;
 
@@ -116,7 +116,7 @@ struct MklDnnThreadPool {
   MklDnnThreadPool(OpKernelContext* ctx) {}
 };
 
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index eb9d4c349dd..0a997599357 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -224,7 +224,7 @@ inline bool array_cmp(const T* a1, const T* a2, size_t size) {
 
 inline mkldnn::stream* CreateStream(MklDnnThreadPool* eigen_tp,
                                     const engine& engine) {
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
   stream_attr tp_stream_attr(engine::kind::cpu);
   if (eigen_tp != nullptr) {
     tp_stream_attr.set_threadpool(eigen_tp);
@@ -238,7 +238,7 @@ inline mkldnn::stream* CreateStream(MklDnnThreadPool* eigen_tp,
 #else
   stream* tp_stream = new stream(engine);
   return tp_stream;
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
 }
 
 class MklDnnShape {
@@ -1390,11 +1390,11 @@ class MklDnnData {
                                   std::shared_ptr<stream> t_stream = nullptr) {
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(data_buffer);
-#ifndef ENABLE_ONEDNN_OPENMP
+#ifdef ENABLE_MKLDNN_THREADPOOL
     user_memory_->set_data_handle(data_buffer, *t_stream);
 #else
     user_memory_->set_data_handle(data_buffer);
-#endif  // !ENABLE_ONEDNN_OPENMP
+#endif  // ENABLE_MKLDNN_THREADPOOL
   }
 
   /// Set function for data buffer of user memory primitive.
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d589714aae1..bac172990f1 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -39,7 +39,7 @@ load(
 load(
     "//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
-    "if_mkldnn_openmp",
+    "if_mkldnn_threadpool",
 )
 load("@bazel_skylib//lib:new_sets.bzl", "sets")
 load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo")
@@ -364,7 +364,7 @@ def tf_copts(
         if_xla_available(["-DTENSORFLOW_USE_XLA=1"]) +
         if_tensorrt(["-DGOOGLE_TENSORRT=1"]) +
         if_mkl(["-DINTEL_MKL=1"]) +
-        if_mkldnn_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
+        if_mkldnn_threadpool(["-DENABLE_MKLDNN_THREADPOOL"]) +
         if_enable_mkl(["-DENABLE_MKL"]) +
         if_android_arm(["-mfpu=neon"]) +
         if_linux_x86_64(["-msse3"]) +
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
index 14988dae575..e7051774570 100644
--- a/third_party/mkl_dnn/BUILD
+++ b/third_party/mkl_dnn/BUILD
@@ -19,10 +19,11 @@ config_setting(
 )
 
 config_setting(
-    name = "build_with_mkldnn_openmp",
+    name = "build_with_mkldnn_threadpool",
     define_values = {
         "build_with_mkl": "true",
-        "build_with_openmp": "true",
+        "build_with_mkl_opensource": "true",
+        "build_with_mkldnn_threadpool": "true",
     },
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index e24c6e3b018..b3bbd3b087c 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -14,18 +14,18 @@ def if_mkl_open_source_only(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
-def if_mkldnn_openmp(if_true, if_false = []):
-    """Returns `if_true` if OpenMP is used with oneDNN.
+def if_mkldnn_threadpool(if_true, if_false = []):
+    """Returns `if_true` if MKL-DNN v1.x is used.
 
     Shorthand for select()'ing on whether we're building with
-    oneDNN open source library only with openmp
+    MKL-DNN v1.x open source library only with user specified threadpool, without depending on MKL binary form.
 
     Returns a select statement which evaluates to if_true if we're building
-    with oneDNN open source library only with OpenMP. Otherwise, the
+    with MKL-DNN v1.x open source library only with user specified threadpool. Otherwise, the
     select statement evaluates to if_false.
 
     """
     return select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_openmp": if_true,
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": if_true,
         "//conditions:default": if_false,
     })
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 2eef9885f51..fee42ccb4ea 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -10,7 +10,8 @@ load(
 )
 load(
     "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
-    "if_mkldnn_openmp",
+    "if_mkl_open_source_only",
+    "if_mkldnn_threadpool",
 )
 load(
     "@org_tensorflow//third_party/mkl:build_defs.bzl",
@@ -44,8 +45,8 @@ template_rule(
     src = "include/dnnl_config.h.in",
     out = "include/dnnl_config.h",
     substitutions = select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_openmp": _DNNL_RUNTIME_OMP,
-        "@org_tensorflow//third_party/mkl:build_with_mkl": _DNNL_RUNTIME_THREADPOOL,
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": _DNNL_RUNTIME_THREADPOOL,
+        "@org_tensorflow//third_party/mkl:build_with_mkl": _DNNL_RUNTIME_OMP,
         "//conditions:default": _DNNL_RUNTIME_SEQ,
     }),
 )