diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc index 5d583a8360b..c29752d3c2c 100644 --- a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc +++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc @@ -25,7 +25,7 @@ limitations under the License. namespace tensorflow { -#ifdef _OPENMP +#if defined(_OPENMP) && !defined(ENABLE_MKLDNN_THREADPOOL) TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) { SessionOptions options; unsetenv("OMP_NUM_THREADS"); @@ -46,7 +46,7 @@ TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) { EXPECT_EQ(omp_get_max_threads(), 314); } -#endif // _OPENMP +#endif // defined(_OPENMP) && !defined(ENABLE_MKLDNN_THREADPOOL) } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_conv_ops_test.cc b/tensorflow/core/kernels/mkl_conv_ops_test.cc index a055351337c..9d11b0fb006 100644 --- a/tensorflow/core/kernels/mkl_conv_ops_test.cc +++ b/tensorflow/core/kernels/mkl_conv_ops_test.cc @@ -28,7 +28,7 @@ limitations under the License. #include "tensorflow/core/public/session.h" #if defined(INTEL_MKL_DNN_ONLY) -#include "third_party/intel_mkl_dnn/include/mkldnn.h" +#include "mkldnn.hpp" #include "tensorflow/core/util/mkl_util.h" #endif diff --git a/tensorflow/core/kernels/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl_qmatmul_op.cc index e73f30db4da..b59612433e6 100644 --- a/tensorflow/core/kernels/mkl_qmatmul_op.cc +++ b/tensorflow/core/kernels/mkl_qmatmul_op.cc @@ -91,12 +91,15 @@ limitations under the License. // https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training #ifdef INTEL_MKL +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/fill_functor.h" #include "tensorflow/core/kernels/mkl_matmul_ops_common.h" #include "tensorflow/core/kernels/mkl_quantized_conv_ops.h" #include "tensorflow/core/kernels/no_op.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/util/mkl_threadpool.h" +#include "tensorflow/core/util/work_sharder.h" namespace { enum { @@ -437,6 +440,26 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase { ((max_input - min_input) * std::max(std::abs(max_weight), std::abs(min_weight))); +#ifdef ENABLE_MKLDNN_THREADPOOL + auto parallel_func = [&](int64 start, int64 end) { + for (int64 j = start; j < end; j++) { + int x = 0; + for (int64 i = 0; i < k; ++i) { + x += wt_buf[i * n + j]; + } + comp_bias[j] = + ((bias_buf[j] * out_scale) + static_cast(x * qa_amin)); + } + }; + + const float kArithCost = 2.5f; + const float kMovCost = 1.0f; + float shard_cost = 4 * kArithCost + kMovCost; + const DeviceBase::CpuWorkerThreads& worker_threads = + *(context->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, n, shard_cost, + parallel_func); +#else #pragma omp parallel for schedule(static) for (int j = 0; j < n; ++j) { int x = 0; @@ -446,7 +469,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase { comp_bias[j] = ((bias_buf[j] * out_scale) + static_cast(x * qa_amin)); } - +#endif // ENABLE_MKLDNN_THREADPOOL return reinterpret_cast(comp_bias_); } else if (mode_ == QUANTIZE_MODE_SCALED) { diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h index fef2d837cf2..4121c88fb83 100644 --- a/tensorflow/core/kernels/mkl_quantized_conv_ops.h +++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h @@ -69,7 +69,11 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a, const float* max_b = max_b_vector.flat().data(); float* min_c = (*min_c_vector)->flat().data(); float* max_c = (*max_c_vector)->flat().data(); + +#ifndef ENABLE_MKLDNN_THREADPOOL #pragma omp parallel for +#endif // !ENABLE_MKLDNN_THREADPOOL + // TODO: Add eigen parallel_for for (size_t n = 0; n < n_channel; ++n) { float a_float_for_one_quant_level = MklFloatForOneQuantizedLevel(min_a, max_a); diff --git a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc index 767a6f1c397..0cd4843c0d8 100644 --- a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc +++ b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc @@ -18,6 +18,7 @@ limitations under the License. #define EIGEN_USE_THREADS #include + #include #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" @@ -28,6 +29,7 @@ limitations under the License. #include "tensorflow/core/kernels/meta_support.h" #include "tensorflow/core/kernels/no_op.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/util/mkl_threadpool.h" #include "tensorflow/core/util/mkl_util.h" namespace tensorflow { @@ -73,7 +75,11 @@ class MklRequantizationRangePerChannelOp : public OpKernel { // Find the ranges of each channel in parallel. float out_min_max = std::numeric_limits::min(); + +#ifndef ENABLE_MKLDNN_THREADPOOL #pragma omp parallel for reduction(max : out_min_max) +#endif // !ENABLE_MKLDNN_THREADPOOL + // TODO: Add eigen parallel_for for (size_t i = 0; i < depth; ++i) { Eigen::Tensor min = transposed_input.chip<0>(i).minimum(); @@ -92,6 +98,7 @@ class MklRequantizationRangePerChannelOp : public OpKernel { // Thread-local out_min_max. out_min_max = std::max(out_min_max, ranges[i]); } + // All local out_min_max gets max-reduced into one global out_min_max at // the end of the loop by specifying reduction(max:out_min_max) along with // omp parallel for. diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 50548740157..b1eef004d43 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -354,9 +354,7 @@ def tf_copts( ) def tf_openmp_copts(): - # TODO(intel-mkl): Remove -fopenmp for threadpool after removing all - # omp pragmas in tensorflow/core. - return if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fopenmp"]) + return (if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fno-openmp"])) def tfe_xla_copts(): return select({