From d3dbe347fdd5c82fdd643fd231877832d4ec940c Mon Sep 17 00:00:00 2001 From: sshiddib <sharada.shiddibhavi@intel.com> Date: Tue, 26 May 2020 15:03:42 -0700 Subject: [PATCH 1/7] Removing OpenMP dependency from Mkl-dnn supporting threadpool --- .../mkl_threadpool_device_test.cc | 4 +-- tensorflow/core/kernels/mkl_conv_ops.cc | 5 ++-- tensorflow/core/kernels/mkl_conv_ops_test.cc | 2 +- tensorflow/core/kernels/mkl_qmatmul_op.cc | 27 +++++++++++++++++-- .../core/kernels/mkl_quantized_conv_ops.h | 21 +++++++++++++-- ...mkl_requantization_range_per_channel_op.cc | 22 +++++++++++++++ tensorflow/tensorflow.bzl | 4 +-- 7 files changed, 73 insertions(+), 12 deletions(-) diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc index 5d583a8360b..c29752d3c2c 100644 --- a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc +++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc @@ -25,7 +25,7 @@ limitations under the License. namespace tensorflow { -#ifdef _OPENMP +#if defined(_OPENMP) && !defined(ENABLE_MKLDNN_THREADPOOL) TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) { SessionOptions options; unsetenv("OMP_NUM_THREADS"); @@ -46,7 +46,7 @@ TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) { EXPECT_EQ(omp_get_max_threads(), 314); } -#endif // _OPENMP +#endif // defined(_OPENMP) && !defined(ENABLE_MKLDNN_THREADPOOL) } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 59de3229211..2208401c7b3 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -1493,7 +1493,8 @@ class MklQuantizedConv2DOp float max_output_value; MklQuantizationRangeForMultiplication<Tinput, qint8, qint32>( min_input, max_input, min_filter.flat<float>()(0), - max_filter.flat<float>()(0), &min_output_value, &max_output_value); + max_filter.flat<float>()(0), &min_output_value, &max_output_value, + context); AllocateOutputSetMklShape(context, 1, &output_min, {}, output_min_mkl_shape); AllocateOutputSetMklShape(context, 2, &output_max, {}, @@ -1510,7 +1511,7 @@ class MklQuantizedConv2DOp output_max_mkl_shape); MklQuantizationRangeForMultiplication<Tinput, qint8, qint32>( min_input, max_input, min_filter, max_filter, &output_min, - &output_max); + &output_max, context); } } } diff --git a/tensorflow/core/kernels/mkl_conv_ops_test.cc b/tensorflow/core/kernels/mkl_conv_ops_test.cc index a055351337c..9d11b0fb006 100644 --- a/tensorflow/core/kernels/mkl_conv_ops_test.cc +++ b/tensorflow/core/kernels/mkl_conv_ops_test.cc @@ -28,7 +28,7 @@ limitations under the License. #include "tensorflow/core/public/session.h" #if defined(INTEL_MKL_DNN_ONLY) -#include "third_party/intel_mkl_dnn/include/mkldnn.h" +#include "mkldnn.hpp" #include "tensorflow/core/util/mkl_util.h" #endif diff --git a/tensorflow/core/kernels/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl_qmatmul_op.cc index cc7127e0559..382566c4ab5 100644 --- a/tensorflow/core/kernels/mkl_qmatmul_op.cc +++ b/tensorflow/core/kernels/mkl_qmatmul_op.cc @@ -91,12 +91,15 @@ limitations under the License. // https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training #ifdef INTEL_MKL +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/fill_functor.h" #include "tensorflow/core/kernels/mkl_matmul_ops_common.h" #include "tensorflow/core/kernels/mkl_quantized_conv_ops.h" #include "tensorflow/core/kernels/no_op.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/util/mkl_threadpool.h" +#include "tensorflow/core/util/work_sharder.h" namespace { enum { @@ -342,7 +345,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> { const float max_weight = context->input(6).flat<float>()(0); MklQuantizationRangeForMultiplication<quint8, qint8, qint32>( min_input, max_input, min_weight, max_weight, min_output_value, - max_output_value); + max_output_value, context); } virtual void ExtendMklDnnMatMulFwdParams(OpKernelContext* context, @@ -428,6 +431,26 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> { ((max_input - min_input) * std::max(std::abs(max_weight), std::abs(min_weight))); +#ifdef ENABLE_MKLDNN_THREADPOOL + auto parallel_func = [&](int64 start, int64 end) { + for (int64 j = start ; j < end; j++) { + int x = 0; + for (int64 i = 0; i < k; ++i) { + x += wt_buf[i * n + j]; + } + comp_bias[j] = + ((bias_buf[j] * out_scale) + static_cast<float>(x * qa_amin)); + } + }; + + const float kArithCost = 2.5f; + const float kMovCost = 1.0f; + float shard_cost = 4*kArithCost + kMovCost; + const DeviceBase::CpuWorkerThreads& worker_threads = + *(context->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, n, shard_cost, + parallel_func); +#else #pragma omp parallel for schedule(static) for (int j = 0; j < n; ++j) { int x = 0; @@ -437,7 +460,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> { comp_bias[j] = ((bias_buf[j] * out_scale) + static_cast<float>(x * qa_amin)); } - +#endif // ENABLE_MKLDNN_THREADPOOL return reinterpret_cast<Tbias*>(comp_bias_); } else if (mode_ == QUANTIZE_MODE_SCALED) { diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h index fef2d837cf2..a95f8d29bfc 100644 --- a/tensorflow/core/kernels/mkl_quantized_conv_ops.h +++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h @@ -39,7 +39,8 @@ float MklFloatForOneQuantizedLevel(float range_min, float range_max) { template <class T1, class T2, class T3> void MklQuantizationRangeForMultiplication(float min_a, float max_a, float min_b, float max_b, - float* min_c, float* max_c) { + float* min_c, float* max_c, + OpKernelContext* context) { const float a_float_for_one_quant_level = MklFloatForOneQuantizedLevel<T1>(min_a, max_a); const float b_float_for_one_quant_level = @@ -59,7 +60,8 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a, const Tensor& min_b_vector, const Tensor& max_b_vector, Tensor** min_c_vector, - Tensor** max_c_vector) { + Tensor** max_c_vector, + OpKernelContext* context) { DCHECK(min_b_vector.NumElements() == (*min_c_vector)->NumElements()); DCHECK(max_b_vector.NumElements() == (*max_c_vector)->NumElements()); size_t n_channel = min_b_vector.NumElements(); @@ -69,6 +71,20 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a, const float* max_b = max_b_vector.flat<float>().data(); float* min_c = (*min_c_vector)->flat<float>().data(); float* max_c = (*max_c_vector)->flat<float>().data(); +#ifdef ENABLE_MKLDNN_THREADPOOL + auto eigen_tp = + MklDnnThreadPoolWrapper::GetInstance().CreateThreadPoolPtr(context); + eigen_tp->parallel_for(n_channel, [&](int n, int n_channel) { + float a_float_for_one_quant_level = + MklFloatForOneQuantizedLevel<T1>(min_a, max_a); + float b_float_for_one_quant_level = + MklFloatForOneQuantizedLevel<T2>(min_b[n], max_b[n]); + float c_float_for_one_quant_level = + a_float_for_one_quant_level * b_float_for_one_quant_level; + min_c[n] = c_float_for_one_quant_level * c_lowest; + max_c[n] = c_float_for_one_quant_level * c_highest; + }); +#else #pragma omp parallel for for (size_t n = 0; n < n_channel; ++n) { float a_float_for_one_quant_level = @@ -80,6 +96,7 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a, min_c[n] = c_float_for_one_quant_level * c_lowest; max_c[n] = c_float_for_one_quant_level * c_highest; } +#endif // ENABLE_MKLDNN_THREADPOOL } } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc index 767a6f1c397..0a19573d901 100644 --- a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc +++ b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/core/kernels/meta_support.h" #include "tensorflow/core/kernels/no_op.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/util/mkl_threadpool.h" #include "tensorflow/core/util/mkl_util.h" namespace tensorflow { @@ -73,6 +74,26 @@ class MklRequantizationRangePerChannelOp : public OpKernel { // Find the ranges of each channel in parallel. float out_min_max = std::numeric_limits<float>::min(); +#ifdef ENABLE_MKLDNN_THREADPOOL + // TODO: Add eigen parallel_for + for(size_t i = 0; i < depth; ++i) { + Eigen::Tensor<qint32, 0, Eigen::RowMajor> min = + transposed_input.chip<0>(i).minimum(); + Eigen::Tensor<qint32, 0, Eigen::RowMajor> max = + transposed_input.chip<0>(i).maximum(); + const int32_t min_per_channel = min(); + const int32_t max_per_channel = max(); + const int32_t abs_max = + std::max(std::abs(min_per_channel), std::abs(max_per_channel)); + float scale = + std::max(std::abs(input_min_data[i]), std::abs(input_max_data[i])); + ranges[i] = + scale * static_cast<float>(abs_max) / static_cast<float>(1L << 31); + if (min_per_channel < 0) is_non_negative = false; + + out_min_max = std::max(out_min_max, ranges[i]); + } +#else #pragma omp parallel for reduction(max : out_min_max) for (size_t i = 0; i < depth; ++i) { Eigen::Tensor<qint32, 0, Eigen::RowMajor> min = @@ -92,6 +113,7 @@ class MklRequantizationRangePerChannelOp : public OpKernel { // Thread-local out_min_max. out_min_max = std::max(out_min_max, ranges[i]); } +#endif // ENABLE_MKLDNN_THREADPOOL // All local out_min_max gets max-reduced into one global out_min_max at // the end of the loop by specifying reduction(max:out_min_max) along with // omp parallel for. diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 9a780839be3..5dc5877367b 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -354,9 +354,7 @@ def tf_copts( ) def tf_openmp_copts(): - # TODO(intel-mkl): Remove -fopenmp for threadpool after removing all - # omp pragmas in tensorflow/core. - return if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fopenmp"]) + return (if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fno-openmp"])) def tfe_xla_copts(): return select({ From 4da2360572103d436f9873f50cef8c940dc963b7 Mon Sep 17 00:00:00 2001 From: sshiddib <sharada.shiddibhavi@intel.com> Date: Tue, 26 May 2020 15:03:42 -0700 Subject: [PATCH 2/7] Removing OpenMP dependency from Mkl-dnn supporting threadpool --- .../mkl_threadpool_device_test.cc | 4 +-- tensorflow/core/kernels/mkl_conv_ops_test.cc | 2 +- tensorflow/core/kernels/mkl_qmatmul_op.cc | 25 ++++++++++++++++++- .../core/kernels/mkl_quantized_conv_ops.h | 14 +++++++++++ ...mkl_requantization_range_per_channel_op.cc | 22 ++++++++++++++++ tensorflow/tensorflow.bzl | 4 +-- 6 files changed, 64 insertions(+), 7 deletions(-) diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc index 5d583a8360b..c29752d3c2c 100644 --- a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc +++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc @@ -25,7 +25,7 @@ limitations under the License. namespace tensorflow { -#ifdef _OPENMP +#if defined(_OPENMP) && !defined(ENABLE_MKLDNN_THREADPOOL) TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) { SessionOptions options; unsetenv("OMP_NUM_THREADS"); @@ -46,7 +46,7 @@ TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) { EXPECT_EQ(omp_get_max_threads(), 314); } -#endif // _OPENMP +#endif // defined(_OPENMP) && !defined(ENABLE_MKLDNN_THREADPOOL) } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_conv_ops_test.cc b/tensorflow/core/kernels/mkl_conv_ops_test.cc index a055351337c..9d11b0fb006 100644 --- a/tensorflow/core/kernels/mkl_conv_ops_test.cc +++ b/tensorflow/core/kernels/mkl_conv_ops_test.cc @@ -28,7 +28,7 @@ limitations under the License. #include "tensorflow/core/public/session.h" #if defined(INTEL_MKL_DNN_ONLY) -#include "third_party/intel_mkl_dnn/include/mkldnn.h" +#include "mkldnn.hpp" #include "tensorflow/core/util/mkl_util.h" #endif diff --git a/tensorflow/core/kernels/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl_qmatmul_op.cc index cc7127e0559..d8bbc130c55 100644 --- a/tensorflow/core/kernels/mkl_qmatmul_op.cc +++ b/tensorflow/core/kernels/mkl_qmatmul_op.cc @@ -91,12 +91,15 @@ limitations under the License. // https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training #ifdef INTEL_MKL +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/fill_functor.h" #include "tensorflow/core/kernels/mkl_matmul_ops_common.h" #include "tensorflow/core/kernels/mkl_quantized_conv_ops.h" #include "tensorflow/core/kernels/no_op.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/util/mkl_threadpool.h" +#include "tensorflow/core/util/work_sharder.h" namespace { enum { @@ -428,6 +431,26 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> { ((max_input - min_input) * std::max(std::abs(max_weight), std::abs(min_weight))); +#ifdef ENABLE_MKLDNN_THREADPOOL + auto parallel_func = [&](int64 start, int64 end) { + for (int64 j = start ; j < end; j++) { + int x = 0; + for (int64 i = 0; i < k; ++i) { + x += wt_buf[i * n + j]; + } + comp_bias[j] = + ((bias_buf[j] * out_scale) + static_cast<float>(x * qa_amin)); + } + }; + + const float kArithCost = 2.5f; + const float kMovCost = 1.0f; + float shard_cost = 4*kArithCost + kMovCost; + const DeviceBase::CpuWorkerThreads& worker_threads = + *(context->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, n, shard_cost, + parallel_func); +#else #pragma omp parallel for schedule(static) for (int j = 0; j < n; ++j) { int x = 0; @@ -437,7 +460,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> { comp_bias[j] = ((bias_buf[j] * out_scale) + static_cast<float>(x * qa_amin)); } - +#endif // ENABLE_MKLDNN_THREADPOOL return reinterpret_cast<Tbias*>(comp_bias_); } else if (mode_ == QUANTIZE_MODE_SCALED) { diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h index fef2d837cf2..037a3a5f3ff 100644 --- a/tensorflow/core/kernels/mkl_quantized_conv_ops.h +++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h @@ -69,6 +69,19 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a, const float* max_b = max_b_vector.flat<float>().data(); float* min_c = (*min_c_vector)->flat<float>().data(); float* max_c = (*max_c_vector)->flat<float>().data(); +#ifdef ENABLE_MKLDNN_THREADPOOL + // TODO: Add eigen parallel_for + for(size_t n = 0; n < n_channel; ++n) { + float a_float_for_one_quant_level = + MklFloatForOneQuantizedLevel<T1>(min_a, max_a); + float b_float_for_one_quant_level = + MklFloatForOneQuantizedLevel<T2>(min_b[n], max_b[n]); + float c_float_for_one_quant_level = + a_float_for_one_quant_level * b_float_for_one_quant_level; + min_c[n] = c_float_for_one_quant_level * c_lowest; + max_c[n] = c_float_for_one_quant_level * c_highest; + } +#else #pragma omp parallel for for (size_t n = 0; n < n_channel; ++n) { float a_float_for_one_quant_level = @@ -80,6 +93,7 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a, min_c[n] = c_float_for_one_quant_level * c_lowest; max_c[n] = c_float_for_one_quant_level * c_highest; } +#endif // ENABLE_MKLDNN_THREADPOOL } } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc index 767a6f1c397..0a19573d901 100644 --- a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc +++ b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/core/kernels/meta_support.h" #include "tensorflow/core/kernels/no_op.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/util/mkl_threadpool.h" #include "tensorflow/core/util/mkl_util.h" namespace tensorflow { @@ -73,6 +74,26 @@ class MklRequantizationRangePerChannelOp : public OpKernel { // Find the ranges of each channel in parallel. float out_min_max = std::numeric_limits<float>::min(); +#ifdef ENABLE_MKLDNN_THREADPOOL + // TODO: Add eigen parallel_for + for(size_t i = 0; i < depth; ++i) { + Eigen::Tensor<qint32, 0, Eigen::RowMajor> min = + transposed_input.chip<0>(i).minimum(); + Eigen::Tensor<qint32, 0, Eigen::RowMajor> max = + transposed_input.chip<0>(i).maximum(); + const int32_t min_per_channel = min(); + const int32_t max_per_channel = max(); + const int32_t abs_max = + std::max(std::abs(min_per_channel), std::abs(max_per_channel)); + float scale = + std::max(std::abs(input_min_data[i]), std::abs(input_max_data[i])); + ranges[i] = + scale * static_cast<float>(abs_max) / static_cast<float>(1L << 31); + if (min_per_channel < 0) is_non_negative = false; + + out_min_max = std::max(out_min_max, ranges[i]); + } +#else #pragma omp parallel for reduction(max : out_min_max) for (size_t i = 0; i < depth; ++i) { Eigen::Tensor<qint32, 0, Eigen::RowMajor> min = @@ -92,6 +113,7 @@ class MklRequantizationRangePerChannelOp : public OpKernel { // Thread-local out_min_max. out_min_max = std::max(out_min_max, ranges[i]); } +#endif // ENABLE_MKLDNN_THREADPOOL // All local out_min_max gets max-reduced into one global out_min_max at // the end of the loop by specifying reduction(max:out_min_max) along with // omp parallel for. diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 9a780839be3..5dc5877367b 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -354,9 +354,7 @@ def tf_copts( ) def tf_openmp_copts(): - # TODO(intel-mkl): Remove -fopenmp for threadpool after removing all - # omp pragmas in tensorflow/core. - return if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fopenmp"]) + return (if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fno-openmp"])) def tfe_xla_copts(): return select({ From 6ae6ef636d608822696508e9535cdb39d547a0c6 Mon Sep 17 00:00:00 2001 From: Sharada Shiddibhavi <sharada.shiddibhavi@intel.com> Date: Mon, 8 Jun 2020 10:02:22 -0700 Subject: [PATCH 3/7] Addressing review comments --- .../core/kernels/mkl_quantized_conv_ops.h | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h index 037a3a5f3ff..37022f46113 100644 --- a/tensorflow/core/kernels/mkl_quantized_conv_ops.h +++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h @@ -69,21 +69,12 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a, const float* max_b = max_b_vector.flat<float>().data(); float* min_c = (*min_c_vector)->flat<float>().data(); float* max_c = (*max_c_vector)->flat<float>().data(); -#ifdef ENABLE_MKLDNN_THREADPOOL - // TODO: Add eigen parallel_for - for(size_t n = 0; n < n_channel; ++n) { - float a_float_for_one_quant_level = - MklFloatForOneQuantizedLevel<T1>(min_a, max_a); - float b_float_for_one_quant_level = - MklFloatForOneQuantizedLevel<T2>(min_b[n], max_b[n]); - float c_float_for_one_quant_level = - a_float_for_one_quant_level * b_float_for_one_quant_level; - min_c[n] = c_float_for_one_quant_level * c_lowest; - max_c[n] = c_float_for_one_quant_level * c_highest; - } -#else + +#ifndef ENABLE_MKLDNN_THREADPOOL #pragma omp parallel for - for (size_t n = 0; n < n_channel; ++n) { +#endif // ENABLE_MKLDNN_THREADPOOL + // TODO: Add eigen parallel_for + for (size_t n = 0; n < n_channel; ++n) { float a_float_for_one_quant_level = MklFloatForOneQuantizedLevel<T1>(min_a, max_a); float b_float_for_one_quant_level = @@ -93,7 +84,6 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a, min_c[n] = c_float_for_one_quant_level * c_lowest; max_c[n] = c_float_for_one_quant_level * c_highest; } -#endif // ENABLE_MKLDNN_THREADPOOL } } // namespace tensorflow From f32fcb56ffd2f6cbd362e5accab39b264a7d169f Mon Sep 17 00:00:00 2001 From: Sharada Shiddibhavi <sharada.shiddibhavi@intel.com> Date: Mon, 8 Jun 2020 10:05:49 -0700 Subject: [PATCH 4/7] Addressing review comments --- ...mkl_requantization_range_per_channel_op.cc | 24 ++++--------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc index 0a19573d901..3aa744c0a5e 100644 --- a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc +++ b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc @@ -74,27 +74,11 @@ class MklRequantizationRangePerChannelOp : public OpKernel { // Find the ranges of each channel in parallel. float out_min_max = std::numeric_limits<float>::min(); -#ifdef ENABLE_MKLDNN_THREADPOOL - // TODO: Add eigen parallel_for - for(size_t i = 0; i < depth; ++i) { - Eigen::Tensor<qint32, 0, Eigen::RowMajor> min = - transposed_input.chip<0>(i).minimum(); - Eigen::Tensor<qint32, 0, Eigen::RowMajor> max = - transposed_input.chip<0>(i).maximum(); - const int32_t min_per_channel = min(); - const int32_t max_per_channel = max(); - const int32_t abs_max = - std::max(std::abs(min_per_channel), std::abs(max_per_channel)); - float scale = - std::max(std::abs(input_min_data[i]), std::abs(input_max_data[i])); - ranges[i] = - scale * static_cast<float>(abs_max) / static_cast<float>(1L << 31); - if (min_per_channel < 0) is_non_negative = false; - out_min_max = std::max(out_min_max, ranges[i]); - } -#else +#ifndef ENABLE_MKLDNN_THREADPOOL #pragma omp parallel for reduction(max : out_min_max) +#endif // ENABLE_MKLDNN_THREADPOOL + // TODO: Add eigen parallel_for for (size_t i = 0; i < depth; ++i) { Eigen::Tensor<qint32, 0, Eigen::RowMajor> min = transposed_input.chip<0>(i).minimum(); @@ -113,7 +97,7 @@ class MklRequantizationRangePerChannelOp : public OpKernel { // Thread-local out_min_max. out_min_max = std::max(out_min_max, ranges[i]); } -#endif // ENABLE_MKLDNN_THREADPOOL + // All local out_min_max gets max-reduced into one global out_min_max at // the end of the loop by specifying reduction(max:out_min_max) along with // omp parallel for. From e2b4994f450f5e08c3718bcd8fc0e8c3cf984780 Mon Sep 17 00:00:00 2001 From: Sharada Shiddibhavi <sharada.shiddibhavi@intel.com> Date: Mon, 8 Jun 2020 10:07:11 -0700 Subject: [PATCH 5/7] Update mkl_quantized_conv_ops.h --- tensorflow/core/kernels/mkl_quantized_conv_ops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h index 37022f46113..442f6a53047 100644 --- a/tensorflow/core/kernels/mkl_quantized_conv_ops.h +++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h @@ -73,8 +73,8 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a, #ifndef ENABLE_MKLDNN_THREADPOOL #pragma omp parallel for #endif // ENABLE_MKLDNN_THREADPOOL - // TODO: Add eigen parallel_for - for (size_t n = 0; n < n_channel; ++n) { + // TODO: Add eigen parallel_for + for (size_t n = 0; n < n_channel; ++n) { float a_float_for_one_quant_level = MklFloatForOneQuantizedLevel<T1>(min_a, max_a); float b_float_for_one_quant_level = From 7fc6bf6726cee3f6178caaafeedb7a98f776cb02 Mon Sep 17 00:00:00 2001 From: Sharada Shiddibhavi <sharada.shiddibhavi@intel.com> Date: Mon, 8 Jun 2020 10:30:03 -0700 Subject: [PATCH 6/7] Update tensorflow/core/kernels/mkl_quantized_conv_ops.h Co-authored-by: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> --- tensorflow/core/kernels/mkl_quantized_conv_ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h index 442f6a53047..4121c88fb83 100644 --- a/tensorflow/core/kernels/mkl_quantized_conv_ops.h +++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h @@ -72,7 +72,7 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a, #ifndef ENABLE_MKLDNN_THREADPOOL #pragma omp parallel for -#endif // ENABLE_MKLDNN_THREADPOOL +#endif // !ENABLE_MKLDNN_THREADPOOL // TODO: Add eigen parallel_for for (size_t n = 0; n < n_channel; ++n) { float a_float_for_one_quant_level = From 7b4b07c098170bd891f2426ee9c043249ac41983 Mon Sep 17 00:00:00 2001 From: Sharada Shiddibhavi <sharada.shiddibhavi@intel.com> Date: Mon, 8 Jun 2020 10:31:00 -0700 Subject: [PATCH 7/7] Update mkl_requantization_range_per_channel_op.cc --- .../core/kernels/mkl_requantization_range_per_channel_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc index 3aa744c0a5e..a43f6a71acf 100644 --- a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc +++ b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc @@ -77,7 +77,7 @@ class MklRequantizationRangePerChannelOp : public OpKernel { #ifndef ENABLE_MKLDNN_THREADPOOL #pragma omp parallel for reduction(max : out_min_max) -#endif // ENABLE_MKLDNN_THREADPOOL +#endif // !ENABLE_MKLDNN_THREADPOOL // TODO: Add eigen parallel_for for (size_t i = 0; i < depth; ++i) { Eigen::Tensor<qint32, 0, Eigen::RowMajor> min =