From 2002d5e2834fe0150ac8007b620e1c33f462655a Mon Sep 17 00:00:00 2001
From: leslie-fang-intel <leslie.fang@intel.com>
Date: Wed, 20 Nov 2019 20:30:38 +0800
Subject: [PATCH] remove ParallelExecute and use d.pallelFor instead of Shard

---
 .../kernels/resize_nearest_neighbor_op.cc     | 29 +++++--------------
 .../core/kernels/resize_nearest_neighbor_op.h |  3 +-
 .../core/kernels/resize_op_benchmark_test.cc  |  2 +-
 3 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
index 998881defe0..71f964dc1f6 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@@ -69,13 +69,13 @@ class ResizeNearestNeighborOp : public OpKernel {
                                                 /*half_pixe_centers=*/true,
                                                 /*align_corners=*/true>()(
             context->eigen_device<Device>(), input_data, st.height_scale,
-            st.width_scale, output_data, context);
+            st.width_scale, output_data);
       } else {
         status = functor::ResizeNearestNeighbor<Device, T,
                                                 /*half_pixe_centers=*/true,
                                                 /*align_corners=*/false>()(
             context->eigen_device<Device>(), input_data, st.height_scale,
-            st.width_scale, output_data, context);
+            st.width_scale, output_data);
       }
     } else {
       if (align_corners_) {
@@ -83,13 +83,13 @@ class ResizeNearestNeighborOp : public OpKernel {
                                                 /*half_pixe_centers=*/false,
                                                 /*align_corners=*/true>()(
             context->eigen_device<Device>(), input_data, st.height_scale,
-            st.width_scale, output_data, context);
+            st.width_scale, output_data);
       } else {
         status = functor::ResizeNearestNeighbor<Device, T,
                                                 /*half_pixe_centers=*/false,
                                                 /*align_corners=*/false>()(
             context->eigen_device<Device>(), input_data, st.height_scale,
-            st.width_scale, output_data, context);
+            st.width_scale, output_data);
       }
     }
     if (!status) {
@@ -131,13 +131,9 @@ struct BoolToScaler<false> {
 namespace functor {
 template <typename T, bool half_pixel_centers, bool align_corners>
 struct ResizeNearestNeighbor<CPUDevice, T, half_pixel_centers, align_corners> {
-  bool ParallelExecute(const CPUDevice& d,
-                       typename TTypes<T, 4>::ConstTensor input,
-                       const float height_scale, const float width_scale,
-                       typename TTypes<T, 4>::Tensor output,
-                       OpKernelContext* c) {
-    const DeviceBase::CpuWorkerThreads& worker_threads =
-        *(c->device()->tensorflow_cpu_worker_threads());
+  bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
     const Eigen::Index batch_size = input.dimension(0);
     const Eigen::Index in_height = input.dimension(1);
     const Eigen::Index in_width = input.dimension(2);
@@ -170,18 +166,9 @@ struct ResizeNearestNeighbor<CPUDevice, T, half_pixel_centers, align_corners> {
       }
     };
     Eigen::Index N = batch_size * out_height * out_width;
-    Shard(worker_threads.num_threads, worker_threads.workers, N, 1000.0,
-          ParallelResize);  // TODO: Come up with a good cost estimate:
-                            // 3500:26~27fps, 1000:27~28fps.
+    d.parallelFor(N, Eigen::TensorOpCost(0, 0, 1000.0), ParallelResize);
     return true;
   }
-  bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
-                  const float height_scale, const float width_scale,
-                  typename TTypes<T, 4>::Tensor output,
-                  OpKernelContext* context) {
-    return ParallelExecute(d, input, height_scale, width_scale, output,
-                           context);
-  }
 };
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.h b/tensorflow/core/kernels/resize_nearest_neighbor_op.h
index 3f564bf23cc..46264790b34 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.h
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.h
@@ -28,8 +28,7 @@ template <typename Device, typename T, bool half_pixel_centers,
 struct ResizeNearestNeighbor {
   bool operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
                   const float height_scale, const float width_scale,
-                  typename TTypes<T, 4>::Tensor output,
-                  OpKernelContext* context = NULL);
+                  typename TTypes<T, 4>::Tensor output);
 };
 
 template <typename Device, typename T, bool half_pixel_centers,
diff --git a/tensorflow/core/kernels/resize_op_benchmark_test.cc b/tensorflow/core/kernels/resize_op_benchmark_test.cc
index 81564c7d7e0..e77cbf1c20a 100644
--- a/tensorflow/core/kernels/resize_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/resize_op_benchmark_test.cc
@@ -51,7 +51,7 @@ static Graph* BM_Resize(const char* algorithm, int batches, int width,
 BM_ResizeDev(cpu, ResizeNearestNeighbor, 10, 499, 499);
 BM_ResizeDev(cpu, ResizeBilinear, 10, 499, 499);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_ResizeDev(gpu, ResizeNearestNeighbor, 10, 499, 499);
 BM_ResizeDev(gpu, ResizeBilinear, 10, 499, 499);
 #endif