diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc index a99640f92a7..998881defe0 100644 --- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc +++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc @@ -19,7 +19,6 @@ limitations under the License. #include "tensorflow/core/kernels/resize_nearest_neighbor_op.h" #include <memory> -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" @@ -28,6 +27,8 @@ limitations under the License. #include "tensorflow/core/kernels/image_resizer_state.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/util/work_sharder.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { @@ -68,13 +69,13 @@ class ResizeNearestNeighborOp : public OpKernel { /*half_pixe_centers=*/true, /*align_corners=*/true>()( context->eigen_device<Device>(), input_data, st.height_scale, - st.width_scale, output_data); + st.width_scale, output_data, context); } else { status = functor::ResizeNearestNeighbor<Device, T, /*half_pixe_centers=*/true, /*align_corners=*/false>()( context->eigen_device<Device>(), input_data, st.height_scale, - st.width_scale, output_data); + st.width_scale, output_data, context); } } else { if (align_corners_) { @@ -82,13 +83,13 @@ class ResizeNearestNeighborOp : public OpKernel { /*half_pixe_centers=*/false, /*align_corners=*/true>()( context->eigen_device<Device>(), input_data, st.height_scale, - st.width_scale, output_data); + st.width_scale, output_data, context); } else { status = functor::ResizeNearestNeighbor<Device, T, /*half_pixe_centers=*/false, /*align_corners=*/false>()( context->eigen_device<Device>(), input_data, st.height_scale, - st.width_scale, output_data); + st.width_scale, output_data, context); } } if (!status) { @@ -130,20 +131,25 @@ struct BoolToScaler<false> { namespace functor { template <typename T, bool half_pixel_centers, bool align_corners> struct ResizeNearestNeighbor<CPUDevice, T, half_pixel_centers, align_corners> { - bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input, - const float height_scale, const float width_scale, - typename TTypes<T, 4>::Tensor output) { - typename BoolToScaler<half_pixel_centers>::Scaler scaler; + bool ParallelExecute(const CPUDevice& d, + typename TTypes<T, 4>::ConstTensor input, + const float height_scale, const float width_scale, + typename TTypes<T, 4>::Tensor output, + OpKernelContext* c) { + const DeviceBase::CpuWorkerThreads& worker_threads = + *(c->device()->tensorflow_cpu_worker_threads()); const Eigen::Index batch_size = input.dimension(0); const Eigen::Index in_height = input.dimension(1); const Eigen::Index in_width = input.dimension(2); const Eigen::Index channels = input.dimension(3); - const Eigen::Index out_height = output.dimension(1); const Eigen::Index out_width = output.dimension(2); - - for (Eigen::Index b = 0; b < batch_size; ++b) { - for (Eigen::Index y = 0; y < out_height; ++y) { + typename BoolToScaler<half_pixel_centers>::Scaler scaler; + auto ParallelResize = [&](Eigen::Index start, Eigen::Index end) { + for (Eigen::Index b = start; b < end; ++b) { + Eigen::Index x = b % out_width; + Eigen::Index y = (b / out_width) % out_height; + Eigen::Index bs = (b / out_width) / out_height; Eigen::Index in_y = std::min( (align_corners) ? static_cast<Eigen::Index>(roundf(scaler(y, height_scale))) @@ -152,21 +158,30 @@ struct ResizeNearestNeighbor<CPUDevice, T, half_pixel_centers, align_corners> { if (half_pixel_centers) { in_y = std::max(static_cast<Eigen::Index>(0), in_y); } - for (Eigen::Index x = 0; x < out_width; ++x) { - Eigen::Index in_x = std::min( - (align_corners) - ? static_cast<Eigen::Index>(roundf(scaler(x, width_scale))) - : static_cast<Eigen::Index>(floorf(scaler(x, width_scale))), - in_width - 1); - if (half_pixel_centers) { - in_x = std::max(static_cast<Eigen::Index>(0), in_x); - } - std::copy_n(&input(b, in_y, in_x, 0), channels, &output(b, y, x, 0)); + Eigen::Index in_x = std::min( + (align_corners) + ? static_cast<Eigen::Index>(roundf(scaler(x, width_scale))) + : static_cast<Eigen::Index>(floorf(scaler(x, width_scale))), + in_width - 1); + if (half_pixel_centers) { + in_x = std::max(static_cast<Eigen::Index>(0), in_x); } + std::copy_n(&input(bs, in_y, in_x, 0), channels, &output(bs, y, x, 0)); } - } + }; + Eigen::Index N = batch_size * out_height * out_width; + Shard(worker_threads.num_threads, worker_threads.workers, N, 1000.0, + ParallelResize); // TODO: Come up with a good cost estimate: + // 3500:26~27fps, 1000:27~28fps. return true; } + bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input, + const float height_scale, const float width_scale, + typename TTypes<T, 4>::Tensor output, + OpKernelContext* context) { + return ParallelExecute(d, input, height_scale, width_scale, output, + context); + } }; } // namespace functor diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.h b/tensorflow/core/kernels/resize_nearest_neighbor_op.h index d6b053180ce..3f564bf23cc 100644 --- a/tensorflow/core/kernels/resize_nearest_neighbor_op.h +++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_ #define TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_ +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/platform/types.h" @@ -27,7 +28,8 @@ template <typename Device, typename T, bool half_pixel_centers, struct ResizeNearestNeighbor { bool operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input, const float height_scale, const float width_scale, - typename TTypes<T, 4>::Tensor output); + typename TTypes<T, 4>::Tensor output, + OpKernelContext* context = NULL); }; template <typename Device, typename T, bool half_pixel_centers, diff --git a/tensorflow/core/kernels/resize_op_benchmark_test.cc b/tensorflow/core/kernels/resize_op_benchmark_test.cc index 6b424502f6f..81564c7d7e0 100644 --- a/tensorflow/core/kernels/resize_op_benchmark_test.cc +++ b/tensorflow/core/kernels/resize_op_benchmark_test.cc @@ -49,9 +49,11 @@ static Graph* BM_Resize(const char* algorithm, int batches, int width, BENCHMARK(BM_Resize_##ALGORITHM##_##DEVICE##_##B##_##W##_##H) BM_ResizeDev(cpu, ResizeNearestNeighbor, 10, 499, 499); -BM_ResizeDev(gpu, ResizeNearestNeighbor, 10, 499, 499); - BM_ResizeDev(cpu, ResizeBilinear, 10, 499, 499); + +#if GOOGLE_CUDA +BM_ResizeDev(gpu, ResizeNearestNeighbor, 10, 499, 499); BM_ResizeDev(gpu, ResizeBilinear, 10, 499, 499); +#endif } // namespace tensorflow