Prevent integer truncation from 64 to 32 bits.

The `tensorflow::Shard` functions last argument must be a 2 argument function where both arguments are `int64` (`long long`, 64 bits). However, there are usages where code passes in a function where arguments are `int` or `int32` (32 bits). In these cases, it is possible that the integer truncation would later cause a segfault or other unexpected behavior. PiperOrigin-RevId: 332560414 Change-Id: Ief649406babc8d4f60b3e7a9d573cbcc5ce5b767
2020-09-18 17:49:02 -07:00 · 2020-09-18 17:49:02 -07:00 · ca8c013b5e
commit ca8c013b5e
parent b946521465
9 changed files with 17 additions and 15 deletions
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@ -121,7 +121,7 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
      auto do_work = [&resource, &bucketized_features, &cached_tree_ids,
                      &cached_node_ids, &output_partial_logits,
                      &output_node_ids, latest_tree,
-                      this](int32 start, int32 end) {
+                      this](int64 start, int64 end) {
        for (int32 i = start; i < end; ++i) {
          int32 tree_id = cached_tree_ids(i);
          int32 node_id = cached_node_ids(i);
@ -237,7 +237,7 @@ class BoostedTreesPredictOp : public OpKernel {
    const int32 last_tree = resource->num_trees() - 1;
    auto do_work = [&resource, &bucketized_features, &output_logits, last_tree,
-                    this](int32 start, int32 end) {
+                    this](int64 start, int64 end) {
      for (int32 i = start; i < end; ++i) {
        std::vector<float> tree_logits(logits_dimension_, 0.0);
        int32 tree_id = 0;
@ -340,7 +340,7 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
    // path. Note: feature_ids has one less value than logits_path because the
    // first value of each logit path will be the bias.
    auto do_work = [&resource, &bucketized_features, &output_debug_info,
-                    last_tree](int32 start, int32 end) {
+                    last_tree](int64 start, int64 end) {
      for (int32 i = start; i < end; ++i) {
        // Proto to store debug outputs, per example.
        boosted_trees::DebugOutput example_debug_info;
--- a/tensorflow/core/kernels/image/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc
@ -223,7 +223,7 @@ struct CropAndResize<CPUDevice, T> {
    const int depth = crops.dimension(3);
    // Sharding across boxes.
-    auto CropAndResizePerBox = [&](int start_box, int limit_box) {
+    auto CropAndResizePerBox = [&](int64 start_box, int64 limit_box) {
      for (int b = start_box; b < limit_box; ++b) {
        const float y1 = boxes(b, 0);
        const float x1 = boxes(b, 1);
@ -449,7 +449,7 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
    grads_image.setZero();
-    auto CropAndResizeBackImgPerBox = [&](int start_box, int limit_box) {
+    auto CropAndResizeBackImgPerBox = [&](int64 start_box, int64 limit_box) {
      for (int b = start_box; b < limit_box; ++b) {
        const float y1 = boxes(b, 0);
        const float x1 = boxes(b, 1);
--- a/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc
@ -193,7 +193,8 @@ struct LaunchBatchBandedTriangularSolve {
    Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
          cost_per_unit,
-          [&in_x, &in_y, adjoint, lower, &bcast, out](int start, int limit) {
+          [&in_x, &in_y, adjoint, lower, &bcast, out](int64 start,
                                                      int64 limit) {
            SequentialBandedTriangularSolveKernel<Scalar>::Run(
                in_x, in_y, lower, adjoint, bcast, out, start, limit);
          });
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@ -95,7 +95,8 @@ struct NthElementFunctor<CPUDevice, T> {
    const int last_dim = input_tensor.dim_size(input_tensor.dims() - 1);
    // Allocate each row to different shard.
-    auto SubNthElement = [&, input, output, last_dim, n](int start, int limit) {
+    auto SubNthElement = [&, input, output, last_dim, n](int64 start,
                                                         int64 limit) {
      // std::nth_element would rearrange the array, so we need a new buffer.
      std::vector<T> buf(last_dim);
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@ -70,8 +70,8 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
    auto do_work = [samples_per_batch, num_elements, &ctx, &means, &stddevs,
                    &minvals, &maxvals, &gen, &output,
-                    kStdDevsInsideBoundsToUseRandnSampler](int start_batch,
+                    kStdDevsInsideBoundsToUseRandnSampler](int64 start_batch,
-                                                           int limit_batch) {
+                                                           int64 limit_batch) {
      // Capturing "gen" by-value would only make a copy for the _shared_
      // lambda.  Since we want to let each worker have its own copy, we pass
      // "gen" by reference and explicitly do a copy assignment here.
@ -333,8 +333,8 @@ struct TruncatedNormalFunctorV2<CPUDevice, T> {
    auto do_work = [num_batches, samples_per_batch, &ctx, &bcast, &means,
                    &stddevs, &minvals, &maxvals, &gen, &output,
-                    kStdDevsInsideBoundsToUseRandnSampler](int start_output,
+                    kStdDevsInsideBoundsToUseRandnSampler](int64 start_output,
-                                                           int limit_output) {
+                                                           int64 limit_output) {
      // Capturing "gen" by-value would only make a copy for the _shared_
      // lambda.  Since we want to let each worker have its own copy, we pass
      // "gen" by reference and explicitly do a copy assignment here.
--- a/tensorflow/core/kernels/random_binomial_op.cc
+++ b/tensorflow/core/kernels/random_binomial_op.cc
@ -184,7 +184,7 @@ struct RandomBinomialFunctor<CPUDevice, T, U> {
    // the sample shape and [H1, ... Hm] for the batch shape of the samples.
    // We have B1 * ... * Bk samples per batch member we need.
    auto DoWork = [num_batches, samples_per_batch, &bcast, &counts, &probs,
-                   &gen, &output](int start_output, int limit_output) {
+                   &gen, &output](int64 start_output, int64 limit_output) {
      // Vectorized intermediate calculations for uniform rejection sampling.
      // We always generate at most 4 samples.
      Eigen::array<T, 4> z;
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@ -97,7 +97,7 @@ struct PoissonFunctor<CPUDevice, T, U> {
    typedef random::UniformDistribution<random::PhiloxRandom, CT> Uniform;
    auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat](
-                      int start_output, int limit_output) {
+                      int64 start_output, int64 limit_output) {
      // Capturing "rng" by value would only make a copy for the _shared_
      // lambda.  Since we want to let each worker have its own copy, we pass
      // "rng" by reference and explicitly do a copy assignment.
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@ -252,7 +252,7 @@ class StatelessRandomGammaOp : public StatelessRandomOpBase {
    // avoid a couple flops which can be done on a per-alpha basis.
    auto DoWork = [samples_per_alpha, num_alphas, &random, samples_flat,
-                   alpha_flat](int start_output, int limit_output) {
+                   alpha_flat](int64 start_output, int64 limit_output) {
      // Capturing "random" by-value would only make a copy for the _shared_
      // lambda.  Since we want to let each worker have its own copy, we pass
      // "random" by reference and explicitly do a copy assignment.
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@ -136,7 +136,7 @@ struct TopKFunctor<CPUDevice, T> {
      return Status::OK();
    }
-    auto SortIndices = [&](int start_batch, int limit_batch) {
+    auto SortIndices = [&](int64 start_batch, int64 limit_batch) {
      for (int32 b = start_batch; b < limit_batch; ++b) {
        const T* input_data = &input(b, 0);
        const auto stable_comp = [input_data](const int32 a, const int32 b) {