Preallocate vector storage when the ultimate vector size is known in advance

PiperOrigin-RevId: 157724431
2017-06-01 09:42:04 -07:00 · 2017-06-01 09:42:04 -07:00 · eb10a4c494
commit eb10a4c494
parent ce32228c49
69 changed files with 95 additions and 0 deletions
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@ -805,6 +805,7 @@ void TF_GraphSetTensorShape(TF_Graph* graph, TF_Output output,
  }

  std::vector<tensorflow::shape_inference::DimensionHandle> dim_vec;
+  dim_vec.reserve(num_dims);
  for (int i = 0; i < num_dims; ++i) {
    dim_vec.push_back(ic->MakeDim(dims[i]));
  }
--- a/tensorflow/cc/client/client_session.cc
+++ b/tensorflow/cc/client/client_session.cc
@ -113,10 +113,12 @@ Status ClientSession::Run(const RunOptions& run_options, const FeedType& inputs,
    feeds.emplace_back(feed.first.name(), feed.second.tensor);
  }
  std::vector<string> output_tensor_names;
+  output_tensor_names.reserve(fetch_outputs.size());
  for (auto const& output : fetch_outputs) {
    output_tensor_names.push_back(output.name());
  }
  std::vector<string> target_node_names;
+  target_node_names.reserve(run_outputs.size());
  for (auto const& output : run_outputs) {
    target_node_names.push_back(output.node()->name());
  }
--- a/tensorflow/cc/framework/gradient_checker.cc
+++ b/tensorflow/cc/framework/gradient_checker.cc
@ -44,6 +44,7 @@ Status ComputeTheoreticalJacobianTranspose(
  size_t x_num = x_shapes.size();
  // Call AddSymbolicGradients to get 'dxs' (we will feed 'dys').
  OutputList dys;
+  dys.reserve(y_shapes.size());
  for (const auto& y_shape : y_shapes) {
    // TODO(suharshs): This currently assumes that all x's are the same type.
    dys.push_back(Cast(scope, Const(scope, 1.0, y_shape), xs[0].type()));
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@ -350,6 +350,7 @@ Status CompileXla(xla::CompileOnlyClient* client,
  compile_result->program_shape = *pshape_or.ValueOrDie();
  xla::ProgramShape* pshape = &compile_result->program_shape;
  std::vector<const xla::Shape*> arg_layouts;
+  arg_layouts.reserve(pshape->parameters_size());
  for (int i = 0; i < pshape->parameters_size(); ++i) {
    arg_layouts.push_back(pshape->mutable_parameters(i));
  }
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@ -50,6 +50,7 @@ class FillOp : public XlaOpKernel {
    // Convert the dims literal into a vector that we can pass to
    // ComputationBuilder.
    std::vector<int64> broadcast;
+    broadcast.reserve(dims_literal.shape().dimensions(0));
    for (int i = 0; i < dims_literal.shape().dimensions(0); ++i) {
      broadcast.push_back(xla::LiteralUtil::Get<int>(dims_literal, {i}));
    }
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@ -50,6 +50,7 @@ class SliceOp : public XlaOpKernel {
    // slice will be an empty handle if the output has no elements.
    CHECK_EQ(begin.size(), size.size());
    std::vector<int64> limits;
+    limits.reserve(begin.size());
    for (int i = 0; i < begin.size(); ++i) {
      limits.push_back(begin[i] + size[i]);
    }
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@ -171,6 +171,7 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
          executor, allocation->device_memory(), allocation->shape()));

  std::vector<GlobalDataHandle> element_handles;
+  element_handles.reserve(element_bases.size());
  for (int i = 0; i < element_bases.size(); ++i) {
    element_handles.push_back(RegisterInternal(
        allocation->backend(), allocation->device_ordinal(), element_bases[i],
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
@ -254,6 +254,7 @@ TEST_F(HloScheduleTest, LatticeMatMul) {
  //      d40      -- layer 4
  HloComputation::Builder builder("entry_computation");
  std::vector<HloInstruction*> params;
+  params.reserve(6);
  for (int i = 0; i < 6; ++i) {
    params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
        i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i))));
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@ -1631,6 +1631,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildKernelThunk(

  // Compute the input buffer indices.
  std::vector<BufferAllocation::Slice> io_buffers;
+  io_buffers.reserve(io_hlos.size());
  for (const HloInstruction* io_hlo : io_hlos) {
    io_buffers.push_back(GetAllocationSlice(*LatestNonGteAncestor(io_hlo)));
  }
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@ -86,6 +86,7 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
  //      d40      -- layer 4
  HloComputation::Builder builder("entry_computation");
  std::vector<HloInstruction*> params;
+  params.reserve(6);
  for (int i = 0; i < 6; ++i) {
    params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
        i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i))));
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@ -1484,6 +1484,7 @@ string HloInstruction::ToString(bool compact_operands,
  }
  if (!slice_starts_.empty() && !slice_limits_.empty()) {
    std::vector<string> bounds;
+    bounds.reserve(slice_starts_.size());
    for (int i = 0; i < slice_starts_.size(); ++i) {
      bounds.push_back(
          StrCat("[", slice_starts_[i], ":", slice_limits_[i], "]"));
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@ -649,6 +649,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
        ResolveAndValidateArguments(request.arguments(), execute_backend_.get(),
                                    executor->device_ordinal()));
    std::vector<se::DeviceMemoryBase> arguments;
+    arguments.reserve(arg_allocations.size());
    for (const Allocation* allocation : arg_allocations) {
      arguments.push_back(allocation->device_memory());
    }
@ -677,6 +678,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
      BuildExecutables(versioned_handles, std::move(module_configs),
                       execute_backend_.get(), executors));
  std::vector<Executable*> executable_ptrs;
+  executable_ptrs.reserve(executables.size());
  for (const auto& executable : executables) {
    executable_ptrs.push_back(executable.get());
  }
@ -752,6 +754,7 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
          << module_config->entry_computation_layout().ToString();

  std::vector<se::DeviceMemoryBase> arguments;
+  arguments.reserve(arg_allocations.size());
  for (const Allocation* allocation : arg_allocations) {
    arguments.push_back(allocation->device_memory());
  }
@ -820,6 +823,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
          << module_config->entry_computation_layout().ToString();

  std::vector<se::DeviceMemoryBase> arguments;
+  arguments.reserve(arg_allocations.size());
  for (const Allocation* allocation : arg_allocations) {
    arguments.push_back(allocation->device_memory());
  }
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@ -2467,6 +2467,7 @@ void ComputationLowerer::Visit(
      // to append dimensions on the left the broadcast_dimensions should just
      // be the n highest dimension numbers of the output shape where n is
      // the number of input dimensions.
+      broadcast_dimensions.reserve(ShapeUtil::Rank(operand->shape()));
      for (int i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
        broadcast_dimensions.push_back(i +
                                       ShapeUtil::Rank(request.output_shape()) -
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@ -829,6 +829,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
  const int count = GetParam();
  ComputationBuilder builder(client_, TestName());
  std::vector<float> values;
+  values.reserve(count);
  for (int i = 0; i < count; ++i) {
    values.push_back(i / static_cast<float>(count));
  }
@ -836,6 +837,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
  auto exp = builder.Pow(x, builder.ConstantR0<float>(2.0f));

  std::vector<float> expected;
+  expected.reserve(values.size());
  for (float value : values) {
    expected.push_back(value * value);
  }
--- a/tensorflow/compiler/xla/tests/log_test.cc
+++ b/tensorflow/compiler/xla/tests/log_test.cc
@ -47,6 +47,7 @@ TEST_F(LogTest, LogTenValues) {
  builder.Log(x);

  std::vector<float> expected;
+  expected.reserve(input.size());
  for (float f : input) {
    expected.push_back(std::log(f));
  }
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@ -246,6 +246,7 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
  }

  std::vector<GlobalData*> param_data;
+  param_data.reserve(param_data_owner.size());
  for (const std::unique_ptr<GlobalData>& data : param_data_owner) {
    param_data.push_back(data.get());
  }
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@ -37,6 +37,7 @@ class SliceTest : public ClientLibraryTestBase {
  template <typename NativeT>
  void RunSliceTenToTwo() {
    std::vector<NativeT> constant;
+    constant.reserve(10);
    for (int i = 0; i < 10; ++i) {
      constant.push_back(static_cast<NativeT>(i));
    }
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@ -64,6 +64,7 @@ TEST_F(VecOpsSimpleTest, ExpManyValues) {
  for (int count : {63, 64, 65, 127, 128, 129, 17 * 4096}) {
    ComputationBuilder builder(client_, TestName());
    std::vector<float> exponents;
+    exponents.reserve(count);
    for (int i = 0; i < count; ++i) {
      exponents.push_back(i / static_cast<float>(count));
    }
@ -71,6 +72,7 @@ TEST_F(VecOpsSimpleTest, ExpManyValues) {
    auto exp = builder.Exp(x);

    std::vector<float> expected;
+    expected.reserve(exponents.size());
    for (float exponent : exponents) {
      expected.push_back(std::exp(exponent));
    }
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@ -81,6 +81,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
        client->GetComputationShape(computation).ConsumeValueOrDie();

    std::vector<const Shape*> layouts;
+    layouts.reserve(program_shape->parameters_size());
    for (int i = 0; i < program_shape->parameters_size(); ++i) {
      layouts.push_back(&program_shape->parameters(i));
    }
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@ -56,6 +56,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
          client->GetComputationShape(computation).ConsumeValueOrDie();

      std::vector<const Shape*> layouts;
+      layouts.reserve(program_shape->parameters_size());
      for (int i = 0; i < program_shape->parameters_size(); ++i) {
        layouts.push_back(&program_shape->parameters(i));
      }
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@ -74,6 +74,7 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
  }

  std::vector<GlobalData*> execute_arguments;
+  execute_arguments.reserve(arguments.size());
  for (auto& argument : arguments) {
    execute_arguments.push_back(argument.get());
  }
--- a/tensorflow/contrib/batching/kernels/batch_kernels.cc
+++ b/tensorflow/contrib/batching/kernels/batch_kernels.cc
@ -347,6 +347,7 @@ class BatchResource : public ResourceBase {

      // Concatenate the tasks ith input tensors into a big output tensor.
      std::vector<Tensor> to_concatenate;
+      to_concatenate.reserve(batch->num_tasks());
      for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
        to_concatenate.push_back(batch->task(task_idx).inputs.at(i));
      }
--- a/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
+++ b/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
@ -139,6 +139,7 @@ TEST(SharedBatchSchedulerTest, ObeyBatchSizeConstraint) {
                   &callback_data](std::unique_ptr<Batch<FakeTask>> batch) {
    ASSERT_TRUE(batch->IsClosed());
    std::vector<size_t> batch_data;
+    batch_data.reserve(batch->num_tasks());
    for (int i = 0; i < batch->num_tasks(); ++i) {
      batch_data.push_back(batch->mutable_task(i)->size());
    }
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
@ -295,6 +295,7 @@ void ExpectVecsEquiv(const std::vector<float>& vec1,
 std::vector<float> GetWeightsByIndex(const std::vector<float>& weights,
                                     const std::vector<int>& indices) {
  std::vector<float> res;
+  res.reserve(indices.size());
  for (const int index : indices) {
    res.push_back(weights[index]);
  }
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
@ -94,6 +94,7 @@ TEST(FfmpegLibTest, TestRoundTripGeneratedWav) {
  }

  std::vector<float> sine_wave;
+  sine_wave.reserve(20000);
  for (int i = 0; i < 20000; ++i) {
    sine_wave.push_back(std::sin(6.28 * 440.0 * i / 20000.0));
  }
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@ -494,6 +494,7 @@ class SparseFeatureCrossOp : public OpKernel {
    ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
                       &feature_start_indices);

+    columns.reserve(values_list_in.size());
    for (int i = 0; i < values_list_in.size(); ++i) {
      columns.emplace_back(new SparseTensorColumn<InternalType>(
          values_list_in[i], std::move(feature_counts[i]),
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@ -324,6 +324,7 @@ static void BM_AllocationDelayed(int iters, int delay) {
  int size_index = 0;

  std::vector<void*> ptrs;
+  ptrs.reserve(delay);
  for (int i = 0; i < delay; i++) {
    ptrs.push_back(nullptr);
  }
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@ -123,10 +123,12 @@ void Benchmark::RunWithArgs(
  }
  // Gets inputs' and outputs' rendezvous keys.
  std::vector<std::pair<string, Tensor>> in;
+  in.reserve(inputs.size());
  for (const auto& p : inputs) {
    in.push_back({GetRendezvousKey(p.first), p.second});
  }
  std::vector<string> out;
+  out.reserve(outputs.size());
  for (const auto& n : outputs) {
    out.push_back(GetRendezvousKey(n));
  }
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@ -94,6 +94,7 @@ Status SessionFactory::GetFactory(const SessionOptions& options,
    // TODO(mrry): Consider providing a system-default fallback option
    // in this case.
    std::vector<string> factory_types;
+    factory_types.reserve(candidate_factories.size());
    for (const auto& candidate_factory : candidate_factories) {
      factory_types.push_back(candidate_factory.first);
    }
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@ -259,6 +259,7 @@ REGISTER_OP("ShapeData")
      }

      std::vector<shape_inference::DimensionHandle> dims;
+      dims.reserve(shape_data->NumElements());
      for (int i = 0; i < shape_data->NumElements(); ++i) {
        dims.emplace_back(c->MakeDim(shape_data->flat<int32>()(i)));
      }
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@ -746,6 +746,7 @@ Status ConcatShapeHelper(InferenceContext* c, int start_value_index,
    }
    // Build result of <rank> different unknown dims.
    std::vector<DimensionHandle> dims;
+    dims.reserve(rank);
    for (int i = 0; i < rank; ++i) dims.push_back(c->UnknownDim());
    c->set_output(0, c->MakeShape(dims));
    return Status::OK();
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@ -96,6 +96,7 @@ string ResourceMgr::DebugString() const {
    }
  }
  std::vector<string> text;
+  text.reserve(lines.size());
  for (const Line& line : lines) {
    text.push_back(strings::Printf(
        "%-20s | %-40s | %-40s | %-s", line.container->c_str(),
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@ -565,6 +565,7 @@ Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
    }
    const auto num_dims = Value(shape_dim);
    std::vector<DimensionHandle> dims;
+    dims.reserve(num_dims);
    for (int i = 0; i < num_dims; i++) dims.push_back(UnknownDim());
    return ReturnCreatedShape(dims, out);
  }
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@ -783,6 +783,7 @@ TEST_F(ShapeInferenceTest, MakeShape) {
  std::vector<DimensionHandle> dims;
  auto in0 = c.input(0);
  const int rank = c.Rank(in0);
+  dims.reserve(rank);
  for (int i = 0; i < rank; ++i) {
    dims.push_back(c.Dim(in0, rank - i - 1));
  }
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@ -51,6 +51,7 @@ string RunInferShapes(const string& op_name, const string& ins,
  ShapeInferenceTestOp op(op_name);
  const int num_inputs = 1 + std::count(ins.begin(), ins.end(), ';');
  std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(num_inputs);
  for (int i = 0; i < num_inputs; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
  NodeDef node_def;
  TF_CHECK_OK(NodeDefBuilder("dummy", op_name)
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@ -496,6 +496,7 @@ Status GraphConstructor::ModifyNodeDefForImport(NodeDef* node_def) {
 void RemoveInputs(NodeDef* node_def, const std::vector<int>& inputs_to_remove) {
  // TODO(skyewm): is there a better way to do this?
  std::vector<string> inputs;
+  inputs.reserve(node_def->input_size());
  for (int i = 0; i < node_def->input_size(); ++i) {
    inputs.push_back(node_def->input(i));
  }
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@ -110,6 +110,7 @@ class GraphTest : public ::testing::Test {
  // are readable.
  static std::vector<string> Stringify(const std::vector<Node*>& nodes) {
    std::vector<string> result;
+    result.reserve(nodes.size());
    for (Node* n : nodes) {
      result.push_back(n->DebugString());
    }
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@ -536,6 +536,7 @@ class AddNProcessor : public AgnosticNodeProcessor {
 protected:
  std::vector<int> GetInputPos() const override {
    std::vector<int> input_pos;
+    input_pos.reserve(node_->input_size());
    for (int i = 0; i < node_->input_size(); i++) {
      input_pos.push_back(i);
    }
--- a/tensorflow/core/kernels/adjust_contrast_op_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@ -73,6 +73,7 @@ TEST_F(AdjustContrastOpTest, Big_99x99x3) {
  TF_EXPECT_OK(InitOp());

  std::vector<float> values;
+  values.reserve(99 * 99 * 3);
  for (int i = 0; i < 99 * 99 * 3; ++i) {
    values.push_back(i % 255);
  }
--- a/tensorflow/core/kernels/dequantize_op_test.cc
+++ b/tensorflow/core/kernels/dequantize_op_test.cc
@ -105,6 +105,7 @@ static void BM_DequantizeMinCombinedCpu(int iters) {
  auto root = Scope::NewRootScope().ExitOnError();
  const int64 num_values = 1500 * 250;
  std::vector<T> inputs;
+  inputs.reserve(num_values);
  for (int i = 0; i < num_values; ++i) inputs.push_back(i);
  ops::Dequantize(root, test::AsTensor<T>(inputs),
                  test::AsTensor<float>({-1.5f}),
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@ -104,6 +104,7 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
      const auto data_flat = data->flat<T>();
      std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
                                   Eigen::Aligned> > out_vec;
+      out_vec.reserve(num_partitions_);
      for (int p = 0; p < num_partitions_; p++) {
        out_vec.push_back(outputs[p]->vec<T>());
      }
@ -124,6 +125,7 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
      // If data has extra dimensions, use Eigen slices
      std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
                                   Eigen::Aligned> > out_flat;
+      out_flat.reserve(num_partitions_);
      for (int p = 0; p < num_partitions_; p++) {
        out_flat.push_back(outputs[p]->flat_outer_dims<T>());
      }
--- a/tensorflow/core/kernels/fractional_max_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_max_pool_op.cc
@ -245,9 +245,11 @@ class FractionalMaxPoolGradOp : public OpKernel {
    constexpr int tensor_in_and_out_dims = 4;
    std::vector<int64> input_size;
    std::vector<int64> output_size;
+    input_size.reserve(tensor_in_and_out_dims);
    for (int i = 0; i < tensor_in_and_out_dims; ++i) {
      input_size.push_back(tensor_in.dim_size(i));
    }
+    output_size.reserve(tensor_in_and_out_dims);
    for (int i = 0; i < tensor_in_and_out_dims; ++i) {
      output_size.push_back(tensor_out.dim_size(i));
    }
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@ -164,6 +164,7 @@ static Graph* Gather(int dim) {
  random::PhiloxRandom philox(301, 17);
  random::SimplePhilox rnd(&philox);
  std::vector<Index> indices_vec;
+  indices_vec.reserve(kLookups);
  for (int i = 0; i < kLookups; i++) {
    indices_vec.push_back(rnd.Uniform(kRows));
  }
--- a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
@ -29,6 +29,7 @@ TEST(MfccMelFilterbankTest, AgreesWithPythonGoldenValues) {

  std::vector<double> input;
  const int kSampleCount = 513;
+  input.reserve(kSampleCount);
  for (int i = 0; i < kSampleCount; ++i) {
    input.push_back(i + 1);
  }
--- a/tensorflow/core/kernels/mfcc_test.cc
+++ b/tensorflow/core/kernels/mfcc_test.cc
@ -26,6 +26,7 @@ TEST(MfccTest, AgreesWithPythonGoldenValues) {
  Mfcc mfcc;
  std::vector<double> input;
  const int kSampleCount = 513;
+  input.reserve(kSampleCount);
  for (int i = 0; i < kSampleCount; ++i) {
    input.push_back(i + 1);
  }
@ -51,6 +52,7 @@ TEST(MfccTest, AvoidsNansWithZeroInput) {
  Mfcc mfcc;
  std::vector<double> input;
  const int kSampleCount = 513;
+  input.reserve(kSampleCount);
  for (int i = 0; i < kSampleCount; ++i) {
    input.push_back(0.0);
  }
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@ -37,6 +37,7 @@ void TestRequantizeMany(Eigen::ThreadPoolDevice* eigen_device, float input_min,
                        int tolerance = 1) {
  const int values_count = values_quantized.size();
  std::vector<quint8> expected_values;
+  expected_values.reserve(values_count);
  for (int value_index = 0; value_index < values_count; ++value_index) {
    expected_values.push_back(FloatToQuantized<quint8>(
        QuantizedToFloat(values_quantized[value_index], input_min, input_max),
@ -78,6 +79,7 @@ void TestRequantizeMany8To32Bit(float input_min, float input_max,
                                int tolerance = 256) {
  const int values_count = values_quantized.size();
  std::vector<qint32> expected_values;
+  expected_values.reserve(values_count);
  for (int value_index = 0; value_index < values_count; ++value_index) {
    expected_values.push_back(FloatToQuantized<qint32>(
        QuantizedToFloat(values_quantized[value_index], input_min, input_max),
--- a/tensorflow/core/kernels/sdca_ops_test.cc
+++ b/tensorflow/core/kernels/sdca_ops_test.cc
@ -57,6 +57,7 @@ Node* Var(Graph* const g, const int n) {
 std::vector<Node*> VarVector(Graph* const g, const int nodes,
                             const int node_size) {
  std::vector<Node*> result;
+  result.reserve(nodes);
  for (int i = 0; i < nodes; ++i) {
    result.push_back(Var(g, node_size));
  }
@ -164,6 +165,7 @@ void GetGraphs(const int32 num_examples, const int32 num_sparse_feature_groups,
      sparse_weights.push_back(NodeBuilder::NodeOut(n));
    }
    std::vector<NodeBuilder::NodeOut> dense_weights;
+    dense_weights.reserve(dense_weight_nodes.size());
    for (Node* n : dense_weight_nodes) {
      dense_weights.push_back(NodeBuilder::NodeOut(n));
    }
@ -171,20 +173,24 @@ void GetGraphs(const int32 num_examples, const int32 num_sparse_feature_groups,
    std::vector<NodeBuilder::NodeOut> sparse_example_indices;
    std::vector<NodeBuilder::NodeOut> sparse_feature_indices;
    std::vector<NodeBuilder::NodeOut> sparse_values;
+    sparse_example_indices.reserve(num_sparse_feature_groups);
    for (int i = 0; i < num_sparse_feature_groups; ++i) {
      sparse_example_indices.push_back(NodeBuilder::NodeOut(
          SparseExampleIndices(g, sparse_features_per_group, num_examples)));
    }
+    sparse_feature_indices.reserve(num_sparse_feature_groups);
    for (int i = 0; i < num_sparse_feature_groups; ++i) {
      sparse_feature_indices.push_back(NodeBuilder::NodeOut(
          SparseFeatureIndices(g, sparse_features_per_group, num_examples)));
    }
+    sparse_values.reserve(num_sparse_feature_groups);
    for (int i = 0; i < num_sparse_feature_groups; ++i) {
      sparse_values.push_back(
          NodeBuilder::NodeOut(RandomZeroOrOne(g, num_examples * 4)));
    }

    std::vector<NodeBuilder::NodeOut> dense_features;
+    dense_features.reserve(num_dense_feature_groups);
    for (int i = 0; i < num_dense_feature_groups; ++i) {
      dense_features.push_back(NodeBuilder::NodeOut(
          RandomZeroOrOneMatrix(g, num_examples, dense_features_per_group)));
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@ -361,6 +361,7 @@ class DeserializeManySparseOp : public OpKernel {
    std::iota(std_order.begin(), std_order.end(), 0);

    std::vector<SparseTensor> tensors_to_concat;
+    tensors_to_concat.reserve(num_sparse_tensors);
    for (int i = 0; i < num_sparse_tensors; ++i) {
      tensors_to_concat.emplace_back(indices_to_concat[i], values_to_concat[i],
                                     preconcat_shape, std_order);
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@ -452,6 +452,7 @@ class SparseCrossOp : public OpKernel {
    ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
                       &feature_start_indices);

+    columns.reserve(values_list_in.size());
    for (int i = 0; i < values_list_in.size(); ++i) {
      columns.emplace_back(new SparseTensorColumn<InternalType>(
          values_list_in[i], std::move(feature_counts[i]),
--- a/tensorflow/core/kernels/sparse_tensors_map_ops.cc
+++ b/tensorflow/core/kernels/sparse_tensors_map_ops.cc
@ -463,6 +463,7 @@ class TakeManySparseFromTensorsMapOp : public SparseTensorAccessingOp {
    std::iota(std_order.begin(), std_order.end(), 0);

    std::vector<SparseTensor> tensors_to_concat;
+    tensors_to_concat.reserve(N);
    for (int i = 0; i < N; ++i) {
      tensors_to_concat.emplace_back(std::move(indices_to_concat[i]),
                                     std::move(values_to_concat[i]),
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@ -88,6 +88,7 @@ class StageOp : public OpKernel {
    OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
    core::ScopedUnref scope(buf);
    Buffer::Tuple tuple;
+    tuple.reserve(ctx->num_inputs());
    for (int i = 0; i < ctx->num_inputs(); ++i) {
      tuple.push_back(ctx->input(i));
    }
--- a/tensorflow/core/lib/gtl/inlined_vector_test.cc
+++ b/tensorflow/core/lib/gtl/inlined_vector_test.cc
@ -778,6 +778,7 @@ BENCHMARK(BM_InlinedVectorFillRange)->Range(0, 1024);
 static void BM_StdVectorFill(int iters, int len) {
  for (int i = 0; i < iters; i++) {
    std::vector<int> v;
+    v.reserve(len);
    for (int j = 0; j < len; j++) {
      v.push_back(j);
    }
@ -810,6 +811,7 @@ static void BM_StdVectorFillString(int iters, int len) {
                       "012345678901234567", "to cause allocation"};
  for (int i = 0; i < iters; i++) {
    std::vector<string> v;
+    v.reserve(len);
    for (int j = 0; j < len; j++) {
      v.push_back(strings[j & 3]);
    }
--- a/tensorflow/core/lib/gtl/optional_test.cc
+++ b/tensorflow/core/lib/gtl/optional_test.cc
@ -1078,6 +1078,7 @@ TEST(optionalTest, NoExcept) {
  static_assert(
      !std::is_nothrow_move_constructible<optional<MoveMeThrow>>::value, "");
  std::vector<optional<MoveMeNoThrow>> v;
+  v.reserve(10);
  for (int i = 0; i < 10; ++i) v.emplace_back();
 }

--- a/tensorflow/core/ops/array_grad.cc
+++ b/tensorflow/core/ops/array_grad.cc
@ -248,6 +248,7 @@ Status ArrayToListGrad(const AttrSlice& attrs, FunctionDef* g) {
  int N;
  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "N", &N));
  std::vector<string> dys;
+  dys.reserve(N);
  for (int i = 0; i < N; ++i) {
    dys.push_back(strings::StrCat("dy:", i));
  }
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@ -613,6 +613,7 @@ REGISTER_OP("Const")
      TF_RETURN_IF_ERROR(TensorShape::IsValidShape(proto->tensor_shape()));
      TensorShape shape(proto->tensor_shape());
      std::vector<DimensionHandle> dims;
+      dims.reserve(shape.dims());
      for (int i = 0; i < shape.dims(); ++i) {
        dims.push_back(c->MakeDim(shape.dim_size(i)));
      }
@ -894,6 +895,7 @@ REGISTER_OP("MatrixDiagPart")
      }
      const int32 rank = c->Rank(in);
      std::vector<DimensionHandle> dims;
+      dims.reserve(rank - 2);
      for (int i = 0; i < rank - 2; ++i) dims.push_back(c->Dim(in, i));

      DimensionHandle min_dim;
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@ -31,6 +31,7 @@ TEST(ArrayOpsTest, Pack_ShapeFn) {
  auto set_axis = [&op](int axis) {
    int n = 3;
    std::vector<NodeDefBuilder::NodeOut> src_list;
+    src_list.reserve(n);
    for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
    TF_ASSERT_OK(NodeDefBuilder("test", "Pack")
                     .Input(src_list)
@ -281,6 +282,7 @@ TEST(ArrayOpsTest, ShapeN_ShapeFn) {
  ShapeInferenceTestOp op("ShapeN");
  int n = 3;
  std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(n);
  for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
  TF_ASSERT_OK(NodeDefBuilder("test", "ShapeN")
                   .Input(src_list)
@ -546,6 +548,7 @@ TEST(ArrayOpsTest, Concat_ShapeFn) {
  ShapeInferenceTestOp op("Concat");
  auto set_n = [&op](int n) {
    std::vector<NodeDefBuilder::NodeOut> src_list;
+    src_list.reserve(n);
    for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
    TF_ASSERT_OK(NodeDefBuilder("test", "Concat")
                     .Input({"concat_dim", 0, DT_INT32})
@ -619,6 +622,7 @@ TEST(ArrayOpsTest, ConcatV2_ShapeFn) {
  ShapeInferenceTestOp op("ConcatV2");
  auto set_n = [&op](int n) {
    std::vector<NodeDefBuilder::NodeOut> src_list;
+    src_list.reserve(n);
    for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
    TF_ASSERT_OK(NodeDefBuilder("test", "ConcatV2")
                     .Input(src_list)
@ -695,6 +699,7 @@ TEST(ArrayOpsTest, ConcatOffset_ShapeFn) {

  const int n = 4;
  std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(n);
  for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_INT32);
  TF_ASSERT_OK(NodeDefBuilder("test", "ConcatOffset")
                   .Input({"concat_dim", 0, DT_INT32})
--- a/tensorflow/core/ops/control_flow_ops_test.cc
+++ b/tensorflow/core/ops/control_flow_ops_test.cc
@ -28,6 +28,7 @@ TEST(ControlFlowOpsTest, Merge_ShapeFn) {

  int n = 3;
  std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(n);
  for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
  TF_ASSERT_OK(NodeDefBuilder("test", "Merge")
                   .Input(src_list)
@ -54,6 +55,7 @@ TEST(ControlFlowOpsTest, RefSelect_ShapeFn) {

  int n = 3;
  std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(n);
  for (int i = 0; i < n; ++i) src_list.emplace_back("a", 1, DT_FLOAT_REF);
  TF_ASSERT_OK(NodeDefBuilder("test", "RefSelect")
                   .Input("index", 0, DT_INT32)
--- a/tensorflow/core/ops/functional_ops_test.cc
+++ b/tensorflow/core/ops/functional_ops_test.cc
@ -33,6 +33,7 @@ TEST(FunctionalOpsTest, SymbolicGradient_ShapeFn) {
    in_type_list.emplace_back(DT_FLOAT);
    src_list.emplace_back("a", 0, DT_FLOAT);
  }
+  out_type_list.reserve(num_outputs);
  for (int i = 0; i < num_outputs; ++i) {
    out_type_list.emplace_back(DT_FLOAT);
  }
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@ -27,6 +27,7 @@ TEST(MathOpsTest, AddN_ShapeFn) {
  ShapeInferenceTestOp op("AddN");
  auto set_n = [&op](int n) {
    std::vector<NodeDefBuilder::NodeOut> src_list;
+    src_list.reserve(n);
    for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
    TF_ASSERT_OK(NodeDefBuilder("test", "AddN")
                     .Input(src_list)
--- a/tensorflow/core/ops/sparse_ops_test.cc
+++ b/tensorflow/core/ops/sparse_ops_test.cc
@ -255,6 +255,7 @@ TEST(SparseOpsTest, SparseConcat_ShapeFn) {
  ShapeInferenceTestOp op("SparseConcat");
  std::vector<NodeDefBuilder::NodeOut> src_list;
  int n = 2;
+  src_list.reserve(n);
  for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_INT64);
  TF_ASSERT_OK(NodeDefBuilder("test", "SparseConcat")
                   .Input(src_list)
--- a/tensorflow/core/ops/string_ops_test.cc
+++ b/tensorflow/core/ops/string_ops_test.cc
@ -27,6 +27,7 @@ TEST(StringOpsTest, StringJoin_ShapeFn) {
  ShapeInferenceTestOp op("StringJoin");
  int n = 3;
  std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.reserve(n);
  for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_STRING);
  TF_ASSERT_OK(NodeDefBuilder("test", "StringJoin")
                   .Input(src_list)
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@ -25,6 +25,7 @@ typedef std::vector<std::tuple<string, Status>> ExpectedCalls;

 ExpectedCalls CreateRetriableErrors(const string& method, int n) {
  ExpectedCalls expected_calls;
+  expected_calls.reserve(n);
  for (int i = 0; i < n; i++) {
    expected_calls.emplace_back(std::make_tuple(
        method, errors::Unavailable(strings::StrCat("Retriable error #", i))));
--- a/tensorflow/core/util/command_line_flags_test.cc
+++ b/tensorflow/core/util/command_line_flags_test.cc
@ -27,6 +27,7 @@ namespace {
 std::vector<char *> CharPointerVectorFromStrings(
    const std::vector<string> &strings) {
  std::vector<char *> result;
+  result.reserve(strings.size());
  for (const string &string : strings) {
    result.push_back(const_cast<char *>(string.c_str()));
  }
--- a/tensorflow/core/util/ctc/ctc_beam_search_test.cc
+++ b/tensorflow/core/util/ctc/ctc_beam_search_test.cc
@ -150,6 +150,7 @@ TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
  // using Eigen::Map.
  Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
  std::vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
+  inputs.reserve(timesteps);
  for (int t = 0; t < timesteps; ++t) {
    inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
  }
@ -199,6 +200,7 @@ TEST(CtcBeamSearch, AllBeamElementsHaveFiniteScores) {
  // using Eigen::Map.
  Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
  std::vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
+  inputs.reserve(timesteps);
  for (int t = 0; t < timesteps; ++t) {
    inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
  }
@ -293,6 +295,7 @@ TEST(CtcBeamSearch, LabelSelection) {
  // using Eigen::Map.
  Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
  std::vector<Eigen::Map<const Eigen::MatrixXf>> inputs;
+  inputs.reserve(timesteps);
  for (int t = 0; t < timesteps; ++t) {
    inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
  }
--- a/tensorflow/python/framework/cpp_shape_inference.cc
+++ b/tensorflow/python/framework/cpp_shape_inference.cc
@ -182,6 +182,7 @@ std::vector<string> RunCppShapeInference(

  std::vector<PyObject*> input_constant_tensor_values_v;
  int cnt = PyList_Size(input_constant_tensor_values);
+  input_constant_tensor_values_v.reserve(cnt);
  for (int i = 0; i < cnt; ++i) {
    input_constant_tensor_values_v.push_back(
        PyList_GetItem(input_constant_tensor_values, i));
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@ -347,6 +347,7 @@ Status ConvertTensorToNdarray(const Tensor& t, PyObject** ret) {
  PyArray_Descr* descr = PyArray_DescrFromType(typenum);
  CHECK(descr);
  std::vector<npy_intp> dims;
+  dims.reserve(t.dims());
  for (int i = 0; i < t.dims(); ++i) {
    dims.push_back(t.dim_size(i));
  }
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -2942,6 +2942,7 @@ bool CudnnSupport::DoMatMul(Stream* stream,
    }
    const auto toPtrs = [](std::vector<DeviceMemory<float>>& v) {
      std::vector<DeviceMemory<float>*> ptrs;
+      ptrs.reserve(v.size());
      for (auto& mem : v) {
        ptrs.push_back(&mem);
      }
--- a/tensorflow/tools/graph_transforms/summarize_graph_main.cc
+++ b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
@ -80,6 +80,7 @@ void PrintBenchmarkUsage(const std::vector<const NodeDef*>& placeholders,
        shape = PartialTensorShape(shape_proto);
      }
    }
+    sizes.reserve(shape.dims());
    for (int i = 0; i < shape.dims(); ++i) {
      sizes.push_back(shape.dim_size(i));
    }
@ -87,6 +88,7 @@ void PrintBenchmarkUsage(const std::vector<const NodeDef*>& placeholders,
    input_layer_shapes.push_back(sizes_string);
  }
  std::vector<string> output_layers;
+  output_layers.reserve(outputs.size());
  for (const NodeDef* node : outputs) {
    output_layers.push_back(node->name());
  }
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@ -634,6 +634,7 @@ void Generator::AppendDebugStringFunctions(const Descriptor& md) {
  Print().Print("namespace internal {").Print();
  Print(sig, " {").Nest();
  std::vector<const FieldDescriptor*> fields;
+  fields.reserve(md.field_count());
  for (int i = 0; i < md.field_count(); ++i) {
    fields.push_back(md.field(i));
  }