TensorFlow: conv improvements, label_image example, and

a few other changes. Changes: - Some improvements to convolution by using 32-bit indices by @benoitsteiner. Not all calls converted yet. Also some improvements to pooling as well by @benoitsteiner. - Improvements to sparse matmul CPU implementation by Ashish - Some fixes to warnings by @vrv - Doc fixes to padding by @Yangqing - Some improvements to Tensor wrappers by Eider - Speed up of matrix inverse on CPU by Rasmus - Add an example of doing image inference from a pre-trained model by @petewarden. - fixed formula in mnist example by nodir - Updates to event accumulator by Cassandra - Slight changes to tensor c api by @mrry - Handling of strings in listdiff by Phil - Fix negative fraction-of-queue-full stats by Frank - Type-checking improvement to importer by Yaroslav - logdir recursive search for Tensorboard by @danmane - Session.run() checks for empty graph by Manoj Base CL: 108013706
2015-11-16 23:42:32 -08:00 · 2015-11-16 23:42:32 -08:00 · 4213ac97be
commit 4213ac97be
parent 56313def00
59 changed files with 2976 additions and 513 deletions
--- a/tensorflow/core/client/tensor_c_api.cc
+++ b/tensorflow/core/client/tensor_c_api.cc
@ -141,9 +141,9 @@ void TF_SetTarget(TF_SessionOptions* options, const char* target) {
  options->options.target = target;
 }
-void TF_SetConfig(TF_SessionOptions* options, const char* config,
+void TF_SetConfig(TF_SessionOptions* options, const void* proto,
-                  size_t config_len, TF_Status* status) {
+                  size_t proto_len, TF_Status* status) {
-  if (!options->options.config.ParseFromArray(config, config_len)) {
+  if (!options->options.config.ParseFromArray(proto, proto_len)) {
    status->status =
        tensorflow::errors::InvalidArgument("Unparseable ConfigProto");
  }
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@ -30,7 +30,7 @@ TEST(GPUBFCAllocatorTest, NoDups) {
  std::sort(ptrs.begin(), ptrs.end());
  // Make sure none of them are equal, and that none of them overlap.
-  for (int i = 0; i < ptrs.size(); i++) {
+  for (size_t i = 0; i < ptrs.size(); i++) {
    if (i > 0) {
      ASSERT_NE(ptrs[i], ptrs[i - 1]);  // No dups
      size_t req_size = a.RequestedSize(ptrs[i - 1]);
@ -40,7 +40,7 @@ TEST(GPUBFCAllocatorTest, NoDups) {
    }
  }
-  for (int i = 0; i < ptrs.size(); i++) {
+  for (size_t i = 0; i < ptrs.size(); i++) {
    a.DeallocateRaw(ptrs[i]);
  }
 }
@ -63,7 +63,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
  // Deallocate half of the memory, and keep track of the others.
  std::vector<void*> existing_ptrs;
-  for (int i = 0; i < initial_ptrs.size(); i++) {
+  for (size_t i = 0; i < initial_ptrs.size(); i++) {
    if (i % 2 == 1) {
      a.DeallocateRaw(initial_ptrs[i]);
    } else {
@ -81,7 +81,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
  std::sort(existing_ptrs.begin(), existing_ptrs.end());
  // Make sure none of them are equal
-  for (int i = 0; i < existing_ptrs.size(); i++) {
+  for (size_t i = 0; i < existing_ptrs.size(); i++) {
    if (i > 0) {
      CHECK_NE(existing_ptrs[i], existing_ptrs[i - 1]);  // No dups
@ -95,7 +95,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
    }
  }
-  for (int i = 0; i < existing_ptrs.size(); i++) {
+  for (size_t i = 0; i < existing_ptrs.size(); i++) {
    a.DeallocateRaw(existing_ptrs[i]);
  }
 }
--- a/tensorflow/core/framework/tensor_types.h
+++ b/tensorflow/core/framework/tensor_types.h
@ -6,66 +6,73 @@
 namespace tensorflow {
 // Helper to define Tensor types given that the scalar is of type T.
-template <typename T, int NDIMS = 1>
+template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
 struct TTypes {
  // Rank-<NDIMS> tensor of scalar type T.
-  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor>,
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
                           Eigen::Aligned> Tensor;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor>,
+  typedef Eigen::TensorMap<
-                           Eigen::Aligned> ConstTensor;
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
      ConstTensor;
  // Unaligned Rank-<NDIMS> tensor of scalar type T.
-  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor> >
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType> >
      UnalignedTensor;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor> >
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor,
-      UnalignedConstTensor;
+                                         IndexType> > UnalignedConstTensor;
  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>,
                           Eigen::Aligned> Tensor32Bit;
  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
  typedef Eigen::TensorMap<
-      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor>,
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
      Eigen::Aligned> Scalar;
-  typedef Eigen::TensorMap<
+  typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
-      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor>,
+                                                  Eigen::RowMajor, IndexType>,
-      Eigen::Aligned> ConstScalar;
+                           Eigen::Aligned> ConstScalar;
  // Unaligned Scalar tensor of scalar type T.
  typedef Eigen::TensorMap<Eigen::TensorFixedSize<
-      T, Eigen::Sizes<>, Eigen::RowMajor> > UnalignedScalar;
+      T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> > UnalignedScalar;
-  typedef Eigen::TensorMap<Eigen::TensorFixedSize<
+  typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
-      const T, Eigen::Sizes<>, Eigen::RowMajor> > UnalignedConstScalar;
+                                                  Eigen::RowMajor, IndexType> >
      UnalignedConstScalar;
  // Rank-1 tensor (vector) of scalar type T.
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>, Eigen::Aligned>
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
-      Flat;
+                           Eigen::Aligned> Flat;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>,
+  typedef Eigen::TensorMap<
-                           Eigen::Aligned> ConstFlat;
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>, Eigen::Aligned>
+      ConstFlat;
-      Vec;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>,
+                           Eigen::Aligned> Vec;
-                           Eigen::Aligned> ConstVec;
+  typedef Eigen::TensorMap<
      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
      ConstVec;
  // Unaligned Rank-1 tensor (vector) of scalar type T.
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor> > UnalignedFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor> >
+      UnalignedFlat;
-      UnalignedConstFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor,
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor> > UnalignedVec;
+                                         IndexType> > UnalignedConstFlat;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor> >
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
-      UnalignedConstVec;
+      UnalignedVec;
  typedef Eigen::TensorMap<
      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> > UnalignedConstVec;
  // Rank-2 tensor (matrix) of scalar type T.
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned>
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
-      Matrix;
+                           Eigen::Aligned> Matrix;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
+  typedef Eigen::TensorMap<
-                           Eigen::Aligned> ConstMatrix;
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
      ConstMatrix;
  // Unaligned Rank-2 tensor (matrix) of scalar type T.
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor> >
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType> >
      UnalignedMatrix;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor> >
+  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor,
-      UnalignedConstMatrix;
+                                         IndexType> > UnalignedConstMatrix;
 };
 typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32;
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@ -11,24 +11,25 @@ namespace functor {
 // TODO(yangke): revisit these operations and in particular, see if we can
 // combine all of them into just one operation without causing nvcc to
 // timeout.
-template <typename Device, typename T, int Dims>
+template <typename Device, typename T, int Dims, typename IndexType>
 struct ShuffleAndReverse {
-  void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+  void operator()(const Device& d,
-                  const Eigen::DSizes<Eigen::DenseIndex, Dims>& order,
+                  typename TTypes<T, Dims, IndexType>::ConstTensor input,
                  const Eigen::DSizes<IndexType, Dims>& order,
                  const Eigen::array<bool, Dims>& reverse_dims,
-                  typename TTypes<T, Dims>::Tensor output) {
+                  typename TTypes<T, Dims, IndexType>::Tensor output) {
    output.device(d) = input.shuffle(order).reverse(reverse_dims);
  }
 };
-template <typename Device, typename T, int Dims>
+template <typename Device, typename T, int Dims, typename IndexType>
 struct InflatePadAndShuffle {
  void operator()(
-      const Device& d, typename TTypes<T, Dims>::ConstTensor input,
+      const Device& d, typename TTypes<T, Dims, IndexType>::ConstTensor input,
-      const Eigen::DSizes<Eigen::DenseIndex, Dims>& strides,
+      const Eigen::DSizes<IndexType, Dims>& strides,
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, Dims>& pad_dims,
+      const Eigen::array<Eigen::IndexPair<IndexType>, Dims>& pad_dims,
-      const Eigen::DSizes<Eigen::DenseIndex, Dims>& order,
+      const Eigen::DSizes<IndexType, Dims>& order,
-      typename TTypes<T, Dims>::Tensor output) {
+      typename TTypes<T, Dims, IndexType>::Tensor output) {
    output.device(d) = input.inflate(strides).pad(pad_dims).shuffle(order);
  }
 };
@ -89,30 +90,92 @@ struct MatMulConvFunctor {
  }
 };
-template <typename Device, typename T>
+template <typename Device, typename T, typename IndexType>
 struct TransformFilter {
-  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in,
+  void operator()(const Device& d,
-                  typename TTypes<T, 4>::Tensor out) {
+                  typename TTypes<T, 4, IndexType>::ConstTensor in,
-    out.device(d) = in.shuffle(Eigen::DSizes<Eigen::DenseIndex, 4>(3, 2, 0, 1));
+                  typename TTypes<T, 4, IndexType>::Tensor out) {
    // We want a 3, 2, 0, 1 shuffle. We can merge dimensions 0 and 1 together
    // to help speedup the shuffle operation.
    Eigen::DSizes<IndexType, 3> merged_dims;
    merged_dims[0] = in.dimension(0) * in.dimension(1);
    merged_dims[1] = in.dimension(2);
    merged_dims[2] = in.dimension(3);
    Eigen::DSizes<IndexType, 4> expanded_dims;
    expanded_dims[0] = in.dimension(3);
    expanded_dims[1] = in.dimension(2);
    expanded_dims[2] = in.dimension(0);
    expanded_dims[3] = in.dimension(1);
    out.device(d) = in.reshape(merged_dims)
                        .shuffle(Eigen::DSizes<IndexType, 3>(2, 1, 0))
                        .reshape(expanded_dims);
  }
 };
-template <typename Device, typename T>
+template <typename Device, typename T, typename IndexType>
 struct TransformDepth {
-  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in,
+  void operator()(const Device& d,
-                  const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle,
+                  typename TTypes<T, 4, IndexType>::ConstTensor in,
-                  typename TTypes<T, 4>::Tensor out) {
+                  const Eigen::DSizes<IndexType, 4>& shuffle,
-    out.device(d) = in.shuffle(shuffle);
+                  typename TTypes<T, 4, IndexType>::Tensor out) {
    Eigen::DSizes<IndexType, 3> merged_dims;
    Eigen::DSizes<IndexType, 4> expanded_dims;
    Eigen::DSizes<IndexType, 3> new_shuffle;
    // Merge dimensions that won't be shuffled together to speed things up.
    if (shuffle[1] == 2 && shuffle[2] == 3) {
      merged_dims[0] = in.dimension(0);
      merged_dims[1] = in.dimension(1);
      merged_dims[2] = in.dimension(2) * in.dimension(3);
      new_shuffle[0] = shuffle[0];
      new_shuffle[1] = 2;
      new_shuffle[2] = shuffle[3];
      expanded_dims[0] = in.dimension(shuffle[0]);
      expanded_dims[1] = in.dimension(2);
      expanded_dims[2] = in.dimension(3);
      expanded_dims[3] = in.dimension(shuffle[3]);
    } else if (shuffle[0] == 2 && shuffle[1] == 3) {
      merged_dims[0] = in.dimension(0);
      merged_dims[1] = in.dimension(1);
      merged_dims[2] = in.dimension(2) * in.dimension(3);
      new_shuffle[0] = 2;
      new_shuffle[1] = shuffle[2];
      new_shuffle[2] = shuffle[3];
      expanded_dims[0] = in.dimension(2);
      expanded_dims[1] = in.dimension(3);
      expanded_dims[2] = in.dimension(shuffle[2]);
      expanded_dims[3] = in.dimension(shuffle[3]);
    } else if (shuffle[0] == 0 && shuffle[1] == 3 && shuffle[2] == 1 &&
               shuffle[3] == 2) {
      merged_dims[0] = in.dimension(0);
      merged_dims[1] = in.dimension(1) * in.dimension(2);
      merged_dims[2] = in.dimension(3);
      new_shuffle[0] = 0;
      new_shuffle[1] = 2;
      new_shuffle[2] = 1;
      expanded_dims[0] = in.dimension(0);
      expanded_dims[1] = in.dimension(3);
      expanded_dims[2] = in.dimension(1);
      expanded_dims[3] = in.dimension(2);
    } else {
      assert(false && "unexpected shuffle");
    }
    out.device(d) =
        in.reshape(merged_dims).shuffle(new_shuffle).reshape(expanded_dims);
  }
 };
-template <typename Device, typename T>
+template <typename Device, typename T, typename IndexType>
 struct PadInput {
-  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in,
+  void operator()(const Device& d,
                  typename TTypes<T, 4, IndexType>::ConstTensor in,
                  int padding_rows_left, int padding_rows_right,
                  int padding_cols_left, int padding_cols_right,
-                  typename TTypes<T, 4>::Tensor out) {
+                  typename TTypes<T, 4, IndexType>::Tensor out) {
-    Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 4> padding;
+    Eigen::array<std::pair<IndexType, IndexType>, 4> padding;
    padding[0] = std::make_pair(0, 0);
    padding[1] = std::make_pair(padding_rows_left, padding_rows_right);
    padding[2] = std::make_pair(padding_cols_left, padding_cols_right);
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@ -783,9 +783,9 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
              TensorShape({out_depth, in_depth, filter_rows, filter_cols}),
              &transformed_filter));
-      functor::TransformFilter<Device, T>()(context->eigen_device<Device>(),
+      functor::TransformFilter<Device, T, int>()(
-                                            filter.tensor<T, 4>(),
+          context->eigen_device<Device>(), To32Bit(filter.tensor<T, 4>()),
-                                            transformed_filter.tensor<T, 4>());
+          To32Bit(transformed_filter.tensor<T, 4>()));
      Tensor transformed_out_backprop;
      OP_REQUIRES_OK(
@ -795,10 +795,10 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
              TensorShape({batch, out_depth, output_rows, output_cols}),
              &transformed_out_backprop));
-      functor::TransformDepth<Device, T>()(
+      functor::TransformDepth<Device, T, int>()(
-          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
+          context->eigen_device<Device>(), To32Bit(out_backprop.tensor<T, 4>()),
-          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
+          Eigen::DSizes<int, 4>(0, 3, 1, 2),
-          transformed_out_backprop.tensor<T, 4>());
+          To32Bit(transformed_out_backprop.tensor<T, 4>()));
      Tensor pre_transformed_in_backprop;
      OP_REQUIRES_OK(context,
@ -831,11 +831,12 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
      }
      auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-      functor::TransformDepth<Device, T>()(
+      functor::TransformDepth<Device, T, int>()(
          context->eigen_device<Device>(),
-          toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(),
+          To32Bit(toConstTensor(pre_transformed_in_backprop)
-          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1),
+                      .template tensor<T, 4>()),
-          in_backprop->tensor<T, 4>());
+          Eigen::DSizes<int, 4>(0, 2, 3, 1),
          To32Bit(in_backprop->tensor<T, 4>()));
    } else {
      // We fill out a padded out_backprop
      TensorShape padded_out_shape(
@ -852,7 +853,7 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
           {left_pad_cols, right_pad_cols},
           {0, 0}}};
-      functor::InflatePadAndShuffle<Device, T, 4>()(
+      functor::InflatePadAndShuffle<Device, T, 4, Eigen::DenseIndex>()(
          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides,
          pad_dims, trivial_order, padded_output.tensor<T, 4>());
      const Tensor& padded_output_cref = padded_output;
@ -869,7 +870,7 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
      Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{0, 1, 3, 2};
      Eigen::array<bool, 4> filter_rev_dims{true, true, false, false};
-      functor::ShuffleAndReverse<Device, T, 4>()(
+      functor::ShuffleAndReverse<Device, T, 4, Eigen::DenseIndex>()(
          context->eigen_device<Device>(), filter.tensor<T, 4>(), filter_order,
          filter_rev_dims, r_filter.tensor<T, 4>());
      const Tensor& r_filter_cref = r_filter;
@ -1033,10 +1034,10 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
              TensorShape({batch, out_depth, output_rows, output_cols}),
              &transformed_out_backprop));
-      functor::TransformDepth<Device, T>()(
+      functor::TransformDepth<Device, T, int>()(
-          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
+          context->eigen_device<Device>(), To32Bit(out_backprop.tensor<T, 4>()),
-          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
+          Eigen::DSizes<int, 4>(0, 3, 1, 2),
-          transformed_out_backprop.tensor<T, 4>());
+          To32Bit(transformed_out_backprop.tensor<T, 4>()));
      Tensor transformed_input;
      OP_REQUIRES_OK(context,
@ -1045,10 +1046,10 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
                         TensorShape({batch, in_depth, input_rows, input_cols}),
                         &transformed_input));
-      functor::TransformDepth<Device, T>()(
+      functor::TransformDepth<Device, T, int>()(
-          context->eigen_device<Device>(), input.tensor<T, 4>(),
+          context->eigen_device<Device>(), To32Bit(input.tensor<T, 4>()),
-          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
+          Eigen::DSizes<int, 4>(0, 3, 1, 2),
-          transformed_input.tensor<T, 4>());
+          To32Bit(transformed_input.tensor<T, 4>()));
      auto out_backprop_ptr =
          AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
@ -1074,12 +1075,12 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
      }
      auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-      functor::TransformDepth<Device, T>()(
+      functor::TransformDepth<Device, T, int>()(
          context->eigen_device<Device>(),
-          toConstTensor(pre_transformed_filter_backprop)
+          To32Bit(toConstTensor(pre_transformed_filter_backprop)
-              .template tensor<T, 4>(),
+                      .template tensor<T, 4>()),
-          Eigen::DSizes<Eigen::DenseIndex, 4>(2, 3, 1, 0),
+          Eigen::DSizes<int, 4>(2, 3, 1, 0),
-          filter_backprop->tensor<T, 4>());
+          To32Bit(filter_backprop->tensor<T, 4>()));
    } else {
      // Fall back to the non-cudnn code path
@ -1102,7 +1103,7 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
           {top_pad_rows, bottom_pad_rows},
           {left_pad_cols, right_pad_cols},
           {0, 0}}};
-      functor::InflatePadAndShuffle<Device, T, 4>()(
+      functor::InflatePadAndShuffle<Device, T, 4, Eigen::DenseIndex>()(
          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides,
          pad_dims, out_order, padded_output.tensor<T, 4>());
      const Tensor& padded_output_cref = padded_output;
@ -1121,7 +1122,7 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
      // No need for reversing this time.
      Eigen::array<bool, 4> trivial_dims{false, false, false, false};
-      functor::ShuffleAndReverse<Device, T, 4>()(
+      functor::ShuffleAndReverse<Device, T, 4, Eigen::DenseIndex>()(
          context->eigen_device<Device>(), input.tensor<T, 4>(), in_order,
          trivial_dims, in_shuffle.tensor<T, 4>());
      const Tensor& in_shuffle_cref = in_shuffle;
@ -1149,7 +1150,7 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
      Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{1, 2, 3, 0};
      Eigen::array<bool, 4> filter_rev_dims{true, true, false, false};
      const Tensor& filter_shuffle_cref = filter_shuffle;
-      functor::ShuffleAndReverse<Device, T, 4>()(
+      functor::ShuffleAndReverse<Device, T, 4, Eigen::DenseIndex>()(
          context->eigen_device<Device>(), filter_shuffle_cref.tensor<T, 4>(),
          filter_order, filter_rev_dims, filter_backprop->tensor<T, 4>());
    }
@ -1165,46 +1166,65 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                 \
+#define DECLARE_GPU_SPEC(T)                                                  \
-  template <>                                                               \
+  template <>                                                                \
-  void ShuffleAndReverse<GPUDevice, T, 4>::operator()(                      \
+  void ShuffleAndReverse<GPUDevice, T, 4, Eigen::DenseIndex>::operator()(    \
-      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,         \
+      const GPUDevice& d,                                                    \
-      const Eigen::DSizes<Eigen::DenseIndex, 4>& order,                     \
+      typename TTypes<T, 4, Eigen::DenseIndex>::ConstTensor input,           \
-      const Eigen::array<bool, 4>& reverse_dims,                            \
+      const Eigen::DSizes<Eigen::DenseIndex, 4>& order,                      \
-      typename TTypes<T, 4>::Tensor output);                                \
+      const Eigen::array<bool, 4>& reverse_dims,                             \
-  extern template struct ShuffleAndReverse<GPUDevice, T, 4>;                \
+      typename TTypes<T, 4, Eigen::DenseIndex>::Tensor output);              \
-  template <>                                                               \
+  extern template struct ShuffleAndReverse<GPUDevice, T, 4,                  \
-  void InflatePadAndShuffle<GPUDevice, T, 4>::operator()(                   \
+                                           Eigen::DenseIndex>;               \
-      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,         \
+  template <>                                                                \
-      const Eigen::DSizes<Eigen::DenseIndex, 4>& strides,                   \
+  void InflatePadAndShuffle<GPUDevice, T, 4, Eigen::DenseIndex>::operator()( \
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4>& pad_dims, \
+      const GPUDevice& d,                                                    \
-      const Eigen::DSizes<Eigen::DenseIndex, 4>& order,                     \
+      typename TTypes<T, 4, Eigen::DenseIndex>::ConstTensor input,           \
-      typename TTypes<T, 4>::Tensor output);                                \
+      const Eigen::DSizes<Eigen::DenseIndex, 4>& strides,                    \
-  extern template struct InflatePadAndShuffle<GPUDevice, T, 4>;             \
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4>& pad_dims,  \
-  template <>                                                               \
+      const Eigen::DSizes<Eigen::DenseIndex, 4>& order,                      \
-  void TransformFilter<GPUDevice, T>::operator()(                           \
+      typename TTypes<T, 4, Eigen::DenseIndex>::Tensor output);              \
-      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,            \
+  extern template struct InflatePadAndShuffle<GPUDevice, T, 4,               \
-      typename TTypes<T, 4>::Tensor out);                                   \
+                                              Eigen::DenseIndex>;            \
-  extern template struct TransformFilter<GPUDevice, T>;                     \
+  template <>                                                                \
-  template <>                                                               \
+  void ShuffleAndReverse<GPUDevice, T, 4, int>::operator()(                  \
-  void TransformDepth<GPUDevice, T>::operator()(                            \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor input,     \
-      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,            \
+      const Eigen::DSizes<int, 4>& order,                                    \
-      const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle,                   \
+      const Eigen::array<bool, 4>& reverse_dims,                             \
-      typename TTypes<T, 4>::Tensor out);                                   \
+      typename TTypes<T, 4, int>::Tensor output);                            \
-  extern template struct TransformDepth<GPUDevice, T>;                      \
+  extern template struct ShuffleAndReverse<GPUDevice, T, 4, int>;            \
-  template <>                                                               \
+  template <>                                                                \
-  void SpatialConvolution<GPUDevice, T>::operator()(                        \
+  void InflatePadAndShuffle<GPUDevice, T, 4, int>::operator()(               \
-      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,             \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor input,     \
-      typename TTypes<T, 4>::ConstTensor input,                             \
+      const Eigen::DSizes<int, 4>& strides,                                  \
-      typename TTypes<T, 4>::ConstTensor filter, int stride,                \
+      const Eigen::array<Eigen::IndexPair<int>, 4>& pad_dims,                \
-      const Eigen::PaddingType& padding);                                   \
+      const Eigen::DSizes<int, 4>& order,                                    \
-  extern template struct SpatialConvolution<GPUDevice, T>;                  \
+      typename TTypes<T, 4, int>::Tensor output);                            \
-  template <>                                                               \
+  extern template struct InflatePadAndShuffle<GPUDevice, T, 4, int>;         \
-  void SpatialConvolutionBackwardInput<GPUDevice, T>::operator()(           \
+  template <>                                                                \
-      const GPUDevice& d, typename TTypes<T, 4>::Tensor in_backprop,        \
+  void TransformFilter<GPUDevice, T, int>::operator()(                       \
-      typename TTypes<T, 4>::ConstTensor filter,                            \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,        \
-      typename TTypes<T, 4>::ConstTensor output_backprop, int input_rows,   \
+      typename TTypes<T, 4, int>::Tensor out);                               \
-      int input_cols, int stride);                                          \
+  extern template struct TransformFilter<GPUDevice, T, int>;                 \
  template <>                                                                \
  void TransformDepth<GPUDevice, T, int>::operator()(                        \
      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,        \
      const Eigen::DSizes<int, 4>& shuffle,                                  \
      typename TTypes<T, 4, int>::Tensor out);                               \
  extern template struct TransformDepth<GPUDevice, T, int>;                  \
  template <>                                                                \
  void SpatialConvolution<GPUDevice, T>::operator()(                         \
      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,              \
      typename TTypes<T, 4>::ConstTensor input,                              \
      typename TTypes<T, 4>::ConstTensor filter, int stride,                 \
      const Eigen::PaddingType& padding);                                    \
  extern template struct SpatialConvolution<GPUDevice, T>;                   \
  template <>                                                                \
  void SpatialConvolutionBackwardInput<GPUDevice, T>::operator()(            \
      const GPUDevice& d, typename TTypes<T, 4>::Tensor in_backprop,         \
      typename TTypes<T, 4>::ConstTensor filter,                             \
      typename TTypes<T, 4>::ConstTensor output_backprop, int input_rows,    \
      int input_cols, int stride);                                           \
  extern template struct SpatialConvolutionBackwardInput<GPUDevice, T>
 DECLARE_GPU_SPEC(float);
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@ -167,6 +167,10 @@ class Conv2DOp : public BinaryOp<T> {
            << ", filter_rows = " << filter_rows << ", stride = " << stride
            << ", out_depth = " << out_depth;
    // If there is nothing to compute, return.
    if (out_shape.num_elements() == 0) {
      return;
    }
    LaunchConvOp<Device, T>::launch(context, use_cudnn_, input, filter, stride,
                                    BrainPadding2EigenPadding(padding_),
                                    output);
@ -260,10 +264,11 @@ struct LaunchConvOp<GPUDevice, T> {
                          input.dim_size(2) + padding_cols, input.dim_size(3)}),
                     &transformed_input));
-        functor::PadInput<GPUDevice, T>()(
+        functor::PadInput<GPUDevice, T, int>()(
-            ctx->eigen_device<GPUDevice>(), input_param.tensor<T, 4>(),
+            ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
            padding_rows / 2, padding_rows - padding_rows / 2, padding_cols / 2,
-            padding_cols - padding_cols / 2, transformed_input.tensor<T, 4>());
+            padding_cols - padding_cols / 2,
            To32Bit(transformed_input.tensor<T, 4>()));
        input = transformed_input;
      }
@ -296,9 +301,9 @@ struct LaunchConvOp<GPUDevice, T> {
                                      filter.dim_size(0), filter.dim_size(1)}),
                         &transformed_filter));
-      functor::TransformFilter<GPUDevice, T>()(
+      functor::TransformFilter<GPUDevice, T, int>()(
-          ctx->eigen_device<GPUDevice>(), filter.tensor<T, 4>(),
+          ctx->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 4>()),
-          transformed_filter.tensor<T, 4>());
+          To32Bit(transformed_filter.tensor<T, 4>()));
      auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                      input.template flat<T>().size());
@ -346,16 +351,16 @@ namespace functor {
      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair); \
  extern template struct MatMulConvFunctor<GPUDevice, T>;                    \
  template <>                                                                \
-  void TransformFilter<GPUDevice, T>::operator()(                            \
+  void TransformFilter<GPUDevice, T, int>::operator()(                       \
-      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,             \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,        \
-      typename TTypes<T, 4>::Tensor out);                                    \
+      typename TTypes<T, 4, int>::Tensor out);                               \
-  extern template struct TransformFilter<GPUDevice, T>;                      \
+  extern template struct TransformFilter<GPUDevice, T, int>;                 \
  template <>                                                                \
-  void PadInput<GPUDevice, T>::operator()(                                   \
+  void PadInput<GPUDevice, T, int>::operator()(                              \
-      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,             \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,        \
      int padding_rows_left, int padding_rows_right, int padding_cols_left,  \
-      int padding_cols_right, typename TTypes<T, 4>::Tensor out);            \
+      int padding_cols_right, typename TTypes<T, 4, int>::Tensor out);       \
-  extern template struct PadInput<GPUDevice, T>
+  extern template struct PadInput<GPUDevice, T, int>
 DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC
--- a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc
@ -5,12 +5,14 @@
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/public/tensor.h"
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
-template struct functor::InflatePadAndShuffle<GPUDevice, float, 4>;
+template struct functor::InflatePadAndShuffle<GPUDevice, float, 4, int>;
-
+template struct functor::InflatePadAndShuffle<GPUDevice, float, 4,
                                              Eigen::DenseIndex>;
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@ -9,13 +9,18 @@
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
-template struct functor::ShuffleAndReverse<GPUDevice, float, 4>;
+template struct functor::ShuffleAndReverse<GPUDevice, float, 4, int>;
 template struct functor::ShuffleAndReverse<GPUDevice, float, 4,
                                           Eigen::DenseIndex>;
-template struct functor::TransformFilter<GPUDevice, float>;
+template struct functor::TransformFilter<GPUDevice, float, int>;
-template struct functor::PadInput<GPUDevice, float>;
+template struct functor::PadInput<GPUDevice, float, int>;
-template struct functor::TransformDepth<GPUDevice, float>;
+template struct functor::TransformDepth<GPUDevice, float, int>;
 // TODO(jiayq): currently pooling ops still use DenseIndex, so I am keeping it
 // here.
 template struct functor::TransformDepth<GPUDevice, float, Eigen::DenseIndex>;
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/linalg_ops_common.h
+++ b/tensorflow/core/kernels/linalg_ops_common.h
@ -86,11 +86,10 @@ class LinearAlgebraOp : public LinearAlgebraOpBase {
  explicit LinearAlgebraOp(OpKernelConstruction* context)
      : LinearAlgebraOpBase(context) {}
-  using ConstMatrixMap =
+  using Matrix =
-      Eigen::Map<const Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-                                     Eigen::RowMajor>>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
-  using MatrixMap = Eigen::Map<
+  using MatrixMap = Eigen::Map<Matrix>;
      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
  // Perform the actual computation on the input matrix, and store the results
  // in the output. This will be called repeatedly for a single call to
--- a/tensorflow/core/kernels/listdiff_op.cc
+++ b/tensorflow/core/kernels/listdiff_op.cc
@ -1,3 +1,4 @@
 #include <string>
 #include <unordered_set>
 #include <utility>
@ -70,6 +71,7 @@ class ListDiffOp : public OpKernel {
      ListDiffOp<type>)
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_LISTDIFF);
 REGISTER_LISTDIFF(string);
 #undef REGISTER_LISTDIFF
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/matrix_inverse_op.cc
@ -1,6 +1,7 @@
 // See docs in ../ops/linalg_ops.cc.
 #include <cmath>
 #include "third_party/eigen3/Eigen/Cholesky"
 #include "third_party/eigen3/Eigen/LU"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@ -35,6 +36,7 @@ class MatrixInverseOp
    }
  }
  using typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::Matrix;
  using typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap;
  using
      typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ConstMatrixMap;
@ -44,15 +46,36 @@ class MatrixInverseOp
    OP_REQUIRES(context, input.rows() == input.cols(),
                errors::InvalidArgument("Input matrix must be square."));
    if (input.rows() == 0) {
-      // By definition, an empty matrix's inverse is an emptry matrix.
+      // By definition, an empty matrix's inverse is an empty matrix.
      return;
    }
-    Eigen::FullPivLU<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
+    if (input.isApprox(input.transpose())) {
-                                   Eigen::RowMajor>> lu_decomposition(input);
+      // Matrix is symmetric, compute Cholesky factorization
-    OP_REQUIRES(context, lu_decomposition.isInvertible(),
+      // input = L * L^T.
      Eigen::LLT<Matrix, Eigen::Lower> cholesky_decomposition(input);
      if (cholesky_decomposition.info() == Eigen::Success) {
        // Cholesky succeeded => Matrix was SPD.
        output->noalias() = cholesky_decomposition.solve(
            Matrix::Identity(input.rows(), input.cols()));
        return;
      }
    }
    Eigen::PartialPivLU<Matrix> lu_decomposition(input);
    // While PartialPivLU cannot give strong guarantees on invertability,
    // we can at least guard against exact zero pivots. This can occur as
    // a result of basic user mistakes, such as providing integer valued
    // matrices that are exacly singular, or due to underflow if this
    // code is run with denormals being flushed to zero.
    // TODO(rmlarsen): Add check based on condition number estimation.
    const Scalar min_abs_pivot =
        lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff();
    OP_REQUIRES(context, min_abs_pivot > Scalar(0),
                errors::InvalidArgument("Input is not invertible."));
-    *output = lu_decomposition.inverse();
+    output->noalias() = lu_decomposition.inverse();
  }
 private:
  TF_DISALLOW_COPY_AND_ASSIGN(MatrixInverseOp);
 };
 REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<float, false>), float);
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@ -99,13 +99,13 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                      \
+#define DECLARE_GPU_SPEC(T)                                         \
-  template <>                                                    \
+  template <>                                                       \
-  void TransformDepth<GPUDevice, T>::operator()(                 \
+  void TransformDepth<GPUDevice, T, Eigen::DenseIndex>::operator()( \
-      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,    \
-      const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle,        \
+      const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle,           \
-      typename TTypes<T, 4>::Tensor out);                        \
+      typename TTypes<T, 4>::Tensor out);                           \
-  extern template struct TransformDepth<GPUDevice, T>;
+  extern template struct TransformDepth<GPUDevice, T, Eigen::DenseIndex>;
 DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC
@ -172,7 +172,7 @@ void DnnPoolingGradOp<T>::Compute(
    // For AvgPoolGrad, the original input tensor is not necessary. However,
    // cudnn still requires them to run, although they do not affect the
    // results.
-    functor::TransformDepth<GPUDevice, T>()(
+    functor::TransformDepth<GPUDevice, T, Eigen::DenseIndex>()(
        context->eigen_device<Device>(), tensor_in->tensor<T, 4>(),
        nhwc_to_nchw, transformed_input.tensor<T, 4>());
  }
@ -180,11 +180,11 @@ void DnnPoolingGradOp<T>::Compute(
    // For AvgPoolGrad, the original output tensor is not necessary. However,
    // cudnn still requires them to run, although they do not affect the
    // results.
-    functor::TransformDepth<GPUDevice, T>()(
+    functor::TransformDepth<GPUDevice, T, Eigen::DenseIndex>()(
        context->eigen_device<Device>(), tensor_out->tensor<T, 4>(),
        nhwc_to_nchw, transformed_output.tensor<T, 4>());
  }
-  functor::TransformDepth<GPUDevice, T>()(
+  functor::TransformDepth<GPUDevice, T, Eigen::DenseIndex>()(
      context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
      nhwc_to_nchw, transformed_output_backprop.tensor<T, 4>());
@ -239,7 +239,7 @@ void DnnPoolingGradOp<T>::Compute(
  /// Transform the output data from NCHW back to NHWC
  auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
  auto nchw_to_nhwc = Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1);
-  functor::TransformDepth<GPUDevice, T>()(
+  functor::TransformDepth<GPUDevice, T, Eigen::DenseIndex>()(
      context->eigen_device<Device>(),
      toConstTensor(transformed_input_backprop).template tensor<T, 4>(),
      nchw_to_nhwc, output->tensor<T, 4>());
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@ -3,71 +3,427 @@
 #define EIGEN_USE_THREADS
 #include "tensorflow/core/common_runtime/device.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
-
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/work_sharder.h"
+#include "tensorflow/core/platform/port.h"
 namespace tensorflow {
 namespace {
 typedef Eigen::Tensor<float, 2, Eigen::RowMajor> Matrix;
 typedef Eigen::DSizes<Eigen::DenseIndex, 2> DSizes;
 typedef Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor>,
                         Eigen::Aligned> MatrixMap;
 typedef Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>,
                         Eigen::Aligned> ConstMatrixMap;
 typedef Eigen::ThreadPoolDevice CPUDevice;
-template <typename T>
+// Blocksizes
-void PrefetchBlockNTA(const T& tensor, int si, int ei, int sj, int ej) {
+// TODO(agarwal): compute these sizes based on cache sizes.
-  for (int i = si; i < ei; ++i) {
+static const int K = 64;
-    for (int j = sj; j < ej; j = j + 16) {
+static const int M = 64;
-      port::prefetch<port::PREFETCH_HINT_NTA>(&tensor(i, j));
+static const int N = 128;
-    }
+
 // This stores a sparse representation of a slice of a matrix with size
 // (num_rows, num_cols). The slice is represented as a series of blocks of size
 // (num_rows, b), where b = block_size for all but the last block, which may
 // have
 // fewer columns.
 //
 // num_rows and block_size are assumed to be <= 256. This allows storing
 // different indices as uint8.
 //
 // For each block, we store all the non zero entries in data/data3 vector and
 // the corresponding coordinates of the element in index/index3 vectors. index3
 // vector stores index of 3 elements in the same row so that these elements can
 // share the same row coordinate. Each entry in Index3 corresponds to 3 entries
 // in data3.
 //
 // Note that all the data/indices of all the blocks are stored in the same
 // vectors respectively. To identify block boundaries, we store the block
 // offsets using index3_offset/index_offset. If there are n blocks in the slice,
 // index3_offset and index_offset have n entires. The indices for the ith block
 // are the values in the following range:
 // [index3[index3_offset[i-1]], index3[index3_offset[i]]). Similarly for
 // index_offset.
 struct SparseSlice {
 public:
  // Indices of three elements on the same row.
  struct Index3 {
    uint8 m;  // row
    // columns
    uint8 k1;
    uint8 k2;
    uint8 k3;
  };
  // Index of one element.
  struct Index {
    uint8 m;
    uint8 k;
  };
  SparseSlice(int nrows, int ncols, int bsize)
      : num_rows(nrows), num_cols(ncols), block_size(bsize) {
    DCHECK_LE(nrows, 256);
    DCHECK_LE(block_size, 256);
  }
 }
-template <typename T>
+  // Initializes the slice with data starting at mat(0, col_offset) and with
-void PrefetchBlockT1(const T& tensor, int si, int ei, int sj, int ej) {
+  // size (num_rows, num_cols).
-  for (int i = si; i < ei; ++i) {
+  // If Transpose is true, implicitly transposes mat.
-    for (int j = sj; j < ej; j = j + 16) {
+  template <bool Transpose = false>
-      port::prefetch<port::PREFETCH_HINT_T1>(&tensor(i, j));
+  void Initialize(const ConstMatrixMap& mat, int col_offset);
    }
  }
 }
-struct Block {
+  void Clear();
  Block(int sm, int em, int sk, int ek, int sn, int en)
      : startm(sm), endm(em), startk(sk), endk(ek), startn(sn), endn(en) {}
-  int startm;
+  // See comments above.
-  int endm;
+  std::vector<int> index3_offset;
-  int startk;
+  std::vector<Index3> index3;
-  int endk;
+  std::vector<float> data3;
-  int startn;
+
-  int endn;
+  // See comments above. Similar to "index3" except that each element in "index"
  // corresponds to one element in data.
  std::vector<int> index_offset;
  std::vector<Index> index;
  std::vector<float> data;
  // Number of rows and columns for the slice.
  const int num_rows;
  const int num_cols;
  // Block size used to initialize from a matrix.
  const int block_size;
 };
-bool NextBlock(const int Bm, const int Bk, const int Bn, const int m_start,
+template <bool Transpose>
-               const int m, const int k, const int n, const Block& b,
+void SparseSlice::Initialize(const ConstMatrixMap& mat, int col_offset) {
-               Block* next) {
+  const int mat_rows = Transpose ? mat.dimension(1) : mat.dimension(0);
-  *next = b;
+  const int mat_cols = Transpose ? mat.dimension(0) : mat.dimension(1);
-  if (b.endk < k) {
+  DCHECK_LE(num_rows, mat_rows);
-    next->startk = b.endk;
+  DCHECK_LE(num_cols + col_offset, mat_cols);
-    next->endk = std::min(b.endk + Bk, k);
+
-  } else {
+  int num_blocks = (num_cols + block_size - 1) / block_size;
-    next->startk = 0;
+  int mat_size = num_rows * num_cols;
-    next->endk = std::min(Bk, k);
+
-    if (b.endm < m) {
+  index3_offset.reserve(num_blocks);
-      next->startm = b.endm;
+  data3.reserve(mat_size);
-      next->endm = std::min(b.endm + Bm, m);
+  index3.reserve(mat_size / 3);
-    } else {
+
-      next->startm = m_start;
+  index_offset.reserve(num_blocks);
-      next->endm = std::min(m_start + Bm, m);
+  data.reserve(num_blocks * num_rows * 2);
-      next->startn = b.endn;
+  index.reserve(num_blocks * num_rows * 2);
-      next->endn = std::min(b.endn + Bn, n);
+
  Index3 idx3;
  Index idx;
  int data3_size = 0;
  for (int i = 0; i < num_blocks; ++i) {
    int num_block_cols =
        std::min(block_size, num_cols - block_size * (num_blocks - 1));
    for (int row = 0; row < num_rows; ++row) {
      idx3.m = static_cast<uint8>(row);
      const float* start =
          Transpose ? &mat(col_offset, row) : &mat(row, col_offset);
      const float* curr = start;
      const int stride = Transpose ? mat.dimension(1) : 1;
      const float* end = start + stride * num_block_cols;
      uint8 k = 0;
 #define NEXT_ELEM \
  curr += stride; \
  ++k;
      while (true) {
        while (curr < end && (*curr == 0)) {
          NEXT_ELEM;
        }
        if (curr >= end) break;
        idx3.k1 = k;
        data3.push_back(*curr);
        NEXT_ELEM;
        while (curr < end && (*curr == 0)) {
          NEXT_ELEM;
        }
        if (curr >= end) break;
        idx3.k2 = k;
        data3.push_back(*curr);
        NEXT_ELEM;
        while (curr < end && (*curr == 0)) {
          NEXT_ELEM;
        }
        if (curr >= end) break;
        idx3.k3 = k;
        data3.push_back(*curr);
        NEXT_ELEM;
        index3.push_back(idx3);
 #undef NEXT_ELEM
      }
      int num_inserted_mod = data3.size() % 3;
      // Move some elements to index and data if needed.
      data3_size = data3.size() - num_inserted_mod;
      idx.m = idx3.m;
      switch (num_inserted_mod) {
        case 2:
          idx.k = idx3.k2;
          data.push_back(data3[data3_size + 1]);
          index.push_back(idx);
          TF_FALLTHROUGH_INTENDED;
        case 1:
          idx.k = idx3.k1;
          data.push_back(data3[data3_size]);
          index.push_back(idx);
          data3.resize(data3_size);
      }
    }
    col_offset += block_size;
    index3_offset.push_back(index3.size());
    index_offset.push_back(index.size());
  }
  DCHECK_EQ(index3_offset.size(), num_blocks);
  DCHECK_EQ(index_offset.size(), num_blocks);
  DCHECK_EQ(3 * index3.size(), data3.size());
  DCHECK_EQ(index.size(), data.size());
 }
 void SparseSlice::Clear() {
  index3_offset.clear();
  index3.clear();
  data3.clear();
  index_offset.clear();
  index.clear();
  data.clear();
 }
 #define SCALAR_MULADD(a, inp, out) *out++ += *a * *inp++;
 #define SCALAR_MULADD3WAY(a1, a2, a3, inp1, inp2, inp3, out) \
  *out++ += *a1 * *inp1++ + *a2 * *inp2++ + *a3 * *inp3++;
 typedef Eigen::internal::packet_traits<float>::type Packet;
 static const int kNumOperands = (sizeof(Packet) / sizeof(float));
 #define LOAD(x) Eigen::internal::pload<Packet>(x);
 #define STORE(x, y) Eigen::internal::pstore<float>(x, y);
 #define LOAD_SCALAR(x, y) const auto y = Eigen::internal::pload1<Packet>(x);
 #define FMA(a, b, c, d) d = Eigen::internal::pmadd<Packet>(a, b, c);
 // Vectorized version of SCALAR_MULADD.
 #define MULADD(a, inp, out)   \
  do {                        \
    const auto b = LOAD(inp); \
    inp += kNumOperands;      \
    auto c = LOAD(out);       \
    FMA(a, b, c, c);          \
    STORE(out, c);            \
    out += kNumOperands;      \
  } while (false)
 // Vectorized version of SCALAR_MULADD3WAY.
 #define MULADD3WAY(a1, a2, a3, inp1, inp2, inp3, out) \
  do {                                                \
    auto c = LOAD(out);                               \
    const auto b1 = LOAD(inp1);                       \
    inp1 += kNumOperands;                             \
    const auto b2 = LOAD(inp2);                       \
    inp2 += kNumOperands;                             \
    const auto b3 = LOAD(inp3);                       \
    inp3 += kNumOperands;                             \
    FMA(a1, b1, c, c);                                \
    FMA(a2, b2, c, c);                                \
    FMA(a3, b3, c, c);                                \
    STORE(out, c);                                    \
    out += kNumOperands;                              \
  } while (false)
 #ifdef EIGEN_VECTORIZE_AVX2
 // Unroll MULADD3WAY for two iterations
 #define MULADD3WAY_16(a1, a2, a3, inp1, inp2, inp3, out) \
  do {                                                   \
    auto c1 = LOAD(out);                                 \
    const auto b1 = LOAD(inp1);                          \
    const auto b2 = LOAD(inp2);                          \
    const auto b3 = LOAD(inp3);                          \
                                                         \
    auto c2 = LOAD(out + kNumOperands);                  \
    const auto b4 = LOAD(inp1 + kNumOperands);           \
    const auto b5 = LOAD(inp2 + kNumOperands);           \
    const auto b6 = LOAD(inp3 + kNumOperands);           \
                                                         \
    FMA(a1, b1, c1, c1);                                 \
    FMA(a1, b4, c2, c2);                                 \
    FMA(a2, b2, c1, c1);                                 \
    FMA(a2, b5, c2, c2);                                 \
    FMA(a3, b3, c1, c1);                                 \
    FMA(a3, b6, c2, c2);                                 \
    STORE(out, c1);                                      \
    STORE(out + kNumOperands, c2);                       \
    out += 2 * kNumOperands;                             \
    inp1 += 2 * kNumOperands;                            \
    inp2 += 2 * kNumOperands;                            \
    inp3 += 2 * kNumOperands;                            \
  } while (false)
 // Further unroll MULADD3WAY.
 #define MULADD3WAY_32(a1, a2, a3, inp1, inp2, inp3, out) \
  MULADD3WAY_16(a1, a2, a3, inp1, inp2, inp3, out);      \
  MULADD3WAY_16(a1, a2, a3, inp1, inp2, inp3, out);
 #define MULADD3WAY_128(a1, a2, a3, inp1, inp2, inp3, out) \
  MULADD3WAY_32(a1, a2, a3, inp1, inp2, inp3, out);       \
  MULADD3WAY_32(a1, a2, a3, inp1, inp2, inp3, out);       \
  MULADD3WAY_32(a1, a2, a3, inp1, inp2, inp3, out);       \
  MULADD3WAY_32(a1, a2, a3, inp1, inp2, inp3, out);
 #else
 #define MULADD3WAY_128(a1, a2, a3, inp1, inp2, inp3, out)    \
  for (int __i = 0; __i < 128 / (4 * kNumOperands); ++__i) { \
    MULADD3WAY(a1, a2, a3, inp1, inp2, inp3, out);           \
    MULADD3WAY(a1, a2, a3, inp1, inp2, inp3, out);           \
    MULADD3WAY(a1, a2, a3, inp1, inp2, inp3, out);           \
    MULADD3WAY(a1, a2, a3, inp1, inp2, inp3, out);           \
  }
 #endif
 // Computes product of "left_slices" with "num_cols" columns of "right", and
 // stores the output in *"output".
 // Note that left_slices is a list of SparseSlices, which are conceptually
 // assumed to be concatenated along the column dimension. Also each SparseSlice
 // is encoded as a list of blocks with upto N columns. See SparseSlice for more
 // details.
 template <int Cols>
 inline void GEPP(const std::vector<SparseSlice*>& left_slices,
                 const ConstMatrixMap& right, const int num_cols,
                 Matrix* output) {
  const int cols = (Cols == -1) ? num_cols : Cols;
  DCHECK_EQ(num_cols, cols);
  const int right_num_cols = right.dimension(1);
  const int output_num_cols = output->dimension(1);
  const int cols_mod = cols % kNumOperands;
  int k_offset = 0;
  // Pre-compute pointers for output matrix.
  float* out_ptrs[M];
  float* const out_start = &(*output)(0, 0);
  for (int j = 0; j < M; ++j) {
    out_ptrs[j] = out_start + output_num_cols * j;
  }
  for (const auto* left_slice : left_slices) {
    const auto& left = *left_slice;
    const float* data3 = (left.data3.size() > 0) ? &left.data3[0] : nullptr;
    const float* data = (left.data.size() > 0) ? &left.data[0] : nullptr;
    const int num_blocks = left.index3_offset.size();
    int begin3 = 0;
    int begin = 0;
    for (int i = 0; i < num_blocks; ++i) {
      // Pre-compute pointers for right matrix
      const float* right_ptrs[K];
      const float* const right_start = &right(k_offset, 0);
      DCHECK_LT(k_offset, right.dimension(0));
      for (int j = 0; j < K; ++j) {
        right_ptrs[j] = right_start + right_num_cols * j;
      }
      const int end3 = left.index3_offset[i];
      int j = begin3;
      // Loop unrolled for 2 iterations.
      for (; j + 1 < end3; j += 2) {
        const float* sl1 = data3++;
        LOAD_SCALAR(sl1, l1);
        const float* sl2 = data3++;
        LOAD_SCALAR(sl2, l2);
        const float* sl3 = data3++;
        LOAD_SCALAR(sl3, l3);
        const float* nsl1 = data3++;
        LOAD_SCALAR(nsl1, nl1);
        const float* nsl2 = data3++;
        LOAD_SCALAR(nsl2, nl2);
        const float* nsl3 = data3++;
        LOAD_SCALAR(nsl3, nl3);
        const SparseSlice::Index3& index = left.index3[j];
        const SparseSlice::Index3& nindex = left.index3[j + 1];
        float* out = out_ptrs[index.m];
        float* nout = out_ptrs[nindex.m];
        const float* r1 = right_ptrs[index.k1];
        const float* r2 = right_ptrs[index.k2];
        const float* r3 = right_ptrs[index.k3];
        const float* nr1 = right_ptrs[nindex.k1];
        const float* nr2 = right_ptrs[nindex.k2];
        const float* nr3 = right_ptrs[nindex.k3];
        if (cols == 128) {
          MULADD3WAY_128(l1, l2, l3, r1, r2, r3, out);
          MULADD3WAY_128(nl1, nl2, nl3, nr1, nr2, nr3, nout);
        } else {
          for (int n = 0; n < cols / kNumOperands; ++n) {
            MULADD3WAY(l1, l2, l3, r1, r2, r3, out);
            MULADD3WAY(nl1, nl2, nl3, nr1, nr2, nr3, nout);
          }
          for (int k = 0; k < cols_mod; ++k) {
            SCALAR_MULADD3WAY(sl1, sl2, sl3, r1, r2, r3, out);
            SCALAR_MULADD3WAY(nsl1, nsl2, nsl3, nr1, nr2, nr3, nout);
          }
        }
      }
      if (j < end3) {
        const float* sl1 = data3++;
        LOAD_SCALAR(sl1, l1);
        const float* sl2 = data3++;
        LOAD_SCALAR(sl2, l2);
        const float* sl3 = data3++;
        LOAD_SCALAR(sl3, l3);
        const SparseSlice::Index3& index = left.index3[j];
        float* out = out_ptrs[index.m];
        const float* r1 = right_ptrs[index.k1];
        const float* r2 = right_ptrs[index.k2];
        const float* r3 = right_ptrs[index.k3];
        if (cols == 128) {
          MULADD3WAY_128(l1, l2, l3, r1, r2, r3, out);
        } else {
          for (int n = 0; n < cols / kNumOperands; ++n) {
            MULADD3WAY(l1, l2, l3, r1, r2, r3, out);
          }
          for (int k = 0; k < cols_mod; ++k) {
            SCALAR_MULADD3WAY(sl1, sl2, sl3, r1, r2, r3, out);
          }
        }
      }
      begin3 = end3;
      int end = left.index_offset[i];
      for (int j = begin; j < end; ++j) {
        const float* sl = data++;
        LOAD_SCALAR(sl, l);
        const SparseSlice::Index& index = left.index[j];
        const float* r = right_ptrs[index.k];
        float* out = out_ptrs[index.m];
        for (int n = 0; n < cols / kNumOperands; ++n) {
          MULADD(l, r, out);
        }
        for (int k = 0; k < cols_mod; ++k) {
          SCALAR_MULADD(sl, r, out);
        }
      }
      k_offset += left.block_size;
      begin = end;
    }
  }
  return next->startn == next->endn;
 }
 #undef SCALAR_MULADD
 #undef SCALAR_MULADD3WAY
 #undef LOAD
 #undef STORE
 #undef LOAD_SCALAR
 #undef FMA
 #undef MULADD
 #undef MULADD3WAY
 #undef MULADD3WAY_16
 #undef MULADD3WAY_32
 #undef MULADD3WAY_128
 }  // namespace
 class SparseMatMulOp : public OpKernel {
 public:
  explicit SparseMatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@ -80,7 +436,6 @@ class SparseMatMulOp : public OpKernel {
  void Compute(OpKernelContext* ctx) override {
    const Tensor& a = ctx->input(0);
    const Tensor& b = ctx->input(1);
    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
                errors::InvalidArgument("a is not a matrix"));
    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
@ -115,10 +470,11 @@ class SparseMatMulOp : public OpKernel {
          left.contract(right_mat, dim_pair);
      return;
    }
    typedef Eigen::Tensor<float, 2, Eigen::RowMajor> Matrix;
    std::unique_ptr<Matrix> right_tr_mat;
    std::unique_ptr<TTypes<float>::ConstMatrix> right_tr_map;
    if (transpose_b_) {
      // TODO(agarwal): avoid transposing the matrix here and directly handle
      // transpose in CreateDenseSlices.
      right_tr_mat.reset(new Matrix(k, n));
      Eigen::array<int, 2> perm({1, 0});
      right_tr_mat->device(ctx->template eigen_device<CPUDevice>()) =
@ -129,56 +485,75 @@ class SparseMatMulOp : public OpKernel {
    TTypes<float>::ConstMatrix& right =
        transpose_b_ ? *right_tr_map : right_mat;
-    const bool transpose_a = transpose_a_;
+    SparseMatMul(left, right, transpose_a_,
-
+                 ctx->device()->tensorflow_cpu_worker_threads(), &out);
    typedef Eigen::TensorMap<Eigen::Tensor<float, 1, Eigen::RowMajor>,
                             Eigen::Unaligned> TensorMap;
    typedef Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>,
                             Eigen::Unaligned> ConstTensorMap;
    typedef Eigen::DSizes<Eigen::DenseIndex, 1> DSizes;
    const int Bm = 16;
    const int Bk = 16;
    const int Bn = 1024;
    auto work_shard = [m, n, k, transpose_a, Bm, Bk, Bn, &left, &right, &out](
        int64 start64, int64 end64) {
      const int start = static_cast<int>(start64);
      const int end = static_cast<int>(end64);
      Block curr(start, std::min(start + Bm, end), 0, std::min(Bk, k), 0,
                 std::min(Bn, n));
      Block next(curr);
      bool done = false;
      for (int i = start; i < end; ++i) {
        out.chip<0>(i).setZero();
      }
      while (true) {
        done = NextBlock(Bm, Bk, Bn, start, end, k, n, curr, &next);
        PrefetchBlockT1(right, curr.startk, curr.endk, curr.startn, curr.endn);
        // Process current block
        for (int i = curr.startm; i < curr.endm; ++i) {
          PrefetchBlockNTA(left, i, i + 1, curr.startk, curr.endk);
          PrefetchBlockNTA(out, i, i + 1, curr.startn, curr.endn);
          DSizes out_slice_shape(curr.endn - curr.startn);
          TensorMap out_i(&out(i, curr.startn), out_slice_shape);
          for (int j = curr.startk; j < curr.endk; ++j) {
            const float l = transpose_a ? left(j, i) : left(i, j);
            if (l == 0) continue;
            ConstTensorMap right_j(&right(j, curr.startn), out_slice_shape);
            out_i += right_j * l;
          }
        }
        if (done) break;
        curr = next;
      }
    };
    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
    Shard(worker_threads.num_threads, worker_threads.workers, m, 2 * k * n,
          work_shard);
  }
 private:
  // Perform matrix multiplication of "left" and "right", and store the result
  // in *"ouptut".
  static inline void SparseMatMul(
      const ConstMatrixMap& left, const ConstMatrixMap& right,
      bool transpose_left, const DeviceBase::CpuWorkerThreads* thread_pool,
      MatrixMap* output);
  // Computes multiplication of left and num_cols columns of right, and stores
  // the output block in *"output" at offsets "output_row_offset" and
  // "output_col_offset". If assign is true, assigns the value to that block,
  // else adds the values to the existing values.
  static inline void ComputeOutputBlock(const std::vector<SparseSlice*>& left,
                                        const ConstMatrixMap& right,
                                        const int num_cols,
                                        int output_row_offset,
                                        int output_col_offset, bool assign,
                                        MatrixMap* output);
  // Encodes "mat" using a sparse representation and stores that in
  // "mat_slices". "mat" is broken into a grid with sizes "slice_num_rows" and
  // "slice_num_cols", each grid element is converted into a SparseSlice and
  // stored in mat_slices. "slice_block_size" is used to perform futher column
  // blocking of each slice.
  static inline BlockingCounter* CreateSparseSlices(
      const ConstMatrixMap& mat, bool transpose, int slice_num_rows,
      int slice_block_size, int slice_num_cols,
      std::vector<std::vector<SparseSlice*>>* mat_slices,
      const DeviceBase::CpuWorkerThreads* thread_pool);
  // This function chops "mat" along column dimension into pieces with at most N
  // columns, and concatenates the pieces one after the other in "buffer". It
  // returns the list of the pieces in "slices". It returns a BlockingCounter
  // which should be used to wait for the shuffle operations to complete.
  static inline BlockingCounter* CreateDenseSlices(
      const ConstMatrixMap& mat, int row_start, int num_rows, int col_start,
      int num_cols, const DeviceBase::CpuWorkerThreads* thread_pool,
      Matrix* buffer, std::vector<ConstMatrixMap*>* slices);
  // Helper function for CreateDenseSlices to move the data around. It returns a
  // BlockingCounter which should be used to wait for the shuffle operations to
  // complete.
  static inline BlockingCounter* ShuffleMatrix(
      const ConstMatrixMap& mat, int slice_row_start, int slice_num_rows,
      int slice_col_start, int slice_num_cols, const int N,
      const DeviceBase::CpuWorkerThreads* thread_pool, Matrix* buffer);
  // Helper function for CreateDenseSlices to create slices.
  static inline void SliceMatrix(const Matrix& mat, const int num_rows,
                                 const int num_slices,
                                 std::vector<ConstMatrixMap*>* slices);
  // Heuristics to compute various block sizes.
  // KR, NR: block sizes for "right". We run blocking iterations that operate on
  // matrices with at most this size.
  // KL: grid size along the column dimension used while encoding left.
  // IB, JB: number of left and right slices to multiply together. This is used
  // for ordering different ComputeBlockOutput operations inside each blocking
  // iteration so as to potentially reduce the working set size.
  static inline void ComputeBlockSizes(const ConstMatrixMap& left,
                                       const ConstMatrixMap& right,
                                       bool transpose_left, int num_threads,
                                       int* KR, int* NR, int* KL, int* JB,
                                       int* IB);
  bool transpose_a_;
  bool transpose_b_;
  bool a_is_sparse_;
@ -186,6 +561,329 @@ class SparseMatMulOp : public OpKernel {
  TF_DISALLOW_COPY_AND_ASSIGN(SparseMatMulOp);
 };
 inline void SparseMatMulOp::ComputeOutputBlock(
    const std::vector<SparseSlice*>& left, const ConstMatrixMap& right,
    const int num_cols, int output_row_offset, int output_col_offset,
    bool assign, MatrixMap* output) {
  const int num_rows = left[0]->num_rows;
  const int rhs_num_cols = right.dimension(1);
  DCHECK_LE(num_cols, rhs_num_cols);
  Matrix out(num_rows, rhs_num_cols);
  out.setZero();
  if (num_cols == N) {
    GEPP<N>(left, right, num_cols, &out);
  } else {
    GEPP<-1>(left, right, num_cols, &out);
  }
  if (!assign) {
    const Eigen::array<int, 2> begin = {output_row_offset, output_col_offset};
    const Eigen::array<int, 2> sizes = {num_rows, num_cols};
    if (num_cols == rhs_num_cols) {
      output->slice(begin, sizes) += out;
    } else {
      static const Eigen::array<int, 2> zero = {0, 0};
      output->slice(begin, sizes) += out.slice(zero, sizes);
    }
  } else {
    // output->slice(begin, sizes) = out.slice(zero, sizes), implemented
    // using memcpy.
    for (int i = 0; i < num_rows; ++i) {
      memcpy(&(*output)(output_row_offset + i, output_col_offset), &out(i, 0),
             num_cols * sizeof(float));
    }
  }
 }
 inline BlockingCounter* SparseMatMulOp::CreateSparseSlices(
    const ConstMatrixMap& mat, bool transpose, int slice_num_rows,
    int slice_block_size, int slice_num_cols,
    std::vector<std::vector<SparseSlice*>>* mat_slices,
    const DeviceBase::CpuWorkerThreads* thread_pool) {
  const int mat_num_rows = transpose ? mat.dimension(1) : mat.dimension(0);
  const int mat_num_cols = transpose ? mat.dimension(0) : mat.dimension(1);
  const int num_slices_dim0 =
      std::max(1, (mat_num_rows + slice_num_rows - 1) / slice_num_rows);
  const int num_slices_dim1 =
      std::max(1, (mat_num_cols + slice_num_cols - 1) / slice_num_cols);
  mat_slices->resize(num_slices_dim0);
  BlockingCounter* counter =
      new BlockingCounter(num_slices_dim0 * num_slices_dim1);
  auto work = [counter, transpose](SparseSlice* sparse_slice,
                                   ConstMatrixMap* slice, int col_offset) {
    if (transpose) {
      sparse_slice->Initialize<true>(*slice, col_offset);
    } else {
      sparse_slice->Initialize<false>(*slice, col_offset);
    }
    delete slice;
    counter->DecrementCount();
  };
  for (int i = 0; i < num_slices_dim0; ++i) {
    (*mat_slices)[i].resize(num_slices_dim1);
    int num_rows =
        std::min<int>(slice_num_rows, mat_num_rows - i * slice_num_rows);
    for (int j = 0; j < num_slices_dim1; ++j) {
      int num_cols =
          std::min<int>(slice_num_cols, mat_num_cols - j * slice_num_cols);
      ConstMatrixMap* slice = nullptr;
      if (transpose) {
        slice =
            new ConstMatrixMap(&mat(0, i * slice_num_rows), mat.dimensions());
      } else {
        DSizes d(num_rows, mat_num_cols);
        slice = new ConstMatrixMap(&mat(i * slice_num_rows, 0), d);
      }
      SparseSlice* sparse_slice =
          new SparseSlice(num_rows, num_cols, slice_block_size);
      (*mat_slices)[i][j] = sparse_slice;
      thread_pool->workers->Schedule(
          std::bind(work, sparse_slice, slice, slice_num_cols * j));
    }
  }
  return counter;
 }
 inline BlockingCounter* SparseMatMulOp::ShuffleMatrix(
    const ConstMatrixMap& mat, int slice_row_start, int slice_num_rows,
    int slice_col_start, int slice_num_cols, const int N,
    const DeviceBase::CpuWorkerThreads* thread_pool, Matrix* buffer) {
  int num_threads = std::min(thread_pool->num_threads, 16);
  BlockingCounter* counter = new BlockingCounter(num_threads);
  DCHECK_EQ(N, buffer->dimension(1));
  auto shuffle_work = [&mat, slice_row_start, slice_num_rows, slice_col_start,
                       slice_num_cols, N, buffer, counter](int s, int e) {
    const int row_start = s % slice_num_rows + slice_row_start;
    const int col_start = s / slice_num_rows * N + slice_col_start;
    float* out_start = &(*buffer)(s, 0);
    const float* input_start = &mat(row_start, col_start);
    const float* input_end = &mat(slice_row_start + slice_num_rows - 1,
                                  slice_col_start + slice_num_cols - 1);
    const int mat_num_cols = mat.dimension(1);
    const int row_slice_size = slice_num_rows * mat_num_cols;
    const int aligned_end = slice_num_cols / N * slice_num_rows;
    const int e1 = std::min(e, aligned_end);
    while (s < e1) {
      memcpy(out_start, input_start, N * sizeof(float));
      out_start += N;
      input_start += mat_num_cols;
      if (input_start > input_end) {
        input_start = input_start - row_slice_size + N;
      }
      ++s;
    }
    int s1 = std::max(s, aligned_end);
    const int copy_num_cols = slice_num_cols % N;
    while (s1 < e) {
      memcpy(out_start, input_start, copy_num_cols * sizeof(float));
      out_start += N;
      input_start += mat_num_cols;
      ++s1;
    }
    if (counter) counter->DecrementCount();
  };
  int start = 0;
  int end = 0;
  int num_out_rows = (slice_num_cols + N - 1) / N * slice_num_rows;
  DCHECK_LE(num_out_rows, buffer->dimension(0));
  for (int i = std::max(1, num_threads); i > 0; --i) {
    end = start + num_out_rows / i;
    thread_pool->workers->Schedule(std::bind(shuffle_work, start, end));
    num_out_rows -= (end - start);
    start = end;
  }
  return counter;
 }
 inline void SparseMatMulOp::SliceMatrix(const Matrix& mat, const int num_rows,
                                        const int num_slices,
                                        std::vector<ConstMatrixMap*>* slices) {
  slices->resize(num_slices);
  DSizes d(num_rows, mat.dimension(1));
  DCHECK_LE(num_rows * num_slices, mat.dimension(0));
  for (int i = 0; i < num_slices; ++i) {
    (*slices)[i] = new ConstMatrixMap(&mat(i * num_rows, 0), d);
  }
 }
 inline BlockingCounter* SparseMatMulOp::CreateDenseSlices(
    const ConstMatrixMap& mat, int row_start, int num_rows, int col_start,
    int num_cols, const DeviceBase::CpuWorkerThreads* thread_pool,
    Matrix* buffer, std::vector<ConstMatrixMap*>* slices) {
  BlockingCounter* shuffle_counter = ShuffleMatrix(
      mat, row_start, num_rows, col_start, num_cols, N, thread_pool, buffer);
  const int num_slices = (num_cols + N - 1) / N;
  SliceMatrix(*buffer, num_rows, num_slices, slices);
  return shuffle_counter;
 }
 inline void SparseMatMulOp::ComputeBlockSizes(const ConstMatrixMap& left,
                                              const ConstMatrixMap& right,
                                              bool transpose_left,
                                              int num_threads, int* KR, int* NR,
                                              int* KL, int* JB, int* IB) {
  // Heuristics for calculating block sizes
  // Assume two hyperthreads per core.
  const int est_num_cores = std::max(1, (num_threads + 1) / 2);
  // Use block of rhs with at most 128K floats per core.
  const int mem = est_num_cores * 128 * 1024;
  *KR = std::min(static_cast<int>(right.dimension(0)), mem / 256);
  *NR = right.dimension(1);
  if (*KR * *NR > mem) {
    // 4096 may be enough to ammortize the cost of writes.
    *KR = std::min<int>(*KR, 4096);
  }
  // Use sizes that are multiples of K and 256.
  *KR = std::max(1, *KR / K) * K;
  *NR = std::max(1, *NR / 256) * 256;
  if (*KR * *NR > mem) {
    *NR = mem / *KR;
  }
  *NR = std::max(1, *NR / 256) * 256;
  const int left_dim0 = transpose_left ? left.dimension(1) : left.dimension(0);
  const int left_dim1 = transpose_left ? left.dimension(0) : left.dimension(1);
  for (*KL = 1024; *KL > K; *KL /= 2) {
    if (*KR % *KL == 0 &&
        std::max<int>(1, left_dim0 / 64) * (left_dim1 / *KL) > est_num_cores) {
      break;
    }
  }
  DCHECK_EQ(*KL % K, 0);
  DCHECK_GE(*KR, *KL);
  if (*KR < right.dimension(0)) {
    CHECK_EQ(*KR % *KL, 0);
  }
  *JB = std::max(1, static_cast<int>(sqrt(num_threads) / 2.0));
  *IB = 8 * *JB;
  DCHECK_EQ(N * sizeof(float) % 64, 0);
 }
 // Here is a an overview of the SparseMatMul code. Note that we assume that the
 // left matrix is sparse.
 //
 // The matrix "left" is divided into a grid with blocksize of (M, KL). Each
 // block is encoded as a SparseSlice. These grid elements are stored as
 // std::vector<std::vector<SparseSlice>>. Each element of the outer vector
 // represents M rows of the left matrix. Lets call these elements l_i and lets
 // call each element of the inner vector L_mk.
 //
 // The matrix "right" is divided into a grid with block size KR * NR.  Lets
 // denote the blocks on the right as R_kn. Note that we ensure that KL divides
 // KR so that for each element R_kn, we don't need to multiply it with any
 // partial L_mk blocks.
 //
 // We then multiply each right side block R_kn with the full "left" matrix and
 // update the output. These iterations are run sequentially since R_kn are
 // packed into the same underlying temporary buffer.
 //
 // In each iteration we do the following:
 // 1. Create slices r_j of R_kn: We split R_kn into vertical blocks with N
 //    (=128) columns and then concatenating these slices into a buffer. This is
 //    done so that each slice r_j of R_kn is stored contiguously in memory. Note
 //    that if R_kj has dimensions (KR, NR), we create NR / N slices, and the
 //    buffer has dimensions (KR * NR / N, N) (assuming N divides NR).
 // 2. For each (l_i, r_j), we compute the inner product using the GEPP function
 //    and update the output block o_ij. These calls are further blocked to
 //    reduce the working set size. In each iteration we take IB elements from
 //    {l_i} and JB elements from {r_j} and compute the IB * JB inner products.
 inline void SparseMatMulOp::SparseMatMul(
    const ConstMatrixMap& left, const ConstMatrixMap& right,
    bool transpose_left, const DeviceBase::CpuWorkerThreads* thread_pool,
    MatrixMap* output) {
  const int num_threads = thread_pool->num_threads;
  int KR, NR, KL, JB, IB;
  ComputeBlockSizes(left, right, transpose_left, num_threads, &KR, &NR, &KL,
                    &JB, &IB);
  // Slice the left matrix
  std::vector<std::vector<SparseSlice*>> left_slices;
  std::unique_ptr<BlockingCounter> sparse_slice_counter;
  sparse_slice_counter.reset(
      CreateSparseSlices(ConstMatrixMap(left.data(), left.dimensions()),
                         transpose_left, M, K, KL, &left_slices, thread_pool));
  const int num_left_slices = left_slices.size();
  const int right_dim0 = right.dimension(0);
  const int right_dim1 = right.dimension(1);
  // Allocate buffer for storing slices of right matrix.
  // Note buffer needs enough space to hold atmost a KR * NR matrix since that
  // is the block size per iteration.
  const int buffer_num_rows =
      std::min(KR, right_dim0) * (std::min(NR, right_dim1) + N - 1) / N;
  Matrix buffer(buffer_num_rows, N);
  std::vector<ConstMatrixMap*> right_slices;
  std::vector<SparseSlice*> block_left_slices;
  std::vector<std::function<void(void)>> tasks;
  // Number of blocks based on block sizes of KR * NR.
  const int num_k_blocks = (right_dim0 + KR - 1) / KR;
  const int num_n_blocks = (right_dim1 + NR - 1) / NR;
  std::unique_ptr<BlockingCounter> dense_slice_counter;
  for (int nb = 0; nb < num_n_blocks; ++nb) {
    const int right_num_cols =
        std::min(NR, static_cast<int>(right_dim1 - NR * nb));
    for (int kb = 0; kb < num_k_blocks; ++kb) {
      const int right_num_rows =
          std::min(KR, static_cast<int>(right_dim0 - KR * kb));
      dense_slice_counter.reset(CreateDenseSlices(
          right, kb * KR, right_num_rows, nb * NR, right_num_cols, thread_pool,
          &buffer, &right_slices));
      const int num_right_slices = right_slices.size();
      tasks.reserve(num_left_slices * num_right_slices);
      for (int j_outer = 0; j_outer < num_right_slices; j_outer += JB) {
        for (int i_outer = 0; i_outer < num_left_slices; i_outer += IB) {
          for (int j_inner = j_outer;
               j_inner < std::min(num_right_slices, j_outer + JB); ++j_inner) {
            const int num_cols = std::min(N, right_num_cols - N * j_inner);
            for (int i_inner = i_outer;
                 i_inner < std::min(num_left_slices, i_outer + IB); ++i_inner) {
              // Figure out which left slices to use.
              block_left_slices.clear();
              int begin = kb * KR / KL;
              int end = std::min<int>((kb + 1) * KR / KL,
                                      (right.dimension(0) + KL - 1) / KL);
              DCHECK_LT(begin, end);
              block_left_slices.insert(block_left_slices.begin(),
                                       left_slices[i_inner].begin() + begin,
                                       left_slices[i_inner].begin() + end);
              tasks.push_back(std::bind(
                  &SparseMatMulOp::ComputeOutputBlock, block_left_slices,
                  std::ref(*right_slices[j_inner]), num_cols, M * i_inner,
                  N * j_inner + nb * NR, kb == 0, output));
            }
          }
        }
      }
      if (sparse_slice_counter) {
        sparse_slice_counter->Wait();
        sparse_slice_counter.reset(nullptr);
      }
      if (dense_slice_counter) {
        dense_slice_counter->Wait();
        dense_slice_counter.reset(nullptr);
      }
      BlockingCounter bc(tasks.size());
      for (const auto& t : tasks) {
        thread_pool->workers->Schedule([&bc, &t]() {
          t();
          bc.DecrementCount();
        });
      }
      bc.Wait();
      tasks.clear();
      gtl::STLDeleteElements(&right_slices);
      right_slices.clear();
    }
  }
  for (auto& left_slice : left_slices) {
    gtl::STLDeleteElements(&left_slice);
  }
 }
 REGISTER_KERNEL_BUILDER(Name("SparseMatMul").Device(DEVICE_CPU),
                        SparseMatMulOp);
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@ -20,6 +20,8 @@ void Sparsify(Tensor* t, float sparsity) {
  for (int64 i = 0; i < N; ++i) {
    if (rnd.Uniform(K) < sparsity * K) {
      flat(i) = 0;
    } else if (flat(i) == 0) {
      flat(i) = 0.1;
    }
  }
 }
@ -86,19 +88,29 @@ static Graph* MultiSparseMatMul(int m, int n, int d, float sparsity_a,
  return g;
 }
-#define BM_SPARSE(M, K, N, S)                                                  \
+#define BM_SPARSE(M, K, N, S)                                                 \
-  static void BM_Sparse##_##M##_##K##_##N##_##S(int iters) {                   \
+  static void BM_Sparse##_##M##_##K##_##N##_##S(int iters) {                  \
-    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2);        \
+    testing::StopTiming();                                                    \
-    std::string label = strings::Printf("%d_%d_%d_%0.2f", M, K, N, S / 100.0); \
+    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2);       \
-    testing::SetLabel(label);                                                  \
+    std::string label;                                                        \
-    test::Benchmark("cpu", SparseMatMul(M, N, K, S / 100.0, false, false))     \
+    if (S == 0) {                                                             \
-        .Run(iters);                                                           \
+      label = strings::Printf("%d*%d*%d_Eigen", M, K, N);                     \
-  }                                                                            \
+    } else {                                                                  \
      label = strings::Printf("%d*%d*%d_sparsity:%0.2f", M, K, N, S / 100.0); \
    }                                                                         \
    testing::SetLabel(label);                                                 \
    testing::UseRealTime();                                                   \
    auto g = SparseMatMul(M, N, K, S / 100.0, false, false);                  \
    testing::StartTiming();                                                   \
    test::Benchmark("cpu", g).Run(iters);                                     \
  }                                                                           \
  BENCHMARK(BM_Sparse##_##M##_##K##_##N##_##S);
 BM_SPARSE(2048, 2048, 2048, 0);
 BM_SPARSE(2048, 2048, 2048, 1);
 BM_SPARSE(2048, 2048, 2048, 50);
 BM_SPARSE(2048, 2048, 2048, 85);
 BM_SPARSE(2048, 2048, 2048, 99);
 BM_SPARSE(1024, 1024, 1024, 0);
 BM_SPARSE(1024, 1024, 1024, 1);
@ -107,28 +119,34 @@ BM_SPARSE(1024, 1024, 1024, 85);
 BM_SPARSE(256, 256, 256, 1);
 BM_SPARSE(512, 512, 512, 1);
-#define BM_SPARSE_MULTI(M, K, N, S1, S2)                                       \
+#define BM_SPARSE_MULTI(M, K, N, S1, S2)                                    \
-  static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2(int iters) {     \
+  static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2(int iters) {  \
-    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 3);    \
+    testing::StopTiming();                                                  \
-    std::string label = strings::Printf("%d_%d_%d_%0.2f_%0.2f", M, K, N,       \
+    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 3); \
-                                        S1 / 100.0, S2 / 100.0);               \
+    std::string label = strings::Printf("%d_%d_%d_%0.2f_%0.2f", M, K, N,    \
-    testing::SetLabel(label);                                                  \
+                                        S1 / 100.0, S2 / 100.0);            \
-    test::Benchmark("cpu", MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0)) \
+    testing::SetLabel(label);                                               \
-        .Run(iters);                                                           \
+    testing::UseRealTime();                                                 \
-  }                                                                            \
+    auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0);            \
    testing::StartTiming();                                                 \
    test::Benchmark("cpu", g).Run(iters);                                   \
  }                                                                         \
  BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2);
-BM_SPARSE_MULTI(512, 2140, 4096, 0, 82);
+BM_SPARSE_MULTI(1024, 2140, 4096, 0, 82);
-BM_SPARSE_MULTI(512, 4096, 2048, 83, 83);
+BM_SPARSE_MULTI(1024, 4096, 2048, 83, 83);
 #define BM_SPARSE_TR(M, K, N, S, TA, TB)                                     \
  static void BM_Sparse##_##M##_##K##_##N##_##S##_##TA##_##TB(int iters) {   \
    testing::StopTiming();                                                   \
    testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2);      \
    std::string label =                                                      \
        strings::Printf("%d_%d_%d_%d_%d_%0.2f", M, K, N, TA, TB, S / 100.0); \
    testing::SetLabel(label);                                                \
-    test::Benchmark("cpu", SparseMatMul(M, N, K, S / 100.0, TA, TB))         \
+    testing::UseRealTime();                                                  \
-        .Run(iters);                                                         \
+    auto g = SparseMatMul(M, N, K, S / 100.0, TA, TB);                       \
    testing::StartTiming();                                                  \
    test::Benchmark("cpu", g).Run(iters);                                    \
  }                                                                          \
  BENCHMARK(BM_Sparse##_##M##_##K##_##N##_##S##_##TA##_##TB);
--- a/tensorflow/core/kernels/string_to_hash_bucket_op.cc
+++ b/tensorflow/core/kernels/string_to_hash_bucket_op.cc
@ -25,7 +25,7 @@ class StringToHashBucketOp : public OpKernel {
                                            &output_tensor));
    auto output_flat = output_tensor->flat<int64>();
-    for (std::size_t i = 0; i < input_flat.size(); ++i) {
+    for (size_t i = 0; i < input_flat.size(); ++i) {
      const uint64 input_hash = Hash64(input_flat(i));
      const uint64 bucket_id = input_hash % num_buckets_;
      // The number of buckets is always in the positive range of int64 so is
--- a/tensorflow/core/lib/core/command_line_flags.cc
+++ b/tensorflow/core/lib/core/command_line_flags.cc
@ -17,6 +17,12 @@ bool StringToValue<int32>(const string& content, int* value) {
  return str_util::NumericParse32(content, value);
 }
 template <>
 bool StringToValue<string>(const string& content, string* value) {
  *value = content;
  return true;
 }
 // Parse a single argument by linearly searching through the command table.
 // The input format is: --argument=value.
 // Return OK if the argument is used. It store the extracted value into the
@ -27,7 +33,7 @@ bool StringToValue<int32>(const string& content, int* value) {
 template <typename T>
 Status ParseArgument(const string& argument) {
  for (auto& command :
-       internal::CommandLineFlagRegistry<int>::Instance()->commands) {
+       internal::CommandLineFlagRegistry<T>::Instance()->commands) {
    string prefix = strings::StrCat("--", command.name, "=");
    if (tensorflow::StringPiece(argument).starts_with(prefix)) {
      string content = argument.substr(prefix.length());
@ -62,6 +68,7 @@ Status ParseArgument<bool>(const string& argument) {
  return Status(error::NOT_FOUND,
                strings::StrCat("Unknown command: ", argument));
 }
 }  // namespace
 Status ParseCommandLineFlags(int* argc, char* argv[]) {
@ -81,6 +88,11 @@ Status ParseCommandLineFlags(int* argc, char* argv[]) {
    if (s.ok()) {
      continue;
    }
    // Search string commands.
    s = ParseArgument<string>(argv[index]);
    if (s.ok()) {
      continue;
    }
    if (s.code() != error::NOT_FOUND) {
      return s;
    }
--- a/tensorflow/core/lib/core/command_line_flags.h
+++ b/tensorflow/core/lib/core/command_line_flags.h
@ -43,11 +43,14 @@ struct CommandLineFlagRegister {
 }  // namespace internal
 #define TF_DEFINE_int32(name, default_value, text) \
-  TF_DEFINE_variable(int32, name, default_value, text);
+  TF_DEFINE_variable(tensorflow::int32, name, default_value, text);
 #define TF_DEFINE_bool(name, default_value, text) \
  TF_DEFINE_variable(bool, name, default_value, text);
 #define TF_DEFINE_string(name, default_value, text) \
  TF_DEFINE_variable(string, name, default_value, text);
 // Parse argv[1]..argv[*argc-1] to options. Remove used arguments from the argv.
 // Returned the number of unused arguments in *argc.
 // Return error Status if the parsing encounters errors.
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@ -859,10 +859,10 @@ REGISTER_OP("ListDiff")
    .Output("idx: int32")
    .Attr("T: type")
    .Doc(R"doc(
-Computes the difference between two lists of numbers.
+Computes the difference between two lists of numbers or strings.
 Given a list `x` and a list `y`, this operation returns a list `out` that
-represents all numbers that are in `x` but not in `y`. The returned list `out`
+represents all values that are in `x` but not in `y`. The returned list `out`
 is sorted in the same order that the numbers appear in `x` (duplicates are
 preserved). This operation also returns a list `idx` that represents the
 position of each `out` element in `x`. In other words:
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@ -35,7 +35,14 @@ REGISTER_OP("MatrixInverse")
    .Output("output: T")
    .Attr("T: {float, double}")
    .Doc(R"doc(
-Calculates the inverse of a square invertible matrix. Checks for invertibility.
+Calculates the inverse of a square invertible matrix.
 The op uses the Cholesky decomposition if the matrix is symmetric positive
 definite and LU decomposition with partial pivoting otherwise.
 If the matrix is not invertible there is no guarantee what the op does. It
 may detect the condition and raise an exception or it may simply return a
 garbage result.
 input: Shape is `[M, M]`.
 output: Shape is `[M, M]` containing the matrix inverse of the input.
@ -47,12 +54,19 @@ REGISTER_OP("BatchMatrixInverse")
    .Output("output: T")
    .Attr("T: {float, double}")
    .Doc(R"doc(
-Calculates the inverse of square invertible matrices. Checks for invertibility.
+Calculates the inverse of square invertible matrices.
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices. The output is a tensor of the same shape as the input
 containing the inverse for all input submatrices `[..., :, :]`.
 The op uses the Cholesky decomposition if the matrices are symmetric positive
 definite and LU decomposition with partial pivoting otherwise.
 If a matrix is not invertible there is no guarantee what the op does. It
 may detect the condition and raise an exception or it may simply return a
 garbage result.
 input: Shape is `[..., M, M]`.
 output: Shape is `[..., M, M]`.
 T: The type of values in the input and output.
--- a/tensorflow/core/public/tensor_c_api.h
+++ b/tensorflow/core/public/tensor_c_api.h
@ -186,8 +186,8 @@ extern void TF_SetTarget(TF_SessionOptions* options, const char* target);
 // config should be a serialized brain.ConfigProto proto.
 // If config was not parsed successfully as a ConfigProto, record the
 // error information in *status.
-extern void TF_SetConfig(TF_SessionOptions* options, const char* config,
+extern void TF_SetConfig(TF_SessionOptions* options, const void* proto,
-                         size_t config_len, TF_Status* status);
+                         size_t proto_len, TF_Status* status);
 // Destroy an options object.
 extern void TF_DeleteSessionOptions(TF_SessionOptions*);
--- a/tensorflow/examples/android/jni/jni_utils.cc
+++ b/tensorflow/examples/android/jni/jni_utils.cc
@ -10,11 +10,11 @@
 #include <fstream>
 #include <sstream>
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
 #include "google/protobuf/message_lite.h"
 #include "tensorflow/core/platform/logging.h"
 #include "google/protobuf/src/google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/src/google/protobuf/io/zero_copy_stream_impl_lite.h"
 #include "google/protobuf/src/google/protobuf/io/coded_stream.h"
 #include "google/protobuf/src/google/protobuf/message_lite.h"
 static const char* const ASSET_PREFIX = "file:///android_asset/";
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@ -0,0 +1,30 @@
 # Description:
 #   Tensorflow C++ inference example for labeling images.
 licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 cc_binary(
    name = "label_image",
    srcs = ["main.cc"],
    linkopts = ["-lm"],
    deps = [
        "//tensorflow/cc:cc_ops",
        "//tensorflow/core:tensorflow",
    ],
 )
 filegroup(
    name = "all_files",
    srcs = glob(
        ["**/*"],
        exclude = [
            "**/METADATA",
            "**/OWNERS",
            "bin/**",
            "gen/**",
        ],
    ),
    visibility = ["//tensorflow:__subpackages__"],
 )
--- a/tensorflow/examples/label_image/README.md
+++ b/tensorflow/examples/label_image/README.md
@ -0,0 +1,49 @@
 # Tensorflow C++ Image Recognition Demo
 This example shows how you can load a pre-trained TensorFlow network and use it
 to recognize objects in images.
 ## Description
 This demo uses a Google Inception model to classify image files that are passed
 in on the command line. See
 [`googlenet_labels.txt`](data/googlenet_labels.txt)
 for the possible classifications, which are the 1,000 categories used in the
 Imagenet competition.
 ## To build/install/run
 As long as you've managed to build the main TensorFlow framework, you should
 have everything you need to run this example installed already.
 To build it, run this command:
 ```bash
 $ bazel build tensorflow/examples/label_image/...
 ```
 That should build a binary executable that you can then run like this:
 ```bash
 $ bazel-bin/tensorflow/examples/label_image/label_image
 ```
 This uses the default example image that ships with the framework, and should
 output something similar to this:
 ```
 I tensorflow/examples/label_image/main.cc:200] military uniform (866): 0.902268
 I tensorflow/examples/label_image/main.cc:200] bow tie (817): 0.05407
 I tensorflow/examples/label_image/main.cc:200] suit (794): 0.0113195
 I tensorflow/examples/label_image/main.cc:200] bulletproof vest (833): 0.0100269
 I tensorflow/examples/label_image/main.cc:200] bearskin (849): 0.00649746
 ```
 In this case, we're using the default image of Admiral Grace Hopper, and you can
 see the network correctly spots she's wearing a military uniform, with a high
 score of 0.9.
 Next, try it out on your own images by supplying the --image= argument, e.g.
 ```bash
 $ bazel-bin/tensorflow/examples/label_image/label_image --image=my_image.png
 ```
--- a/tensorflow/examples/label_image/data/googlenet_labels.txt
+++ b/tensorflow/examples/label_image/data/googlenet_labels.txt
--- a/tensorflow/examples/label_image/data/grace_hopper.jpg
+++ b/tensorflow/examples/label_image/data/grace_hopper.jpg
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@ -0,0 +1,295 @@
 // A minimal but useful C++ example showing how to load an Imagenet-style object
 // recognition TensorFlow model, prepare input images for it, run them through
 // the graph, and interpret the results.
 //
 // It's designed to have as few dependencies and be as clear as possible, so
 // it's more verbose than it could be in production code. In particular, using
 // auto for the types of a lot of the returned values from TensorFlow calls can
 // remove a lot of boilerplate, but I find them explicit types useful in sample
 // code to make it  simple to look up the classes involved.
 //
 // To use it, compile and then run in a working directory with the
 // learning/brain/tutorials/label_image/data/ folder below it, and you should
 // see the top five labels for the example Lena image output. You can then
 // customize it to use your own models or images by changing the file names at
 // the top of the main() function.
 //
 // The googlenet_graph.pb file included by default is created from Inception.
 #include <fstream>
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/default_device.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/command_line_flags.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/tensor.h"
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Tensor;
 using tensorflow::Status;
 using tensorflow::string;
 using tensorflow::int32;
 // These are the command-line flags the program can understand.
 // They define where the graph and input data is located, and what kind of
 // input the model expects. If you train your own model, or use something
 // other than GoogLeNet you'll need to update these.
 TF_DEFINE_string(image,
                 "tensorflow/examples/label_image/data/grace_hopper.jpg",
                 "The image to classify (JPEG or PNG).");
 TF_DEFINE_string(graph,
                 "tensorflow/examples/label_image/data/googlenet_graph.pb",
                 "The location of the GraphDef file containing the protobuf"
                 " definition of the network.");
 TF_DEFINE_string(labels,
                 "tensorflow/examples/label_image/data/googlenet_labels.txt",
                 "A text file containing the labels of all the categories, one"
                 " per line.");
 TF_DEFINE_int32(input_width, 224, "Width of the image the network expects.");
 TF_DEFINE_int32(input_height, 224, "Height of the image the network expects.");
 TF_DEFINE_int32(input_mean, 117, "How much to subtract from input values.");
 TF_DEFINE_int32(input_std, 1, "What to divide the input values by.");
 TF_DEFINE_string(input_layer, "input", "The name of the input node.");
 TF_DEFINE_string(output_layer, "softmax2", "The name of the output node.");
 TF_DEFINE_bool(self_test, false, "Whether to run a sanity check on the results.");
 TF_DEFINE_string(root_dir, "", "The directory at the root of the data files.");
 // Takes a file name, and loads a list of labels from it, one per line, and
 // returns a vector of the strings. It pads with empty strings so the length
 // of the result is a multiple of 16, because our model expects that.
 Status ReadLabelsFile(string file_name, std::vector<string>* result) {
  std::ifstream file(file_name);
  result->clear();
  string line;
  while (std::getline(file, line)) {
    result->push_back(line);
  }
  const int padding = 16;
  while (result->size() % padding) {
    result->emplace_back();
  }
  return Status::OK();
 }
 // Given an image file name, read in the data, try to decode it as an image,
 // resize it to the requested size, and then scale the values as desired.
 Status ReadTensorFromImageFile(string file_name, const int input_height,
                               const int input_width, const float input_mean,
                               const float input_std,
                               std::vector<Tensor>* out_tensors) {
  tensorflow::GraphDefBuilder b;
  string input_name = "file_reader";
  string output_name = "normalized";
  tensorflow::Node* file_reader =
      tensorflow::ops::ReadFile(tensorflow::ops::Const(file_name, b.opts()),
                                b.opts().WithName(input_name));
  // Now try to figure out what kind of file it is and decode it.
  const int wanted_channels = 3;
  tensorflow::Node* image_reader;
  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
    image_reader = tensorflow::ops::DecodePng(
        file_reader,
        b.opts().WithAttr("channels", wanted_channels).WithName("png_reader"));
  } else {
    // Assume if it's not a PNG then it must be a JPEG.
    image_reader = tensorflow::ops::DecodeJpeg(
        file_reader,
        b.opts().WithAttr("channels", wanted_channels).WithName("jpeg_reader"));
  }
  // Now cast the image data to float so we can do normal math on it.
  tensorflow::Node* float_caster = tensorflow::ops::Cast(
      image_reader, tensorflow::DT_FLOAT, b.opts().WithName("float_caster"));
  // The convention for image ops in TensorFlow is that all images are expected
  // to be in batches, so that they're four-dimensional arrays with indices of
  // [batch, height, width, channel]. Because we only have a single image, we
  // have to add a batch dimension of 1 to the start with ExpandDims().
  tensorflow::Node* dims_expander = tensorflow::ops::ExpandDims(
      float_caster, tensorflow::ops::Const(0, b.opts()), b.opts());
  // Bilinearly resize the image to fit the required dimensions.
  tensorflow::Node* resized = tensorflow::ops::ResizeBilinear(
      dims_expander, tensorflow::ops::Const({input_height, input_width},
                                            b.opts().WithName("size")),
      b.opts());
  // Subtract the mean and divide by the scale.
  tensorflow::ops::Div(
      tensorflow::ops::Sub(
          resized, tensorflow::ops::Const({input_mean}, b.opts()), b.opts()),
      tensorflow::ops::Const({input_std}, b.opts()),
      b.opts().WithName(output_name));
  // This runs the GraphDef network definition that we've just constructed, and
  // returns the results in the output tensor.
  tensorflow::GraphDef graph;
  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph));
  std::unique_ptr<tensorflow::Session> session(
      tensorflow::NewSession(tensorflow::SessionOptions()));
  TF_RETURN_IF_ERROR(session->Create(graph));
  TF_RETURN_IF_ERROR(session->Run({}, {output_name}, {}, out_tensors));
  return Status::OK();
 }
 // Reads a model graph definition from disk, and creates a session object you
 // can use to run it.
 Status LoadGraph(string graph_file_name,
                 std::unique_ptr<tensorflow::Session>* session) {
  tensorflow::GraphDef graph_def;
  Status load_graph_status =
      ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def);
  if (!load_graph_status.ok()) {
    return tensorflow::errors::NotFound("Failed to load compute graph at '",
                                        graph_file_name, "'");
  }
  session->reset(tensorflow::NewSession(tensorflow::SessionOptions()));
  Status session_create_status = (*session)->Create(graph_def);
  if (!session_create_status.ok()) {
    return session_create_status;
  }
  return Status::OK();
 }
 // Analyzes the output of the Inception graph to retrieve the highest scores and
 // their positions in the tensor, which correspond to categories.
 Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
                    Tensor* indices, Tensor* scores) {
  tensorflow::GraphDefBuilder b;
  string output_name = "top_k";
  tensorflow::ops::TopK(tensorflow::ops::Const(outputs[0], b.opts()),
                        how_many_labels, b.opts().WithName(output_name));
  // This runs the GraphDef network definition that we've just constructed, and
  // returns the results in the output tensors.
  tensorflow::GraphDef graph;
  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph));
  std::unique_ptr<tensorflow::Session> session(
      tensorflow::NewSession(tensorflow::SessionOptions()));
  TF_RETURN_IF_ERROR(session->Create(graph));
  // The TopK node returns two outputs, the scores and their original indices,
  // so we have to append :0 and :1 to specify them both.
  std::vector<Tensor> out_tensors;
  TF_RETURN_IF_ERROR(session->Run({}, {output_name + ":0", output_name + ":1"},
                                  {}, &out_tensors));
  *scores = out_tensors[0];
  *indices = out_tensors[1];
  return Status::OK();
 }
 // Given the output of a model run, and the name of a file containing the labels
 // this prints out the top five highest-scoring values.
 Status PrintTopLabels(const std::vector<Tensor>& outputs,
                      string labels_file_name) {
  std::vector<string> labels;
  Status read_labels_status = ReadLabelsFile(labels_file_name, &labels);
  if (!read_labels_status.ok()) {
    LOG(ERROR) << read_labels_status;
    return read_labels_status;
  }
  const int how_many_labels = 5;
  Tensor indices;
  Tensor scores;
  TF_RETURN_IF_ERROR(GetTopLabels(outputs, how_many_labels, &indices, &scores));
  tensorflow::TTypes<float>::Flat scores_flat = scores.flat<float>();
  tensorflow::TTypes<int32>::Flat indices_flat = indices.flat<int32>();
  for (int pos = 0; pos < how_many_labels; ++pos) {
    const int label_index = indices_flat(pos);
    const float score = scores_flat(pos);
    LOG(INFO) << labels[label_index] << " (" << label_index << "): " << score;
  }
  return Status::OK();
 }
 // This is a testing function that returns whether the top label index is the
 // one that's expected.
 Status CheckTopLabel(const std::vector<Tensor>& outputs, int expected,
                     bool* is_expected) {
  *is_expected = false;
  Tensor indices;
  Tensor scores;
  const int how_many_labels = 1;
  TF_RETURN_IF_ERROR(GetTopLabels(outputs, how_many_labels, &indices, &scores));
  tensorflow::TTypes<int32>::Flat indices_flat = indices.flat<int32>();
  if (indices_flat(0) != expected) {
    LOG(ERROR) << "Expected label #" << expected << " but got #"
               << indices_flat(0);
    *is_expected = false;
  } else {
    *is_expected = true;
  }
  return Status::OK();
 }
 int main(int argc, char* argv[]) {
  // We need to call this to set up global state for TensorFlow.
  tensorflow::port::InitMain(argv[0], &argc, &argv);
  Status s = tensorflow::ParseCommandLineFlags(&argc, argv);
  if (!s.ok()) {
    LOG(ERROR) << "Error parsing command line flags: " << s.ToString();
    return -1;
  }
  // First we load and initialize the model.
  std::unique_ptr<tensorflow::Session> session;
  string graph_path = tensorflow::io::JoinPath(FLAGS_root_dir, FLAGS_graph);
  Status load_graph_status = LoadGraph(graph_path, &session);
  if (!load_graph_status.ok()) {
    LOG(ERROR) << load_graph_status;
    return -1;
  }
  // Get the image from disk as a float array of numbers, resized and normalized
  // to the specifications the main graph expects.
  std::vector<Tensor> resized_tensors;
  string image_path = tensorflow::io::JoinPath(FLAGS_root_dir, FLAGS_image);
  Status read_tensor_status = ReadTensorFromImageFile(
      image_path, FLAGS_input_height, FLAGS_input_width, FLAGS_input_mean,
      FLAGS_input_std, &resized_tensors);
  if (!read_tensor_status.ok()) {
    LOG(ERROR) << read_tensor_status;
    return -1;
  }
  const Tensor& resized_tensor = resized_tensors[0];
  // Actually run the image through the model.
  std::vector<Tensor> outputs;
  Status run_status = session->Run({{FLAGS_input_layer, resized_tensor}},
                                   {FLAGS_output_layer}, {}, &outputs);
  if (!run_status.ok()) {
    LOG(ERROR) << "Running model failed: " << run_status;
    return -1;
  }
  // This is for automated testing to make sure we get the expected result with
  // the default settings. We know that label 866 (military uniform) should be
  // the top label for the Admiral Hopper image.
  if (FLAGS_self_test) {
    bool expected_matches;
    Status check_status = CheckTopLabel(outputs, 866, &expected_matches);
    if (!check_status.ok()) {
      LOG(ERROR) << "Running check failed: " << check_status;
      return -1;
    }
    if (!expected_matches) {
      LOG(ERROR) << "Self-test failed!";
      return -1;
    }
  }
  // Do something interesting with the results we've generated.
  Status print_status = PrintTopLabels(outputs, FLAGS_labels);
  if (!print_status.ok()) {
    LOG(ERROR) << "Running print failed: " << print_status;
    return -1;
  }
  return 0;
 }
--- a/tensorflow/g3doc/api_docs/index.md
+++ b/tensorflow/g3doc/api_docs/index.md
@ -14,3 +14,5 @@ Note: Many practical aspects of usage are covered in the Mechanics tab, and
 some additional documentation not specific to any particular language API is
 available in the Resources tab.
 * [Python API](python/index.md)
 * [C++ API](cc/index.md)
--- a/tensorflow/g3doc/api_docs/python/nn.md
+++ b/tensorflow/g3doc/api_docs/python/nn.md
@ -245,31 +245,54 @@ strided according to the `strides` argument.  `strides = [1, 1, 1, 1]` applies
 the filter to a patch at every offset, `strides = [1, 2, 2, 1]` applies the
 filter to every other image patch in each dimension, etc.
-Ignoring channels for the moment, the spatial semantics of the convolution ops
+Ignoring channels for the moment, and assume that the the 4-D `input` has shape
 are as follows.  If the 4-D `input` has shape
 `[batch, in_height, in_width, ...]` and the 4-D `filter` has shape
-`[filter_height, filter_width, ...]`, then
+`[filter_height, filter_width, ...]`, then the spatial semantics of the
 convolution ops are as follows: first, according to the padding scheme chosen
 as `'SAME'` or `'VALID'`, the output size and the padding pixels are computed.
 For the `'SAME'` padding, the output height and width are computed as:
-    shape(output) = [batch,
+    out_height = ceil(float(in_height) / float(strides[1]))
-                     (in_height - filter_height + 1) / strides[1],
+    out_width  = ceil(float(in_width) / float(stides[2]))
-                     (in_width - filter_width + 1) / strides[2],
+
-                     ...]
+and the padding on the top and left are computed as:
    pad_along_height = ((out_height - 1) * strides[1] +
                        filter_height - in_height)
    pad_along_width = ((out_width - 1) * strides[2] +
                       filter_width - in_width)
    pad_top = pad_along_height / 2
    pad_left = pad_along_width / 2
 Note that the division by 2 means that there might be cases when the padding on
 both sides (top vs bottom, right vs left) are off by one. In this case, the
 bottom and right sides always get the one additional padded pixel. For example,
 when `pad_along_height` is 5, we pad 2 pixels at the top and 3 pixels at the
 bottom. Note that this is different from existing libraries such as cuDNN and
 Caffe, which explicitly specify the number of padded pixels and always pad the
 same number of pixels on both sides.
 For the `'VALID`' padding, the output height and width are computed as:
    out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
    out_width  = ceil(float(in_width - filter_width + 1) / float(stides[2]))
 and the padding values are always zero. The output is then computed as
    output[b, i, j, :] =
-        sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, ...] *
+        sum_{di, dj} input[b, strides[1] * i + di - pad_top,
                           strides[2] * j + dj - pad_left, ...] *
                     filter[di, dj, ...]
 where any value outside the original input image region are considered zero (
 i.e. we pad zero values around the border of the image).
 Since `input` is 4-D, each `input[b, i, j, :]` is a vector.  For `conv2d`, these
 vectors are multiplied by the `filter[di, dj, :, :]` matrices to produce new
 vectors.  For `depthwise_conv_2d`, each scalar component `input[b, i, j, k]`
 is multiplied by a vector `filter[di, dj, k]`, and all the vectors are
 concatenated.
 In the formula for `shape(output)`, the rounding direction depends on padding:
 * `padding = 'SAME'`: Round down (only full size windows are considered).
 * `padding = 'VALID'`: Round up (partial windows are included).
 - - -
 ### `tf.nn.conv2d(input, filter, strides, padding, use_cudnn_on_gpu=None, name=None)` <a class="md-anchor" id="conv2d"></a>
@ -412,14 +435,8 @@ In detail, the output is
    output[i] = reduce(value[strides * i:strides * i + ksize])
-for each tuple of indices `i`.  The output shape is
+where the indices also take into consideration the padding values. Please refer
-
+to the `Convolution` section for details about the padding calculation.
    shape(output) = (shape(value) - ksize + 1) / strides
 where the rounding direction depends on padding:
 * `padding = 'SAME'`: Round down (only full size windows are considered).
 * `padding = 'VALID'`: Round up (partial windows are included).
 - - -
--- a/tensorflow/g3doc/tutorials/mnist/beginners/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/beginners/index.md
@ -136,7 +136,7 @@ that the evidence for a class \\(i\\) given an input \\(x\\) is:
 $$\text{evidence}_i = \sum_j W_{i,~ j} x_j + b_i$$
-where \\(W\_i\\) is the weights and \\(b\_i\\) is the bias for class \\(i\\),
+where \\(W_i\\) is the weights and \\(b_i\\) is the bias for class \\(i\\),
 and \\(j\\) is an index for summing over the pixels in our input image \\(x\\).
 We then convert the evidence tallies into our predicted probabilities
 \\(y\\) using the "softmax" function:
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -839,6 +839,7 @@ cpu_only_kernel_test_list = glob([
    "kernel_tests/save_restore_ops_test.py",
    "kernel_tests/segment_reduction_ops_test.py",
    "kernel_tests/sparse_concat_op_test.py",
    "kernel_tests/sparse_matmul_op_test.py",
    "kernel_tests/sparse_reorder_op_test.py",
    "kernel_tests/sparse_to_dense_op_test.py",
    "kernel_tests/sparsemask_op_test.py",
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@ -333,6 +333,9 @@ class BaseSession(SessionInterface):
    # Check session.
    if self._closed:
      raise RuntimeError('Attempted to use a closed Session.')
    if self.graph.version == 0:
      raise RuntimeError('The Session graph is empty.  Add operations to the '
                         'graph before calling run().')
    # Validate and process fetches.
    is_list_fetch = isinstance(fetches, (list, tuple))
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@ -445,6 +445,12 @@ class SessionTest(test_util.TensorFlowTestCase):
      sess.close()
      t.join()
  def testUseEmptyGraph(self):
    with session.Session() as sess:
      with self.assertRaisesWithPredicateMatch(
          RuntimeError, lambda e: 'The Session graph is empty.' in str(e)):
        sess.run([])
  def testNotEntered(self):
    # pylint: disable=protected-access
    self.assertEqual(ops._default_session_stack.get_default(), None)
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@ -214,7 +214,7 @@ import_array();
                        "but got %s" % type(config))
      status = TF_NewStatus()
      config_str = config.SerializeToString()
-      _TF_SetConfig(opts, config_str, len(config_str), status)
+      _TF_SetConfig(opts, config_str, status)
      if TF_GetCode(status) != 0:
        raise ValueError(TF_Message(status))
    return opts
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@ -167,7 +167,14 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
  """
  # Type checks for inputs.
  if not isinstance(graph_def, graph_pb2.GraphDef):
-    raise TypeError('graph_def must be a GraphDef proto.')
+    # `graph_def` could be a dynamically-created message, so try a duck-typed
    # approach
    try:
      old_graph_def = graph_def
      graph_def = graph_pb2.GraphDef()
      graph_def.MergeFrom(old_graph_def)
    except TypeError:
      raise TypeError('graph_def must be a GraphDef proto.')
  if input_map is None:
    input_map = {}
  else:
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@ -2458,6 +2458,7 @@ class Graph(object):
    control_ops = []
    current = self._current_control_dependencies()
    for c in control_inputs:
      c = self.as_graph_element(c)
      if isinstance(c, Tensor):
        c = c.op
      elif not isinstance(c, Operation):
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@ -632,6 +632,20 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
    # e should be dominated by c.
    self.assertEqual(e.op.control_inputs, [])
  def testBasicWithConversion(self):
    g = ops.Graph()
    a = _apply_op(g, "const", [], [types.float32])
    class ConvertibleObj(object):
      def _as_graph_element(self):
        return a
    with g.control_dependencies([ConvertibleObj()]):
      c = _apply_op(g, "const", [], [types.float32])
    self.assertEqual(c.op.control_inputs, [a.op])
  def testNested(self):
    g = ops.Graph()
    a_1 = _apply_op(g, "const", [], [types.float32])
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@ -183,6 +183,13 @@ class Conv2DTest(tf.test.TestCase):
                       stride=1, padding="VALID",
                       expected=expected_output)
  def testConv2DEmpty(self):
    expected_output = []
    self._VerifyValues(tensor_in_sizes=[0, 2, 3, 3],
                       filter_in_sizes=[1, 1, 3, 3],
                       stride=1, padding="VALID",
                       expected=expected_output)
  def testConv2D2x2Filter(self):
    # The outputs are computed using third_party/py/IPython/notebook.
    expected_output = [2271.0, 2367.0, 2463.0, 2901.0, 3033.0, 3165.0]
@ -1008,4 +1015,5 @@ if __name__ == "__main__":
    setattr(Conv2DTest, "testInceptionBackFilter_" + str(index),
            GetInceptionBackFilterTest(input_size_, filter_size_, output_size_,
                                       stride_, padding_))
  tf.test.main()
--- a/tensorflow/python/kernel_tests/listdiff_op_test.py
+++ b/tensorflow/python/kernel_tests/listdiff_op_test.py
@ -10,57 +10,56 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 _TYPES = [tf.int32, tf.int64, tf.float32, tf.float64, tf.string]
 class ListDiffTest(tf.test.TestCase):
-  def _testListDiff(self, x, y, out, idx, dtype=np.int32):
+  def _testListDiff(self, x, y, out, idx):
-    x = np.array(x, dtype=dtype)
+    for dtype in _TYPES:
-    y = np.array(y, dtype=dtype)
+      if dtype == tf.string:
-    out = np.array(out, dtype=dtype)
+        x = [str(a) for a in x]
-    idx = np.array(idx, dtype=dtype)
+        y = [str(a) for a in y]
        out = [str(a) for a in out]
-    with self.test_session() as sess:
+      with self.test_session() as sess:
-      x_tensor = tf.convert_to_tensor(x)
+        x_tensor = tf.convert_to_tensor(x, dtype=dtype)
-      y_tensor = tf.convert_to_tensor(y)
+        y_tensor = tf.convert_to_tensor(y, dtype=dtype)
-      out_tensor, idx_tensor = tf.listdiff(x_tensor, y_tensor)
+        out_tensor, idx_tensor = tf.listdiff(x_tensor, y_tensor)
-      tf_out, tf_idx = sess.run([out_tensor, idx_tensor])
+        tf_out, tf_idx = sess.run([out_tensor, idx_tensor])
-    self.assertAllEqual(tf_out, out)
+      self.assertAllEqual(tf_out, out)
-    self.assertAllEqual(tf_idx, idx)
+      self.assertAllEqual(tf_idx, idx)
-    self.assertEqual(1, out_tensor.get_shape().ndims)
+      self.assertEqual(1, out_tensor.get_shape().ndims)
-    self.assertEqual(1, idx_tensor.get_shape().ndims)
+      self.assertEqual(1, idx_tensor.get_shape().ndims)
  def testBasic1(self):
    x = [1, 2, 3, 4]
    y = [1, 2]
    out = [3, 4]
    idx = [2, 3]
-    for t in [np.int32, np.int64, np.float, np.double]:
+    self._testListDiff(x, y, out, idx)
      self._testListDiff(x, y, out, idx, dtype=t)
  def testBasic2(self):
    x = [1, 2, 3, 4]
    y = [2]
    out = [1, 3, 4]
    idx = [0, 2, 3]
-    for t in [np.int32, np.int64, np.float, np.double]:
+    self._testListDiff(x, y, out, idx)
      self._testListDiff(x, y, out, idx, dtype=t)
  def testBasic3(self):
    x = [1, 4, 3, 2]
    y = [4, 2]
    out = [1, 3]
    idx = [0, 2]
-    for t in [np.int32, np.int64, np.float, np.double]:
+    self._testListDiff(x, y, out, idx)
      self._testListDiff(x, y, out, idx, dtype=t)
  def testDuplicates(self):
    x = [1, 2, 4, 3, 2, 3, 3, 1]
    y = [4, 2]
    out = [1, 3, 3, 3, 1]
    idx = [0, 3, 5, 6, 7]
-    for t in [np.int32, np.int64, np.float, np.double]:
+    self._testListDiff(x, y, out, idx)
      self._testListDiff(x, y, out, idx, dtype=t)
  def testRandom(self):
    num_random_tests = 10
@ -78,38 +77,37 @@ class ListDiffTest(tf.test.TestCase):
      else:
        out = []
        idx = []
-      for t in [np.int32, np.int64, np.float, np.double]:
+      self._testListDiff(list(x), list(y), out, idx)
        self._testListDiff(x, y, out, idx, dtype=t)
-  def testInt32FullyOverlapping(self):
+  def testFullyOverlapping(self):
    x = [1, 2, 3, 4]
    y = [1, 2, 3, 4]
    out = []
    idx = []
    self._testListDiff(x, y, out, idx)
-  def testInt32NonOverlapping(self):
+  def testNonOverlapping(self):
    x = [1, 2, 3, 4]
    y = [5, 6]
    out = x
    idx = np.arange(len(x))
    self._testListDiff(x, y, out, idx)
-  def testInt32EmptyX(self):
+  def testEmptyX(self):
    x = []
    y = [1, 2]
    out = []
    idx = []
    self._testListDiff(x, y, out, idx)
-  def testInt32EmptyY(self):
+  def testEmptyY(self):
    x = [1, 2, 3, 4]
    y = []
    out = x
    idx = np.arange(len(x))
    self._testListDiff(x, y, out, idx)
-  def testInt32EmptyXY(self):
+  def testEmptyXY(self):
    x = []
    y = []
    out = []
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@ -30,7 +30,7 @@ class InverseOpTest(tf.test.TestCase):
      self.assertAllClose(np_ans, out)
      self.assertShapeEqual(y, tf_ans)
-  def testBasic(self):
+  def testNonsymmetric(self):
    # 2x2 matrices
    matrix1 = np.array([[1., 2.], [3., 4.]])
    matrix2 = np.array([[1., 3.], [3., 5.]])
@ -42,6 +42,18 @@ class InverseOpTest(tf.test.TestCase):
    matrix_batch = np.tile(matrix_batch, [2, 3, 1, 1])
    self._verifyInverse(matrix_batch)
  def testSymmetricPositiveDefinite(self):
    # 2x2 matrices
    matrix1 = np.array([[2., 1.], [1., 2.]])
    matrix2 = np.array([[3., -1.], [-1., 3.]])
    self._verifyInverse(matrix1)
    self._verifyInverse(matrix2)
    # A multidimensional batch of 2x2 matrices
    matrix_batch = np.concatenate([np.expand_dims(matrix1, 0), np.expand_dims(
        matrix2, 0)])
    matrix_batch = np.tile(matrix_batch, [2, 3, 1, 1])
    self._verifyInverse(matrix_batch)
  def testNonSquareMatrix(self):
    # When the inverse of a non-square matrix is attempted we should return
    # an error
@ -58,22 +70,10 @@ class InverseOpTest(tf.test.TestCase):
    # The input should be invertible.
    with self.test_session():
      with self.assertRaisesOpError("Input is not invertible."):
-        # All rows of the matrix below add to zero
+        # All rows of the matrix below add to zero.
        tensor3 = tf.constant([[1., 0., -1.], [-1., 1., 0.], [0., -1., 1.]])
        tf.matrix_inverse(tensor3).eval()
    with self.test_session():
      with self.assertRaisesOpError("Input is not invertible."):
        # Determinant of the matrix below is zero
        tensor3 = tf.constant([[1., 1.], [1., 1.]])
        tf.matrix_inverse(tensor3).eval()
    with self.test_session():
      with self.assertRaisesOpError("Input is not invertible."):
        # Determinant of the matrix below is zero
        tensor3 = tf.constant([[np.inf, 1.], [1., 1.]])
        tf.matrix_inverse(tensor3).eval()
  def testEmpty(self):
    self._verifyInverse(np.empty([0, 2, 2]))
    self._verifyInverse(np.empty([2, 0, 0]))
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@ -175,7 +175,8 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
  with ops.op_scope(t_list + [clip_norm], name, "clip_by_global_norm") as name:
    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
    scale = clip_norm * math_ops.minimum(
-        1.0 / use_norm, constant_op.constant(1.0 / clip_norm))
+        1.0 / use_norm,
        constant_op.constant(1.0 / clip_norm, dtype=use_norm.dtype))
    values = [
        ops.convert_to_tensor(
--- a/tensorflow/python/ops/common_shapes.py
+++ b/tensorflow/python/ops/common_shapes.py
@ -89,8 +89,8 @@ def bias_add_shape(op):
  return [output_shape]
-def _Get2DOutputSize(input_height, input_width, filter_height, filter_width,
+def get2d_conv_output_size(input_height, input_width, filter_height,
-                     row_stride, col_stride, padding_type):
+                           filter_width, row_stride, col_stride, padding_type):
  """Returns the number of rows and columns in a convolution/pooling output."""
  input_height = tensor_shape.as_dimension(input_height)
  input_width = tensor_shape.as_dimension(input_width)
@ -184,7 +184,7 @@ def conv2d_shape(op):
  # in the kernel implementation.
  stride = stride_r
  padding = op.get_attr("padding")
-  out_rows, out_cols = _Get2DOutputSize(
+  out_rows, out_cols = get2d_conv_output_size(
      in_rows, in_cols, filter_rows, filter_cols, stride, stride, padding)
  return [tensor_shape.TensorShape([batch_size, out_rows, out_cols, depth_out])]
@ -246,7 +246,7 @@ def separable_conv2d_shape(op):
  # in the kernel implementation.
  stride = stride_r
  padding = op.get_attr("padding")
-  out_rows, out_cols = _Get2DOutputSize(
+  out_rows, out_cols = get2d_conv_output_size(
      in_rows, in_cols, filter_rows, filter_cols, stride, stride, padding)
  return [tensor_shape.TensorShape([batch_size, out_rows, out_cols, depth_out])]
@ -294,7 +294,7 @@ def avg_pool_shape(op):
  # in the kernel implementation.
  padding = op.get_attr("padding")
-  out_rows, out_cols = _Get2DOutputSize(
+  out_rows, out_cols = get2d_conv_output_size(
      in_rows, in_cols, ksize_r, ksize_c, stride_r, stride_c, padding)
  return [tensor_shape.TensorShape([batch_size, out_rows, out_cols, depth])]
@ -346,7 +346,7 @@ def max_pool_shape(op):
  # in the kernel implementation.
  if ksize_d == 1:
    padding = op.get_attr("padding")
-    out_rows, out_cols = _Get2DOutputSize(
+    out_rows, out_cols = get2d_conv_output_size(
        in_rows, in_cols, ksize_r, ksize_c, stride_r, stride_c, padding)
    return [tensor_shape.TensorShape([batch_size, out_rows, out_cols, depth])]
  else:
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@ -42,7 +42,8 @@ def Print(input_, data, message=None, first_n=None, summarize=None,
    message: A string, prefix of the error message.
    first_n: Only log `first_n` number of times. Negative numbers log always;
             this is the default.
-    summarize: Only print this many entries of each tensor.
+    summarize: Only print this many entries of each tensor. If None, then a
               maximum of 3 elements are printed per input tensor.
    name: A name for the operation (optional).
  Returns:
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@ -38,31 +38,54 @@ strided according to the `strides` argument.  `strides = [1, 1, 1, 1]` applies
 the filter to a patch at every offset, `strides = [1, 2, 2, 1]` applies the
 filter to every other image patch in each dimension, etc.
-Ignoring channels for the moment, the spatial semantics of the convolution ops
+Ignoring channels for the moment, and assume that the the 4-D `input` has shape
 are as follows.  If the 4-D `input` has shape
 `[batch, in_height, in_width, ...]` and the 4-D `filter` has shape
-`[filter_height, filter_width, ...]`, then
+`[filter_height, filter_width, ...]`, then the spatial semantics of the
 convolution ops are as follows: first, according to the padding scheme chosen
 as `'SAME'` or `'VALID'`, the output size and the padding pixels are computed.
 For the `'SAME'` padding, the output height and width are computed as:
-    shape(output) = [batch,
+    out_height = ceil(float(in_height) / float(strides[1]))
-                     (in_height - filter_height + 1) / strides[1],
+    out_width  = ceil(float(in_width) / float(stides[2]))
-                     (in_width - filter_width + 1) / strides[2],
+
-                     ...]
+and the padding on the top and left are computed as:
    pad_along_height = ((out_height - 1) * strides[1] +
                        filter_height - in_height)
    pad_along_width = ((out_width - 1) * strides[2] +
                       filter_width - in_width)
    pad_top = pad_along_height / 2
    pad_left = pad_along_width / 2
 Note that the division by 2 means that there might be cases when the padding on
 both sides (top vs bottom, right vs left) are off by one. In this case, the
 bottom and right sides always get the one additional padded pixel. For example,
 when `pad_along_height` is 5, we pad 2 pixels at the top and 3 pixels at the
 bottom. Note that this is different from existing libraries such as cuDNN and
 Caffe, which explicitly specify the number of padded pixels and always pad the
 same number of pixels on both sides.
 For the `'VALID`' padding, the output height and width are computed as:
    out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
    out_width  = ceil(float(in_width - filter_width + 1) / float(stides[2]))
 and the padding values are always zero. The output is then computed as
    output[b, i, j, :] =
-        sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, ...] *
+        sum_{di, dj} input[b, strides[1] * i + di - pad_top,
                           strides[2] * j + dj - pad_left, ...] *
                     filter[di, dj, ...]
 where any value outside the original input image region are considered zero (
 i.e. we pad zero values around the border of the image).
 Since `input` is 4-D, each `input[b, i, j, :]` is a vector.  For `conv2d`, these
 vectors are multiplied by the `filter[di, dj, :, :]` matrices to produce new
 vectors.  For `depthwise_conv_2d`, each scalar component `input[b, i, j, k]`
 is multiplied by a vector `filter[di, dj, k]`, and all the vectors are
 concatenated.
 In the formula for `shape(output)`, the rounding direction depends on padding:
 * `padding = 'SAME'`: Round down (only full size windows are considered).
 * `padding = 'VALID'`: Round up (partial windows are included).
@@conv2d
@@depthwise_conv2d
@@separable_conv2d
@ -79,14 +102,8 @@ In detail, the output is
    output[i] = reduce(value[strides * i:strides * i + ksize])
-for each tuple of indices `i`.  The output shape is
+where the indices also take into consideration the padding values. Please refer
-
+to the `Convolution` section for details about the padding calculation.
    shape(output) = (shape(value) - ksize + 1) / strides
 where the rounding direction depends on padding:
 * `padding = 'SAME'`: Round down (only full size windows are considered).
 * `padding = 'VALID'`: Round up (partial windows are included).
@@avg_pool
@@max_pool
--- a/tensorflow/python/summary/event_accumulator.py
+++ b/tensorflow/python/summary/event_accumulator.py
@ -143,6 +143,8 @@ class EventAccumulator(object):
    self._is_autoupdating = False
    self._activated = False
    self._compression_bps = compression_bps
    self.most_recent_step = -1
    self.most_recent_wall_time = -1
  def Reload(self):
    """Loads all events added since the last call to `Reload`.
@ -156,6 +158,31 @@ class EventAccumulator(object):
    self._activated = True
    with self._generator_mutex:
      for event in self._generator.Load():
        ## Check if the event happened after a crash
        if event.step < self.most_recent_step:
          ## Keep data in reservoirs that has a step less than event.step
          _NotExpired = lambda x: x.step < event.step
          num_expired_scalars = self._scalars.FilterItems(_NotExpired)
          num_expired_histograms = self._histograms.FilterItems(_NotExpired)
          num_expired_compressed_histograms = self._compressed_histograms.FilterItems(
              _NotExpired)
          num_expired_images = self._images.FilterItems(_NotExpired)
          purge_msg = (
              'Detected out of order event.step likely caused by a Tensorflow '
              'restart. Purging expired events from Tensorboard display '
              'between the previous step: {} (timestamp: {}) and current step:'
              ' {} (timestamp: {}). Removing {} scalars, {} histograms, {} '
              'compressed histograms, and {} images.').format(
                  self.most_recent_step, self.most_recent_wall_time, event.step,
                  event.wall_time, num_expired_scalars, num_expired_histograms,
                  num_expired_compressed_histograms, num_expired_images)
          logging.warn(purge_msg)
        else:
          self.most_recent_step = event.step
          self.most_recent_wall_time = event.wall_time
        ## Process the event
        if event.HasField('graph_def'):
          if self._graph is not None:
            logging.warn(('Found more than one graph event per run.'
--- a/tensorflow/python/summary/event_accumulator_test.py
+++ b/tensorflow/python/summary/event_accumulator_test.py
@ -102,8 +102,8 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
  def testTags(self):
    gen = _EventGenerator()
-    gen.AddScalar('sv1')
+    gen.AddScalar('s1')
-    gen.AddScalar('sv2')
+    gen.AddScalar('s2')
    gen.AddHistogram('hst1')
    gen.AddHistogram('hst2')
    gen.AddImage('im1')
@ -113,7 +113,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
    self.assertTagsEqual(
        acc.Tags(), {
            ea.IMAGES: ['im1', 'im2'],
-            ea.SCALARS: ['sv1', 'sv2'],
+            ea.SCALARS: ['s1', 's2'],
            ea.HISTOGRAMS: ['hst1', 'hst2'],
            ea.COMPRESSED_HISTOGRAMS: ['hst1', 'hst2'],
            ea.GRAPH: False})
@ -123,8 +123,8 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
    acc = ea.EventAccumulator(gen)
    acc.Reload()
    self.assertEqual(acc.Tags(), self.empty)
-    gen.AddScalar('sv1')
+    gen.AddScalar('s1')
-    gen.AddScalar('sv2')
+    gen.AddScalar('s2')
    gen.AddHistogram('hst1')
    gen.AddHistogram('hst2')
    gen.AddImage('im1')
@ -133,7 +133,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
    acc.Reload()
    self.assertTagsEqual(acc.Tags(), {
        ea.IMAGES: ['im1', 'im2'],
-        ea.SCALARS: ['sv1', 'sv2'],
+        ea.SCALARS: ['s1', 's2'],
        ea.HISTOGRAMS: ['hst1', 'hst2'],
        ea.COMPRESSED_HISTOGRAMS: ['hst1', 'hst2'],
        ea.GRAPH: False})
@ -141,13 +141,13 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
  def testScalars(self):
    gen = _EventGenerator()
    acc = ea.EventAccumulator(gen)
-    sv1 = ea.ScalarEvent(wall_time=1, step=10, value=32)
+    s1 = ea.ScalarEvent(wall_time=1, step=10, value=32)
-    sv2 = ea.ScalarEvent(wall_time=2, step=12, value=64)
+    s2 = ea.ScalarEvent(wall_time=2, step=12, value=64)
-    gen.AddScalar('sv1', wall_time=1, step=10, value=32)
+    gen.AddScalar('s1', wall_time=1, step=10, value=32)
-    gen.AddScalar('sv2', wall_time=2, step=12, value=64)
+    gen.AddScalar('s2', wall_time=2, step=12, value=64)
    acc.Reload()
-    self.assertEqual(acc.Scalars('sv1'), [sv1])
+    self.assertEqual(acc.Scalars('s1'), [s1])
-    self.assertEqual(acc.Scalars('sv2'), [sv2])
+    self.assertEqual(acc.Scalars('s2'), [s2])
  def testHistograms(self):
    gen = _EventGenerator()
@ -311,7 +311,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
    with self.assertRaises(RuntimeError):
      acc.Tags()
    with self.assertRaises(RuntimeError):
-      acc.Scalars('sv1')
+      acc.Scalars('s1')
    acc.Reload()
    self.assertTrue(acc._activated)
    acc._activated = False
@ -321,17 +321,17 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
    acc = ea.EventAccumulator(gen)
    acc.Reload()
    with self.assertRaises(KeyError):
-      acc.Scalars('sv1')
+      acc.Scalars('s1')
    with self.assertRaises(KeyError):
      acc.Scalars('hst1')
    with self.assertRaises(KeyError):
      acc.Scalars('im1')
    with self.assertRaises(KeyError):
-      acc.Histograms('sv1')
+      acc.Histograms('s1')
    with self.assertRaises(KeyError):
      acc.Histograms('im1')
    with self.assertRaises(KeyError):
-      acc.Images('sv1')
+      acc.Images('s1')
    with self.assertRaises(KeyError):
      acc.Images('hst1')
@ -339,21 +339,43 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
    """Tests that non-value events in the generator don't cause early exits."""
    gen = _EventGenerator()
    acc = ea.EventAccumulator(gen)
-    gen.AddScalar('sv1', wall_time=1, step=10, value=20)
+    gen.AddScalar('s1', wall_time=1, step=10, value=20)
    gen.AddEvent(tf.Event(
-        wall_time=2, step=20, file_version='notsv2'))
+        wall_time=2, step=20, file_version='nots2'))
-    gen.AddScalar('sv3', wall_time=3, step=100, value=1)
+    gen.AddScalar('s3', wall_time=3, step=100, value=1)
    gen.AddHistogram('hst1')
    gen.AddImage('im1')
    acc.Reload()
    self.assertTagsEqual(acc.Tags(), {
        ea.IMAGES: ['im1'],
-        ea.SCALARS: ['sv1', 'sv3'],
+        ea.SCALARS: ['s1', 's3'],
        ea.HISTOGRAMS: ['hst1'],
        ea.COMPRESSED_HISTOGRAMS: ['hst1'],
        ea.GRAPH: False})
  def testExpiredDataDiscardedAfterRestart(self):
    """Tests that events are discarded after a restart is detected.
    If a step value is observed to be lower than what was previously seen,
    this should force a discard of all previous items that are outdated.
    """
    gen = _EventGenerator()
    acc = ea.EventAccumulator(gen)
    gen.AddScalar('s1', wall_time=1, step=100, value=20)
    gen.AddScalar('s1', wall_time=1, step=200, value=20)
    gen.AddScalar('s1', wall_time=1, step=300, value=20)
    acc.Reload()
    ## Check that number of items are what they should be
    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 200, 300])
    gen.AddScalar('s1', wall_time=1, step=101, value=20)
    gen.AddScalar('s1', wall_time=1, step=201, value=20)
    gen.AddScalar('s1', wall_time=1, step=301, value=20)
    acc.Reload()
    ## Check that we have discarded 200 and 300
    self.assertEqual([x.step for x in acc.Scalars('s1')], [100, 101, 201, 301])
 class RealisticEventAccumulatorTest(EventAccumulatorTest):
--- a/tensorflow/python/summary/event_multiplexer.py
+++ b/tensorflow/python/summary/event_multiplexer.py
@ -128,13 +128,15 @@ class EventMultiplexer(object):
    return self
  def AddRunsFromDirectory(self, path, name=None):
-    """Load runs from a directory, assuming each subdirectory is a run.
+    """Load runs from a directory; recursively walks subdirectories.
    If path doesn't exist, no-op. This ensures that it is safe to call
      `AddRunsFromDirectory` multiple times, even before the directory is made.
-    If the directory contains TensorFlow event files, it is itself treated as a
+    If path is a directory, load event files in the directory (if any exist) and
-      run.
+      recursively call AddRunsFromDirectory on any subdirectories. This mean you
      can call AddRunsFromDirectory at the root of a tree of event logs and
      TensorBoard will load them all.
    If the `EventMultiplexer` is already loaded or autoupdating, this will cause
    the newly created accumulators to also `Reload()` or `AutoUpdate()`.
@ -156,25 +158,16 @@ class EventMultiplexer(object):
    if not gfile.Exists(path):
      return  # Maybe it hasn't been created yet, fail silently to retry later
    if not gfile.IsDirectory(path):
-      raise ValueError('Path exists and is not a directory, %s'  % path)
+      raise ValueError('AddRunsFromDirectory: path exists and is not a '
-    paths = gfile.ListDirectory(path)
+                       'directory, %s'  % path)
-    is_directory = lambda x: gfile.IsDirectory(os.path.join(path, x))
+
-    subdirectories = filter(is_directory, paths)
+    for (subdir, _, files) in os.walk(path):
-    for s in subdirectories:
+      if list(filter(event_accumulator.IsTensorFlowEventsFile, files)):
-      if name:
+        logging.info('Adding events from directory %s', subdir)
-        subname = '/'.join([name, s])
+        rpath = os.path.relpath(subdir, path)
-      else:
+        subname = os.path.join(name, rpath) if name else rpath
-        subname = s
+        self.AddRun(subdir, name=subname)
      self.AddRun(os.path.join(path, s), subname)
    if list(filter(event_accumulator.IsTensorFlowEventsFile, paths)):
      directory_name = os.path.split(path)[1]
      logging.info('Directory %s has event files; loading', directory_name)
      if name:
        dname = name
      else:
        dname = directory_name
      self.AddRun(path, dname)
    return self
  def Reload(self):
--- a/tensorflow/python/summary/event_multiplexer_test.py
+++ b/tensorflow/python/summary/event_multiplexer_test.py
@ -3,6 +3,7 @@ from __future__ import division
 from __future__ import print_function
 import os
 import os.path
 import tensorflow.python.platform
@ -13,6 +14,20 @@ from tensorflow.python.summary import event_accumulator
 from tensorflow.python.summary import event_multiplexer
 def _AddEvents(path):
  if not gfile.IsDirectory(path):
    gfile.MakeDirs(path)
  fpath = os.path.join(path, 'hypothetical.tfevents.out')
  with gfile.GFile(fpath, 'w'):
    return fpath
 def _CreateCleanDirectory(path):
  if gfile.IsDirectory(path):
    gfile.DeleteRecursively(path)
  gfile.MkDir(path)
 class _FakeAccumulator(object):
  def __init__(self, path):
@ -122,34 +137,33 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
    x.AddRunsFromDirectory(fakedir)
    self.assertEqual(x.Runs(), {}, 'loading fakedir had no effect')
-    if gfile.IsDirectory(realdir):
+    _CreateCleanDirectory(realdir)
      gfile.DeleteRecursively(realdir)
    gfile.MkDir(realdir)
    x.AddRunsFromDirectory(realdir)
    self.assertEqual(x.Runs(), {}, 'loading empty directory had no effect')
    path1 = join(realdir, 'path1')
    gfile.MkDir(path1)
    x.AddRunsFromDirectory(realdir)
-    self.assertEqual(sorted(x.Runs().keys()), ['path1'], 'loaded run: path1')
+    self.assertEqual(x.Runs(), {}, 'creating empty subdirectory had no effect')
    _AddEvents(path1)
    x.AddRunsFromDirectory(realdir)
    self.assertItemsEqual(x.Runs(), ['path1'], 'loaded run: path1')
    loader1 = x._GetAccumulator('path1')
    self.assertEqual(loader1._path, path1, 'has the correct path')
    path2 = join(realdir, 'path2')
-    gfile.MkDir(path2)
+    _AddEvents(path2)
    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(sorted(x.Runs().keys()), ['path1', 'path2'])
+    self.assertItemsEqual(x.Runs(), ['path1', 'path2'])
    self.assertEqual(x._GetAccumulator('path1'), loader1,
                     'loader1 not regenerated')
    loader2 = x._GetAccumulator('path2')
    path2_2 = join(path2, 'path2')
-    gfile.MkDir(path2_2)
+    _AddEvents(path2_2)
-    x.AddRunsFromDirectory(path2)
+    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(sorted(x.Runs().keys()), ['path1', 'path2'])
+    self.assertItemsEqual(x.Runs(), ['path1', 'path2', 'path2/path2'])
-    self.assertNotEqual(loader2, x._GetAccumulator('path2'),
+    self.assertEqual(x._GetAccumulator('path2/path2')._path, path2_2,
                        'loader2 regenerated')
    self.assertEqual(x._GetAccumulator('path2')._path, path2_2,
                     'loader2 path correct')
  def testAddRunsFromDirectoryThatContainsEvents(self):
@ -158,21 +172,18 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
    join = os.path.join
    realdir = join(tmpdir, 'event_containing_directory')
-    if gfile.IsDirectory(realdir):
+    _CreateCleanDirectory(realdir)
      gfile.DeleteRecursively(realdir)
    gfile.MkDir(realdir)
    self.assertEqual(x.Runs(), {})
-    with gfile.GFile(join(realdir, 'hypothetical.tfevents.out'), 'w'):
+    _AddEvents(realdir)
      pass
    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['event_containing_directory'])
+    self.assertItemsEqual(x.Runs(), ['.'])
    subdir = join(realdir, 'subdir')
-    gfile.MkDir(subdir)
+    _AddEvents(subdir)
    x.AddRunsFromDirectory(realdir)
-    self.assertItemsEqual(x.Runs(), ['event_containing_directory', 'subdir'])
+    self.assertItemsEqual(x.Runs(), ['.', 'subdir'])
  def testAddRunsFromDirectoryWithRunNames(self):
    x = event_multiplexer.EventMultiplexer()
@ -180,30 +191,45 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
    join = os.path.join
    realdir = join(tmpdir, 'event_containing_directory')
-    if gfile.IsDirectory(realdir):
+    _CreateCleanDirectory(realdir)
      gfile.DeleteRecursively(realdir)
    gfile.MkDir(realdir)
    self.assertEqual(x.Runs(), {})
-    with gfile.GFile(join(realdir, 'hypothetical.tfevents.out'), 'w'):
+    _AddEvents(realdir)
      pass
    x.AddRunsFromDirectory(realdir, 'foo')
-    self.assertItemsEqual(x.Runs(), ['foo'])
+    self.assertItemsEqual(x.Runs(), ['foo/.'])
    subdir = join(realdir, 'subdir')
-    gfile.MkDir(subdir)
+    _AddEvents(subdir)
    x.AddRunsFromDirectory(realdir, 'foo')
-    self.assertItemsEqual(x.Runs(), ['foo', 'foo/subdir'])
+    self.assertItemsEqual(x.Runs(), ['foo/.', 'foo/subdir'])
  def testAddRunsFromDirectoryWalksTree(self):
    x = event_multiplexer.EventMultiplexer()
    tmpdir = self.get_temp_dir()
    join = os.path.join
    realdir = join(tmpdir, 'event_containing_directory')
    _CreateCleanDirectory(realdir)
    _AddEvents(realdir)
    sub = join(realdir, 'subdirectory')
    sub1 = join(sub, '1')
    sub2 = join(sub, '2')
    sub1_1 = join(sub1, '1')
    _AddEvents(sub1)
    _AddEvents(sub2)
    _AddEvents(sub1_1)
    x.AddRunsFromDirectory(realdir)
    self.assertItemsEqual(x.Runs(), ['.',
                                     'subdirectory/1', 'subdirectory/2',
                                     'subdirectory/1/1'])
  def testAddRunsFromDirectoryThrowsException(self):
    x = event_multiplexer.EventMultiplexer()
    tmpdir = self.get_temp_dir()
-    filepath = os.path.join(tmpdir, 'bad_file')
+    filepath = _AddEvents(tmpdir)
    with gfile.GFile(filepath, 'w'):
      pass
    with self.assertRaises(ValueError):
      x.AddRunsFromDirectory(filepath)
--- a/tensorflow/python/summary/impl/reservoir.py
+++ b/tensorflow/python/summary/impl/reservoir.py
@ -77,7 +77,7 @@ class Reservoir(object):
      key: The key for which we are finding associated items.
    Raises:
-      KeyError: If the key is not ofund in the reservoir.
+      KeyError: If the key is not found in the reservoir.
    Returns:
      [list, of, items] associated with that key.
@ -102,6 +102,19 @@ class Reservoir(object):
      bucket = self._buckets[key]
    bucket.AddItem(item)
  def FilterItems(self, filterFn):
    """Filter items within a Reservoir, using a filtering function.
    Args:
      filterFn: A function that returns True for the items to be kept.
    Returns:
      The number of items removed.
    """
    with self._mutex:
      return sum(bucket.FilterItems(filterFn)
                 for bucket in self._buckets.values())
 class _ReservoirBucket(object):
  """A container for items from a stream, that implements reservoir sampling.
@ -128,7 +141,7 @@ class _ReservoirBucket(object):
    # AddItem are thread-safe
    self._mutex = threading.Lock()
    self._max_size = _max_size
-    self._count = 0
+    self._num_items_seen = 0
    if _random is not None:
      self._random = _random
    else:
@ -139,13 +152,13 @@ class _ReservoirBucket(object):
    The new item is guaranteed to be added to the bucket, and to be the last
    element in the bucket. If the bucket has reached capacity, then an old item
-    will be replaced. With probability (_max_size/_count) a random item in the
+    will be replaced. With probability (_max_size/_num_items_seen) a random item
-    bucket will be popped out and the new item will be appended to the end. With
+    in the bucket will be popped out and the new item will be appended
-    probability (1 - _max_size/_count) the last item in the bucket will be
+    to the end. With probability (1 - _max_size/_num_items_seen)
-    replaced.
+    the last item in the bucket will be replaced.
-    Since the O(n) replacements occur with O(1/_count) liklihood, the amortized
+    Since the O(n) replacements occur with O(1/_num_items_seen) likelihood,
-    runtime is O(1).
+    the amortized runtime is O(1).
    Args:
      item: The item to add to the bucket.
@ -154,13 +167,43 @@ class _ReservoirBucket(object):
      if len(self.items) < self._max_size or self._max_size == 0:
        self.items.append(item)
      else:
-        r = self._random.randint(0, self._count)
+        r = self._random.randint(0, self._num_items_seen)
        if r < self._max_size:
          self.items.pop(r)
          self.items.append(item)
        else:
          self.items[-1] = item
-      self._count += 1
+      self._num_items_seen += 1
  def FilterItems(self, filterFn):
    """Filter items in a ReservoirBucket, using a filtering function.
    Filtering items from the reservoir bucket must update the
    internal state variable self._num_items_seen, which is used for determining
    the rate of replacement in reservoir sampling. Ideally, self._num_items_seen
    would contain the exact number of items that have ever seen by the
    ReservoirBucket and satisfy filterFn. However, the ReservoirBucket does not
    have access to all items seen -- it only has access to the subset of items
    that have survived sampling (self.items). Therefore, we estimate
    self._num_items_seen by scaling it by the same ratio as the ratio of items
    not removed from self.items.
    Args:
      filterFn: A function that returns True for items to be kept.
    Returns:
      The number of items removed from the bucket.
    """
    with self._mutex:
      size_before = len(self.items)
      self.items = filter(filterFn, self.items)
      size_diff = size_before - len(self.items)
      # Estimate a correction the the number of items seen
      prop_remaining = len(self.items) / float(
          size_before) if size_before > 0 else 0
      self._num_items_seen = int(round(self._num_items_seen * prop_remaining))
      return size_diff
  def Items(self):
    """Get all the items in the bucket."""
--- a/tensorflow/python/summary/impl/reservoir_test.py
+++ b/tensorflow/python/summary/impl/reservoir_test.py
@ -90,12 +90,14 @@ class ReservoirBucketTest(googletest.TestCase):
    for i in xrange(100):
      b.AddItem(i)
    self.assertEqual(b.Items(), list(xrange(100)))
    self.assertEqual(b._num_items_seen, 100)
  def testDoesntOverfill(self):
    b = reservoir._ReservoirBucket(10)
    for i in xrange(1000):
      b.AddItem(i)
    self.assertEqual(len(b.Items()), 10)
    self.assertEqual(b._num_items_seen, 1000)
  def testMaintainsOrder(self):
    b = reservoir._ReservoirBucket(100)
@ -119,12 +121,14 @@ class ReservoirBucketTest(googletest.TestCase):
    for i in xrange(20):
      b.AddItem(i)
      self.assertEqual(b.Items(), [i])
    self.assertEqual(b._num_items_seen, 20)
  def testSizeZeroBucket(self):
    b = reservoir._ReservoirBucket(0)
    for i in xrange(20):
      b.AddItem(i)
      self.assertEqual(b.Items(), list(range(i + 1)))
    self.assertEqual(b._num_items_seen, 20)
  def testSizeRequirement(self):
    with self.assertRaises(ValueError):
@ -132,6 +136,29 @@ class ReservoirBucketTest(googletest.TestCase):
    with self.assertRaises(ValueError):
      reservoir._ReservoirBucket(10.3)
  def testRemovesItems(self):
    b = reservoir._ReservoirBucket(100)
    for i in xrange(10):
      b.AddItem(i)
    self.assertEqual(len(b.Items()), 10)
    self.assertEqual(b._num_items_seen, 10)
    self.assertEqual(b.FilterItems(lambda x: x <= 7), 2)
    self.assertEqual(len(b.Items()), 8)
    self.assertEqual(b._num_items_seen, 8)
  def testRemovesItemsWhenItemsAreReplaced(self):
    b = reservoir._ReservoirBucket(100)
    for i in xrange(10000):
      b.AddItem(i)
    self.assertEqual(b._num_items_seen, 10000)
    # Remove items
    num_removed = b.FilterItems(lambda x: x <= 7)
    self.assertGreater(num_removed, 92)
    self.assertEqual([], [item for item in b.Items() if item > 7])
    self.assertEqual(b._num_items_seen,
                     int(round(10000 * (1 - float(num_removed) / 100))))
 class ReservoirBucketStatisticalDistributionTest(googletest.TestCase):
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@ -445,7 +445,8 @@ def shuffle_batch(tensor_list, batch_size, capacity, min_after_dequeue,
        capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
        dtypes=dtypes, shapes=shapes)
    _enqueue(queue, tensor_list, num_threads, enqueue_many)
-    full = (math_ops.cast(queue.size() - min_after_dequeue, types.float32) *
+    full = (math_ops.cast(math_ops.maximum(0, queue.size() - min_after_dequeue),
                          types.float32) *
            (1. / (capacity - min_after_dequeue)))
    # Note that name contains a '/' at the end so we intentionally do not place
    # a '/' after %s below.
@ -513,7 +514,8 @@ def shuffle_batch_join(tensor_list_list, batch_size, capacity,
        capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
        dtypes=dtypes, shapes=shapes)
    _enqueue_join(queue, tensor_list_list, enqueue_many)
-    full = (math_ops.cast(queue.size() - min_after_dequeue, types.float32) *
+    full = (math_ops.cast(math_ops.maximum(0, queue.size() - min_after_dequeue),
                          types.float32) *
            (1. / (capacity - min_after_dequeue)))
    # Note that name contains a '/' at the end so we intentionally do not place
    # a '/' after %s below.
--- a/tensorflow/tensorboard/components/tf-event-dashboard/tf-chart.ts
+++ b/tensorflow/tensorboard/components/tf-event-dashboard/tf-chart.ts
@ -4,7 +4,7 @@
 module TF {
  type TFDatum = [number, number, number];
  type tooltipMap = {[run: string]: string};
-  type TooltipUpdater = (tooltipMap, xValue, closestRun) => void;
+  export type TooltipUpdater = (tooltipMap, xValue, closestRun) => void;
  let Y_TOOLTIP_FORMATTER_PRECISION = 4;
  let STEP_AXIS_FORMATTER_PRECISION = 4;
--- a/tensorflow/tensorboard/components/tf-graph-common/lib/graph.ts
+++ b/tensorflow/tensorboard/components/tf-graph-common/lib/graph.ts
@ -39,13 +39,13 @@ export class SlimGraph {
  }
 }
-interface NormalizedInput {
+export interface NormalizedInput {
  name: string;
  hasNumberPart: boolean;
  isControlDependency: boolean;
 }
-interface BuildParams {
+export interface BuildParams {
  enableEmbedding: boolean;
  inEmbeddingTypes: string[];
  outEmbeddingTypes: string[];
@ -352,7 +352,7 @@ export function joinStatsInfoWithGraph(graph: SlimGraph,
 /**
 * Execution stats for the node.
 */
-class NodeStats {
+export class NodeStats {
  constructor(totalBytes: number, totalMicros: number, outputSize: number[][]) {
    this.totalBytes = totalBytes;
    this.totalMicros = totalMicros;
--- a/tensorflow/tensorboard/components/tf-graph-common/lib/hierarchy.ts
+++ b/tensorflow/tensorboard/components/tf-graph-common/lib/hierarchy.ts
@ -11,7 +11,7 @@ const LOG_PREFIX_MSG = "Graph hierarchy: ";
 /**
 * Class used as output for getPredecessors and getSuccessors methods
 */
-interface Edges {
+export interface Edges {
  control: string[];
  regular: string[];
 }
@ -370,7 +370,7 @@ function findEdgeTargetsInGraph(
  });
 }
-interface HierarchyParams {
+export interface HierarchyParams {
  verifyTemplate: boolean;
  seriesNodeMinSize: number;
 }
@ -640,7 +640,7 @@ function detectSeries(clusters: {[clusterId: string]: string[]},
     * which is an array that contains objects with name, id, prefix, suffix,
     * and parent properties.
     */
-    let candidatesDict = {};
+    let candidatesDict: {[seriesName: string]: SeriesNode[]} = {};
    // Group all nodes that have the same name, with the exception of a
    // number at the end of the name after an underscore, which is allowed to
--- a/tensorflow/tensorboard/components/tf-graph-common/lib/render.ts
+++ b/tensorflow/tensorboard/components/tf-graph-common/lib/render.ts
@ -65,7 +65,7 @@ export let SeriesNodeColors = {
 /**
 * Parameters that affect how the graph is rendered on the screen.
 */
-interface RenderGraphParams {
+export interface RenderGraphParams {
  /**
   * Whether to extract high degree nodes from the core part of the graph.
   */
--- a/tensorflow/tensorboard/components/tf-graph-common/lib/template.ts
+++ b/tensorflow/tensorboard/components/tf-graph-common/lib/template.ts
@ -28,7 +28,7 @@ export function detect(h, verifyTemplate): {[templateId: string]: string[]} {
  // Sort the templates by minimum level in the graph at which they appear,
  // as this leads to optimal setting of the colors of each template for
  // maximum differentiation.
-  return _(templates).pairs()
+  return <{[templateId: string]: string[]}> _(templates).pairs()
      .sortBy(function(pair) {
        return pair[1].level;
      })
@ -101,6 +101,7 @@ function clusterSimilarSubgraphs(h: hierarchy.Hierarchy) {
 function groupTemplateAndAssignId(nnGroups, verifyTemplate) {
  // For each metanode, compare its subgraph (starting from shallower groups)
  // and assign template id.
  let result: {[templateId: string]: {level: number, nodes: string[]}} = {};
  return _.reduce(nnGroups, function(templates, nnGroupPair) {
    let signature = nnGroupPair[0],
      nnGroup = nnGroupPair[1].nodes,
@ -137,7 +138,7 @@ function groupTemplateAndAssignId(nnGroups, verifyTemplate) {
      };
    });
    return templates;
-  }, {});
+  }, result);
 }
 function sortNodes(names: string[], graph: graphlib.Graph<Metanode|OpNode, Metaedge>,
--- a/tensorflow/tensorboard/components/tf-graph-common/tf-graph-common.html
+++ b/tensorflow/tensorboard/components/tf-graph-common/tf-graph-common.html
@ -1,6 +1,7 @@
 <script src="../../bower_components/d3/d3.js"></script>
 <script src="../../bower_components/lodash/lodash.js"></script>
 <script src="../../bower_components/graphlib/dist/graphlib.core.js"></script>
 <script src="../../bower_components/dagre/dist/dagre.core.js"></script>
 <script src="lib/common.js"></script>
 <script src="lib/graph.js"></script>
--- a/tensorflow/tensorboard/components/tf-graph/tf-graph-scene.html
+++ b/tensorflow/tensorboard/components/tf-graph/tf-graph-scene.html
@ -1,8 +1,7 @@
 <link rel="import" href="../../bower_components/polymer/polymer.html">
 <link rel="import" href="tf-graph-style.html">
 <link rel="import" href="tf-graph-minimap.html">
-<script src="../tf-graph-common/lib/layout.js"></script>
+
 <script src="../../bower_components/dagre/dist/dagre.core.js"></script>
 <!--
  A module that takes a render hierarchy as input and produces an SVG DOM using
  dagre and d3.
--- a/tensorflow/tensorboard/tensorboard.py
+++ b/tensorflow/tensorboard/tensorboard.py
@ -23,27 +23,25 @@ from tensorflow.python.summary import event_accumulator
 from tensorflow.python.summary import event_multiplexer
 from tensorflow.tensorboard import tensorboard_handler
-flags.DEFINE_string('logdir', None, """
+flags.DEFINE_string('logdir', None, """logdir specifies the directory where
-logdir specifies where TensorBoard will look to find TensorFlow event files
+TensorBoard will look to find TensorFlow event files that it can display.
-that it can display. In the simplest case, logdir is a directory containing
+TensorBoard will recursively walk the directory structure rooted at logdir,
-tfevents files. TensorBoard also supports comparing multiple TensorFlow
+looking for .*tfevents.* files.
 executions: to do this, you can use directory whose subdirectories contain
 tfevents files, as in the following example:
-foo/bar/logdir/
+You may also pass a comma seperated list of log directories, and TensorBoard
-foo/bar/logdir/mnist_1/events.out.tfevents.1444088766
+will watch each directory. You can also assign names to individual log
-foo/bar/logdir/mnist_2/events.out.tfevents.1444090064
+directories by putting a colon between the name and the path, as in
 You may also pass a comma seperated list of log directories, and you can
 assign names to individual log directories by putting a colon between the name
 and the path, as in
 tensorboard --logdir=name1:/path/to/logs/1,name2:/path/to/logs/2
 """)
 flags.DEFINE_boolean('debug', False, 'Whether to run the app in debug mode. '
                     'This increases log verbosity to DEBUG.')
 flags.DEFINE_string('host', '127.0.0.1', 'What host to listen to. Defaults to '
                    'serving on localhost, set to 0.0.0.0 for remote access.')
 flags.DEFINE_integer('port', 6006, 'What port to serve TensorBoard on.')
 FLAGS = flags.FLAGS
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@ -10,8 +10,10 @@ from setuptools.dist import Distribution
 _VERSION = '0.5.0'
 REQUIRED_PACKAGES = [
-    'numpy >= 1.9.2',
+    'numpy >= 1.8.2',
    'six >= 1.10.0',
    'protobuf == 3.0.0a3',
    'wheel',
 ]
 # pylint: disable=line-too-long