Merge branch 'master' into ENH/better_leaky_relu

2019-05-23 16:36:34 +08:00 · 2019-05-23 16:36:34 +08:00 · 9d517206ed
commit 9d517206ed
parent 8ae27c004c 592fa18bc1
15 changed files with 415 additions and 223 deletions
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@ -890,21 +890,26 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
      VLOG(1) << "Running component function on device " << target
              << " with handle " << handle;
      VLOG(4) << "    with " << opts_copy.DebugString();
-      flr->Run(
-          opts_copy, handle, comp_args, comp_rets,
-          [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
-            if (!status.ok()) {
-              VLOG(2) << "Component function execution failed: " << status;
-              refcounted_done->UpdateStatus(status);
-            } else {
-              for (int i = 0; i < comp_rets->size(); ++i) {
-                (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
-              }
-            }
-            delete comp_rets;
-            // refcounted_done is thread-safe
-            refcounted_done->Unref();
-          });
+
+      flr->Run(opts_copy, handle, comp_args, comp_rets,
+               [comp_rets, rets, comp_data, refcounted_done,
+                data](const Status& status) {
+                 if (!status.ok()) {
+                   VLOG(2) << "Component function execution failed: " << status;
+                   const string function_and_msg = strings::StrCat(
+                       errors::FormatFunctionForError(data->function_name_),
+                       " ", status.error_message());
+                   refcounted_done->UpdateStatus(
+                       Status(status.code(), function_and_msg));
+                 } else {
+                   for (int i = 0; i < comp_rets->size(); ++i) {
+                     (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
+                   }
+                 }
+                 delete comp_rets;
+                 // refcounted_done is thread-safe
+                 refcounted_done->Unref();
+               });
    } else {
      opts_copy.remote_execution = true;

--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -1264,7 +1264,9 @@ tf_kernel_library(
 tf_kernel_library(
    name = "unique_op",
    prefix = "unique_op",
-    deps = ARRAY_DEPS,
+    deps = ARRAY_DEPS + [
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
 )

 tf_kernel_library(
--- a/tensorflow/core/kernels/spacetodepth_op.cc
+++ b/tensorflow/core/kernels/spacetodepth_op.cc
@ -37,6 +37,21 @@ limitations under the License.

 namespace tensorflow {

+namespace {
+template <typename T>
+struct RawType {
+  using type = T;
+};
+
+template <>
+struct RawType<qint8> {
+  // spacetodepth_op_gpu.cu.cc does not instantiate SpaceToDepthOpFunctor for
+  // int8, so we map qint8 to uint8. Instantiating int8 could slow down
+  // compilation and the code generated is almost the same as for uint8.
+  using type = uint8;
+};
+}  // namespace
+
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;

@ -66,17 +81,17 @@ class SpaceToDepthOp : public OpKernel {
    const Tensor& input = context->input(0);
    const int dims = input.dims();

-    // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I (int8x4) here.
-    constexpr bool is_int8x4 = std::is_same<T, qint8>::value;
-    OP_REQUIRES(context, (is_int8x4 == (data_format_ == FORMAT_NCHW_VECT_C)),
-                errors::InvalidArgument(
-                    "qint8 should be used with data_format NCHW_VECT_C."));
-
-    constexpr int kVect = is_int8x4 ? 4 : 1;
-    constexpr int kDims = is_int8x4 ? 5 : 4;
-    OP_REQUIRES(context, kDims == dims,
-                errors::InvalidArgument("Input rank should be: ", kDims,
-                                        " instead of: ", dims));
+    const bool is_int8x4 = (data_format_ == FORMAT_NCHW_VECT_C);
+    const int vect = is_int8x4 ? 4 : 1;
+    if (is_int8x4) {
+      OP_REQUIRES(
+          context, dims == 5,
+          errors::InvalidArgument("Input rank should be 5 instead of ", dims));
+    } else {
+      OP_REQUIRES(
+          context, dims == 4,
+          errors::InvalidArgument("Input rank should be 4 instead of ", dims));
+    }

    constexpr int kNumSpatialDims = 2;
    const int batch_size =
@ -87,7 +102,7 @@ class SpaceToDepthOp : public OpKernel {
        input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'W'));
    const int input_depth =
        input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'C')) *
-        kVect;
+        vect;

    // Both width and height must be divisible by block_size.
    OP_REQUIRES(context,
@ -111,32 +126,32 @@ class SpaceToDepthOp : public OpKernel {
                                       output_width, output_depth),
                       &outputs_tensor));

-    auto Tinput = input.tensor<T, kDims>();
-    auto Toutput = outputs_tensor->tensor<T, kDims>();
-
    if (std::is_same<Device, GPUDevice>::value) {
-      if (is_int8x4) {
+      using RT = typename RawType<T>::type;
+      if (data_format_ == FORMAT_NCHW_VECT_C) {
        // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
        auto Tinput_v = input.template reinterpret_last_dimension<int32, 4>();
        auto Toutput_v = outputs_tensor->reinterpret_last_dimension<int32, 4>();
        functor::SpaceToDepthOpFunctor<GPUDevice, int32, FORMAT_NCHW> functor;
        functor(context->eigen_device<GPUDevice>(), Tinput_v, block_size_,
                Toutput_v);
-        return;
      } else if (data_format_ == FORMAT_NCHW) {
-        functor::SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> functor;
-        functor(context->eigen_device<GPUDevice>(), Tinput, block_size_,
-                Toutput);
-        return;
+        CHECK((std::is_same<T, RT>::value));
+        functor::SpaceToDepthOpFunctor<GPUDevice, RT, FORMAT_NCHW> functor;
+        functor(context->eigen_device<GPUDevice>(), input.tensor<RT, 4>(),
+                block_size_, outputs_tensor->tensor<RT, 4>());
+      } else {
+        CHECK((std::is_same<T, RT>::value));
+        functor::SpaceToDepthOpFunctor<GPUDevice, RT, FORMAT_NHWC> functor;
+        functor(context->eigen_device<GPUDevice>(), input.tensor<RT, 4>(),
+                block_size_, outputs_tensor->tensor<RT, 4>());
      }
-    }
-
-    // NOTE: Assumes data_format_ == FORMAT_NHWC here, since we have rejected
-    // (CPU && data_format_ != FORMAT_NHWC) in the constructor.
-
-    if (!is_int8x4) {
+    } else {
+      // NOTE: Assumes data_format_ == FORMAT_NHWC here, since we have rejected
+      // (CPU && data_format_ != FORMAT_NHWC) in the constructor.
      functor::SpaceToDepthOpFunctor<Device, T, FORMAT_NHWC> functor;
-      functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput);
+      functor(context->eigen_device<Device>(), input.tensor<T, 4>(),
+              block_size_, outputs_tensor->tensor<T, 4>());
    }
  };

@ -181,6 +196,7 @@ struct SpaceToDepthOpFunctor<CPUDevice, T, FORMAT_NHWC> {
      SpaceToDepthOp<CPUDevice, type>);

 TF_CALL_ALL_TYPES(REGISTER);
+TF_CALL_qint8(REGISTER);
 #undef REGISTER

 #if GOOGLE_CUDA
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/

 #include <functional>
-#include <unordered_map>
 #include <utility>

+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@ -106,7 +106,7 @@ class UniqueOp : public OpKernel {
      auto Tin = input.flat<T>();
      const int64 N = static_cast<int64>(Tin.size());

-      std::unordered_map<T, TIndex> uniq;
+      absl::flat_hash_map<T, TIndex> uniq;
      uniq.reserve(2 * N);
      for (Eigen::Index i = 0, j = 0; i < N; ++i) {
        auto it = uniq.insert(std::make_pair(Tin(i), j));
@ -153,7 +153,8 @@ class UniqueOp : public OpKernel {
        return true;
      };

-      std::unordered_map<int64, int64, decltype(hash_fn), decltype(equal_to_fn)>
+      absl::flat_hash_map<int64, int64, decltype(hash_fn),
+                          decltype(equal_to_fn)>
          uniq(0, hash_fn, equal_to_fn);

      uniq.reserve(2 * Tin.dimension(1));
--- a/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
@ -206,20 +206,49 @@ void CompareRoundingResults(int flat_size, const int depth_multiplier,
 }
 #endif

-void TryTestOneDepthwiseConv3x3Filter() {
+bool GenerateValidShapeConfigurations(
+    int filter_width, int filter_height, int depth_multiplier,
+    int dilation_width_factor, int dilation_height_factor,
+    RuntimeShape* input_shape_inference, RuntimeShape* filter_shape_inference,
+    RuntimeShape* output_shape_inference, int* pad_width, int* pad_height,
+    int* stride) {
  const int batch = UniformRandomInt(1, 3);
  const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
-  int input_width = UniformRandomInt(5, 50);
-  int input_height = UniformRandomInt(5, 50);
+  const int input_width = UniformRandomInt(5, 50);
+  const int input_height = UniformRandomInt(5, 50);
+  *stride = UniformRandomInt(1, 2);
+  const bool test_pad = UniformRandomInt(0, 1);
+  const auto padding_type = test_pad ? PaddingType::kValid : PaddingType::kSame;
+
+  const int output_depth = input_depth * depth_multiplier;
+
+  input_shape_inference->BuildFrom(
+      {batch, input_height, input_width, input_depth});
+
+  filter_shape_inference->BuildFrom(
+      {1, filter_height, filter_width, output_depth});
+
+  EXPECT_TRUE(ComputeConvSizes(
+      *input_shape_inference, output_depth, filter_width, filter_height,
+      *stride, dilation_width_factor, dilation_height_factor, padding_type,
+      output_shape_inference, pad_width, pad_height));
+
+  // We just care about whether the shape is suitable so we use non-per-channel
+  // case.
+  return optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
+      optimized_ops::depthwise_conv::QuantizationType::kNonPerChannelUint8>(
+      *input_shape_inference, *filter_shape_inference, *stride, *stride,
+      dilation_width_factor, dilation_height_factor, *pad_width, *pad_height,
+      depth_multiplier, *output_shape_inference, 0);
+}
+
+void TryTestOneDepthwiseConv3x3Filter() {
  const int filter_width = 3;
  const int filter_height = 3;
  const int depth_multiplier = 1;
-  const int stride = UniformRandomInt(1, 2);
  // We don't support dilations in the 3x3 filter.
  const int dilation_width_factor = 1;
  const int dilation_height_factor = 1;
-  // Currently only support valid for per-channel fast kernel.
-  const auto padding_type = PaddingType::kValid;

  const int output_activation_min = -128;
  const int output_activation_max = 127;
@ -227,19 +256,25 @@ void TryTestOneDepthwiseConv3x3Filter() {
  const std::int32_t input_offset = UniformRandomInt(-25, 25);
  const std::int32_t output_offset = UniformRandomInt(-25, 25);

-  const int output_depth = input_depth * depth_multiplier;
-
-  RuntimeShape input_shape_inference(
-      {batch, input_height, input_width, input_depth});
+  RuntimeShape input_shape_inference;
+  RuntimeShape filter_shape_inference;
  RuntimeShape output_shape_inference;
  int pad_width, pad_height;
-  EXPECT_TRUE(ComputeConvSizes(
-      input_shape_inference, output_depth, filter_width, filter_height, stride,
-      dilation_width_factor, dilation_height_factor, padding_type,
-      &output_shape_inference, &pad_width, &pad_height));
+  int stride;
+
+  // Keeps trying until we get valid shape/configurations for 3x3 filter case.
+  bool generated_valid_configurations_for_3x3_kernel = false;
+  while (!generated_valid_configurations_for_3x3_kernel) {
+    generated_valid_configurations_for_3x3_kernel =
+        GenerateValidShapeConfigurations(
+            filter_width, filter_height, depth_multiplier,
+            dilation_width_factor, dilation_height_factor,
+            &input_shape_inference, &filter_shape_inference,
+            &output_shape_inference, &pad_width, &pad_height, &stride);
+  }
+
+  const int output_depth = output_shape_inference.Dims(3);

-  RuntimeShape filter_shape_inference(
-      {1, filter_height, filter_width, output_depth});
  RuntimeShape bias_shape_inference({1, 1, 1, output_depth});
  const int input_buffer_size = input_shape_inference.FlatSize();
  const int filter_buffer_size = filter_shape_inference.FlatSize();
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
@ -422,7 +422,11 @@ inline bool Fast3x3FilterKernelSupported(

    // TODO(b/132878669): Support padding.
    if (pad_height == 1) {
-      return false;
+      for (int i = 0; i < output_depth; ++i) {
+        if (output_shift_ptr[i] != output_shift_ptr[0]) {
+          return false;
+        }
+      }
    }

    for (int i = 0; i < output_depth; ++i) {
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
@ -1902,8 +1902,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
 template <>
 struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                                      EdgeType::kCenter, 1, 1> {
-  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                         const int32* bias_ptr, uint8* output_ptr,
+  static inline void Run(const int8* input_ptr, const int8* filter_ptr,
+                         const int32* bias_ptr, int8* output_ptr,
                         const DepthwiseConvParams* params_ptr) {
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
@ -1931,9 +1931,9 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "dup v25.8h, w9\n"

        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "saddw v8.8h, v26.8h, v8.8b\n"
        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "saddw v0.8h, v25.8h, v0.8b\n"

        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"

@ -1953,13 +1953,13 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
          "sqxtn v16.4h, v16.4s\n"
          "sqxtn2 v16.8h, v17.4s\n"
          "sqadd v16.8h, v16.8h, v28.8h\n"
-          "sqxtun v16.8b, v16.8h\n"
-          "umax v16.8b, v16.8b, v30.8b\n"
-          "umin v16.8b, v16.8b, v31.8b\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
          "st1 {v16.8b}, [%[output_ptr]], #8\n"
-          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "saddw v0.8h, v25.8h, v0.8b\n"
          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"

          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
@ -1976,9 +1976,9 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "sqxtn v16.4h, v16.4s\n"
        "sqxtn2 v16.8h, v17.4s\n"
        "sqadd v16.8h, v16.8h, v28.8h\n"
-        "sqxtun v16.8b, v16.8h\n"
-        "umax v16.8b, v16.8b, v30.8b\n"
-        "umin v16.8b, v16.8b, v31.8b\n"
+        "sqxtn v16.8b, v16.8h\n"
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
        "st1 {v16.8b}, [%[output_ptr]]\n"
        :
        // Outputs.
@ -2003,8 +2003,8 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
 template <>
 struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                                      EdgeType::kCorner, 1, 1> {
-  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                         const int32* bias_ptr, uint8* output_ptr,
+  static inline void Run(const int8* input_ptr, const int8* filter_ptr,
+                         const int32* bias_ptr, int8* output_ptr,
                         const DepthwiseConvParams* params_ptr) {
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
@ -2052,17 +2052,17 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "dup v25.8h, w6\n"

        // Add input and filter offsets.
-        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "saddw v8.8h, v26.8h, v8.8b\n"
        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v10.8h, v26.8h, v10.8b\n"
-        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"

-        "uaddw v0.8h, v25.8h, v0.8b\n"
-        "uaddw v1.8h, v25.8h, v1.8b\n"
-        "uaddw v2.8h, v25.8h, v2.8b\n"
-        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "saddw v0.8h, v25.8h, v0.8b\n"
+        "saddw v1.8h, v25.8h, v1.8b\n"
+        "saddw v2.8h, v25.8h, v2.8b\n"
+        "saddw v3.8h, v25.8h, v3.8b\n"

        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"

@ -2094,20 +2094,20 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
          "sqxtn v16.4h, v16.4s\n"
          "sqxtn2 v16.8h, v17.4s\n"
          "sqadd v16.8h, v16.8h, v28.8h\n"
-          "sqxtun v16.8b, v16.8h\n"
-          "umax v16.8b, v16.8b, v30.8b\n"
-          "umin v16.8b, v16.8b, v31.8b\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
          "st1 {v16.8b}, [%[output_ptr]], #8\n"
-          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v10.8h, v26.8h, v10.8b\n"
-          "uaddw v11.8h, v26.8h, v11.8b\n"
-          "uaddw v0.8h, v25.8h, v0.8b\n"
-          "uaddw v1.8h, v25.8h, v1.8b\n"
-          "uaddw v2.8h, v25.8h, v2.8b\n"
-          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v0.8h, v25.8h, v0.8b\n"
+          "saddw v1.8h, v25.8h, v1.8b\n"
+          "saddw v2.8h, v25.8h, v2.8b\n"
+          "saddw v3.8h, v25.8h, v3.8b\n"

          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"

@ -2129,9 +2129,9 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "sqxtn v16.4h, v16.4s\n"
        "sqxtn2 v16.8h, v17.4s\n"
        "sqadd v16.8h, v16.8h, v28.8h\n"
-        "sqxtun v16.8b, v16.8h\n"
-        "umax v16.8b, v16.8b, v30.8b\n"
-        "umin v16.8b, v16.8b, v31.8b\n"
+        "sqxtn v16.8b, v16.8h\n"
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
        "st1 {v16.8b}, [%[output_ptr]]\n"
        :
        // Outputs.
@ -2156,8 +2156,8 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
 template <>
 struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                                      EdgeType::kHorizontal, 1, 1> {
-  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                         const int32* bias_ptr, uint8* output_ptr,
+  static inline void Run(const int8* input_ptr, const int8* filter_ptr,
+                         const int32* bias_ptr, int8* output_ptr,
                         const DepthwiseConvParams* params_ptr) {
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
@ -2211,21 +2211,21 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "dup v25.8h, w12\n"

        // Add input and filter offsets.
-        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "saddw v8.8h, v26.8h, v8.8b\n"
        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v10.8h, v26.8h, v10.8b\n"
-        "uaddw v11.8h, v26.8h, v11.8b\n"
-        "uaddw v12.8h, v26.8h, v12.8b\n"
-        "uaddw v13.8h, v26.8h, v13.8b\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v12.8h, v26.8h, v12.8b\n"
+        "saddw v13.8h, v26.8h, v13.8b\n"

-        "uaddw v0.8h, v25.8h, v0.8b\n"
-        "uaddw v1.8h, v25.8h, v1.8b\n"
-        "uaddw v2.8h, v25.8h, v2.8b\n"
-        "uaddw v3.8h, v25.8h, v3.8b\n"
-        "uaddw v4.8h, v25.8h, v4.8b\n"
-        "uaddw v5.8h, v25.8h, v5.8b\n"
+        "saddw v0.8h, v25.8h, v0.8b\n"
+        "saddw v1.8h, v25.8h, v1.8b\n"
+        "saddw v2.8h, v25.8h, v2.8b\n"
+        "saddw v3.8h, v25.8h, v3.8b\n"
+        "saddw v4.8h, v25.8h, v4.8b\n"
+        "saddw v5.8h, v25.8h, v5.8b\n"

        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"

@ -2272,25 +2272,25 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
          "sqxtn v16.4h, v16.4s\n"
          "sqxtn2 v16.8h, v17.4s\n"
          "sqadd v16.8h, v16.8h, v28.8h\n"
-          "sqxtun v16.8b, v16.8h\n"
-          "umax v16.8b, v16.8b, v30.8b\n"
-          "umin v16.8b, v16.8b, v31.8b\n"
-          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
          "st1 {v16.8b}, [%[output_ptr]], #8\n"
-          "uaddw v9.8h, v26.8h, v9.8b\n"
-          "uaddw v10.8h, v26.8h, v10.8b\n"
-          "uaddw v11.8h, v26.8h, v11.8b\n"
-          "uaddw v12.8h, v26.8h, v12.8b\n"
-          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"

-          "uaddw v0.8h, v25.8h, v0.8b\n"
-          "uaddw v1.8h, v25.8h, v1.8b\n"
-          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "saddw v0.8h, v25.8h, v0.8b\n"
+          "saddw v1.8h, v25.8h, v1.8b\n"
+          "saddw v2.8h, v25.8h, v2.8b\n"
          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "saddw v3.8h, v25.8h, v3.8b\n"
          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v4.8h, v25.8h, v4.8b\n"
-          "uaddw v5.8h, v25.8h, v5.8b\n"
+          "saddw v4.8h, v25.8h, v4.8b\n"
+          "saddw v5.8h, v25.8h, v5.8b\n"

          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"

@ -2315,9 +2315,9 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "sqxtn v16.4h, v16.4s\n"
        "sqxtn2 v16.8h, v17.4s\n"
        "sqadd v16.8h, v16.8h, v28.8h\n"
-        "sqxtun v16.8b, v16.8h\n"
-        "umax v16.8b, v16.8b, v30.8b\n"
-        "umin v16.8b, v16.8b, v31.8b\n"
+        "sqxtn v16.8b, v16.8h\n"
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
        "st1 {v16.8b}, [%[output_ptr]]\n"
        :
        // Outputs.
@ -2342,8 +2342,8 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
 template <>
 struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                                      EdgeType::kVertical, 1, 1> {
-  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                         const int32* bias_ptr, uint8* output_ptr,
+  static inline void Run(const int8* input_ptr, const int8* filter_ptr,
+                         const int32* bias_ptr, int8* output_ptr,
                         const DepthwiseConvParams* params_ptr) {
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
@ -2399,21 +2399,21 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "dup v25.8h, w12\n"

        // Add input and filter offsets.
-        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "saddw v8.8h, v26.8h, v8.8b\n"
        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v10.8h, v26.8h, v10.8b\n"
-        "uaddw v11.8h, v26.8h, v11.8b\n"
-        "uaddw v12.8h, v26.8h, v12.8b\n"
-        "uaddw v13.8h, v26.8h, v13.8b\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v12.8h, v26.8h, v12.8b\n"
+        "saddw v13.8h, v26.8h, v13.8b\n"

-        "uaddw v0.8h, v25.8h, v0.8b\n"
-        "uaddw v1.8h, v25.8h, v1.8b\n"
-        "uaddw v2.8h, v25.8h, v2.8b\n"
-        "uaddw v3.8h, v25.8h, v3.8b\n"
-        "uaddw v4.8h, v25.8h, v4.8b\n"
-        "uaddw v5.8h, v25.8h, v5.8b\n"
+        "saddw v0.8h, v25.8h, v0.8b\n"
+        "saddw v1.8h, v25.8h, v1.8b\n"
+        "saddw v2.8h, v25.8h, v2.8b\n"
+        "saddw v3.8h, v25.8h, v3.8b\n"
+        "saddw v4.8h, v25.8h, v4.8b\n"
+        "saddw v5.8h, v25.8h, v5.8b\n"

        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"

@ -2462,25 +2462,25 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
          "sqxtn v16.4h, v16.4s\n"
          "sqxtn2 v16.8h, v17.4s\n"
          "sqadd v16.8h, v16.8h, v28.8h\n"
-          "sqxtun v16.8b, v16.8h\n"
-          "umax v16.8b, v16.8b, v30.8b\n"
-          "umin v16.8b, v16.8b, v31.8b\n"
-          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
          "st1 {v16.8b}, [%[output_ptr]], #8\n"
-          "uaddw v9.8h, v26.8h, v9.8b\n"
-          "uaddw v10.8h, v26.8h, v10.8b\n"
-          "uaddw v11.8h, v26.8h, v11.8b\n"
-          "uaddw v12.8h, v26.8h, v12.8b\n"
-          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"

-          "uaddw v0.8h, v25.8h, v0.8b\n"
-          "uaddw v1.8h, v25.8h, v1.8b\n"
-          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "saddw v0.8h, v25.8h, v0.8b\n"
+          "saddw v1.8h, v25.8h, v1.8b\n"
+          "saddw v2.8h, v25.8h, v2.8b\n"
          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "saddw v3.8h, v25.8h, v3.8b\n"
          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v4.8h, v25.8h, v4.8b\n"
-          "uaddw v5.8h, v25.8h, v5.8b\n"
+          "saddw v4.8h, v25.8h, v4.8b\n"
+          "saddw v5.8h, v25.8h, v5.8b\n"

          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"

@ -2505,10 +2505,10 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "sqxtn v16.4h, v16.4s\n"
        "sqxtn2 v16.8h, v17.4s\n"
        "sqadd v16.8h, v16.8h, v28.8h\n"
-        "sqxtun v16.8b, v16.8h\n"
+        "sqxtn v16.8b, v16.8h\n"
        // TODO(b/129852264): Improve testing coverage.
-        "umax v16.8b, v16.8b, v30.8b\n"
-        "umin v16.8b, v16.8b, v31.8b\n"
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
        "st1 {v16.8b}, [%[output_ptr]]\n"
        :
        // Outputs.
@ -2690,10 +2690,10 @@ struct DepthwiseConvMultiRowPerChannel {
 //   * Vertical edges.
 template <DepthwiseConvOutputRounding output_rounding>
 inline void DepthwiseConvHandlePaddingPerChannel(
-    const uint8* input_data, const uint8* filter_data, const int32* bias_data,
-    uint8* output_data, const DepthwiseConvParams& params) {
+    const int8* input_data, const int8* filter_data, const int32* bias_data,
+    int8* output_data, const DepthwiseConvParams& params) {
  if (params.input_width == 1 && params.input_height == 1) {
-    const uint8* filter_ptr =
+    const int8* filter_ptr =
        filter_data + params.filter_row_size + params.output_depth;
    DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCenter, 1,
                                   1>::Run(input_data, filter_ptr, bias_data,
@ -2707,10 +2707,10 @@ inline void DepthwiseConvHandlePaddingPerChannel(
  const int32 out_y_end_corner = params.output_height - 1;

  // Handle top row.
-  const uint8* input_ptr = input_data;
-  const uint8* filter_ptr =
+  const int8* input_ptr = input_data;
+  const int8* filter_ptr =
      filter_data + params.filter_row_size + params.output_depth;
-  uint8* output_ptr = output_data;
+  int8* output_ptr = output_data;

  DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
      input_ptr, filter_ptr, bias_data, output_ptr, &params);
@ -2911,16 +2911,16 @@ inline void DepthwiseConv3x3FilterPerChannel(
    int32 end_y = row_end;

    // TODO(b/132878669): Support padding.
-    //     if (pad_width == 1 && pad_height == 1) {
-    //       DepthwiseConvHandlePaddingPerChannel<output_rounding>(
-    //           input_ptr, filter_data, bias_data, output_ptr, params);
-    //
-    //       // Update extents now that the edges have been handled.
-    //       out_x = 1;
-    //       end_x = params.output_width - 1;
-    //       out_y = std::max(1, out_y);
-    //       end_y = std::min(params.output_height - 1, end_y);
-    //     }
+    if (pad_width == 1 && pad_height == 1) {
+      DepthwiseConvHandlePaddingPerChannel<output_rounding>(
+          input_ptr, filter_data, bias_data, output_ptr, params);
+
+      // Update extents now that the edges have been handled.
+      out_x = 1;
+      end_x = params.output_width - 1;
+      out_y = std::max(1, out_y);
+      end_y = std::min(params.output_height - 1, end_y);
+    }

    // pad_width and pad_height can both be 0 or 1, depending on padding option,
    // such as Padding_VALID / Padding_SAME.
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
@ -213,9 +213,10 @@ int Main(int argc, char* argv[]) {
  evaluator->EvaluateModel();

  if (!proto_output_file_path.empty()) {
-    std::ofstream proto_out_file(proto_output_file_path, std::ios::out);
+    std::ofstream proto_out_file(proto_output_file_path,
+                                 std::ios::out | std::ios::binary);
    TopkAccuracyEvalMetrics metrics = results_writer.AggregatedMetrics();
-    proto_out_file << metrics.DebugString();
+    proto_out_file << metrics.SerializeAsString();
    proto_out_file.close();
  }

--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@ -256,21 +256,20 @@ class _InterpolateFunctionError(object):
    _, tags = error_interpolation.parse_message(message)
    g = None
    func_stack = []
-    # pylint: disable=protected-access
    for t in tags:
      if t.type == "function_node":
+        # TODO(mdan): Tests should cover this.
        if t.name == compat.as_str(self._func.name):
-          g = self._func._graph
+          g = self._func.graph
        elif g:
          next_func = g._get_function(t.name)
          if next_func is not None and isinstance(next_func,
                                                  _EagerDefinedFunction):
-            g = next_func._graph
+            g = next_func.graph
        if g:
          func_stack.append(g.name)
        else:
          func_stack.append("<unknown>")
-    # pylint: enable=protected-access
    if g:
      message = error_interpolation.interpolate(message, g)
      message += "\n\nFunction call stack:\n"
--- a/tensorflow/python/framework/convert_to_constants.py
+++ b/tensorflow/python/framework/convert_to_constants.py
@ -18,12 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import variable_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.saver import export_meta_graph

@ -43,6 +46,19 @@ def _run_inline_graph_optimization(func):
  meta_graph = export_meta_graph(
      graph_def=func.graph.as_graph_def(), graph=func.graph)

+  # Clear the initializer_name for the variables collections, since they are not
+  # needed after saved to saved_model.
+  for name in [
+      "variables", "model_variables", "trainable_variables", "local_variables"
+  ]:
+    raw_list = []
+    for raw in meta_graph.collection_def["variables"].bytes_list.value:
+      variable = variable_pb2.VariableDef()
+      variable.ParseFromString(raw)
+      variable.ClearField("initializer_name")
+      raw_list.append(variable.SerializeToString())
+    meta_graph.collection_def[name].bytes_list.value[:] = raw_list
+
  # Add a collection 'train_op' so that Grappler knows the outputs.
  fetch_collection = meta_graph_pb2.CollectionDef()
  for array in func.inputs + func.outputs:
@ -123,6 +139,7 @@ def convert_variables_to_constants_v2(func):
  resource_identities = {}
  placeholders = {}
  converted_input_indices = set()
+  reference_variables = []
  for node in graph_def.node:
    if node.name in map_name_to_value:
      # Get the dtype and data for the Placeholders whose values are stored as
@ -134,6 +151,9 @@ def convert_variables_to_constants_v2(func):
      }
      converted_input_indices.add(
          func.captured_inputs.index(map_name_to_value[node.name]))
+    # Collect the reference variables that cannot be lifted.
+    if node.op == "VariableV2":
+      reference_variables.append(node)
    if node.op == "ReadVariableOp":
      # Get name of Placeholder op associated with ReadVariableOp. There can be
      # an Identity in between the ReadVariableOp and Placeholder. Store the
@ -158,7 +178,35 @@ def convert_variables_to_constants_v2(func):
  output_graph_def = graph_pb2.GraphDef()
  how_many_converted = 0

+  # Add identity node after the reference variable and get the tensor values
+  # for them.
+  if reference_variables:
+    reference_variable_tensors = []
+    with func.graph.as_default():
+      for node in reference_variables:
+        identity_node = array_ops.identity(
+            func.graph.as_graph_element(node.name + ":0"))
+        reference_variable_tensors.append(identity_node.name)
+
+    reference_variable_values = func.prune([], reference_variable_tensors)()
+
+    # Add values of reference variables as constant nodes.
+    for node, value in zip(reference_variables, reference_variable_values):
+      output_node = output_graph_def.node.add()
+      dtype = attr_value_pb2.AttrValue()
+      dtype.type = value.dtype.as_datatype_enum
+
+      output_node.op = "Const"
+      output_node.name = node.name
+      output_node.attr["dtype"].CopyFrom(dtype)
+      output_node.attr["value"].tensor.CopyFrom(
+          tensor_util.make_tensor_proto(value))
+      how_many_converted += 1
+
  for input_node in graph_def.node:
+    # Skip VariableV2 node, since their values are added by the identity nodes.
+    if input_node.op == "VariableV2":
+      continue
    output_node = output_graph_def.node.add()
    # Convert Placeholder ops to Const ops.
    if input_node.name in placeholders:
--- a/tensorflow/python/framework/convert_to_constants_test.py
+++ b/tensorflow/python/framework/convert_to_constants_test.py
@ -21,12 +21,17 @@ from __future__ import print_function
 import os

 from tensorflow.python import keras
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import convert_to_constants
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import simple_save
 from tensorflow.python.saved_model.load import load
 from tensorflow.python.saved_model.save import save
 from tensorflow.python.training.tracking import tracking
@ -51,9 +56,9 @@ class VariablesToConstantsTest(test.TestCase):
                             input_data):
    # Check that the converted ConcreteFunction produces the same result as the
    # original Function.
-    expected_value = func(input_data)
+    expected_value = nest.flatten(func(input_data))
    actual_value = nest.flatten(converted_concrete_func(input_data))
-    self.assertEqual(expected_value.numpy(), actual_value)
+    self.assertEqual(expected_value[0].numpy(), actual_value)

    # Ensure the shape is retained.
    self.assertEqual(converted_concrete_func.inputs[0].shape, input_data.shape)
@ -65,7 +70,7 @@ class VariablesToConstantsTest(test.TestCase):
    # Load it back and make sure it works.
    loaded_obj = load(save_dir)
    actual_value = nest.flatten(loaded_obj.signatures["mykey"](input_data))
-    self.assertEqual(expected_value.numpy(), actual_value)
+    self.assertEqual(expected_value[0].numpy(), actual_value)

  @test_util.run_v2_only
  def testConstSavedModel(self):
@ -231,6 +236,44 @@ class VariablesToConstantsTest(test.TestCase):
    actual_value = nest.flatten(output_func(input_data))
    self.assertEqual(expected_value.numpy(), actual_value)

+  def _v1_single_metagraph_saved_model(self):
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      start = array_ops.placeholder(
+          shape=[1, 1], dtype=dtypes.float32, name="start")
+      distractor = variables.RefVariable(-1., name="distractor")
+      v = variables.RefVariable(3., name="v")
+      local_variable = variables.VariableV1(
+          1.,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          trainable=False,
+          use_resource=True)
+      output = array_ops.identity(start * v * local_variable, name="output")
+      with session_lib.Session() as session:
+        session.run([v.initializer, distractor.initializer,
+                     local_variable.initializer])
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        simple_save.simple_save(
+            session,
+            path,
+            inputs={"start": start},
+            outputs={"output": output},
+            legacy_init_op=local_variable.initializer)
+    return path
+
+  @test_util.run_v2_only
+  def test_ref_variable_import(self):
+    saved = self._v1_single_metagraph_saved_model()
+    imported = load(saved)
+    fn = imported.signatures["serving_default"]
+    output_func = convert_to_constants.convert_variables_to_constants_v2(fn)
+    constant_graph_def = output_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    input_data = constant_op.constant(1., shape=[1, 1])
+    root = tracking.AutoTrackable()
+    self._testConvertedFunction(root, fn, output_func, input_data)

 if __name__ == "__main__":
  test.main()
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@ -235,6 +235,11 @@ class SpaceToDepthTest(test.TestCase):

  def spaceToDepthUsingTranspose(self, tensor, block_size, data_format):
    block_size_sq = block_size * block_size
+
+    dtype = tensor.dtype
+    if dtype == dtypes.qint8:
+      tensor = array_ops.bitcast(tensor, dtypes.int8)
+
    if data_format == "NHWC":
      b, ih, iw, ic = tensor.shape.as_list()
      assert ih % block_size == 0, (ih, block_size)
@ -253,56 +258,87 @@ class SpaceToDepthTest(test.TestCase):
                                 [b, ic, oh, block_size, ow, block_size])
      tensor = array_ops.transpose(tensor, [0, 3, 5, 1, 2, 4])
      tensor = array_ops.reshape(tensor, [b, oc, oh, ow])
+
+    if dtype == dtypes.qint8:
+      tensor = array_ops.bitcast(tensor, dtype)
    return tensor

  def compareToTranspose(self, batch_size, out_height, out_width, in_channels,
-                         block_size, data_format, use_gpu):
+                         block_size, data_format, data_type, use_gpu):
    in_height = out_height * block_size
    in_width = out_width * block_size
    nhwc_input_shape = [batch_size, in_height, in_width, in_channels]
    nchw_input_shape = [batch_size, in_channels, in_height, in_width]
    total_size = np.prod(nhwc_input_shape)

-    if data_format == "NCHW_VECT_C":
-      # Initialize the input tensor with qint8 values that circle -127..127.
-      x = [((f + 128) % 255) - 127 for f in range(total_size)]
-      t = constant_op.constant(x, shape=nhwc_input_shape, dtype=dtypes.float32)
-      expected = self.spaceToDepthUsingTranspose(t, block_size, "NHWC")
-      t = test_util.NHWCToNCHW_VECT_C(t)
-      t, _, _ = gen_array_ops.quantize_v2(t, -128.0, 127.0, dtypes.qint8)
-      t = array_ops.space_to_depth(t, block_size, data_format="NCHW_VECT_C")
-      t = gen_array_ops.dequantize(t, -128, 127)
-      actual = test_util.NCHW_VECT_CToNHWC(t)
-    else:
-      # Initialize the input tensor with ascending whole numbers as floats.
-      x = [f * 1.0 for f in range(total_size)]
-      shape = nchw_input_shape if data_format == "NCHW" else nhwc_input_shape
-      t = constant_op.constant(x, shape=shape, dtype=dtypes.float32)
-      expected = self.spaceToDepthUsingTranspose(t, block_size, data_format)
-      actual = array_ops.space_to_depth(t, block_size, data_format=data_format)
+    # Construct the input tensor in data_type and NHWC.
+    # force_cpu is needed because quantize_v2 runs on only CPU.
+    with test_util.force_cpu():
+      if data_type == dtypes.qint8:
+        # Initialize the input tensor with qint8 values that circle -127..127.
+        x = [((f + 128) % 255) - 127 for f in range(total_size)]
+        t = constant_op.constant(
+            x, shape=nhwc_input_shape, dtype=dtypes.float32)
+        t, _, _ = gen_array_ops.quantize_v2(t, -128.0, 127.0, dtypes.qint8)
+      else:
+        assert data_type == dtypes.float32
+        # Initialize the input tensor with ascending whole numbers as floats.
+        x = [f * 1.0 for f in range(total_size)]
+        shape = nchw_input_shape if data_format == "NCHW" else nhwc_input_shape
+        t = constant_op.constant(x, shape=shape, dtype=dtypes.float32)
+
+    with test_util.device(use_gpu):
+      if data_format == "NCHW_VECT_C":
+        assert data_type == dtypes.qint8
+
+        # Convert to int8, then NHWCToNCHW_VECT_C, and then back to qint8.
+        actual = array_ops.bitcast(t, dtypes.int8)
+        actual = test_util.NHWCToNCHW_VECT_C(actual)
+        actual = array_ops.bitcast(actual, dtypes.qint8)
+        actual = array_ops.space_to_depth(
+            actual, block_size, data_format=data_format)
+        actual = array_ops.bitcast(actual, dtypes.int8)
+        actual = test_util.NCHW_VECT_CToNHWC(actual)
+        actual = array_ops.bitcast(actual, dtypes.qint8)
+
+        expected = array_ops.bitcast(t, dtypes.int8)
+        expected = math_ops.cast(expected, dtypes.float32)
+        expected = self.spaceToDepthUsingTranspose(expected, block_size, "NHWC")
+        expected = math_ops.cast(expected, dtypes.int8)
+        expected = array_ops.bitcast(expected, dtypes.qint8)
+      else:
+        # Initialize the input tensor with ascending whole numbers as floats.
+        actual = array_ops.space_to_depth(
+            t, block_size, data_format=data_format)
+        expected = self.spaceToDepthUsingTranspose(t, block_size, data_format)

-    with self.cached_session(use_gpu=use_gpu) as sess:
      actual_vals, expected_vals = self.evaluate([actual, expected])
      self.assertTrue(np.array_equal(actual_vals, expected_vals))

+  # TODO(jingyue): figure out why this test failed in eager mode.
+  @test_util.run_deprecated_v1
  def testAgainstTranspose(self):
-    self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", False)
-    self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", False)
-    self.compareToTranspose(1, 2, 3, 2, 3, "NHWC", False)
+    self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", dtypes.float32, False)
+    self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", dtypes.float32, False)
+    self.compareToTranspose(1, 2, 3, 2, 3, "NHWC", dtypes.float32, False)
+
+    self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", dtypes.qint8, False)
+    self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", dtypes.qint8, False)
+    self.compareToTranspose(1, 2, 3, 2, 3, "NHWC", dtypes.qint8, False)

    if not test.is_gpu_available():
      tf_logging.info("skipping gpu tests since gpu not available")
      return

-    self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", True)
-    self.compareToTranspose(3, 2, 3, 2, 2, "NHWC", True)
-    self.compareToTranspose(3, 2, 3, 1, 2, "NCHW", True)
-    self.compareToTranspose(3, 2, 3, 2, 3, "NCHW", True)
-    self.compareToTranspose(5, 7, 11, 3, 2, "NCHW", True)
+    self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", dtypes.float32, True)
+    self.compareToTranspose(3, 2, 3, 2, 2, "NHWC", dtypes.float32, True)
+    self.compareToTranspose(3, 2, 3, 1, 2, "NCHW", dtypes.float32, True)
+    self.compareToTranspose(3, 2, 3, 2, 3, "NCHW", dtypes.float32, True)
+    self.compareToTranspose(5, 7, 11, 3, 2, "NCHW", dtypes.float32, True)

-    self.compareToTranspose(3, 2, 3, 4, 2, "NCHW_VECT_C", True)
-    self.compareToTranspose(3, 2, 3, 8, 3, "NCHW_VECT_C", True)
-    self.compareToTranspose(5, 7, 11, 12, 2, "NCHW_VECT_C", True)
+    self.compareToTranspose(3, 2, 3, 4, 2, "NCHW_VECT_C", dtypes.qint8, True)
+    self.compareToTranspose(3, 2, 3, 8, 3, "NCHW_VECT_C", dtypes.qint8, True)
+    self.compareToTranspose(5, 7, 11, 12, 2, "NCHW_VECT_C", dtypes.qint8, True)


 class SpaceToDepthGradientTest(test.TestCase):
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@ -1971,6 +1971,7 @@ def tf_py_wrap_cc(
 #    //third_party/tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
 # 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
 def py_test(deps = [], data = [], kernels = [], **kwargs):
+    # Python version placeholder
    native.py_test(
        # TODO(jlebar): Ideally we'd use tcmalloc here.,
        deps = select({
@ -1999,6 +2000,8 @@ def py_binary(name, deps = [], **kwargs):
        name = name + "_deps",
        deps = deps,
    )
+
+    # Python version placeholder
    native.py_binary(
        name = name,
        deps = select({
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-looper-thread.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-looper-thread.pbtxt
@ -34,10 +34,6 @@ tf_class {
    name: "is_alive"
    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
  }
-  member_method {
-    name: "join"
-    argspec: "args=[\'self\', \'timeout\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
  member_method {
    name: "loop"
    argspec: "args=[\'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@ -41,6 +41,9 @@ _CORNER_CASES = {
    'estimator.NanLossDuringTrainingError': {
        'message': {}
    },
+    'train.LooperThread': {
+        'join': {}
+    }
 }

 # Python 2 vs. 3 differences