Branch 183429339 (#16469)

* Change `reduce_logsumexp` to internally use `reshape` rather than `squeeze` since the latter requires the `axis` arg to be a Python `list`. PiperOrigin-RevId: 183396533 * Kernel utils to support broadcast add and mul. PiperOrigin-RevId: 183397494 * Updating sparsify_gather. PiperOrigin-RevId: 183402917 * [tf.data] Move slow-path-related code into the slow path in IteratorHandleOp::Compute(). This slightly reduces the amount of work performed when an iterator is accessed (after the first access), and potentially reduces contention if concurrent steps are accessing the same iterator. PiperOrigin-RevId: 183406221 * Cleanup: Ran clang-format on all *.{cc,h} in under grappler. PiperOrigin-RevId: 183406440 * Increase shard count of //third_party/tensorflow/python:nn_batchnorm_test to avoid timeouts When run under asan, the test runs for about 5 minutes, and sometimes longer, causing frequent timeouts. This change increases the shard count of the test to 4, which brings the run time of the longest running shard under asan to about 2 minutes. PiperOrigin-RevId: 183414888 * Add available choices to toco flags and fix minor formatting issues. PiperOrigin-RevId: 183415713 * Performance improvements to some GPU code to use shared locks instead of unique locks for some hotspot cases. PiperOrigin-RevId: 183418559 * [XLA] Improve error message for bad slices. PiperOrigin-RevId: 183420038 * Fix py3 build rules for all py tests under py2tf. PiperOrigin-RevId: 183422144 * Fix bug with Operation._control_inputs setter. PiperOrigin-RevId: 183422192 * Make softmax_op_test.py work with C API enabled. PiperOrigin-RevId: 183422829 * Cleanup: Ran clang-format on all *.{cc,h} files in tensorflow/core/kernels. PiperOrigin-RevId: 183423961 * Fix the documentation for the dense layer for how rank > 2 inputs are handled. PiperOrigin-RevId: 183425868 * Cleanup: Ran clang-format on all *.{cc,h} in tensorflow/core/ops. PiperOrigin-RevId: 183429339
2018-01-26 13:32:16 -08:00 · 2018-01-26 13:32:16 -08:00 · 982549ea34
commit 982549ea34
parent f84623507b
325 changed files with 4706 additions and 4373 deletions
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@ -37,6 +37,9 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 using tensorflow::str_util::Join;
 using tensorflow::strings::Printf;
 namespace xla {
 namespace {
@ -934,7 +937,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
      "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
      BinaryOperation_Name(operation).c_str(),
      ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
-      tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str());
+      Join(broadcast_dimensions, ", ").c_str());
  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
@ -1097,7 +1100,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
    return InvalidArgument(
        "Map operation requires all operands to have the same shape; got: "
        "%s",
-        tensorflow::str_util::Join(pieces, ", ").c_str());
+        Join(pieces, ", ").c_str());
  }
  // Check that dimensions.size == arg_shape.dimensions_size() (we currently
@ -1114,7 +1117,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
    if (dimensions[i] != i) {
      return InvalidArgument(
          "Map requires monotonically increasing dimension numbers, found: %s ",
-          tensorflow::str_util::Join(dimensions, ", ").c_str());
+          Join(dimensions, ", ").c_str());
    }
  }
@ -1914,21 +1917,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
    const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
    tensorflow::gtl::ArraySlice<int64> limits,
    tensorflow::gtl::ArraySlice<int64> strides) {
  auto error = [&](const string& message) {
    return InvalidArgument(
        "%s in slice operation; argument shape: %s; starts: {%s}; limits: "
        "{%s}; strides: {%s}",
        message.c_str(), ShapeUtil::HumanString(arg).c_str(),
        Join(starts, ",").c_str(), Join(limits, ",").c_str(),
        Join(strides, ",").c_str());
  };
  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
  VLOG(2) << tensorflow::strings::Printf(
      "slicing shape %s starts={%s} limits={%s}",
-      ShapeUtil::HumanString(arg).c_str(),
+      ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(),
-      tensorflow::str_util::Join(starts, ", ").c_str(),
+      Join(limits, ", ").c_str());
      tensorflow::str_util::Join(limits, ", ").c_str());
  if (starts.size() != limits.size()) {
-    return InvalidArgument("slice start and limit sizes differ: %zu vs %zu",
+    return error(Printf("slice start and limit sizes differ: %zu vs %zu",
-                           starts.size(), limits.size());
+                        starts.size(), limits.size()));
  }
  if (starts.size() != strides.size()) {
-    return InvalidArgument("slice start and strides sizes differ: %zu vs %zu",
+    return error(Printf("slice start and strides sizes differ: %zu vs %zu",
-                           starts.size(), strides.size());
+                        starts.size(), strides.size()));
  }
  if (starts.size() != ShapeUtil::Rank(arg)) {
@ -1947,20 +1957,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                             start_index);
    }
    if (limit_index > arg.dimensions(dimension)) {
-      return InvalidArgument(
+      return error(
-          "limit index (%lld) must be less than or equal to dimension "
+          Printf("limit index (%lld) must be less than or equal to dimension "
                 "size (%lld)",
-          limit_index, arg.dimensions(dimension));
+                 limit_index, arg.dimensions(dimension)));
    }
    VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
                                           start_index);
    VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                           limit_index);
    if (start_index > limit_index) {
-      return InvalidArgument(
+      return error(
-          "limit index (%lld) must be greater or equal to "
+          Printf("limit index (%lld) must be greater or equal to "
                 "start index (%lld) in slice with positive stride",
-          limit_index, start_index);
+                 limit_index, start_index));
    }
    if (stride <= 0) {
      return InvalidArgument("stride (%lld) must be positive", stride);
@ -1983,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
      "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
      ShapeUtil::HumanString(operand_shape).c_str(),
      ShapeUtil::HumanString(start_indices_shape).c_str(),
-      tensorflow::str_util::Join(slice_sizes, ", ").c_str());
+      Join(slice_sizes, ", ").c_str());
  if (ShapeUtil::Rank(start_indices_shape) != 1) {
    return InvalidArgument(
@ -2280,8 +2290,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
    return InvalidArgument(
        "Reshape dimensions [%s] are not a permutation of the operand "
        "dimensions (operand shape is %s).",
-        tensorflow::str_util::Join(dimensions, ",").c_str(),
+        Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str());
        ShapeUtil::HumanString(operand).c_str());
  }
  return inferred_shape;
@ -2373,8 +2382,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
  // The applied function's arity equals the number of arguments.
  if (arg_shapes.size() != to_apply.parameters_size()) {
    string computation_signature = ShapeUtil::HumanString(to_apply);
-    string argument_shapes = tensorflow::str_util::Join(
+    string argument_shapes =
-        arg_shapes, ", ", [](string* out, const Shape* shape) {
+        Join(arg_shapes, ", ", [](string* out, const Shape* shape) {
          tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
        });
    return InvalidArgument(
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@ -1512,5 +1512,20 @@ TEST_F(ShapeInferenceTest, Conditional) {
                        "must have the same shape"));
 }
 TEST_F(ShapeInferenceTest, BadSlice) {
  auto arg = ShapeUtil::MakeShape(F32, {4});
  StatusOr<Shape> statusor =
      ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
  ASSERT_FALSE(statusor.ok());
  LOG(INFO) << statusor.status();
  EXPECT_THAT(statusor.status().error_message(),
              HasSubstr("less than or equal to dimension size"))
      << statusor.status();
  EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape"))
      << statusor.status();
 }
 }  // namespace
 }  // namespace xla
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@ -71,6 +71,32 @@ cc_library(
    ],
 )
 cc_library(
    name = "kernel_util",
    srcs = [
        "kernel_util.cc",
    ],
    hdrs = [
        "kernel_util.h",
    ],
    deps = [
        "//tensorflow/contrib/lite:builtin_op_data",
        "//tensorflow/contrib/lite:context",
        "//tensorflow/contrib/lite/kernels/internal:round",
    ],
 )
 tf_cc_test(
    name = "kernel_util_test",
    size = "small",
    srcs = ["kernel_util_test.cc"],
    deps = [
        ":kernel_util",
        "//tensorflow/contrib/lite/testing:util",
        "@com_google_googletest//:gtest",
    ],
 )
 cc_library(
    name = "builtin_ops",
    srcs = [
@ -87,7 +113,6 @@ cc_library(
        "fully_connected.cc",
        "gather.cc",
        "hashtable_lookup.cc",
        "kernel_util.cc",
        "l2norm.cc",
        "local_response_norm.cc",
        "lsh_projection.cc",
@ -111,7 +136,6 @@ cc_library(
        "unidirectional_sequence_rnn.cc",
    ],
    hdrs = [
        "kernel_util.h",
        "padding.h",
        "register.h",
    ],
@ -125,6 +149,7 @@ cc_library(
    }),
    deps = [
        ":activation_functor",
        ":kernel_util",
        ":op_macros",
        "//tensorflow/contrib/lite:builtin_op_data",
        "//tensorflow/contrib/lite:framework",
--- a/tensorflow/contrib/lite/kernels/kernel_util.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include <algorithm>
 #include <cmath>
 #include <memory>
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 namespace tflite {
@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
  }
 }
 bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
  return TfLiteIntArrayEqual(input1->dims, input2->dims);
 }
 TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
                                        TfLiteTensor* input1,
                                        TfLiteTensor* input2,
                                        TfLiteIntArray** output_shape) {
  int64_t dims1 = NumDimensions(input1);
  int64_t dims2 = NumDimensions(input2);
  int64_t out_dims = std::max(dims1, dims2);
  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
      TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
  for (int i = 0; i < out_dims; ++i) {
    int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
    int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
    TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1);
    shape->data[out_dims - i - 1] = std::max(d1, d2);
  }
  *output_shape = shape.release();
  return kTfLiteOk;
 }
 }  // namespace tflite
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
 inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
 inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 inline int64_t NumElements(const TfLiteTensor* t) {
  int64_t count = 1;
  for (int i = 0; i < NumDimensions(t); ++i) {
    count *= SizeOfDimension(t, i);
  }
  return count;
 }
 inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
                                            const TfLiteNode* node, int index) {
  const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
@ -76,6 +84,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
                                   float* activation_min,
                                   float* activation_max);
 // Return true if the given tensors have the same shape.
 bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
 // Calculate the output_shape that is necessary for element-wise operations
 // with broadcasting involving the two input tensors.
 TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
                                        TfLiteTensor* input1,
                                        TfLiteTensor* input2,
                                        TfLiteIntArray** output_shape);
 }  // namespace tflite
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
--- a/tensorflow/contrib/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
@ -0,0 +1,150 @@
 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/testing/util.h"
 namespace tflite {
 namespace {
 void ReportError(TfLiteContext* context, const char* format, ...) {}
 class KernelUtilTest : public ::testing::Test {
 public:
  KernelUtilTest() {
    context_.ReportError = ReportError;
    tensor1_.dims = nullptr;
    tensor2_.dims = nullptr;
  }
  ~KernelUtilTest() {
    TfLiteTensorFree(&tensor1_);
    TfLiteTensorFree(&tensor2_);
  }
  void SetShape(TfLiteTensor* tensor, std::initializer_list<int> dims) {
    TfLiteTensorFree(tensor);
    tensor->dims = TfLiteIntArrayCreate(dims.size());
    int i = 0;
    for (int d : dims) {
      tensor->dims->data[i] = d;
      ++i;
    }
  }
  std::vector<int> GetShape(TfLiteIntArray* dims) {
    std::vector<int> result;
    for (int i = 0; i < dims->size; ++i) {
      result.push_back(dims->data[i]);
    }
    return result;
  }
 protected:
  TfLiteContext context_;
  TfLiteTensor tensor1_;
  TfLiteTensor tensor2_;
 };
 TEST_F(KernelUtilTest, SameShapeEmpty) {
  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor1_, {1, 2, 3});
  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor2_, {1, 2});
  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor2_, {1, 2, 3, 4});
  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor2_, {1, 2, 3});
  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor2_, {});
  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor1_, {});
  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
 }
 TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) {
  TfLiteIntArray* output = nullptr;
  SetShape(&tensor1_, {1, 2});
  SetShape(&tensor2_, {1, 3});
  EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  EXPECT_EQ(output, nullptr);
 }
 TEST_F(KernelUtilTest, BroadcastShapeOnes) {
  TfLiteIntArray* output = nullptr;
  SetShape(&tensor1_, {1, 1});
  SetShape(&tensor2_, {1, 3});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  TfLiteIntArrayFree(output);
  SetShape(&tensor1_, {1, 2});
  SetShape(&tensor2_, {1, 1});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  TfLiteIntArrayFree(output);
 }
 TEST_F(KernelUtilTest, BroadcastShapeScalars) {
  TfLiteIntArray* output = nullptr;
  SetShape(&tensor1_, {1, 2});
  SetShape(&tensor2_, {});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2));
  TfLiteIntArrayFree(output);
  SetShape(&tensor1_, {});
  SetShape(&tensor2_, {2});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2));
  TfLiteIntArrayFree(output);
 }
 TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
  TfLiteIntArray* output = nullptr;
  SetShape(&tensor1_, {1, 2});
  SetShape(&tensor2_, {3, 1, 1});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2));
  TfLiteIntArrayFree(output);
  SetShape(&tensor1_, {1, 2, 3, 4});
  SetShape(&tensor2_, {1, 3, 1});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4));
  TfLiteIntArrayFree(output);
 }
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
  ::tflite::LogToStderr();
  ::testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();
 }
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
           "For Protobuf formats, the binary format will be used."),
      Flag("input_format", parsed_flags.input_format.bind(),
           parsed_flags.input_format.default_value(),
-           "Input file format. One of: tensorflow_graphdef, "),
+           "Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."),
      Flag("output_format", parsed_flags.output_format.bind(),
-           parsed_flags.output_format.default_value(), "Output file format."),
+           parsed_flags.output_format.default_value(),
           "Output file format. "
           "One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
      Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
           parsed_flags.default_ranges_min.default_value(),
           "If defined, will be used as the default value for the min bound "
@ -58,11 +60,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
      Flag("inference_type", parsed_flags.inference_type.bind(),
           parsed_flags.inference_type.default_value(),
           "Target data type of arrays in the output file (for input_arrays, "
-           "this may be overridden by inference_input_type)."),
+           "this may be overridden by inference_input_type). "
           "One of FLOAT, QUANTIZED_UINT8."),
      Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
           parsed_flags.inference_input_type.default_value(),
-           "Target data type of input arrays. If not specified, inference_type "
+           "Target data type of input arrays. "
-           "is used."),
+           "If not specified, inference_type is used. "
           "One of FLOAT, QUANTIZED_UINT8."),
      Flag("input_type", parsed_flags.input_type.bind(),
           parsed_flags.input_type.default_value(),
           "Deprecated ambiguous flag that set both --input_data_types and "
@ -76,35 +80,31 @@ bool ParseTocoFlagsFromCommandLineFlags(
      Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
           parsed_flags.drop_fake_quant.default_value(),
-           "Ignore and discard FakeQuant nodes. For instance, that can be used "
+           "Ignore and discard FakeQuant nodes. For instance, to "
           "to "
           "generate plain float code without fake-quantization from a "
-           "quantized "
+           "quantized graph."),
           "graph."),
      Flag(
          "reorder_across_fake_quant",
          parsed_flags.reorder_across_fake_quant.bind(),
          parsed_flags.reorder_across_fake_quant.default_value(),
          "Normally, FakeQuant nodes must be strict boundaries for graph "
          "transformations, in order to ensure that quantized inference has "
-          "the "
+          "the exact same arithmetic behavior as quantized training --- which "
-          "exact same arithmetic behavior as quantized training --- which is "
+          "is the whole point of quantized training and of FakeQuant nodes in "
-          "the "
+          "the first place. "
-          "whole point of quantized training and of FakeQuant nodes in the "
+          "However, that entails subtle requirements on where exactly "
          "first "
          "place. However, that entails subtle requirements on where exactly "
          "FakeQuant nodes must be placed in the graph. Some quantized graphs "
          "have FakeQuant nodes at unexpected locations, that prevent graph "
          "transformations that are necessary in order to generate inference "
          "code for these graphs. Such graphs should be fixed, but as a "
          "temporary work-around, setting this reorder_across_fake_quant flag "
-          "allows toco to perform necessary graph transformaitons on them, "
+          "allows TOCO to perform necessary graph transformaitons on them, "
          "at the cost of no longer faithfully matching inference and training "
          "arithmetic."),
      Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
           parsed_flags.allow_custom_ops.default_value(),
           "If true, allow TOCO to create TF Lite Custom operators for all the "
-           "unsupported Tensorflow ops."),
+           "unsupported TensorFlow ops."),
      Flag(
          "drop_control_dependency",
          parsed_flags.drop_control_dependency.bind(),
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/py2tf/BUILD
@ -57,6 +57,7 @@ py_library(
 py_test(
    name = "api_test",
    srcs = ["api_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":py2tf_internal",
        "//tensorflow/python:client_testlib",
@ -66,6 +67,7 @@ py_test(
 py_test(
    name = "conversion_test",
    srcs = ["conversion_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":py2tf_internal",
        "//tensorflow/python:client_testlib",
@ -76,6 +78,7 @@ py_test(
 py_test(
    name = "naming_test",
    srcs = ["naming_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":py2tf_internal",
        "//tensorflow/python:client_testlib",
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@ -52,6 +52,7 @@ py_library(
 py_test(
    name = "break_canonicalization_test",
    srcs = ["break_canonicalization_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -62,6 +63,7 @@ py_test(
 py_test(
    name = "call_trees_test",
    srcs = ["call_trees_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -72,6 +74,7 @@ py_test(
 py_test(
    name = "continue_canonicalization_test",
    srcs = ["continue_canonicalization_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -82,6 +85,7 @@ py_test(
 py_test(
    name = "control_flow_test",
    srcs = ["control_flow_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -92,6 +96,7 @@ py_test(
 py_test(
    name = "builtin_functions_test",
    srcs = ["builtin_functions_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -112,6 +117,7 @@ py_test(
 py_test(
    name = "logical_expressions_test",
    srcs = ["logical_expressions_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -122,6 +128,7 @@ py_test(
 py_test(
    name = "print_functions_test",
    srcs = ["print_functions_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -133,6 +140,7 @@ py_test(
 py_test(
    name = "side_effect_guards_test",
    srcs = ["side_effect_guards_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@ -38,6 +38,7 @@ py_library(
 py_test(
    name = "anno_test",
    srcs = ["anno_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":pyct",
        "//tensorflow/python:client_testlib",
@ -47,6 +48,7 @@ py_test(
 py_test(
    name = "compiler_test",
    srcs = ["compiler_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":pyct",
        "//tensorflow/python:client_testlib",
@ -57,6 +59,7 @@ py_test(
 py_test(
    name = "parser_test",
    srcs = ["parser_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":pyct",
        "//tensorflow/python:client_testlib",
@ -66,6 +69,7 @@ py_test(
 py_test(
    name = "pretty_printer_test",
    srcs = ["pretty_printer_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":pyct",
        "//tensorflow/python:client_testlib",
@ -75,6 +79,7 @@ py_test(
 py_test(
    name = "templates_test",
    srcs = ["templates_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":pyct",
        "//tensorflow/python:client_testlib",
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
@ -32,6 +32,7 @@ py_library(
 py_test(
    name = "access_test",
    srcs = ["access_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":static_analysis",
        "//tensorflow/contrib/py2tf/pyct",
@ -43,6 +44,7 @@ py_test(
 py_test(
    name = "live_values_test",
    srcs = ["live_values_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":static_analysis",
        "//tensorflow/contrib/py2tf/pyct",
@ -53,6 +55,7 @@ py_test(
 py_test(
    name = "type_info_test",
    srcs = ["type_info_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":static_analysis",
        "//tensorflow/contrib/py2tf/pyct",
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@ -230,8 +230,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
  // TODO(tucker): actually maintain separate CPUAllocators for
  // different numa_nodes.  For now, just one.
  numa_node = 0;
  mutex_lock lock(mu_);
  {
    // Here we optimize the most common use case where cuda_host_allocators_
    // and cuda_al_ have already been populated and since we're only reading
    // these vectors, we can get by with a shared lock. In the slower case,
    // we take a unique lock and populate these vectors.
    tf_shared_lock lock(mu_);
    if (FLAGS_brain_gpu_record_mem_types &&
        static_cast<int>(cuda_al_.size()) > 0) {
      return cuda_al_[0];
    }
    if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
      return cuda_host_allocators_[0];
    }
  }
  mutex_lock lock(mu_);
  // Find the first valid StreamExecutor to request CUDA host memory
  // through, since any will work.
  //
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
  DisableDetailedStats(false);
 }
-Cluster::~Cluster() {
+Cluster::~Cluster() {}
 }
 void Cluster::AllowSoftPlacement(bool soft_placement_state) {
  options_.config.set_allow_soft_placement(soft_placement_state);
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/framework/variable.pb.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
 namespace tensorflow {
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice;
 template <typename Device, typename T>
 class AdjustContrastOp : public OpKernel {
 public:
-  explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit AdjustContrastOp(OpKernelConstruction* context)
-  }
+      : OpKernel(context) {}
  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
--- a/tensorflow/core/kernels/adjust_contrast_op_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@ -29,8 +29,7 @@ limitations under the License.
 namespace tensorflow {
-class AdjustContrastOpTest : public OpsTestBase {
+class AdjustContrastOpTest : public OpsTestBase {};
 };
 TEST_F(AdjustContrastOpTest, Simple_1113) {
  TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@ -192,7 +192,8 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
    const DeviceBase::CpuWorkerThreads& worker_threads =
        *context->device()->tensorflow_cpu_worker_threads();
    Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
-          kCostPerChannel, [channel_count, &input_data, &output_data, scale_h](
+          kCostPerChannel,
          [channel_count, &input_data, &output_data, scale_h](
              int64 start_channel, int64 end_channel) {
            const float* p = input_data.data() + start_channel * kChannelSize;
            float* q = output_data.data() + start_channel * kChannelSize;
--- a/tensorflow/core/kernels/attention_ops.cc
+++ b/tensorflow/core/kernels/attention_ops.cc
@ -52,7 +52,8 @@ class ExtractGlimpseOp : public OpKernel {
    const int64 batch_size = input_shape.dim_size(0);
    const Tensor& window_size = context->input(1);
-    OP_REQUIRES(context, (window_size.shape().dims() == 1) &&
+    OP_REQUIRES(context,
                (window_size.shape().dims() == 1) &&
                    window_size.shape().dim_size(0) == 2,
                errors::InvalidArgument(
                    "input must be a vector of size 2 (height, width)",
--- a/tensorflow/core/kernels/avgpooling_op.h
+++ b/tensorflow/core/kernels/avgpooling_op.h
@ -48,9 +48,8 @@ struct SpatialAvgPooling {
 typedef Eigen::GpuDevice GPUDevice;
-// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation
+// Launch a custom GPU kernels from Yanqing for the avgpooling backward
-// that works NHWC data formats.
+// operation that works NHWC data formats. Arguments:
 // Arguments:
 //   top_diff: backprop to the output of the pooling layer
 //   num: number of input batches
 //   height: input height
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
        hstart = max(hstart, 0);
        wstart = max(wstart, 0);
        int pool_size = (hend - hstart) * (wend - wstart);
-        gradient +=
+        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] /
-            top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size);
+                    dtype(pool_size);
      }
    }
    bottom_diff[index] = gradient;
@ -90,8 +90,8 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
                            const GPUDevice& d) {
  int x_size = num * height * width * channels;
  CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
-  AvePoolBackwardNHWC<
+  AvePoolBackwardNHWC<T>
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
          config.virtual_thread_count, top_diff, num, height, width, channels,
          pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
          pad_t, pad_t, bottom_diff);
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@ -111,13 +111,14 @@ class Barrier : public ResourceBase {
      mutex_lock lock(mu_);
      if (closed_) {
        OP_REQUIRES_ASYNC(
-            ctx, !cancel_pending_enqueues_ &&
+            ctx,
            !cancel_pending_enqueues_ &&
                (num_inserted == 0 || !incomplete_.empty()),
            errors::Cancelled(
                "Barrier ", name_, " is closed.  Pending enqueues cancelled: ",
-                cancel_pending_enqueues_, ".  Number of new insertions: ",
+                cancel_pending_enqueues_,
-                num_inserted, ".  Number of incomplete keys: ",
+                ".  Number of new insertions: ", num_inserted,
-                incomplete_.size(), "."),
+                ".  Number of incomplete keys: ", incomplete_.size(), "."),
            callback);
      }
@ -128,7 +129,8 @@ class Barrier : public ResourceBase {
      for (int i = 0; i < num_inserted; ++i) {
        OP_REQUIRES_OK_ASYNC(
-            ctx, InsertOneLocked<T>(ctx, keys, values, element_shape,
+            ctx,
            InsertOneLocked<T>(ctx, keys, values, element_shape,
                               component_index, i, &ready_tuples,
                               &new_elements),
            callback);
@ -317,8 +319,9 @@ class Barrier : public ResourceBase {
        return errors::Cancelled(
            "Barrier ", name_,
            " is closed, but attempted to insert a brand new key: ",
-            keys_vec(i), ".  Pending enqueues cancelled: ",
+            keys_vec(i),
-            cancel_pending_enqueues_, ".  Insertion index: ", i,
+            ".  Pending enqueues cancelled: ", cancel_pending_enqueues_,
            ".  Insertion index: ", i,
            ".  Number of incomplete keys: ", incomplete_.size(), ".");
      }
    } else {
@ -532,11 +535,12 @@ class InsertManyOp : public BarrierOpKernel {
    OP_REQUIRES_ASYNC(
        ctx, component_index_ < barrier->num_components(),
        errors::InvalidArgument("The component ID is out of range ",
-                                component_index_, " > num_components", " (= ",
+                                component_index_, " > num_components",
-                                barrier->num_components(), ")"),
+                                " (= ", barrier->num_components(), ")"),
        callback);
    OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING,
+        ctx,
        ctx->MatchSignature({DT_STRING_REF, DT_STRING,
                             barrier->component_type(component_index_)},
                            {}),
        callback);
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
 #include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/macros.h"
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@ -429,11 +429,10 @@ template <typename Scalar>
 struct LaunchBatchMatMul<SYCLDevice, Scalar> {
  static void Launch(OpKernelContext* context, const Tensor& in_x,
                     const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
    // Number of matrix multiplies i.e. size of the batch.
    const int64 batch_size = in_x.dim_size(0);
-  ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y, out,
+    ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
-                                        0, batch_size);
+                                          out, 0, batch_size);
  }
 };
 #endif  // TENSORFLOW_USE_SYCL
@ -462,9 +461,9 @@ class BatchMatMul : public OpKernel {
    TensorShape out_shape;
    for (int i = 0; i < ndims - 2; ++i) {
      OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
-                  errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(",
+                  errors::InvalidArgument(
-                                          i, ") must be the same: ",
+                      "In[0].dim(", i, ") and In[1].dim(", i,
-                                          in0.shape().DebugString(), " vs ",
+                      ") must be the same: ", in0.shape().DebugString(), " vs ",
                      in1.shape().DebugString()));
      out_shape.AddDim(in0.dim_size(i));
    }
--- a/tensorflow/core/kernels/batch_matmul_op_test.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_test.cc
@ -54,7 +54,8 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
 // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
 // gpu);
 // BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
-// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);  \
+// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
 // \
 // BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
 // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@ -56,7 +56,8 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
      errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                              " instead of ", orig_input_tensor.dims()));
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
+  OP_REQUIRES(context,
              TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
                  block_dims == orig_crops.dim_size(0) &&
                  2 == orig_crops.dim_size(1),
              errors::InvalidArgument("crops should have shape [", block_dims,
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/util/bcast.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/bcast.h"
 namespace tensorflow {
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@ -77,12 +77,12 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
  }
  CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
  if (data_format == FORMAT_NHWC) {
-    BiasNHWCKernel<
+    BiasNHWCKernel<T>
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
            config.virtual_thread_count, input, bias, output, bias_size);
  } else {
-    BiasNCHWKernel<
+    BiasNCHWKernel<T>
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
            config.virtual_thread_count, input, bias, output, bias_size,
            image_size);
  }
@ -206,8 +206,8 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
  // Check if we have enough shared memory.
  if (shared_memory_size <= max_shared_memory_size) {
    if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_SharedAtomics<
+      BiasGradNHWC_SharedAtomics<T>
-          T><<<config.block_count, config.thread_per_block, shared_memory_size,
+          <<<config.block_count, config.thread_per_block, shared_memory_size,
             d.stream()>>>(total_count, output_backprop, bias_backprop,
                           bias_size);
    } else {
@ -217,8 +217,8 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
      if (config.thread_per_block < kWarpSize) {
        config.thread_per_block = kWarpSize;
      }
-      BiasGradNCHW_SharedAtomics<
+      BiasGradNCHW_SharedAtomics<T>
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
              output_backprop, bias_backprop, batch, bias_size, image_size,
              group_size);
    }
@ -227,13 +227,14 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
    // output block, it is possible to process one group of elements at a time.
    // But for now, we simply fall back to the naive implementation.
    if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_Naive<
+      BiasGradNHWC_Naive<T>
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
              total_count, output_backprop, bias_backprop, bias_size);
    } else {
-      BiasGradNCHW_Naive<
+      BiasGradNCHW_Naive<T>
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size, image_size);
+              total_count, output_backprop, bias_backprop, bias_size,
              image_size);
    }
  }
 }
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/kernels/bounds_check.h
@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
  auto *to_x = reinterpret_cast<const volatile T *>(&x);
  return *to_x;
 }
-}  // namespace tensorflow::internal
+}  // namespace internal
 }  // namespace tensorflow
 #endif  // TENSORFLOW_UTIL_BOUNDS_CHECK_H_
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@ -126,12 +126,12 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
                        SimpleCandidateSamplerOp<LogUniformSampler>);
-REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler")
+REGISTER_KERNEL_BUILDER(
-                            .Device(DEVICE_CPU),
+    Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU),
    SimpleCandidateSamplerOp<UnigramSampler>);
-REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler")
+REGISTER_KERNEL_BUILDER(
-                            .Device(DEVICE_CPU),
+    Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU),
    SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
 class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
@ -197,7 +197,8 @@ class ComputeAccidentalHitsOp : public OpKernel {
  void Compute(OpKernelContext* context) override {
    const Tensor& in_true_candidates = context->input(0);
    const TensorShape& in_true_candidates_shape = in_true_candidates.shape();
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
+    OP_REQUIRES(context,
                TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
                    in_true_candidates_shape.dim_size(1) == num_true_,
                errors::InvalidArgument(
                    "true_candidates must be a batch_size * num_true matrix"));
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@ -252,4 +252,3 @@ REGISTER_KERNEL_BUILDER(
    CpuCastOp);
 #endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
    p[0] = a.value;
    p[1] = 0;
 #else
-    static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");
+    static_assert(::tensorflow::port::kLittleEndian,
                  "Not a little endian system!");
    p[0] = 0;
    p[1] = a.value;
 #endif
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@ -108,12 +108,12 @@ class HSVToRGBOp : public OpKernel {
 };
 #define REGISTER_CPU(T)                                           \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \
+  REGISTER_KERNEL_BUILDER(                                        \
-                              .TypeConstraint<T>("T"),        \
+      Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
      RGBToHSVOp<CPUDevice, T>);                                  \
  template class RGBToHSVOp<CPUDevice, T>;                        \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \
+  REGISTER_KERNEL_BUILDER(                                        \
-                              .TypeConstraint<T>("T"),        \
+      Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
      HSVToRGBOp<CPUDevice, T>);                                  \
  template class HSVToRGBOp<CPUDevice, T>;
 TF_CALL_float(REGISTER_CPU);
@ -125,25 +125,24 @@ TF_CALL_double(REGISTER_CPU);
 namespace functor {
 #define DECLARE_GPU(T)                                               \
  template <>                                                        \
-  void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \
+  void RGBToHSV<GPUDevice, T>::operator()(                           \
-      TTypes<T, 2>::ConstTensor input_data,                   \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
-      TTypes<T, 1>::Tensor range,                             \
+      TTypes<T, 1>::Tensor range, TTypes<T, 2>::Tensor output_data); \
      TTypes<T, 2>::Tensor output_data);                      \
  extern template struct RGBToHSV<GPUDevice, T>;                     \
  template <>                                                        \
-  void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \
+  void HSVToRGB<GPUDevice, T>::operator()(                           \
-      TTypes<T, 2>::ConstTensor input_data,                   \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
      TTypes<T, 2>::Tensor output_data);                             \
  extern template struct HSVToRGB<GPUDevice, T>;
 TF_CALL_float(DECLARE_GPU);
 TF_CALL_double(DECLARE_GPU);
 }  // namespace functor
 #define REGISTER_GPU(T)                                           \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \
+  REGISTER_KERNEL_BUILDER(                                        \
-                              .TypeConstraint<T>("T"),        \
+      Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
      RGBToHSVOp<GPUDevice, T>);                                  \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \
+  REGISTER_KERNEL_BUILDER(                                        \
-                              .TypeConstraint<T>("T"),        \
+      Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
      HSVToRGBOp<GPUDevice, T>);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
@ -151,11 +150,11 @@ TF_CALL_double(REGISTER_GPU);
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL(T)                                           \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \
+  REGISTER_KERNEL_BUILDER(                                         \
-                              .TypeConstraint<T>("T"),         \
+      Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
      RGBToHSVOp<SYCLDevice, T>);                                  \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \
+  REGISTER_KERNEL_BUILDER(                                         \
-                              .TypeConstraint<T>("T"),         \
+      Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
      HSVToRGBOp<SYCLDevice, T>);
 TF_CALL_float(REGISTER_SYCL);
 TF_CALL_double(REGISTER_SYCL);
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/colorspace_op.h
@ -54,9 +54,8 @@ struct RGBToHSV {
    // TODO(wicke): all these assignments are only necessary because a combined
    // expression is larger than kernel parameter space. A custom kernel is
    // probably in order.
-    H.device(d) = (R == V).select(norm * (G - B),
+    H.device(d) = (R == V).select(
-                                  (G == V).select(
+        norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6),
                                      norm * (B - R) + T(2) / T(6),
                                        norm * (R - G) + T(4) / T(6)));
    H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
    H.device(d) = (H < T(0)).select(H + T(1), H);
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
@ -17,8 +17,8 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #include "tensorflow/core/kernels/colorspace_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/colorspace_op.h"
 namespace tensorflow {
@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice;
  template class functor::HSVToRGB<GPUDevice, T>;
 TF_CALL_float(INSTANTIATE_GPU);
 TF_CALL_double(INSTANTIATE_GPU);
-}
+}  // namespace tensorflow
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@ -41,9 +41,10 @@ namespace tensorflow {
 // Assumes all inputs are nonempty
 template <typename T>
-void ConcatCPU(DeviceBase* d,
+void ConcatCPU(
-               const std::vector<
+    DeviceBase* d,
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
        inputs,
    typename TTypes<T, 2>::Matrix* output);
 #if GOOGLE_CUDA
 template <typename T>
@ -57,9 +58,10 @@ void ConcatGPU(
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
+void ConcatSYCL(
-               const std::vector<
+    const Eigen::SyclDevice& d,
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
        inputs,
    typename TTypes<T, 2>::Matrix* output);
 #endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@ -48,9 +48,10 @@ struct MemCpyCopier<ResourceHandle> {
 }  // namespace
 template <typename T>
-void ConcatCPU(DeviceBase* d,
+void ConcatCPU(
-               const std::vector<
+    DeviceBase* d,
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
        inputs,
    typename TTypes<T, 2>::Matrix* output) {
  if (std::is_same<T, string>::value) {
    // use a large cost here to force strings to be handled by separate threads
@ -86,9 +87,10 @@ TF_CALL_variant(REGISTER)
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
+void ConcatSYCL(
-               const std::vector<
+    const Eigen::SyclDevice& d,
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
        inputs,
    typename TTypes<T, 2>::Matrix* output) {
  ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
                    output);
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@ -15,9 +15,9 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include "tensorflow/core/kernels/concat_lib.h"
 #include <vector>
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/util/work_sharder.h"
 namespace tensorflow {
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@ -71,7 +71,8 @@ class ConcatBaseOp : public OpKernel {
    const TensorShape& input_shape = values[0].shape();
    int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
-    OP_REQUIRES(c, (0 <= axis && axis < input_dims) ||
+    OP_REQUIRES(c,
                (0 <= axis && axis < input_dims) ||
                    (allow_legacy_scalars() && concat_dim == 0),
                errors::InvalidArgument(
                    "ConcatOp : Expected concatenating dimensions in the range "
@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel {
          c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
          errors::InvalidArgument(
              "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
+              input_shape.DebugString(), " vs. shape[", i,
-              in.shape().DebugString()));
+              "] = ", in.shape().DebugString()));
      for (int j = 0; j < input_dims; ++j) {
        if (j == axis) {
          continue;
@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel {
            c, in.dim_size(j) == input_shape.dim_size(j),
            errors::InvalidArgument(
                "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
+                input_shape.DebugString(), " vs. shape[", i,
-                in.shape().DebugString()));
+                "] = ", in.shape().DebugString()));
      }
      if (in.NumElements() > 0) {
        int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
-                         Eigen::Unaligned> EigenMap;
+                         Eigen::Unaligned>
    EigenMap;
 static void MemcpyManyAlternative1(int iters, int dim2) {
  testing::StopTiming();
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@ -99,8 +99,9 @@ class AccumulatorTakeGradientOp
                      ConditionalAccumulatorBase* accumulator,
                      DoneCallback callback) override {
    // Check signature
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
+    OP_REQUIRES_OK_ASYNC(
-                                                  {accumulator->dtype()}),
+        ctx,
        ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}),
        callback);
  }
@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp
 REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
                        AccumulatorTakeGradientOp);
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@ -146,7 +146,6 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 template <typename Device, typename T, typename Index>
 class FillOp : public OpKernel {
 public:
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@ -91,6 +91,7 @@ class KilledBySignal {
 public:
  explicit KilledBySignal(int signum) : signum_(signum) {}
  bool operator()(int exit_status) const { return exit_status == signum_; }
 private:
  const int signum_;
 };
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@ -679,7 +679,8 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
    const int dims = resized_shape.dims();
    OP_REQUIRES(
-        context, TensorShapeUtils::IsMatrix(paddings.shape()) &&
+        context,
        TensorShapeUtils::IsMatrix(paddings.shape()) &&
            paddings.dim_size(1) == 2,
        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                paddings.shape().DebugString()));
@ -715,11 +716,12 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
      const int32 after =
          paddings_matrix(d, 1);  // Pad after existing elements.
      OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
+                  errors::InvalidArgument(
-                                          before, " ", after));
+                      "paddings must be non-negative: ", before, " ", after));
      if (offset_ == 0) {  // SYMMETRIC mode.
        OP_REQUIRES(
-            context, before <= resized_shape.dim_size(d) &&
+            context,
            before <= resized_shape.dim_size(d) &&
                after <= resized_shape.dim_size(d),
            errors::InvalidArgument("paddings must be no greater "
                                    "than the dimension size: ",
@ -727,7 +729,8 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
                                    resized_shape.dim_size(d)));
      } else if (offset_ == 1) {  // REFLECT mode.
        OP_REQUIRES(
-            context, before < resized_shape.dim_size(d) &&
+            context,
            before < resized_shape.dim_size(d) &&
                after < resized_shape.dim_size(d),
            errors::InvalidArgument("paddings must be less than"
                                    " the dimension size: ",
@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
    // We only check the first three dims, since the depth is accessed as an
    // int64 below.
    for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
+      OP_REQUIRES(
-                                           std::numeric_limits<int>::max()),
+          context,
          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
          errors::InvalidArgument("filter too large"));
    }
    // The last dimension for input is in_depth. It must be the same as the
    // filter's in_depth.
    const int64 in_depth = padded_shape.dim_size(3);
-    OP_REQUIRES(
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
-        context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
-        errors::InvalidArgument("input and filter must have the same depth: ",
+                    "input and filter must have the same depth: ", in_depth,
-                                in_depth, " vs ", filter.dim_size(2)));
+                    " vs ", filter.dim_size(2)));
    // The last dimension for filter is out_depth.
    const int out_depth = static_cast<int>(filter.dim_size(3));
@ -786,8 +790,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
    // The second dimension for input is rows/height.
    // The first dimension for filter is rows/height.
    const int64 padded_rows_raw = padded_shape.dim_size(1);
-    OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw,
+    OP_REQUIRES(
-                                         std::numeric_limits<int>::max()),
+        context,
        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
        errors::InvalidArgument("Input rows too large"));
    const int padded_rows = static_cast<int>(padded_rows_raw);
    const int filter_rows = static_cast<int>(filter.dim_size(0));
@ -796,8 +801,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
    // The third dimension for input is columns/width.
    // The second dimension for filter is columns/width.
    const int64 padded_cols_raw = padded_shape.dim_size(2);
-    OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw,
+    OP_REQUIRES(
-                                         std::numeric_limits<int>::max()),
+        context,
        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
        errors::InvalidArgument("Input cols too large"));
    const int padded_cols = static_cast<int>(padded_cols_raw);
    const int filter_cols = static_cast<int>(filter.dim_size(1));
@ -870,7 +876,8 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
          .Device(DEVICE_CPU)                                             \
          .TypeConstraint<T>("T"),                                        \
      FusedResizeConv2DUsingGemmOp<                                       \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+          T,                                                              \
          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
                                       BILINEAR>,                         \
          true>);
@ -880,7 +887,8 @@ TF_CALL_float(REGISTER_FUSED);
  REGISTER_KERNEL_BUILDER(                                                \
      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
      FusedResizeConv2DUsingGemmOp<                                       \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+          T,                                                              \
          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
                                       NEAREST>,                          \
          false>);
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@ -27,7 +27,6 @@ limitations under the License.
 namespace tensorflow {
 // Get the Cudnn workspace limit from the environment variable, which is in MB.
 // Return the workspace memory limit in bytes. If no value is set, return the
 // default value.
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@ -25,9 +25,9 @@ limitations under the License.
 #include "cuda/include/cuda.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/lib/math/math_util.h"
 namespace tensorflow {
@ -252,11 +252,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
  int x = threadIdx.x;
  Dimension<3> output_dims = {
-      input_dims[0], input_dims[2], input_dims[1],
+      input_dims[0],
      input_dims[2],
      input_dims[1],
  };
  Dimension<3> input_dims_in_tiles = {
-      input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI,
+      input_dims[0],
      (input_dims[1] + TileSizeI - 1) / TileSizeI,
      (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
  };
@ -264,7 +267,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
      FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
  Index<3> input_tile_origin = {
-      input_tile_index[0], input_tile_index[1] * TileSizeI,
+      input_tile_index[0],
      input_tile_index[1] * TileSizeI,
      input_tile_index[2] * TileSizeJ,
  };
@ -322,11 +326,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
  __syncthreads();
  Index<3> output_tile_index = {
-      input_tile_index[0], input_tile_index[2], input_tile_index[1],
+      input_tile_index[0],
      input_tile_index[2],
      input_tile_index[1],
  };
  Index<3> output_tile_origin = {
-      output_tile_index[0], output_tile_index[1] * TileSizeJ,
+      output_tile_index[0],
      output_tile_index[1] * TileSizeJ,
      output_tile_index[2] * TileSizeI,
  };
@ -902,19 +909,21 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
    constexpr int kNumThreads = 256;
    Dimension<3> input_dims_in_tiles = {
-        input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
+        input_dims[0],
        MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
        MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
    };
    int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
                            input_dims_in_tiles[2];
-    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate>
+    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
                                          conjugate>
        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
                                                            output);
  } else if (narrow_matrix) {
-    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output,
+    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
-                                                  kMinDimensionToUseTiles);
+        d, input, input_dims, output, kMinDimensionToUseTiles);
  } else {
    int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
    CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
                                        filter.shape().DebugString()));
    for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
+      OP_REQUIRES(
-                                           std::numeric_limits<int>::max()),
+          context,
          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
          errors::InvalidArgument("filter too large"));
    }
    // The last dimension for input is in_depth. It must be the same as the
    // filter's in_depth.
    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
-        context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
-        errors::InvalidArgument("input and filter must have the same depth: ",
+                    "input and filter must have the same depth: ", in_depth,
-                                in_depth, " vs ", filter.dim_size(2)));
+                    " vs ", filter.dim_size(2)));
    // The last dimension for filter is out_depth.
    const int out_depth = static_cast<int>(filter.dim_size(3));
@ -487,8 +488,9 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
    // The second dimension for input is rows/height.
    // The first dimension for filter is rows/height.
    const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
+    OP_REQUIRES(
-                                         std::numeric_limits<int>::max()),
+        context,
        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
        errors::InvalidArgument("Input rows too large"));
    const int input_rows = static_cast<int>(input_rows_raw);
    const int filter_rows = static_cast<int>(filter.dim_size(0));
@ -496,8 +498,9 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
    // The third dimension for input is columns/width.
    // The second dimension for filter is columns/width.
    const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
+    OP_REQUIRES(
-                                         std::numeric_limits<int>::max()),
+        context,
        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
        errors::InvalidArgument("Input cols too large"));
    const int input_cols = static_cast<int>(input_cols_raw);
    const int filter_cols = static_cast<int>(filter.dim_size(1));
--- a/tensorflow/core/kernels/cross_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cross_op_gpu.cu.cc
@ -17,8 +17,8 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #include "tensorflow/core/kernels/cross_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/cross_op.h"
 namespace tensorflow {
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@ -19,13 +19,13 @@ limitations under the License.
 #include <limits>
 #include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 namespace tensorflow {
@ -80,16 +80,17 @@ class CTCDecodeHelper {
    if (!(batch_size == (*seq_len)->dim_size(0))) {
      return errors::FailedPrecondition(
-          "len(sequence_length) != batch_size.  ", "len(sequence_length):  ",
+          "len(sequence_length) != batch_size.  ",
-          (*seq_len)->dim_size(0), " batch_size: ", batch_size);
+          "len(sequence_length):  ", (*seq_len)->dim_size(0),
          " batch_size: ", batch_size);
    }
    auto seq_len_t = (*seq_len)->vec<int32>();
    for (int b = 0; b < batch_size; ++b) {
      if (!(seq_len_t(b) <= max_time)) {
-        return errors::FailedPrecondition("sequence_length(", b, ") <= ",
+        return errors::FailedPrecondition("sequence_length(", b,
-                                          max_time);
+                                          ") <= ", max_time);
      }
    }
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel {
      const int64 batch_indices = g.group()[0];
      OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
                  errors::InvalidArgument("labels batch index must be between ",
-                                          0, " and ", batch_size, " but saw: ",
+                                          0, " and ", batch_size,
-                                          batch_indices));
+                                          " but saw: ", batch_indices));
      auto values = g.values<int32>();
      std::vector<int>* b_values = &labels_t[batch_indices];
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
+REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
-          complex64, complex128);
+          complex128);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                        BinaryOp<CPUDevice, functor::add<int32>>);
 #endif
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_KERNEL(type)                          \
  REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
-REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
+REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
-          uint8, complex128, string);
+          complex128, string);
 // Notice: String is excluded to allow marking AddV2 is_commutative and
 // is_aggregate.
 REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
+REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
-          complex64, complex128);
+          complex128);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
+REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
-          complex64, complex128);
+          complex128);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@ -16,15 +16,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double,
+REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
-          complex64, complex128);
+          complex128);
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                \
  REGISTER_KERNEL_BUILDER(                                        \
-                          Name("Cosh")                                \
+      Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
                          .Device(DEVICE_SYCL)                        \
                          .TypeConstraint<TYPE>("T"),                 \
      UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@ -15,8 +15,10 @@ limitations under the License.
 #if GOOGLE_CUDA
-#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#define EIGEN_USE_GPU
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 namespace tensorflow {
 namespace functor {
@ -38,7 +40,6 @@ struct SelectScalarFunctor<GPUDevice, T> {
                  typename TTypes<bool>::ConstScalar cond,
                  typename TTypes<T>::ConstFlat then_flat,
                  typename TTypes<T>::ConstFlat else_flat) {
 #if !defined(EIGEN_HAS_INDEX_LIST)
    Eigen::array<int, 1> rank1{1};
 #else
@ -50,7 +51,6 @@ struct SelectScalarFunctor<GPUDevice, T> {
    To32Bit(out).device(d) = cond.reshape(rank1)
                                 .broadcast(broadcast_dims)
                                 .select(then_flat, else_flat);
  }
 };
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double);
+REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
          double);
 REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                            .Device(DEVICE_SYCL)
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@ -17,8 +17,8 @@ limitations under the License.
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
+REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
-          uint8, int32);
+          int32);
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
--- a/tensorflow/core/kernels/cwise_op_mul_2.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_2.cc
@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
-REGISTER6(BinaryOp, CPU, "Mul", functor::mul,
+REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64,
-          int8, uint16, int16, int64, complex64, complex128);
+          complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64,
          complex64, complex128);
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@ -201,12 +201,10 @@ struct SelectFunctorBase {
 };
 template <typename T>
-struct SelectFunctor<CPUDevice, T>
+struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
        : SelectFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-struct SelectFunctor<SYCLDevice, T>
+struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
        : SelectFunctorBase<SYCLDevice, T> {};
 #endif  // TENSORFLOW_USE_SYCL
 template <typename Device, typename T>
@ -257,8 +255,8 @@ struct BatchSelectFunctorBase {
 };
 template <typename T>
-struct BatchSelectFunctor<CPUDevice, T>
+struct BatchSelectFunctor<CPUDevice, T> : BatchSelectFunctorBase<CPUDevice, T> {
-        : BatchSelectFunctorBase<CPUDevice, T> {};
+};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct BatchSelectFunctor<SYCLDevice, T>
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@ -16,15 +16,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double,
+REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
-          complex64, complex128);
+          complex128);
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                \
  REGISTER_KERNEL_BUILDER(                                        \
-                          Name("Sinh")                                \
+      Name("Sinh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
                          .Device(DEVICE_SYCL)                        \
                          .TypeConstraint<TYPE>("T"),                 \
      UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@ -57,8 +57,8 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
      in1(ctx->input(1)),
      bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) {
  if (!bcast.IsValid()) {
-    ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ",
+    ctx->SetStatus(errors::InvalidArgument(
-                                           in0.shape().DebugString(), " vs. ",
+        "Incompatible shapes: ", in0.shape().DebugString(), " vs. ",
        in1.shape().DebugString()));
    return;
  }
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@ -171,7 +171,6 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
  }
 };
 #ifdef TENSORFLOW_USE_SYCL
 // Partial specialization of BinaryFunctor for SYCL devices
 typedef Eigen::SyclDevice SYCLDevice;
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h
@ -51,7 +51,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
                  typename Functor::tin_type in0,
                  typename Functor::tin_type in1, bool* error) {
-    To32Bit(out).device(d) = To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
+    To32Bit(out).device(d) =
        To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
  }
  void Left(const SYCLDevice& d, typename Functor::tout_type out,
@ -61,7 +62,9 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
    constexpr int NumDims = Functor::tin_type::NumDimensions;
    static_assert(NumDims == 1, "Unexpected size");
    Eigen::Sizes<1> scalar_dim;
-    out.device(d) = scalar.reshape(scalar_dim).broadcast(in.dimensions()).binaryExpr(in, Binary());
+    out.device(d) = scalar.reshape(scalar_dim)
                        .broadcast(in.dimensions())
                        .binaryExpr(in, Binary());
  }
  void Right(const SYCLDevice& d, typename Functor::tout_type out,
@ -71,7 +74,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
    constexpr int NumDims = Functor::tin_type::NumDimensions;
    static_assert(NumDims == 1, "Unexpected size");
    Eigen::Sizes<1> scalar_dim;
-    out.device(d) = in.binaryExpr(scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
+    out.device(d) = in.binaryExpr(
        scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
  }
  void BCast(const SYCLDevice& d,
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@ -430,13 +430,10 @@ class IteratorStateVariant {
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
                                       kIteratorVariantTypeName);
 // TODO(mrry): Can we simply use the template kernel here?
 class IteratorHandleOp : public OpKernel {
 public:
  explicit IteratorHandleOp(OpKernelConstruction* ctx)
      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_STRING, TensorShape({2}),
                                                 &handle_, nullptr));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
@ -460,20 +457,21 @@ class IteratorHandleOp : public OpKernel {
  }
  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
    {
      mutex_lock l(mu_);
      if (resource_ == nullptr) {
        FunctionLibraryRuntime* lib = context->function_library();
        std::unique_ptr<DeviceMgr> device_mgr(nullptr);
        std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
        std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
-    // If the iterator is shared then we construct a new FLR, and pass that in.
+        // If the iterator is shared then we construct a new FLR, and pass that
-    // NOTE(mrry,rohanj): In this case it is not possible to call remote
+        // in. NOTE(mrry,rohanj): In this case it is not possible to call remote
        // functions from the iterator. We may add this functionality if there
        // is sufficient demand, but it will require a significant refactoring.
        if (!name_.empty()) {
-      lib = CreateFLR(context, &device_mgr, &flib_def, &pflr);
+          lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr);
        }
    if (resource_ == nullptr) {
        ResourceMgr* mgr = context->resource_manager();
        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
@ -482,8 +480,8 @@ class IteratorHandleOp : public OpKernel {
            context,
            mgr->LookupOrCreate<IteratorResource>(
                cinfo_.container(), cinfo_.name(), &resource,
-              [lib, &device_mgr, &flib_def, &pflr, this](IteratorResource** ret)
+                [lib, &device_mgr, &flib_def, &pflr,
-                  EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                 this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
                  *ret = new IteratorResource(
                      output_dtypes_, output_shapes_, graph_def_version_,
                      std::move(device_mgr), std::move(flib_def),
@ -498,18 +496,12 @@ class IteratorHandleOp : public OpKernel {
          return;
        }
      auto h = handle_.AccessTensor(context)->template flat<string>();
      h(0) = cinfo_.container();
      h(1) = cinfo_.name();
        resource_ = resource;
      }
-    if (context->expected_output_dtype(0) == DT_RESOURCE) {
+    }
    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
                                context, 0, cinfo_.container(), cinfo_.name(),
                                MakeTypeIndex<IteratorResource>()));
    } else {
      context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
    }
  }
 private:
@ -526,7 +518,7 @@ class IteratorHandleOp : public OpKernel {
    return Status::OK();
  }
-  FunctionLibraryRuntime* CreateFLR(
+  FunctionLibraryRuntime* CreatePrivateFLR(
      OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
      std::unique_ptr<FunctionLibraryDefinition>* flib_def,
      std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
@ -546,9 +538,8 @@ class IteratorHandleOp : public OpKernel {
  }
  mutex mu_;
-  ContainerInfo cinfo_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
  IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
  PersistentTensor handle_ GUARDED_BY(mu_);
  DataTypeVector output_dtypes_;
  std::vector<PartialTensorShape> output_shapes_;
  const int graph_def_version_;
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@ -91,9 +91,9 @@ class DecodeCSVOp : public OpKernel {
            } else {
              int32 value;
              OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
+                          errors::InvalidArgument(
-                                                  " is not a valid int32: ",
+                              "Field ", f, " in record ", i,
-                                                  fields[f]));
+                              " is not a valid int32: ", fields[f]));
              output[f]->flat<int32>()(i) = value;
            }
            break;
@ -111,9 +111,9 @@ class DecodeCSVOp : public OpKernel {
            } else {
              int64 value;
              OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
+                          errors::InvalidArgument(
-                                                  " is not a valid int64: ",
+                              "Field ", f, " in record ", i,
-                                                  fields[f]));
+                              " is not a valid int64: ", fields[f]));
              output[f]->flat<int64>()(i) = value;
            }
            break;
@ -130,9 +130,9 @@ class DecodeCSVOp : public OpKernel {
            } else {
              float value;
              OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
+                          errors::InvalidArgument(
-                                                  " is not a valid float: ",
+                              "Field ", f, " in record ", i,
-                                                  fields[f]));
+                              " is not a valid float: ", fields[f]));
              output[f]->flat<float>()(i) = value;
            }
            break;
@ -150,9 +150,9 @@ class DecodeCSVOp : public OpKernel {
            } else {
              double value;
              OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
+                          errors::InvalidArgument(
-                                                  " is not a valid double: ",
+                              "Field ", f, " in record ", i,
-                                                  fields[f]));
+                              " is not a valid double: ", fields[f]));
              output[f]->flat<double>()(i) = value;
            }
            break;
@ -208,7 +208,8 @@ class DecodeCSVOp : public OpKernel {
        if (!quoted) {
          while (static_cast<size_t>(current_idx) < input.size() &&
                 input[current_idx] != delim_) {
-            OP_REQUIRES(ctx, (!use_quote_delim_ || input[current_idx] != '"') &&
+            OP_REQUIRES(ctx,
                        (!use_quote_delim_ || input[current_idx] != '"') &&
                            input[current_idx] != '\n' &&
                            input[current_idx] != '\r',
                        errors::InvalidArgument(
@ -238,7 +239,8 @@ class DecodeCSVOp : public OpKernel {
          }
          OP_REQUIRES(
-              ctx, (static_cast<size_t>(current_idx) < input.size() &&
+              ctx,
              (static_cast<size_t>(current_idx) < input.size() &&
               input[current_idx] == '"' &&
               (static_cast<size_t>(current_idx) == input.size() - 1 ||
                input[current_idx + 1] == delim_)),
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@ -87,10 +87,11 @@ class DecodeImageOp : public OpKernel {
      channels_ = 3;
    } else {
      OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-      OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
+      OP_REQUIRES(
-                               channels_ == 4,
+          context,
-                  errors::InvalidArgument(
+          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
-                      "channels must be 0, 1, 3, or 4, got ", channels_));
+          errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
                                  channels_));
    }
    flags_.components = channels_;
@ -114,8 +115,9 @@ class DecodeImageOp : public OpKernel {
    if (format_ == kJpgFormat) {
      OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
-      OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
+      OP_REQUIRES(context,
-                               flags_.ratio == 4 || flags_.ratio == 8,
+                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
                      flags_.ratio == 8,
                  errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
                                          flags_.ratio));
      OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
@ -130,7 +132,8 @@ class DecodeImageOp : public OpKernel {
      string dct_method;
      OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
      OP_REQUIRES(
-          context, (dct_method.empty() || dct_method == "INTEGER_FAST" ||
+          context,
          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
           dct_method == "INTEGER_ACCURATE"),
          errors::InvalidArgument("dct_method must be one of "
                                  "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
@ -157,9 +160,9 @@ class DecodeImageOp : public OpKernel {
        errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
                                FileFormatString(magic, input)));
    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
-                errors::InvalidArgument(FileFormatString(magic, input),
+                errors::InvalidArgument(
-                                        " contents are too large for int: ",
+                    FileFormatString(magic, input),
-                                        input.size()));
+                    " contents are too large for int: ", input.size()));
    OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
                errors::InvalidArgument(FileFormatString(magic, input),
                                        " does not support uint16 output"));
@ -212,7 +215,8 @@ class DecodeImageOp : public OpKernel {
            input.data(), input.size(), flags, nullptr /* nwarn */,
            [=, &output](int width, int height, int channels) -> uint8* {
              Status status(context->allocate_output(
-                  0, format_ == kGifFormat
+                  0,
                  format_ == kGifFormat
                      ? TensorShape({1, height, width, channels})
                      : TensorShape({height, width, channels}),
                  &output));
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@ -120,8 +120,8 @@ bool CanUseDeepConv2D(int stride_rows, int stride_cols, int filter_rows,
  VLOG(2) << "CanUseDeepConv2D"
          << " deep_conv_cost: " << deep_conv_cost
-          << " direct_conv_cost: " << direct_conv_cost
+          << " direct_conv_cost: " << direct_conv_cost << " deep_direct_ratio: "
-          << " deep_direct_ratio: " << (static_cast<float>(deep_conv_cost) /
+          << (static_cast<float>(deep_conv_cost) /
              static_cast<float>(direct_conv_cost))
          << " use_deep_conv: " << (deep_conv_cost < direct_conv_cost);
  return deep_conv_cost < direct_conv_cost;
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@ -308,10 +308,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
    // in_depth for input and filter must match.
    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
-        context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
-        errors::InvalidArgument("input and filter must have the same depth: ",
+                    "input and filter must have the same depth: ", in_depth,
-                                in_depth, " vs ", filter.dim_size(2)));
+                    " vs ", filter.dim_size(2)));
    // The last dimension for filter is depth multiplier.
    const int32 depth_multiplier = filter.dim_size(3);
@ -430,8 +430,9 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
+REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
-    Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+                            .Device(DEVICE_GPU)
                            .TypeConstraint<Eigen::half>("T"),
                        DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
 REGISTER_KERNEL_BUILDER(
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@ -17,12 +17,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "external/cub_archive/cub/util_ptx.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "external/cub_archive/cub/util_ptx.cuh"
 #if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
--- a/tensorflow/core/kernels/diag_op.cc
+++ b/tensorflow/core/kernels/diag_op.cc
@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 namespace tensorflow {
@ -47,8 +47,9 @@ class DiagOp : public OpKernel {
  void Compute(OpKernelContext* context) override {
    const Tensor& diagonal = context->input(0);
    const int num_dims = diagonal.dims();
-    OP_REQUIRES(context, 0 != num_dims, errors::InvalidArgument(
+    OP_REQUIRES(
-        "Input must be at least rank 1, got 0"));
+        context, 0 != num_dims,
        errors::InvalidArgument("Input must be at least rank 1, got 0"));
    TensorShape out_shape;
    for (int i = 0; i < num_dims; ++i) {
      out_shape.AddDim(diagonal.dim_size(i));
@ -60,9 +61,8 @@ class DiagOp : public OpKernel {
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, out_shape, &output_tensor));
    functor::DiagFunctor<Device, T> diagFunc;
-    Status s = diagFunc(context,
+    Status s =
-                        diagonal.NumElements(),
+        diagFunc(context, diagonal.NumElements(), diagonal.flat<T>().data(),
                        diagonal.flat<T>().data(),
                 output_tensor->flat<T>().data());
    OP_REQUIRES_OK(context, s);
  }
@ -83,11 +83,11 @@ class DiagPartOp : public OpKernel {
                                         even and positive, got shape ",
                                        tensor.shape().DebugString()));
    for (int i = 0; i < out_dims; i++) {
-      OP_REQUIRES(context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
+      OP_REQUIRES(
-                  errors::InvalidArgument(
+          context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
-                    "Invalid shape ", tensor.shape().DebugString(),
+          errors::InvalidArgument("Invalid shape ",
-                    ": dimensions ", i, " and ", i + out_dims, " do not match.")
+                                  tensor.shape().DebugString(), ": dimensions ",
-                  );
+                                  i, " and ", i + out_dims, " do not match."));
    }
    TensorShape out_shape;
@ -96,13 +96,10 @@ class DiagPartOp : public OpKernel {
    }
    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
                   context->allocate_output(0, out_shape, &output));
    functor::DiagPartFunctor<Device, T> diagPartFunc;
-    Status s = diagPartFunc(context,
+    Status s = diagPartFunc(context, out_shape.num_elements(),
-                            out_shape.num_elements(),
+                            tensor.flat<T>().data(), output->flat<T>().data());
                            tensor.flat<T>().data(),
                            output->flat<T>().data());
    OP_REQUIRES_OK(context, s);
  }
 };
@ -129,9 +126,8 @@ class DiagPartOp : public OpKernel {
 namespace functor {
 template <typename T>
 struct DiagFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
-  operator() (OpKernelContext* context, const int64 size,
+                                        const int64 size, const T* in, T* out) {
              const T* in, T* out) {
    // This subprocess is responsible for writing values in index range
    // [start*size, limit*size)
    auto subDiag = [in, out, size](int64 start, int64 limit) {
@ -143,17 +139,16 @@ struct DiagFunctor<CPUDevice, T> {
    // Here, 5 is a empirical factor of cost_per_unit.
    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5 * size,
-        5 * size, subDiag);
+          subDiag);
    return Status::OK();
  }
 };
 template <typename T>
 struct DiagPartFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
-  operator() (OpKernelContext* context, const int64 size,
+                                        const int64 size, const T* in, T* out) {
              const T* in, T* out) {
    // This subprocess is responsible for extracting values in index range
    // [start, limit)
    auto subDiagPart = [in, out, size](int64 start, int64 limit) {
@ -164,14 +159,13 @@ struct DiagPartFunctor<CPUDevice, T> {
    // Here, 5 is a empirical factor of cost_per_unit.
    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5,
-        5, subDiagPart);
+          subDiagPart);
    return Status::OK();
  }
 };
 }  // namespace functor
 // Register the CPU kernels.
 #define REGISTER_DIAGOP(T)                                    \
  REGISTER_KERNEL_BUILDER(                                    \
@ -250,6 +244,4 @@ TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU);
 #endif  // GOOGLE_CUDA
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/diag_op.h
+++ b/tensorflow/core/kernels/diag_op.h
@ -26,14 +26,14 @@ namespace functor {
 template <typename Device, typename T>
 struct DiagFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
-                     const T* in, T* out);
+                    T* out);
 };
 template <typename Device, typename T>
 struct DiagPartFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
-                     const T* in, T* out);
+                    T* out);
 };
 }  // namespace functor
--- a/tensorflow/core/kernels/diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc
@ -19,8 +19,8 @@ limitations under the License.
 #include <complex>
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/kernels/diag_op.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 namespace tensorflow {
 namespace functor {
@ -28,10 +28,8 @@ namespace functor {
 typedef Eigen::GpuDevice GPUDevice;
 template <typename T>
-__global__ void DiagCudaKernel(const int num_threads,
+__global__ void DiagCudaKernel(const int num_threads, const int64 size,
-                               const int64 size,
+                               const T* in, T* out) {
                               const T* in,
                               T* out) {
  CUDA_1D_KERNEL_LOOP(index, num_threads) {
    // Fill the diagonal elements or set to zero in other place.
    if (index % (1 + size) == 0) {
@ -44,9 +42,8 @@ __global__ void DiagCudaKernel(const int num_threads,
 template <typename T>
 struct DiagFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
-  operator() (OpKernelContext* context, const int64 size,
+                                        const int64 size, const T* in, T* out) {
              const T* in, T* out) {
    // Empty tensor couldn't launch the kernel.
    if (size == 0) {
      return Status::OK();
@ -56,25 +53,22 @@ struct DiagFunctor<GPUDevice, T> {
    // so this may overflow for `size*size` in extreme cases,
    // here is checking the multiplication overflow for integer.
    if (size && (int(size * size) / size) != size) {
-      return errors::Internal(
+      return errors::Internal("DiagOp got input size too large.");
          "DiagOp got input size too large.");
    }
    int virtual_thread_count = int(size * size);
    // Launch the GPU kernel.
    const GPUDevice& device = context->eigen_device<GPUDevice>();
-    CudaLaunchConfig diag_config = GetCudaLaunchConfig(
+    CudaLaunchConfig diag_config =
-        virtual_thread_count, device);
+        GetCudaLaunchConfig(virtual_thread_count, device);
-    DiagCudaKernel<<<diag_config.block_count,
+    DiagCudaKernel<<<diag_config.block_count, diag_config.thread_per_block, 0,
-                     diag_config.thread_per_block,
+                     device.stream()>>>(diag_config.virtual_thread_count, size,
-                     0, device.stream()>>>(
+                                        in, out);
        diag_config.virtual_thread_count, size, in, out);
    auto err = cudaGetLastError();
    if (err != cudaSuccess) {
      return errors::Internal(
-          "Could not launch DiagOp kernel: ",
+          "Could not launch DiagOp kernel: ", cudaGetErrorString(err), ".");
          cudaGetErrorString(err), ".");
    }
    return Status::OK();
  }
@ -87,12 +81,9 @@ template struct DiagFunctor<GPUDevice, int64>;
 template struct DiagFunctor<GPUDevice, complex64>;
 template struct DiagFunctor<GPUDevice, complex128>;
 template <typename T>
-__global__ void DiagPartCudaKernel(const int num_threads,
+__global__ void DiagPartCudaKernel(const int num_threads, const int64 size,
-                                   const int64 size,
+                                   const T* in, T* out) {
                                   const T* in,
                                   T* out) {
  CUDA_1D_KERNEL_LOOP(index, num_threads) {
    out[index] = in[(1 + size) * index];
  }
@ -100,9 +91,8 @@ __global__ void DiagPartCudaKernel(const int num_threads,
 template <typename T>
 struct DiagPartFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
-  operator() (OpKernelContext* context, const int64 size,
+                                        const int64 size, const T* in, T* out) {
              const T* in, T* out) {
    // Empty tensor couldn't launch the kernel.
    if (size == 0) {
      return Status::OK();
@ -111,16 +101,14 @@ struct DiagPartFunctor<GPUDevice, T> {
    // Extract the diagonal elements.
    CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device);
-    DiagPartCudaKernel<<<diag_config.block_count,
+    DiagPartCudaKernel<<<diag_config.block_count, diag_config.thread_per_block,
-                     diag_config.thread_per_block,
+                         0, device.stream()>>>(diag_config.virtual_thread_count,
-                     0, device.stream()>>>(
+                                               size, in, out);
        diag_config.virtual_thread_count, size, in, out);
    auto err = cudaGetLastError();
    if (err != cudaSuccess) {
      return errors::Internal(
-          "Could not launch DiagPartOp kernel: ",
+          "Could not launch DiagPartOp kernel: ", cudaGetErrorString(err), ".");
          cudaGetErrorString(err), ".");
    }
    return Status::OK();
  }
--- a/tensorflow/core/kernels/diag_op_test.cc
+++ b/tensorflow/core/kernels/diag_op_test.cc
@ -51,4 +51,3 @@ BM_Diag(128);
 BM_Diag(512);
 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/dilation_ops.cc
+++ b/tensorflow/core/kernels/dilation_ops.cc
@ -91,10 +91,10 @@ void ParseSizes(OpKernelContext* context, const std::vector<int32>& strides,
                                      filter.shape().DebugString()));
  const int filter_rows = filter.dim_size(0);
  const int filter_cols = filter.dim_size(1);
-  OP_REQUIRES(
+  OP_REQUIRES(context, depth == filter.dim_size(2),
-      context, depth == filter.dim_size(2),
+              errors::InvalidArgument(
-      errors::InvalidArgument("input and filter must have the same depth: ",
+                  "input and filter must have the same depth: ", depth, " vs ",
-                              depth, " vs ", filter.dim_size(2)));
+                  filter.dim_size(2)));
  // Effective filter size, after introducing rate - 1 zeros between each
  // non-zero filter element.
@ -234,7 +234,8 @@ class DilationBackpropInputOp : public OpKernel {
    // [ batch, out_rows, out_cols, depth ]
    const int batch = input.dim_size(0);
    const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
+    OP_REQUIRES(context,
                batch == out_backprop.dim_size(0) &&
                    out_rows == out_backprop.dim_size(1) &&
                    out_cols == out_backprop.dim_size(2) &&
                    depth == out_backprop.dim_size(3),
@ -353,7 +354,8 @@ class DilationBackpropFilterOp : public OpKernel {
    // [ batch, out_rows, out_cols, depth ]
    const int batch = input.dim_size(0);
    const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
+    OP_REQUIRES(context,
                batch == out_backprop.dim_size(0) &&
                    out_rows == out_backprop.dim_size(1) &&
                    out_cols == out_backprop.dim_size(2) &&
                    depth == out_backprop.dim_size(3),
--- a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
@ -61,9 +61,8 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
          const int w_in = w_beg + w * rate_cols;
          if (w_in >= 0 && w_in < input_cols) {
            const T val =
-                input_ptr[d +
+                input_ptr[d + depth * (w_in +
-                          depth *
+                                       input_cols * (h_in + input_rows * b))] +
                              (w_in + input_cols * (h_in + input_rows * b))] +
                filter_ptr[d + depth * (w + filter_cols * h)];
            if (val > cur_val) {
              cur_val = val;
@ -106,9 +105,8 @@ __global__ void DilationBackpropInputKernel(
          const int w_in = w_beg + w * rate_cols;
          if (w_in >= 0 && w_in < input_cols) {
            const T val =
-                input_ptr[d +
+                input_ptr[d + depth * (w_in +
-                          depth *
+                                       input_cols * (h_in + input_rows * b))] +
                              (w_in + input_cols * (h_in + input_rows * b))] +
                filter_ptr[d + depth * (w + filter_cols * h)];
            if (val > cur_val) {
              cur_val = val;
@ -156,9 +154,8 @@ __global__ void DilationBackpropFilterKernel(
          const int w_in = w_beg + w * rate_cols;
          if (w_in >= 0 && w_in < input_cols) {
            const T val =
-                input_ptr[d +
+                input_ptr[d + depth * (w_in +
-                          depth *
+                                       input_cols * (h_in + input_rows * b))] +
                              (w_in + input_cols * (h_in + input_rows * b))] +
                filter_ptr[d + depth * (w + filter_cols * h)];
            if (val > cur_val) {
              cur_val = val;
--- a/tensorflow/core/kernels/draw_bounding_box_op.cc
+++ b/tensorflow/core/kernels/draw_bounding_box_op.cc
@ -29,8 +29,7 @@ template <class T>
 class DrawBoundingBoxesOp : public OpKernel {
 public:
  explicit DrawBoundingBoxesOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+      : OpKernel(context) {}
  }
  void Compute(OpKernelContext* context) override {
    const Tensor& images = context->input(0);
@ -94,35 +93,28 @@ class DrawBoundingBoxesOp : public OpKernel {
        int64 color_index = bb % color_table_length;
        const int64 min_box_row =
            static_cast<float>(tboxes(b, bb, 0)) * (height - 1);
-        const int64 min_box_row_clamp =
+        const int64 min_box_row_clamp = std::max<int64>(min_box_row, 0);
            std::max<int64>(min_box_row, 0);
        const int64 max_box_row =
            static_cast<float>(tboxes(b, bb, 2)) * (height - 1);
        const int64 max_box_row_clamp =
            std::min<int64>(max_box_row, height - 1);
        const int64 min_box_col =
            static_cast<float>(tboxes(b, bb, 1)) * (width - 1);
-        const int64 min_box_col_clamp =
+        const int64 min_box_col_clamp = std::max<int64>(min_box_col, 0);
            std::max<int64>(min_box_col, 0);
        const int64 max_box_col =
            static_cast<float>(tboxes(b, bb, 3)) * (width - 1);
-        const int64 max_box_col_clamp =
+        const int64 max_box_col_clamp = std::min<int64>(max_box_col, width - 1);
            std::min<int64>(max_box_col, width - 1);
        if (min_box_row > max_box_row || min_box_col > max_box_col) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
-                       << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                       << "," << max_box_row
                       << "," << max_box_col
                       << ") is inverted and will not be drawn.";
          continue;
        }
-        if (min_box_row >= height || max_box_row < 0 ||
+        if (min_box_row >= height || max_box_row < 0 || min_box_col >= width ||
-            min_box_col >= width || max_box_col < 0) {
+            max_box_col < 0) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
-                       << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                       << "," << max_box_row
                       << "," << max_box_col
                       << ") is completely outside the image"
                       << " and will not be drawn.";
          continue;
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@ -103,7 +103,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
      // Walk through data and copy the data to the appropriate output tensor
      const auto data_flat = data->flat<T>();
      std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_vec;
+                                   Eigen::Aligned> >
          out_vec;
      out_vec.reserve(num_partitions_);
      for (int p = 0; p < num_partitions_; p++) {
        out_vec.push_back(outputs[p]->vec<T>());
@ -124,7 +125,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
    } else {
      // If data has extra dimensions, use Eigen slices
      std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_flat;
+                                   Eigen::Aligned> >
          out_flat;
      out_flat.reserve(num_partitions_);
      for (int p = 0; p < num_partitions_; p++) {
        out_flat.push_back(outputs[p]->flat_outer_dims<T>());
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@ -79,8 +79,8 @@ template <typename T>
 void RangeInit(const GPUDevice& d, const T start, const T delta,
               const int32 size, typename TTypes<T>::Flat out) {
  CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
-  RangeInitKernel<
+  RangeInitKernel<T>
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
          start, delta, size, out.data());
 }
@ -103,9 +103,8 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
                      T* out, int64 gather_dim_size, int64 indices_size,
                      int64 slice_size, int64 out_size) {
  CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
-  GatherOpKernel<
+  GatherOpKernel<T, int32, true>
-      T, int32,
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
      true><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
          params, indices, out, gather_dim_size, indices_size, slice_size,
          out_size);
 }
@ -231,10 +230,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
    OP_REQUIRES_ASYNC(
        c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()),
-        errors::InvalidArgument("data.shape must start with partitions.shape, ",
+        errors::InvalidArgument(
            "data.shape must start with partitions.shape, ",
            "got data.shape = ", data.shape().DebugString(),
-                                ", partitions.shape = ",
+            ", partitions.shape = ", partitions.shape().DebugString()),
                                partitions.shape().DebugString()),
        done);
    Tensor partition_count;
@ -245,7 +244,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
      AllocatorAttributes alloc_attr;
      alloc_attr.set_on_host(true);
      OP_REQUIRES_OK_ASYNC(
-          c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+          c,
          c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
                           &partition_count, alloc_attr),
          done);
      auto e_part_count = partition_count.flat<int32>();
@ -259,7 +259,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
    // Prepare for counting.
    OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+        c,
        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
                         &partition_count),
        done);
    Tensor indices_out;
@ -280,7 +281,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
    alloc_attr.set_on_host(true);
    alloc_attr.set_gpu_compatible(true);
    OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(partition_count.dtype(), partition_count.shape(),
+        c,
        c->allocate_temp(partition_count.dtype(), partition_count.shape(),
                         &cpu_tensor, alloc_attr),
        done);
    perftools::gputools::DeviceMemoryBase wrapped(
@ -340,8 +342,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
        indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
    // Allocate temporary storage.
    OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
+        c,
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+        c->allocate_temp(DT_INT8,
                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
                         &cub_temp_storage),
        done);
    // Radix-sort the partition information.
@ -376,7 +379,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
    zero_functor(device, partition_count->flat<int32>());
    // Allocate memory for aggregates_out.
    OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+        c,
        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
                         &aggregates_out),
        done);
    // Obtain the pointers to inner buffers.
@ -408,8 +412,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                                   num_runs_ptr, reduction_op, N, cu_stream);
    // Allocate temporary storage.
    OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
+        c,
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+        c->allocate_temp(DT_INT8,
                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
                         &cub_temp_storage),
        done);
    // Run reduce-by-key. The effect is that we count how many times
--- a/tensorflow/core/kernels/eigen_activations_test.cc
+++ b/tensorflow/core/kernels/eigen_activations_test.cc
@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
  ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) {
  const ptrdiff_t depth = 3;
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@ -25,31 +25,43 @@ namespace Eigen {
 *
 * \brief Extract glimpses from an input tensor.
 *
-  * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch).
+ * The input parameter is expected to be a col-major tensor with a rank of 4
-  * The width and height parameters specify the extension of the returned glimpses.
+ * (depth, x, y, and batch). The width and height parameters specify the
-  * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension.
+ * extension of the returned glimpses. The offsets parameter specifies the x, y
-  * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension.
+ * locations of the center of the glimpses relative to the center of the input
-  * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center.
+ * image. The vector is expected to contain one IndexPair for each image in the
 * batch dimension. The normalized boolean indicates if incoming coordinates are
 * normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each
 * height and width dimension. The centered boolean indicates if incoming
 * coordinates are centered relative to the image, in which case -1.0 and 1.0
 * correspond to minimum and maximum of each dimension while 0.0 corresponds to
 * the center.
 *
-  * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch).
+ * The result can be assigned to a tensor of rank equal to that of the input.
-  * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size.
+ * The result will be laid out in col-major order (depth, x, y, batch). The
 * dimensions of the result will be equal to the dimensions of the input except
 * for width and height which will be equal to the requested glimpse size.
 */
 namespace {
 template <typename Index>
 struct GlimpseExtractionOp {
  GlimpseExtractionOp(const Index width, const Index height,
                      const std::vector<IndexPair<float> >& offsets,
-                      const bool normalized,
+                      const bool normalized, const bool centered,
-                      const bool centered,
+                      const bool uniform_noise)
-                      const bool uniform_noise) :
+      : width_(width),
-      width_(width), height_(height), offsets_(offsets),
+        height_(height),
-      normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { }
+        offsets_(offsets),
        normalized_(normalized),
        centered_(centered),
        uniform_noise_(uniform_noise) {}
  template <typename Input>
  DSizes<Index, 4> dimensions(const Input& input) const {
    typedef typename internal::traits<Input>::Index IndexType;
    typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
        Ref;
    Ref in(input);
    DSizes<Index, 4> dims = in.dimensions();
@ -62,12 +74,12 @@ struct GlimpseExtractionOp {
  }
  template <typename Input, typename Output, typename Device>
-  EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC void eval(const Input& input, Output& output,
-  void eval(const Input& input, Output& output, const Device& device) const
+                              const Device& device) const {
  {
    typedef typename internal::traits<Input>::Index IndexType;
    typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
        Ref;
    Ref in(input);
    const Index num_channels = in.dimension(0);
    const Index input_width = in.dimension(1);
@ -215,21 +227,22 @@ struct GlimpseExtractionOp {
  const bool centered_;
  const bool uniform_noise_;
 };
-}
+}  // namespace
 template <typename Input>
-EIGEN_ALWAYS_INLINE
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<
-static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input>
+    const GlimpseExtractionOp<typename internal::traits<Input>::Index>,
    const Input>
 ExtractGlimpses(const Input& input,
                const typename internal::traits<Input>::Index width,
                const typename internal::traits<Input>::Index height,
                const std::vector<IndexPair<float> >& offsets,
                const bool normalized = true, const bool centered = true,
-                const bool uniform_noise = true)
+                const bool uniform_noise = true) {
-{
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
                      YOU_MADE_A_PROGRAMMING_MISTAKE);
  typedef typename internal::traits<Input>::Index Index;
  const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
--- a/tensorflow/core/kernels/eigen_attention_test.cc
+++ b/tensorflow/core/kernels/eigen_attention_test.cc
@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
  ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 TEST(EigenAttentionTest, Simple) {
  const ptrdiff_t depth = 3;
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
@ -25,7 +25,7 @@ void EigenApprox(float a, float b) {
  ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
 static int ceil_div(int a, int b) { return (a + b - 1) / b; }
-}
+}  // namespace
 TEST(EigenBackwardSpatialConvolutionsTest,
     test_simple_spatial_convolution_backward_input_valid) {
--- a/tensorflow/core/kernels/eigen_pooling_test.cc
+++ b/tensorflow/core/kernels/eigen_pooling_test.cc
@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
  ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 TEST(EigenPoolingTest, Simple) {
  const int depth = 10;
--- a/tensorflow/core/kernels/eigen_softmax.h
+++ b/tensorflow/core/kernels/eigen_softmax.h
@ -25,9 +25,11 @@ namespace Eigen {
 *
 * \brief Applies a softmax
 *
-  * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other).
+ * The input parameter is expected to be a col-major tensor with a rank of 2
 * (depth and other).
 *
-  * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order.
+ * The result can be assigned to a tensor of rank and dimensions equal to that
 * of the input. The result will be laid out in col-major order.
 *
 */
@ -41,8 +43,7 @@ struct SoftmaxOp {
  }
  template <typename Input, typename Output, typename Device>
-  void eval(const Input& input, Output& output, const Device& device) const
+  void eval(const Input& input, Output& output, const Device& device) const {
  {
 #if !defined(EIGEN_HAS_INDEX_LIST)
    // nvcc doesn't support cxx11
    Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
@ -59,27 +60,35 @@ struct SoftmaxOp {
    Eigen::IndexList<Eigen::type2index<0> > depth_dim;
    Eigen::IndexList<int, Eigen::type2index<1> > bcast;
    bcast.set(0, dimensions(input)[0]);
-    Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d;
+    Eigen::IndexList<Eigen::type2index<1>,
                     typename internal::traits<Input>::Index>
        dims2d;
    dims2d.set(1, dimensions(input)[1]);
 #endif
-    output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp();
+    output.device(device) =
-    output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+        ((input -
          input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) *
         beta_)
            .exp();
    output.device(device) =
        output /
        (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
  }
 private:
  const float beta_;
 };
-}
+}  // namespace
 template <typename Input>
-EIGEN_ALWAYS_INLINE
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<const SoftmaxOp,
-static const TensorCustomUnaryOp<const SoftmaxOp, const Input>
+                                                     const Input>
-SoftMax(const Input& input, const float beta)
+SoftMax(const Input& input, const float beta) {
-{
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2,
                      YOU_MADE_A_PROGRAMMING_MISTAKE);
  const SoftmaxOp op(beta);
  return input.customOp(op);
--- a/tensorflow/core/kernels/eigen_softmax_test.cc
+++ b/tensorflow/core/kernels/eigen_softmax_test.cc
@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
  ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 TEST(EigenSoftmaxTest, Simple) {
  const int depth = 1024;
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@ -80,8 +80,9 @@ class EncodeJpegOp : public OpKernel {
                errors::InvalidArgument("image must be 3-dimensional",
                                        image.shape().DebugString()));
-    OP_REQUIRES(context, FastBoundsCheck(image.NumElements(),
+    OP_REQUIRES(
-                                         std::numeric_limits<int32>::max()),
+        context,
        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
        errors::InvalidArgument(
            "Cannot encode images with >= max int32 elements"));
@ -100,8 +101,9 @@ class EncodeJpegOp : public OpKernel {
      } else if (channels == 3) {
        adjusted_flags.format = jpeg::FORMAT_RGB;
      } else {
-        OP_REQUIRES(context, false, errors::InvalidArgument(
+        OP_REQUIRES(
-                                        "image must have 1 or 3 channels, got ",
+            context, false,
            errors::InvalidArgument("image must have 1 or 3 channels, got ",
                                    image.shape().DebugString()));
      }
    } else {
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@ -346,7 +346,8 @@ class SingleSequenceExampleParserOp : public OpKernel {
          feature_list_sparse_keys[di].scalar<string>()();
    }
    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsVector(
+        ctx,
        TensorShapeUtils::IsVector(
            feature_list_dense_missing_assumed_empty->shape()),
        errors::InvalidArgument(
            "Expected feature_list_dense_missing_assumed_empty ",
@ -386,12 +387,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
      required[d] = (def_value.NumElements() == 0);  // No default provided.
      if (def_value.NumElements() > 0) {
-        OP_REQUIRES(
+        OP_REQUIRES(ctx, def_value.shape() == attrs_.context_dense_shapes[d],
            ctx, def_value.shape() == attrs_.context_dense_shapes[d],
                    errors::InvalidArgument(
-                "def_value[", d, "].shape() == ",
+                        "def_value[", d,
-                def_value.shape().DebugString(), " != context_dense_shapes_[",
+                        "].shape() == ", def_value.shape().DebugString(),
-                d, "] == ", attrs_.context_dense_shapes[d].DebugString()));
+                        " != context_dense_shapes_[", d,
                        "] == ", attrs_.context_dense_shapes[d].DebugString()));
        OP_REQUIRES(
            ctx, def_value.dtype() == attrs_.context_dense_types[d],
            errors::InvalidArgument(
@ -576,12 +577,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
        const Feature& f = fl.feature(t);
        bool types_match;
        OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
-        OP_REQUIRES(
+        OP_REQUIRES(ctx, types_match,
            ctx, types_match,
                    errors::InvalidArgument(
                        "Name: ", name, ", Feature list: ", key, ", Index: ", t,
-                ".  Data types don't match. ", "Expected type: ",
+                        ".  Data types don't match. ",
-                DataTypeString(dtype), "  Feature is: ", ProtoDebugString(f)));
+                        "Expected type: ", DataTypeString(dtype),
                        "  Feature is: ", ProtoDebugString(f)));
        OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f,
                                             feature_list_dense_values[d]));
      }
--- a/tensorflow/core/kernels/fact_op.cc
+++ b/tensorflow/core/kernels/fact_op.cc
@ -122,13 +122,9 @@ static string D(const char* s) {
  return ret;
 }
-REGISTER_KERNEL_BUILDER(Name("Fact")
+REGISTER_KERNEL_BUILDER(
-                            .Device(DEVICE_CPU)
+    Name("Fact").Device(DEVICE_CPU).Label(D("Yoxmos").c_str()), FactOpKernel2);
-                            .Label(D("Yoxmos").c_str()),
+REGISTER_KERNEL_BUILDER(
-                        FactOpKernel2);
+    Name("Fact").Device(DEVICE_CPU).Label(D("yoxmos").c_str()), FactOpKernel2);
 REGISTER_KERNEL_BUILDER(Name("Fact")
                            .Device(DEVICE_CPU)
                            .Label(D("yoxmos").c_str()),
                        FactOpKernel2);
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@ -378,8 +378,7 @@ TEST_F(QuantOpsTest, WithArgsGradient_RegularRange) {
  Tensor* output = GetOutput(0);
  auto input_flat = GetInput(0).flat<float>();
  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
+  FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
                    {0.0f, input_flat(1), input_flat(2),
                                input_flat(3), input_flat(4), 0.0f});
  ExpectClose(expected, *output);
 }
@ -2167,21 +2166,19 @@ TEST_F(QuantOpsTest,
  Tensor* output_bprop_wrt_input = GetOutput(0);
  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
  auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
+  FillValues<float>(
-                    {0.0f, grad_flat(1), grad_flat(2),
+      &expected_bprop_wrt_input,
-                     grad_flat(3), grad_flat(4), 0.0f});
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
  Tensor* output_bprop_wrt_min = GetOutput(1);
  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
                    {grad_flat(0), 0.0f, 0.0f});
  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
  Tensor* output_bprop_wrt_max = GetOutput(2);
  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
                    {0.0f, 0.0f, grad_flat(5)});
  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
@ -2215,21 +2212,19 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_4Bits_NarrowRange) {
  Tensor* output_bprop_wrt_input = GetOutput(0);
  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
  auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
+  FillValues<float>(
-                    {0.0f, grad_flat(1), grad_flat(2),
+      &expected_bprop_wrt_input,
-                     grad_flat(3), grad_flat(4), 0.0f});
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
  Tensor* output_bprop_wrt_min = GetOutput(1);
  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
                    {grad_flat(0), 0.0f, 0.0f});
  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
  Tensor* output_bprop_wrt_max = GetOutput(2);
  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
                    {0.0f, 0.0f, grad_flat(5)});
  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
@ -2270,8 +2265,7 @@ TEST_F(QuantOpsTest,
  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
                                  TensorShape({1, 2, 3, 4}));
  auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(
+  FillValues<float>(&expected_bprop_wrt_input,
      &expected_bprop_wrt_input,
                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@ -255,8 +255,8 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
      // TODO(josh11b): This makes two copies of callback, avoid this if possible.
      dequeue_attempts_.emplace_back(
          num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, allow_small_batch, this](Attempt* attempt)
+          [callback, allow_small_batch,
-              EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
            int64 queue_size = queues_[0].size();
            if (closed_ && queue_size < attempt->elements_requested) {
@ -317,9 +317,8 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                  const TensorShape shape =
                      ManyOutShape(i, attempt->elements_requested);
                  Tensor element;
-                      attempt->context->SetStatus(
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
-                          attempt->context->allocate_temp(component_dtypes_[i],
+                      component_dtypes_[i], shape, &element));
                                                          shape, &element));
                  if (!attempt->context->status().ok()) return kComplete;
                  attempt->tuple.emplace_back(element);
                }
@ -327,8 +326,8 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
              result = kProgress;
              Tuple tuple;
              DequeueLocked(attempt->context, &tuple);
-                  const int64 index = attempt->tuple[0].dim_size(0) -
+              const int64 index =
-                                      attempt->elements_requested;
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
              for (int i = 0; i < num_components(); ++i) {
                attempt->context->SetStatus(batch_util::CopyElementToSlice(
                    std::move(tuple[i]), &attempt->tuple[i], index));
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@ -18,8 +18,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
@ -118,7 +118,8 @@ DEFINE_SETONE_SYCL(double);
 template <typename T>
 struct FillFunctor<Eigen::ThreadPoolDevice, T> {
-  void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out,
+  void operator()(const Eigen::ThreadPoolDevice& d,
                  typename TTypes<T>::Flat out,
                  typename TTypes<T>::ConstScalar in) {
    out.device(d) = out.constant(in());
  }
@ -150,8 +151,7 @@ struct FillFunctor<Eigen::SyclDevice, T> {
  }
 };
-#define DEFINE_FILL_SYCL(T) \
+#define DEFINE_FILL_SYCL(T) template struct FillFunctor<Eigen::SyclDevice, T>;
  template struct FillFunctor<Eigen::SyclDevice, T>;
 DEFINE_FILL_SYCL(float);
 DEFINE_FILL_SYCL(double);
 TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@ -232,7 +232,8 @@ class FractionalAvgPoolGradOp : public OpKernel {
    // Grab the inputs.
    const Tensor& orig_input_tensor_shape = context->input(0);
-    OP_REQUIRES(context, orig_input_tensor_shape.dims() == 1 &&
+    OP_REQUIRES(context,
                orig_input_tensor_shape.dims() == 1 &&
                    orig_input_tensor_shape.NumElements() == 4,
                errors::InvalidArgument("original input tensor shape must be"
                                        "1-dimensional and 4 elements"));
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@ -253,8 +253,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
      args.push_back(ctx->input(i));
    }
    std::vector<Tensor>* rets = new std::vector<Tensor>;
-    lib->Run(
+    lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) {
        opts, handle, args, rets, [ctx, done, rets](const Status& status) {
      if (!status.ok()) {
        ctx->SetStatus(status);
      } else if (rets->size() != ctx->num_outputs()) {
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@ -68,7 +68,8 @@ void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
 template <class T>
 void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
                                  typename TTypes<T>::Flat out) {
-  To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
+  To32Bit(out).device(d) =
      To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
 }
 template class VarianceToInvVariance<float>;
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@ -18,12 +18,12 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/prefetch.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/work_sharder.h"
 namespace tensorflow {
@ -52,7 +52,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
  const size_t slice_bytes = slice_elems * sizeof(T);
  auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
  mutex mu;
-  // Store the value of invalidate index for printing error information, it's a shared variable.
+  // Store the value of invalidate index for printing error information, it's a
  // shared variable.
  SliceIndex result = -1;
  auto work = [&](int64 start, int64 end) {
    SliceIndex batch_idx = static_cast<SliceIndex>(start / indices_size);
@ -66,7 +67,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
      SliceIndex b_next = batch_idx + 1;
      if ((batch_idx == batch_idx_end && i_next < indices_idx_end) ||
          (i_next < indices_size)) {
-        port::prefetch<port::PREFETCH_HINT_T0>(&params(batch_idx, indices(i_next), 0));
+        port::prefetch<port::PREFETCH_HINT_T0>(
            &params(batch_idx, indices(i_next), 0));
        port::prefetch<port::PREFETCH_HINT_T0>(&out(batch_idx, i_next, 0));
        b_next = batch_idx;
      } else if (b_next <= batch_idx_end) {
@ -85,7 +87,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
      // ahead-of-time compilation binary size).
      if (is_simple_type<T>::value) {
        // Avoid auto-promotion to Index from SliceIndex by casting.
-        memcpy(out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
+        memcpy(
            out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
            params_base + (batch_idx * static_cast<SliceIndex>(limit) +
                           static_cast<SliceIndex>(index)) *
                              slice_elems,
@ -99,8 +102,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
    }
  };
-  Shard(worker_threads->num_threads, worker_threads->workers, batch_size*indices_size,
+  Shard(worker_threads->num_threads, worker_threads->workers,
-        slice_elems * sizeof(T), work);
+        batch_size * indices_size, slice_elems * sizeof(T), work);
  return result;
 }
@ -143,7 +146,8 @@ struct GatherFunctorCPU {
 template <typename Device, typename T, typename Index>
 struct GatherFunctor {
-  int64 operator()(OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor params,
+  int64 operator()(OpKernelContext* ctx,
                   typename TTypes<T, 3>::ConstTensor params,
                   typename TTypes<Index>::ConstFlat indices,
                   typename TTypes<T, 3>::Tensor out);
 };
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@ -106,8 +106,7 @@ class GatherOp : public OpKernel {
      auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
      functor::GatherFunctor<Device, T, Index> functor;
-      int64 bad_i = functor(c, params_flat,
+      int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
                            indices_flat, out_flat);
      OP_REQUIRES(
          c, bad_i < 0,
--- a/tensorflow/core/kernels/hinge-loss.h
+++ b/tensorflow/core/kernels/hinge-loss.h
@ -50,9 +50,8 @@ class HingeLossUpdater : public DualLossUpdater {
    // valid value for new dual = 0
    // c. new optimal value > 1.0. Then new optimal value should be set to 1.0.
    const double candidate_optimal_dual =
-        current_dual +
+        current_dual + (label - wx) / (num_loss_partitions * example_weight *
-        (label - wx) /
+                                       weighted_example_norm);
            (num_loss_partitions * example_weight * weighted_example_norm);
    if (label * candidate_optimal_dual < 0) {
      return 0.0;
    }
--- a/tensorflow/core/kernels/histogram_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
@ -17,16 +17,16 @@ limitations under the License.
 #define EIGEN_USE_GPU
-#include "tensorflow/core/kernels/histogram_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "external/cub_archive/cub/device/device_histogram.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/histogram_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 namespace tensorflow {
@ -104,8 +104,8 @@ struct HistogramFixedWidthFunctor<GPUDevice, T, Tout> {
        /* num_samples */ num_samples,
        /* stream */ stream);
    if (err != cudaSuccess) {
-      return errors::Internal("Could not launch HistogramRange: ",
+      return errors::Internal(
-                              cudaGetErrorString(err), ".");
+          "Could not launch HistogramRange: ", cudaGetErrorString(err), ".");
    }
    return Status::OK();
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@ -109,7 +109,8 @@ struct ImageResizerState {
    ValidateAndCalculateOutputSize(context, input);
    if (!context->status().ok()) return;
    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({input.dim_size(0), out_height,
+                                0,
                                TensorShape({input.dim_size(0), out_height,
                                             out_width, input.dim_size(3)}),
                                &output));
  }
@ -168,7 +169,8 @@ struct ImageResizerGradientState {
        CalculateResizeScale(original_width, resized_width, align_corners_);
    output = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({batch_size, original_height,
+                                0,
                                TensorShape({batch_size, original_height,
                                             original_width, channels}),
                                &output));
  }
--- a/Show More
+++ b/Show More
`@ -51,4 +51,3 @@ BM_Diag(128);`
	`BM_Diag(512);`	`BM_Diag(512);`

	`} // end namespace tensorflow`	`} // end namespace tensorflow`