Branch 183429339 (#16469)

* Change `reduce_logsumexp` to internally use `reshape` rather than `squeeze` since the latter requires the `axis` arg to be a Python `list`. PiperOrigin-RevId: 183396533 * Kernel utils to support broadcast add and mul. PiperOrigin-RevId: 183397494 * Updating sparsify_gather. PiperOrigin-RevId: 183402917 * [tf.data] Move slow-path-related code into the slow path in IteratorHandleOp::Compute(). This slightly reduces the amount of work performed when an iterator is accessed (after the first access), and potentially reduces contention if concurrent steps are accessing the same iterator. PiperOrigin-RevId: 183406221 * Cleanup: Ran clang-format on all *.{cc,h} in under grappler. PiperOrigin-RevId: 183406440 * Increase shard count of //third_party/tensorflow/python:nn_batchnorm_test to avoid timeouts When run under asan, the test runs for about 5 minutes, and sometimes longer, causing frequent timeouts. This change increases the shard count of the test to 4, which brings the run time of the longest running shard under asan to about 2 minutes. PiperOrigin-RevId: 183414888 * Add available choices to toco flags and fix minor formatting issues. PiperOrigin-RevId: 183415713 * Performance improvements to some GPU code to use shared locks instead of unique locks for some hotspot cases. PiperOrigin-RevId: 183418559 * [XLA] Improve error message for bad slices. PiperOrigin-RevId: 183420038 * Fix py3 build rules for all py tests under py2tf. PiperOrigin-RevId: 183422144 * Fix bug with Operation._control_inputs setter. PiperOrigin-RevId: 183422192 * Make softmax_op_test.py work with C API enabled. PiperOrigin-RevId: 183422829 * Cleanup: Ran clang-format on all *.{cc,h} files in tensorflow/core/kernels. PiperOrigin-RevId: 183423961 * Fix the documentation for the dense layer for how rank > 2 inputs are handled. PiperOrigin-RevId: 183425868 * Cleanup: Ran clang-format on all *.{cc,h} in tensorflow/core/ops. PiperOrigin-RevId: 183429339
2018-01-26 13:32:16 -08:00 · 2018-01-26 13:32:16 -08:00 · 982549ea34
commit 982549ea34
parent f84623507b
325 changed files with 4706 additions and 4373 deletions
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@ -37,6 +37,9 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 using tensorflow::str_util::Join;
 using tensorflow::strings::Printf;
 namespace xla {
 namespace {
@ -934,7 +937,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
      "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
      BinaryOperation_Name(operation).c_str(),
      ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
-      tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str());
+      Join(broadcast_dimensions, ", ").c_str());
  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
@ -1097,7 +1100,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
    return InvalidArgument(
        "Map operation requires all operands to have the same shape; got: "
        "%s",
-        tensorflow::str_util::Join(pieces, ", ").c_str());
+        Join(pieces, ", ").c_str());
  }
  // Check that dimensions.size == arg_shape.dimensions_size() (we currently
@ -1114,7 +1117,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
    if (dimensions[i] != i) {
      return InvalidArgument(
          "Map requires monotonically increasing dimension numbers, found: %s ",
-          tensorflow::str_util::Join(dimensions, ", ").c_str());
+          Join(dimensions, ", ").c_str());
    }
  }
@ -1914,21 +1917,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
    const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
    tensorflow::gtl::ArraySlice<int64> limits,
    tensorflow::gtl::ArraySlice<int64> strides) {
  auto error = [&](const string& message) {
    return InvalidArgument(
        "%s in slice operation; argument shape: %s; starts: {%s}; limits: "
        "{%s}; strides: {%s}",
        message.c_str(), ShapeUtil::HumanString(arg).c_str(),
        Join(starts, ",").c_str(), Join(limits, ",").c_str(),
        Join(strides, ",").c_str());
  };
  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
  VLOG(2) << tensorflow::strings::Printf(
      "slicing shape %s starts={%s} limits={%s}",
-      ShapeUtil::HumanString(arg).c_str(),
+      ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(),
-      tensorflow::str_util::Join(starts, ", ").c_str(),
+      Join(limits, ", ").c_str());
      tensorflow::str_util::Join(limits, ", ").c_str());
  if (starts.size() != limits.size()) {
-    return InvalidArgument("slice start and limit sizes differ: %zu vs %zu",
+    return error(Printf("slice start and limit sizes differ: %zu vs %zu",
-                           starts.size(), limits.size());
+                        starts.size(), limits.size()));
  }
  if (starts.size() != strides.size()) {
-    return InvalidArgument("slice start and strides sizes differ: %zu vs %zu",
+    return error(Printf("slice start and strides sizes differ: %zu vs %zu",
-                           starts.size(), strides.size());
+                        starts.size(), strides.size()));
  }
  if (starts.size() != ShapeUtil::Rank(arg)) {
@ -1947,20 +1957,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                             start_index);
    }
    if (limit_index > arg.dimensions(dimension)) {
-      return InvalidArgument(
+      return error(
-          "limit index (%lld) must be less than or equal to dimension "
+          Printf("limit index (%lld) must be less than or equal to dimension "
-          "size (%lld)",
+                 "size (%lld)",
-          limit_index, arg.dimensions(dimension));
+                 limit_index, arg.dimensions(dimension)));
    }
    VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
                                           start_index);
    VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                           limit_index);
    if (start_index > limit_index) {
-      return InvalidArgument(
+      return error(
-          "limit index (%lld) must be greater or equal to "
+          Printf("limit index (%lld) must be greater or equal to "
-          "start index (%lld) in slice with positive stride",
+                 "start index (%lld) in slice with positive stride",
-          limit_index, start_index);
+                 limit_index, start_index));
    }
    if (stride <= 0) {
      return InvalidArgument("stride (%lld) must be positive", stride);
@ -1983,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
      "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
      ShapeUtil::HumanString(operand_shape).c_str(),
      ShapeUtil::HumanString(start_indices_shape).c_str(),
-      tensorflow::str_util::Join(slice_sizes, ", ").c_str());
+      Join(slice_sizes, ", ").c_str());
  if (ShapeUtil::Rank(start_indices_shape) != 1) {
    return InvalidArgument(
@ -2280,8 +2290,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
    return InvalidArgument(
        "Reshape dimensions [%s] are not a permutation of the operand "
        "dimensions (operand shape is %s).",
-        tensorflow::str_util::Join(dimensions, ",").c_str(),
+        Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str());
        ShapeUtil::HumanString(operand).c_str());
  }
  return inferred_shape;
@ -2373,8 +2382,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
  // The applied function's arity equals the number of arguments.
  if (arg_shapes.size() != to_apply.parameters_size()) {
    string computation_signature = ShapeUtil::HumanString(to_apply);
-    string argument_shapes = tensorflow::str_util::Join(
+    string argument_shapes =
-        arg_shapes, ", ", [](string* out, const Shape* shape) {
+        Join(arg_shapes, ", ", [](string* out, const Shape* shape) {
          tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
        });
    return InvalidArgument(
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@ -1512,5 +1512,20 @@ TEST_F(ShapeInferenceTest, Conditional) {
                        "must have the same shape"));
 }
 TEST_F(ShapeInferenceTest, BadSlice) {
  auto arg = ShapeUtil::MakeShape(F32, {4});
  StatusOr<Shape> statusor =
      ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
  ASSERT_FALSE(statusor.ok());
  LOG(INFO) << statusor.status();
  EXPECT_THAT(statusor.status().error_message(),
              HasSubstr("less than or equal to dimension size"))
      << statusor.status();
  EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape"))
      << statusor.status();
 }
 }  // namespace
 }  // namespace xla
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@ -71,6 +71,32 @@ cc_library(
    ],
 )
 cc_library(
    name = "kernel_util",
    srcs = [
        "kernel_util.cc",
    ],
    hdrs = [
        "kernel_util.h",
    ],
    deps = [
        "//tensorflow/contrib/lite:builtin_op_data",
        "//tensorflow/contrib/lite:context",
        "//tensorflow/contrib/lite/kernels/internal:round",
    ],
 )
 tf_cc_test(
    name = "kernel_util_test",
    size = "small",
    srcs = ["kernel_util_test.cc"],
    deps = [
        ":kernel_util",
        "//tensorflow/contrib/lite/testing:util",
        "@com_google_googletest//:gtest",
    ],
 )
 cc_library(
    name = "builtin_ops",
    srcs = [
@ -87,7 +113,6 @@ cc_library(
        "fully_connected.cc",
        "gather.cc",
        "hashtable_lookup.cc",
        "kernel_util.cc",
        "l2norm.cc",
        "local_response_norm.cc",
        "lsh_projection.cc",
@ -111,7 +136,6 @@ cc_library(
        "unidirectional_sequence_rnn.cc",
    ],
    hdrs = [
        "kernel_util.h",
        "padding.h",
        "register.h",
    ],
@ -125,6 +149,7 @@ cc_library(
    }),
    deps = [
        ":activation_functor",
        ":kernel_util",
        ":op_macros",
        "//tensorflow/contrib/lite:builtin_op_data",
        "//tensorflow/contrib/lite:framework",
--- a/tensorflow/contrib/lite/kernels/kernel_util.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include <algorithm>
 #include <cmath>
 #include <memory>
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 namespace tflite {
@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
  }
 }
 bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
  return TfLiteIntArrayEqual(input1->dims, input2->dims);
 }
 TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
                                        TfLiteTensor* input1,
                                        TfLiteTensor* input2,
                                        TfLiteIntArray** output_shape) {
  int64_t dims1 = NumDimensions(input1);
  int64_t dims2 = NumDimensions(input2);
  int64_t out_dims = std::max(dims1, dims2);
  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
      TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
  for (int i = 0; i < out_dims; ++i) {
    int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
    int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
    TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1);
    shape->data[out_dims - i - 1] = std::max(d1, d2);
  }
  *output_shape = shape.release();
  return kTfLiteOk;
 }
 }  // namespace tflite
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
 inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
 inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 inline int64_t NumElements(const TfLiteTensor* t) {
  int64_t count = 1;
  for (int i = 0; i < NumDimensions(t); ++i) {
    count *= SizeOfDimension(t, i);
  }
  return count;
 }
 inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
                                            const TfLiteNode* node, int index) {
  const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
@ -76,6 +84,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
                                   float* activation_min,
                                   float* activation_max);
 // Return true if the given tensors have the same shape.
 bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
 // Calculate the output_shape that is necessary for element-wise operations
 // with broadcasting involving the two input tensors.
 TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
                                        TfLiteTensor* input1,
                                        TfLiteTensor* input2,
                                        TfLiteIntArray** output_shape);
 }  // namespace tflite
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
--- a/tensorflow/contrib/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
@ -0,0 +1,150 @@
 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/testing/util.h"
 namespace tflite {
 namespace {
 void ReportError(TfLiteContext* context, const char* format, ...) {}
 class KernelUtilTest : public ::testing::Test {
 public:
  KernelUtilTest() {
    context_.ReportError = ReportError;
    tensor1_.dims = nullptr;
    tensor2_.dims = nullptr;
  }
  ~KernelUtilTest() {
    TfLiteTensorFree(&tensor1_);
    TfLiteTensorFree(&tensor2_);
  }
  void SetShape(TfLiteTensor* tensor, std::initializer_list<int> dims) {
    TfLiteTensorFree(tensor);
    tensor->dims = TfLiteIntArrayCreate(dims.size());
    int i = 0;
    for (int d : dims) {
      tensor->dims->data[i] = d;
      ++i;
    }
  }
  std::vector<int> GetShape(TfLiteIntArray* dims) {
    std::vector<int> result;
    for (int i = 0; i < dims->size; ++i) {
      result.push_back(dims->data[i]);
    }
    return result;
  }
 protected:
  TfLiteContext context_;
  TfLiteTensor tensor1_;
  TfLiteTensor tensor2_;
 };
 TEST_F(KernelUtilTest, SameShapeEmpty) {
  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor1_, {1, 2, 3});
  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor2_, {1, 2});
  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor2_, {1, 2, 3, 4});
  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor2_, {1, 2, 3});
  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor2_, {});
  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
  SetShape(&tensor1_, {});
  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
 }
 TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) {
  TfLiteIntArray* output = nullptr;
  SetShape(&tensor1_, {1, 2});
  SetShape(&tensor2_, {1, 3});
  EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  EXPECT_EQ(output, nullptr);
 }
 TEST_F(KernelUtilTest, BroadcastShapeOnes) {
  TfLiteIntArray* output = nullptr;
  SetShape(&tensor1_, {1, 1});
  SetShape(&tensor2_, {1, 3});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  TfLiteIntArrayFree(output);
  SetShape(&tensor1_, {1, 2});
  SetShape(&tensor2_, {1, 1});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  TfLiteIntArrayFree(output);
 }
 TEST_F(KernelUtilTest, BroadcastShapeScalars) {
  TfLiteIntArray* output = nullptr;
  SetShape(&tensor1_, {1, 2});
  SetShape(&tensor2_, {});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2));
  TfLiteIntArrayFree(output);
  SetShape(&tensor1_, {});
  SetShape(&tensor2_, {2});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2));
  TfLiteIntArrayFree(output);
 }
 TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
  TfLiteIntArray* output = nullptr;
  SetShape(&tensor1_, {1, 2});
  SetShape(&tensor2_, {3, 1, 1});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2));
  TfLiteIntArrayFree(output);
  SetShape(&tensor1_, {1, 2, 3, 4});
  SetShape(&tensor2_, {1, 3, 1});
  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
                                                  &tensor2_, &output));
  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4));
  TfLiteIntArrayFree(output);
 }
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
  ::tflite::LogToStderr();
  ::testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();
 }
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
           "For Protobuf formats, the binary format will be used."),
      Flag("input_format", parsed_flags.input_format.bind(),
           parsed_flags.input_format.default_value(),
-           "Input file format. One of: tensorflow_graphdef, "),
+           "Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."),
      Flag("output_format", parsed_flags.output_format.bind(),
-           parsed_flags.output_format.default_value(), "Output file format."),
+           parsed_flags.output_format.default_value(),
           "Output file format. "
           "One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
      Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
           parsed_flags.default_ranges_min.default_value(),
           "If defined, will be used as the default value for the min bound "
@ -58,11 +60,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
      Flag("inference_type", parsed_flags.inference_type.bind(),
           parsed_flags.inference_type.default_value(),
           "Target data type of arrays in the output file (for input_arrays, "
-           "this may be overridden by inference_input_type)."),
+           "this may be overridden by inference_input_type). "
           "One of FLOAT, QUANTIZED_UINT8."),
      Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
           parsed_flags.inference_input_type.default_value(),
-           "Target data type of input arrays. If not specified, inference_type "
+           "Target data type of input arrays. "
-           "is used."),
+           "If not specified, inference_type is used. "
           "One of FLOAT, QUANTIZED_UINT8."),
      Flag("input_type", parsed_flags.input_type.bind(),
           parsed_flags.input_type.default_value(),
           "Deprecated ambiguous flag that set both --input_data_types and "
@ -76,35 +80,31 @@ bool ParseTocoFlagsFromCommandLineFlags(
      Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
           parsed_flags.drop_fake_quant.default_value(),
-           "Ignore and discard FakeQuant nodes. For instance, that can be used "
+           "Ignore and discard FakeQuant nodes. For instance, to "
           "to "
           "generate plain float code without fake-quantization from a "
-           "quantized "
+           "quantized graph."),
           "graph."),
      Flag(
          "reorder_across_fake_quant",
          parsed_flags.reorder_across_fake_quant.bind(),
          parsed_flags.reorder_across_fake_quant.default_value(),
          "Normally, FakeQuant nodes must be strict boundaries for graph "
          "transformations, in order to ensure that quantized inference has "
-          "the "
+          "the exact same arithmetic behavior as quantized training --- which "
-          "exact same arithmetic behavior as quantized training --- which is "
+          "is the whole point of quantized training and of FakeQuant nodes in "
-          "the "
+          "the first place. "
-          "whole point of quantized training and of FakeQuant nodes in the "
+          "However, that entails subtle requirements on where exactly "
          "first "
          "place. However, that entails subtle requirements on where exactly "
          "FakeQuant nodes must be placed in the graph. Some quantized graphs "
          "have FakeQuant nodes at unexpected locations, that prevent graph "
          "transformations that are necessary in order to generate inference "
          "code for these graphs. Such graphs should be fixed, but as a "
          "temporary work-around, setting this reorder_across_fake_quant flag "
-          "allows toco to perform necessary graph transformaitons on them, "
+          "allows TOCO to perform necessary graph transformaitons on them, "
          "at the cost of no longer faithfully matching inference and training "
          "arithmetic."),
      Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
           parsed_flags.allow_custom_ops.default_value(),
-           "If true, allow TOCO to create TF Lite Custom operators for all the"
+           "If true, allow TOCO to create TF Lite Custom operators for all the "
-           "unsupported Tensorflow ops."),
+           "unsupported TensorFlow ops."),
      Flag(
          "drop_control_dependency",
          parsed_flags.drop_control_dependency.bind(),
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/py2tf/BUILD
@ -57,6 +57,7 @@ py_library(
 py_test(
    name = "api_test",
    srcs = ["api_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":py2tf_internal",
        "//tensorflow/python:client_testlib",
@ -66,6 +67,7 @@ py_test(
 py_test(
    name = "conversion_test",
    srcs = ["conversion_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":py2tf_internal",
        "//tensorflow/python:client_testlib",
@ -76,6 +78,7 @@ py_test(
 py_test(
    name = "naming_test",
    srcs = ["naming_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":py2tf_internal",
        "//tensorflow/python:client_testlib",
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@ -52,6 +52,7 @@ py_library(
 py_test(
    name = "break_canonicalization_test",
    srcs = ["break_canonicalization_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -62,6 +63,7 @@ py_test(
 py_test(
    name = "call_trees_test",
    srcs = ["call_trees_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -72,6 +74,7 @@ py_test(
 py_test(
    name = "continue_canonicalization_test",
    srcs = ["continue_canonicalization_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -82,6 +85,7 @@ py_test(
 py_test(
    name = "control_flow_test",
    srcs = ["control_flow_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -92,6 +96,7 @@ py_test(
 py_test(
    name = "builtin_functions_test",
    srcs = ["builtin_functions_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -112,6 +117,7 @@ py_test(
 py_test(
    name = "logical_expressions_test",
    srcs = ["logical_expressions_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -122,6 +128,7 @@ py_test(
 py_test(
    name = "print_functions_test",
    srcs = ["print_functions_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
@ -133,6 +140,7 @@ py_test(
 py_test(
    name = "side_effect_guards_test",
    srcs = ["side_effect_guards_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":test_lib",
        "//tensorflow/contrib/py2tf/pyct",
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@ -38,6 +38,7 @@ py_library(
 py_test(
    name = "anno_test",
    srcs = ["anno_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":pyct",
        "//tensorflow/python:client_testlib",
@ -47,6 +48,7 @@ py_test(
 py_test(
    name = "compiler_test",
    srcs = ["compiler_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":pyct",
        "//tensorflow/python:client_testlib",
@ -57,6 +59,7 @@ py_test(
 py_test(
    name = "parser_test",
    srcs = ["parser_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":pyct",
        "//tensorflow/python:client_testlib",
@ -66,6 +69,7 @@ py_test(
 py_test(
    name = "pretty_printer_test",
    srcs = ["pretty_printer_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":pyct",
        "//tensorflow/python:client_testlib",
@ -75,6 +79,7 @@ py_test(
 py_test(
    name = "templates_test",
    srcs = ["templates_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":pyct",
        "//tensorflow/python:client_testlib",
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
@ -32,6 +32,7 @@ py_library(
 py_test(
    name = "access_test",
    srcs = ["access_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":static_analysis",
        "//tensorflow/contrib/py2tf/pyct",
@ -43,6 +44,7 @@ py_test(
 py_test(
    name = "live_values_test",
    srcs = ["live_values_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":static_analysis",
        "//tensorflow/contrib/py2tf/pyct",
@ -53,6 +55,7 @@ py_test(
 py_test(
    name = "type_info_test",
    srcs = ["type_info_test.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":static_analysis",
        "//tensorflow/contrib/py2tf/pyct",
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@ -230,8 +230,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
  // TODO(tucker): actually maintain separate CPUAllocators for
  // different numa_nodes.  For now, just one.
  numa_node = 0;
  mutex_lock lock(mu_);
  {
    // Here we optimize the most common use case where cuda_host_allocators_
    // and cuda_al_ have already been populated and since we're only reading
    // these vectors, we can get by with a shared lock. In the slower case,
    // we take a unique lock and populate these vectors.
    tf_shared_lock lock(mu_);
    if (FLAGS_brain_gpu_record_mem_types &&
        static_cast<int>(cuda_al_.size()) > 0) {
      return cuda_al_[0];
    }
    if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
      return cuda_host_allocators_[0];
    }
  }
  mutex_lock lock(mu_);
  // Find the first valid StreamExecutor to request CUDA host memory
  // through, since any will work.
  //
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
  DisableDetailedStats(false);
 }
-Cluster::~Cluster() {
+Cluster::~Cluster() {}
 }
 void Cluster::AllowSoftPlacement(bool soft_placement_state) {
  options_.config.set_allow_soft_placement(soft_placement_state);
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@ -325,7 +325,7 @@ class VirtualScheduler {
  // Boolean field for whether the cost is accurate.
  std::map<string, std::pair<int, bool>> op_costs_;
-  Costs graph_costs_;                // Graph cost.
+  Costs graph_costs_;                   // Graph cost.
  std::map<string, Costs> op_to_cost_;  // Per-op cost.
  // Auxilliary data structures for constructing NodeState and DeviceState.
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/framework/variable.pb.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
 namespace tensorflow {
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice;
 template <typename Device, typename T>
 class AdjustContrastOp : public OpKernel {
 public:
-  explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit AdjustContrastOp(OpKernelConstruction* context)
-  }
+      : OpKernel(context) {}
  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
--- a/tensorflow/core/kernels/adjust_contrast_op_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@ -29,8 +29,7 @@ limitations under the License.
 namespace tensorflow {
-class AdjustContrastOpTest : public OpsTestBase {
+class AdjustContrastOpTest : public OpsTestBase {};
 };
 TEST_F(AdjustContrastOpTest, Simple_1113) {
  TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@ -192,8 +192,9 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
    const DeviceBase::CpuWorkerThreads& worker_threads =
        *context->device()->tensorflow_cpu_worker_threads();
    Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
-          kCostPerChannel, [channel_count, &input_data, &output_data, scale_h](
+          kCostPerChannel,
-                               int64 start_channel, int64 end_channel) {
+          [channel_count, &input_data, &output_data, scale_h](
              int64 start_channel, int64 end_channel) {
            const float* p = input_data.data() + start_channel * kChannelSize;
            float* q = output_data.data() + start_channel * kChannelSize;
            for (int i = start_channel; i < end_channel; i++) {
--- a/tensorflow/core/kernels/aggregate_ops_cpu.h
+++ b/tensorflow/core/kernels/aggregate_ops_cpu.h
@ -25,7 +25,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 namespace tensorflow {
@ -201,7 +201,7 @@ struct Add7Functor<SYCLDevice, T> {
                  typename TTypes<T>::ConstFlat in6,
                  typename TTypes<T>::ConstFlat in7) {
    Add7EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7);
+                                          in7);
  }
 };
@ -214,7 +214,7 @@ struct Add8Functor<SYCLDevice, T> {
      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
    Add8EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8);
+                                          in7, in8);
  }
 };
@ -227,7 +227,7 @@ struct Add8pFunctor<SYCLDevice, T> {
      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
    Add8pEigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7, in8);
+                                           in7, in8);
  }
 };
@ -241,10 +241,10 @@ struct Add9Functor<SYCLDevice, T> {
      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
      typename TTypes<T>::ConstFlat in9) {
    Add9EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8, in9);
+                                          in7, in8, in9);
  }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
--- a/tensorflow/core/kernels/attention_ops.cc
+++ b/tensorflow/core/kernels/attention_ops.cc
@ -52,8 +52,9 @@ class ExtractGlimpseOp : public OpKernel {
    const int64 batch_size = input_shape.dim_size(0);
    const Tensor& window_size = context->input(1);
-    OP_REQUIRES(context, (window_size.shape().dims() == 1) &&
+    OP_REQUIRES(context,
-                             window_size.shape().dim_size(0) == 2,
+                (window_size.shape().dims() == 1) &&
                    window_size.shape().dim_size(0) == 2,
                errors::InvalidArgument(
                    "input must be a vector of size 2 (height, width)",
                    window_size.shape().DebugString()));
--- a/tensorflow/core/kernels/avgpooling_op.h
+++ b/tensorflow/core/kernels/avgpooling_op.h
@ -48,9 +48,8 @@ struct SpatialAvgPooling {
 typedef Eigen::GpuDevice GPUDevice;
-// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation
+// Launch a custom GPU kernels from Yanqing for the avgpooling backward
-// that works NHWC data formats.
+// operation that works NHWC data formats. Arguments:
 // Arguments:
 //   top_diff: backprop to the output of the pooling layer
 //   num: number of input batches
 //   height: input height
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
        hstart = max(hstart, 0);
        wstart = max(wstart, 0);
        int pool_size = (hend - hstart) * (wend - wstart);
-        gradient +=
+        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] /
-            top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size);
+                    dtype(pool_size);
      }
    }
    bottom_diff[index] = gradient;
@ -90,11 +90,11 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
                            const GPUDevice& d) {
  int x_size = num * height * width * channels;
  CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
-  AvePoolBackwardNHWC<
+  AvePoolBackwardNHWC<T>
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      config.virtual_thread_count, top_diff, num, height, width, channels,
+          config.virtual_thread_count, top_diff, num, height, width, channels,
-      pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
+          pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
-      pad_t, pad_t, bottom_diff);
+          pad_t, pad_t, bottom_diff);
  return d.ok();
 }
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@ -111,13 +111,14 @@ class Barrier : public ResourceBase {
      mutex_lock lock(mu_);
      if (closed_) {
        OP_REQUIRES_ASYNC(
-            ctx, !cancel_pending_enqueues_ &&
+            ctx,
-                     (num_inserted == 0 || !incomplete_.empty()),
+            !cancel_pending_enqueues_ &&
                (num_inserted == 0 || !incomplete_.empty()),
            errors::Cancelled(
                "Barrier ", name_, " is closed.  Pending enqueues cancelled: ",
-                cancel_pending_enqueues_, ".  Number of new insertions: ",
+                cancel_pending_enqueues_,
-                num_inserted, ".  Number of incomplete keys: ",
+                ".  Number of new insertions: ", num_inserted,
-                incomplete_.size(), "."),
+                ".  Number of incomplete keys: ", incomplete_.size(), "."),
            callback);
      }
@ -128,9 +129,10 @@ class Barrier : public ResourceBase {
      for (int i = 0; i < num_inserted; ++i) {
        OP_REQUIRES_OK_ASYNC(
-            ctx, InsertOneLocked<T>(ctx, keys, values, element_shape,
+            ctx,
-                                    component_index, i, &ready_tuples,
+            InsertOneLocked<T>(ctx, keys, values, element_shape,
-                                    &new_elements),
+                               component_index, i, &ready_tuples,
                               &new_elements),
            callback);
      }
@ -317,8 +319,9 @@ class Barrier : public ResourceBase {
        return errors::Cancelled(
            "Barrier ", name_,
            " is closed, but attempted to insert a brand new key: ",
-            keys_vec(i), ".  Pending enqueues cancelled: ",
+            keys_vec(i),
-            cancel_pending_enqueues_, ".  Insertion index: ", i,
+            ".  Pending enqueues cancelled: ", cancel_pending_enqueues_,
            ".  Insertion index: ", i,
            ".  Number of incomplete keys: ", incomplete_.size(), ".");
      }
    } else {
@ -532,13 +535,14 @@ class InsertManyOp : public BarrierOpKernel {
    OP_REQUIRES_ASYNC(
        ctx, component_index_ < barrier->num_components(),
        errors::InvalidArgument("The component ID is out of range ",
-                                component_index_, " > num_components", " (= ",
+                                component_index_, " > num_components",
-                                barrier->num_components(), ")"),
+                                " (= ", barrier->num_components(), ")"),
        callback);
    OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING,
+        ctx,
-                                  barrier->component_type(component_index_)},
+        ctx->MatchSignature({DT_STRING_REF, DT_STRING,
-                                 {}),
+                             barrier->component_type(component_index_)},
                            {}),
        callback);
    const Tensor* keys;
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
 #include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/macros.h"
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@ -41,7 +41,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 namespace {
@ -429,14 +429,13 @@ template <typename Scalar>
 struct LaunchBatchMatMul<SYCLDevice, Scalar> {
  static void Launch(OpKernelContext* context, const Tensor& in_x,
                     const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
-
+    // Number of matrix multiplies i.e. size of the batch.
-  // Number of matrix multiplies i.e. size of the batch.
+    const int64 batch_size = in_x.dim_size(0);
-  const int64 batch_size = in_x.dim_size(0);
+    ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
-  ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y, out,
+                                          out, 0, batch_size);
                                        0, batch_size);
  }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 template <typename Device, typename Scalar>
 class BatchMatMul : public OpKernel {
@ -462,10 +461,10 @@ class BatchMatMul : public OpKernel {
    TensorShape out_shape;
    for (int i = 0; i < ndims - 2; ++i) {
      OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
-                  errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(",
+                  errors::InvalidArgument(
-                                          i, ") must be the same: ",
+                      "In[0].dim(", i, ") and In[1].dim(", i,
-                                          in0.shape().DebugString(), " vs ",
+                      ") must be the same: ", in0.shape().DebugString(), " vs ",
-                                          in1.shape().DebugString()));
+                      in1.shape().DebugString()));
      out_shape.AddDim(in0.dim_size(i));
    }
    auto n = (ndims == 2) ? 1 : out_shape.num_elements();
@ -507,12 +506,12 @@ class BatchMatMul : public OpKernel {
  bool adj_y_;
 };
-#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                 \
  REGISTER_KERNEL_BUILDER(                                              \
      Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
      BatchMatMul<CPUDevice, TYPE>)
-#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                 \
  REGISTER_KERNEL_BUILDER(                                              \
      Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
      BatchMatMul<GPUDevice, TYPE>)
@ -522,5 +521,5 @@ class BatchMatMul : public OpKernel {
  REGISTER_KERNEL_BUILDER(                                               \
      Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
      BatchMatMul<SYCLDevice, TYPE>)
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@ -35,5 +35,5 @@ TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #ifdef TENSORFLOW_USE_SYCL
 TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL);
 TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/batch_matmul_op_test.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_test.cc
@ -53,9 +53,10 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
 /* Uncomment to enable benchmarks for double & complex types: */
 // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
 // gpu);
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu);                    \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
-// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);  \
+// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu);                    \
+// \
 // BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
 // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
 // Typical fully connected layers
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 template <typename Device, typename T>
 class BatchNormOp : public OpKernel {
--- a/tensorflow/core/kernels/batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/batch_norm_op_test.cc
@ -54,7 +54,7 @@ TEST_F(BatchNormOpTest, Simple) {
  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
  test::FillValues<float>(
      &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f,
-                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f });
+                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f});
  test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
 }
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@ -56,9 +56,10 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
      errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                              " instead of ", orig_input_tensor.dims()));
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
+  OP_REQUIRES(context,
-                           block_dims == orig_crops.dim_size(0) &&
+              TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
-                           2 == orig_crops.dim_size(1),
+                  block_dims == orig_crops.dim_size(0) &&
                  2 == orig_crops.dim_size(1),
              errors::InvalidArgument("crops should have shape [", block_dims,
                                      ", 2] instead of ",
                                      orig_crops.shape().DebugString()));
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/util/bcast.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/bcast.h"
 namespace tensorflow {
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@ -77,14 +77,14 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
  }
  CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
  if (data_format == FORMAT_NHWC) {
-    BiasNHWCKernel<
+    BiasNHWCKernel<T>
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size);
+            config.virtual_thread_count, input, bias, output, bias_size);
  } else {
-    BiasNCHWKernel<
+    BiasNCHWKernel<T>
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size,
+            config.virtual_thread_count, input, bias, output, bias_size,
-        image_size);
+            image_size);
  }
 }
@ -206,10 +206,10 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
  // Check if we have enough shared memory.
  if (shared_memory_size <= max_shared_memory_size) {
    if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_SharedAtomics<
+      BiasGradNHWC_SharedAtomics<T>
-          T><<<config.block_count, config.thread_per_block, shared_memory_size,
+          <<<config.block_count, config.thread_per_block, shared_memory_size,
-               d.stream()>>>(total_count, output_backprop, bias_backprop,
+             d.stream()>>>(total_count, output_backprop, bias_backprop,
-                             bias_size);
+                           bias_size);
    } else {
      // Round up the block count to multiple of bias_size.
      int group_size = (config.block_count + bias_size - 1) / bias_size;
@ -217,23 +217,24 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
      if (config.thread_per_block < kWarpSize) {
        config.thread_per_block = kWarpSize;
      }
-      BiasGradNCHW_SharedAtomics<
+      BiasGradNCHW_SharedAtomics<T>
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          output_backprop, bias_backprop, batch, bias_size, image_size,
+              output_backprop, bias_backprop, batch, bias_size, image_size,
-          group_size);
+              group_size);
    }
  } else {
    // Note that even if we don't have enough shared memory to fit the entire
    // output block, it is possible to process one group of elements at a time.
    // But for now, we simply fall back to the naive implementation.
    if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_Naive<
+      BiasGradNHWC_Naive<T>
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size);
+              total_count, output_backprop, bias_backprop, bias_size);
    } else {
-      BiasGradNCHW_Naive<
+      BiasGradNCHW_Naive<T>
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size, image_size);
+              total_count, output_backprop, bias_backprop, bias_size,
              image_size);
    }
  }
 }
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/kernels/bounds_check.h
@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
  auto *to_x = reinterpret_cast<const volatile T *>(&x);
  return *to_x;
 }
-}  // namespace tensorflow::internal
+}  // namespace internal
 }  // namespace tensorflow
 #endif  // TENSORFLOW_UTIL_BOUNDS_CHECK_H_
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@ -126,13 +126,13 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
                        SimpleCandidateSamplerOp<LogUniformSampler>);
-REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler")
+REGISTER_KERNEL_BUILDER(
-                            .Device(DEVICE_CPU),
+    Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<UnigramSampler>);
+    SimpleCandidateSamplerOp<UnigramSampler>);
-REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler")
+REGISTER_KERNEL_BUILDER(
-                            .Device(DEVICE_CPU),
+    Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
+    SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
 class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
 public:
@ -197,8 +197,9 @@ class ComputeAccidentalHitsOp : public OpKernel {
  void Compute(OpKernelContext* context) override {
    const Tensor& in_true_candidates = context->input(0);
    const TensorShape& in_true_candidates_shape = in_true_candidates.shape();
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
+    OP_REQUIRES(context,
-                             in_true_candidates_shape.dim_size(1) == num_true_,
+                TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
                    in_true_candidates_shape.dim_size(1) == num_true_,
                errors::InvalidArgument(
                    "true_candidates must be a batch_size * num_true matrix"));
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@ -36,7 +36,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #define CURRY_TYPES2(FN, arg0)   \
  FN(arg0, bool);                \
@ -223,11 +223,11 @@ class SyclCastOp : public CastOpBase {
  }
 };
-#define REGISTER_CAST_SYCL(srctype, dsttype)                    \
+#define REGISTER_CAST_SYCL(srctype, dsttype)                   \
-  REGISTER_KERNEL_BUILDER(Name("Cast")                          \
+  REGISTER_KERNEL_BUILDER(Name("Cast")                         \
-                              .TypeConstraint<srctype>("SrcT")  \
+                              .TypeConstraint<srctype>("SrcT") \
-                              .TypeConstraint<dsttype>("DstT")  \
+                              .TypeConstraint<dsttype>("DstT") \
-                              .Device(DEVICE_SYCL),             \
+                              .Device(DEVICE_SYCL),            \
                          SyclCastOp)
 CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
 CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
@ -237,7 +237,7 @@ CURRY_TYPES2(REGISTER_CAST_SYCL, double);
 #undef REGISTER_CAST_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef CURRY_TYPES2
@ -250,6 +250,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
    Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"),
    CpuCastOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
    p[0] = a.value;
    p[1] = 0;
 #else
-    static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");
+    static_assert(::tensorflow::port::kLittleEndian,
                  "Not a little endian system!");
    p[0] = 0;
    p[1] = a.value;
 #endif
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@ -41,25 +41,25 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
    o.device(d) = i.template cast<O>();
  }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
-#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1)   \
+#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
-  FN(arg0, arg1, bool);                        \
+  FN(arg0, arg1, bool);                      \
-  FN(arg0, arg1, uint8);                       \
+  FN(arg0, arg1, uint8);                     \
-  FN(arg0, arg1, int8);                        \
+  FN(arg0, arg1, int8);                      \
-  FN(arg0, arg1, uint16);                      \
+  FN(arg0, arg1, uint16);                    \
-  FN(arg0, arg1, int16);                       \
+  FN(arg0, arg1, int16);                     \
-  FN(arg0, arg1, int32);                       \
+  FN(arg0, arg1, int32);                     \
-  FN(arg0, arg1, int64);                       \
+  FN(arg0, arg1, int64);                     \
-  FN(arg0, arg1, float);                       \
+  FN(arg0, arg1, float);                     \
-  FN(arg0, arg1, double);                      \
+  FN(arg0, arg1, double);                    \
-  FN(arg0, arg1, std::complex<float>);         \
+  FN(arg0, arg1, std::complex<float>);       \
  FN(arg0, arg1, std::complex<double>)
-#define CURRY_TYPES3(FN, arg0, arg1)           \
+#define CURRY_TYPES3(FN, arg0, arg1)   \
-  CURRY_TYPES3_NO_HALF(FN, arg0, arg1)         \
+  CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
  FN(arg0, arg1, Eigen::half);
 #define CAST_CASE(DEVICE, IN, OUT)                                         \
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@ -107,10 +107,10 @@ static void BM_gpu_float_int64(int iters, int num) {
  testing::UseRealTime();
 #if GOOGLE_CUDA
  test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
  test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
@ -130,10 +130,10 @@ static void BM_gpu_bool_float(int iters, int num) {
  testing::UseRealTime();
 #if GOOGLE_CUDA
  test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
  test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
@ -180,7 +180,7 @@ static void BM_gpu_float_half(int iters, int num) {
  testing::UseRealTime();
 #if GOOGLE_CUDA
  test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
@ -191,7 +191,7 @@ static void BM_gpu_half_float(int iters, int num) {
  testing::UseRealTime();
 #if GOOGLE_CUDA
  test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@ -107,14 +107,14 @@ class HSVToRGBOp : public OpKernel {
  }
 };
-#define REGISTER_CPU(T)                                       \
+#define REGISTER_CPU(T)                                           \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \
+  REGISTER_KERNEL_BUILDER(                                        \
-                              .TypeConstraint<T>("T"),        \
+      Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-                          RGBToHSVOp<CPUDevice, T>);          \
+      RGBToHSVOp<CPUDevice, T>);                                  \
-  template class RGBToHSVOp<CPUDevice, T>;                    \
+  template class RGBToHSVOp<CPUDevice, T>;                        \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \
+  REGISTER_KERNEL_BUILDER(                                        \
-                              .TypeConstraint<T>("T"),        \
+      Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-                          HSVToRGBOp<CPUDevice, T>);          \
+      HSVToRGBOp<CPUDevice, T>);                                  \
  template class HSVToRGBOp<CPUDevice, T>;
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
@ -123,40 +123,39 @@ TF_CALL_double(REGISTER_CPU);
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
-#define DECLARE_GPU(T)                                        \
+#define DECLARE_GPU(T)                                               \
-  template <>                                                 \
+  template <>                                                        \
-  void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \
+  void RGBToHSV<GPUDevice, T>::operator()(                           \
-      TTypes<T, 2>::ConstTensor input_data,                   \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
-      TTypes<T, 1>::Tensor range,                             \
+      TTypes<T, 1>::Tensor range, TTypes<T, 2>::Tensor output_data); \
-      TTypes<T, 2>::Tensor output_data);                      \
+  extern template struct RGBToHSV<GPUDevice, T>;                     \
-  extern template struct RGBToHSV<GPUDevice, T>;              \
+  template <>                                                        \
-  template <>                                                 \
+  void HSVToRGB<GPUDevice, T>::operator()(                           \
-  void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
-      TTypes<T, 2>::ConstTensor input_data,                   \
+      TTypes<T, 2>::Tensor output_data);                             \
      TTypes<T, 2>::Tensor output_data);                      \
  extern template struct HSVToRGB<GPUDevice, T>;
 TF_CALL_float(DECLARE_GPU);
 TF_CALL_double(DECLARE_GPU);
 }  // namespace functor
-#define REGISTER_GPU(T)                                       \
+#define REGISTER_GPU(T)                                           \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \
+  REGISTER_KERNEL_BUILDER(                                        \
-                              .TypeConstraint<T>("T"),        \
+      Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-                          RGBToHSVOp<GPUDevice, T>);          \
+      RGBToHSVOp<GPUDevice, T>);                                  \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \
+  REGISTER_KERNEL_BUILDER(                                        \
-                              .TypeConstraint<T>("T"),        \
+      Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-                          HSVToRGBOp<GPUDevice, T>);
+      HSVToRGBOp<GPUDevice, T>);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(T)                                       \
+#define REGISTER_SYCL(T)                                           \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \
+  REGISTER_KERNEL_BUILDER(                                         \
-                              .TypeConstraint<T>("T"),         \
+      Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
-                          RGBToHSVOp<SYCLDevice, T>);          \
+      RGBToHSVOp<SYCLDevice, T>);                                  \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \
+  REGISTER_KERNEL_BUILDER(                                         \
-                              .TypeConstraint<T>("T"),         \
+      Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
-                          HSVToRGBOp<SYCLDevice, T>);
+      HSVToRGBOp<SYCLDevice, T>);
 TF_CALL_float(REGISTER_SYCL);
 TF_CALL_double(REGISTER_SYCL);
 #endif
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/colorspace_op.h
@ -54,10 +54,9 @@ struct RGBToHSV {
    // TODO(wicke): all these assignments are only necessary because a combined
    // expression is larger than kernel parameter space. A custom kernel is
    // probably in order.
-    H.device(d) = (R == V).select(norm * (G - B),
+    H.device(d) = (R == V).select(
-                                  (G == V).select(
+        norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6),
-                                      norm * (B - R) + T(2) / T(6),
+                                        norm * (R - G) + T(4) / T(6)));
                                      norm * (R - G) + T(4) / T(6)));
    H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
    H.device(d) = (H < T(0)).select(H + T(1), H);
  }
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
@ -17,8 +17,8 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #include "tensorflow/core/kernels/colorspace_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/colorspace_op.h"
 namespace tensorflow {
@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice;
  template class functor::HSVToRGB<GPUDevice, T>;
 TF_CALL_float(INSTANTIATE_GPU);
 TF_CALL_double(INSTANTIATE_GPU);
-}
+}  // namespace tensorflow
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@ -224,34 +224,34 @@ class HSVToRGBOpTest : public OpsTestBase {
  }
 };
-#define TEST_COLORSPACE(test, dt)                               \
+#define TEST_COLORSPACE(test, dt)         \
-  TEST_F(test, CheckBlack) {                                    \
+  TEST_F(test, CheckBlack) {              \
-    MakeOp(dt);                                                 \
+    MakeOp(dt);                           \
-    CheckBlack(dt);                                             \
+    CheckBlack(dt);                       \
-  }                                                             \
+  }                                       \
-  TEST_F(test, CheckGray) {                                     \
+  TEST_F(test, CheckGray) {               \
-    MakeOp(dt);                                                 \
+    MakeOp(dt);                           \
-    CheckGray(dt);                                              \
+    CheckGray(dt);                        \
-  }                                                             \
+  }                                       \
-  TEST_F(test, CheckWhite) {                                    \
+  TEST_F(test, CheckWhite) {              \
-    MakeOp(dt);                                                 \
+    MakeOp(dt);                           \
-    CheckWhite(dt);                                             \
+    CheckWhite(dt);                       \
-  }                                                             \
+  }                                       \
-  TEST_F(test, CheckRedMax) {                                   \
+  TEST_F(test, CheckRedMax) {             \
-    MakeOp(dt);                                                 \
+    MakeOp(dt);                           \
-    CheckRedMax(dt);                                            \
+    CheckRedMax(dt);                      \
-  }                                                             \
+  }                                       \
-  TEST_F(test, CheckGreenMax) {                                 \
+  TEST_F(test, CheckGreenMax) {           \
-    MakeOp(dt);                                                 \
+    MakeOp(dt);                           \
-    CheckGreenMax(dt);                                          \
+    CheckGreenMax(dt);                    \
-  }                                                             \
+  }                                       \
-  TEST_F(test, CheckBlueMax) {                                  \
+  TEST_F(test, CheckBlueMax) {            \
-    MakeOp(dt);                                                 \
+    MakeOp(dt);                           \
-    CheckBlueMax(dt);                                           \
+    CheckBlueMax(dt);                     \
-  }                                                             \
+  }                                       \
-  TEST_F(test, CheckNegativeDifference) {                       \
+  TEST_F(test, CheckNegativeDifference) { \
-    MakeOp(dt);                                                 \
+    MakeOp(dt);                           \
-    CheckNegativeDifference(dt);                                \
+    CheckNegativeDifference(dt);          \
  }
 typedef RGBToHSVOpTest<float> rgb_to_hsv_float;
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@ -41,10 +41,11 @@ namespace tensorflow {
 // Assumes all inputs are nonempty
 template <typename T>
-void ConcatCPU(DeviceBase* d,
+void ConcatCPU(
-               const std::vector<
+    DeviceBase* d,
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
-               typename TTypes<T, 2>::Matrix* output);
+        inputs,
    typename TTypes<T, 2>::Matrix* output);
 #if GOOGLE_CUDA
 template <typename T>
 void ConcatGPU(
@ -57,11 +58,12 @@ void ConcatGPU(
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
+void ConcatSYCL(
-               const std::vector<
+    const Eigen::SyclDevice& d,
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
-               typename TTypes<T, 2>::Matrix* output);
+        inputs,
-#endif // TENSORFLOW_USE_SYCL
+    typename TTypes<T, 2>::Matrix* output);
 #endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 #endif  // TENSORFLOW_KERNELS_CONCAT_LIB_H_
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@ -48,10 +48,11 @@ struct MemCpyCopier<ResourceHandle> {
 }  // namespace
 template <typename T>
-void ConcatCPU(DeviceBase* d,
+void ConcatCPU(
-               const std::vector<
+    DeviceBase* d,
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
-               typename TTypes<T, 2>::Matrix* output) {
+        inputs,
    typename TTypes<T, 2>::Matrix* output) {
  if (std::is_same<T, string>::value) {
    // use a large cost here to force strings to be handled by separate threads
    ConcatCPUImpl<T>(d, inputs, 100000, MemCpyCopier<T>(), output);
@ -86,21 +87,22 @@ TF_CALL_variant(REGISTER)
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
+void ConcatSYCL(
-               const std::vector<
+    const Eigen::SyclDevice& d,
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
-               typename TTypes<T, 2>::Matrix* output) {
+        inputs,
    typename TTypes<T, 2>::Matrix* output) {
  ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
-                   output);
+                    output);
 }
-#define REGISTER_SYCL(T)                                                      \
+#define REGISTER_SYCL(T)                                                       \
- template void ConcatSYCL<T>(                                                 \
+  template void ConcatSYCL<T>(                                                 \
-     const Eigen::SyclDevice&,                                                \
+      const Eigen::SyclDevice&,                                                \
-     const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
-     typename TTypes<T, 2>::Matrix* output);
+      typename TTypes<T, 2>::Matrix* output);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL)
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@ -15,9 +15,9 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include "tensorflow/core/kernels/concat_lib.h"
 #include <vector>
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/util/work_sharder.h"
 namespace tensorflow {
@ -73,7 +73,7 @@ void ConcatCPUImpl(
  // Sharded mode.
  auto work = [&row_size, &sizes, &inputs, &output, &copier, &num_inputs](
-      int64 start, int64 end) {
+                  int64 start, int64 end) {
    int64 skipped_rows = start / row_size;
    T* out = output->data() + skipped_rows * row_size;
    T* out_start = output->data() + start;
@ -160,5 +160,5 @@ void ConcatSYCLImpl(
    }
  }
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@ -37,7 +37,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
@ -71,8 +71,9 @@ class ConcatBaseOp : public OpKernel {
    const TensorShape& input_shape = values[0].shape();
    int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
-    OP_REQUIRES(c, (0 <= axis && axis < input_dims) ||
+    OP_REQUIRES(c,
-                       (allow_legacy_scalars() && concat_dim == 0),
+                (0 <= axis && axis < input_dims) ||
                    (allow_legacy_scalars() && concat_dim == 0),
                errors::InvalidArgument(
                    "ConcatOp : Expected concatenating dimensions in the range "
                    "[",
@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel {
          c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
          errors::InvalidArgument(
              "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
+              input_shape.DebugString(), " vs. shape[", i,
-              in.shape().DebugString()));
+              "] = ", in.shape().DebugString()));
      for (int j = 0; j < input_dims; ++j) {
        if (j == axis) {
          continue;
@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel {
            c, in.dim_size(j) == input_shape.dim_size(j),
            errors::InvalidArgument(
                "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
+                input_shape.DebugString(), " vs. shape[", i,
-                in.shape().DebugString()));
+                "] = ", in.shape().DebugString()));
      }
      if (in.NumElements() > 0) {
        int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
@ -142,7 +143,7 @@ class ConcatBaseOp : public OpKernel {
        ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
        return;
      }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
    }
  }
@ -252,7 +253,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                        ConcatV2Op<CPUDevice, int32>);
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 class ConcatOffsetOp : public OpKernel {
 public:
@ -347,5 +348,5 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
                            .HostMemory("shape")
                            .HostMemory("offset"),
                        ConcatOffsetOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
-                         Eigen::Unaligned> EigenMap;
+                         Eigen::Unaligned>
    EigenMap;
 static void MemcpyManyAlternative1(int iters, int dim2) {
  testing::StopTiming();
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@ -160,7 +160,7 @@ class ConditionalAccumulatorBase : public ResourceBase {
 * Modifications to convenience macros defined in core/framework/op_kernel.h.
 * The below macros return a boolean if the test fails, so that the calling
 * function can get an indication that a failure has occurred.
-*/
+ */
 #define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS)          \
  do {                                                 \
    if (!TF_PREDICT_TRUE(EXP)) {                       \
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@ -99,9 +99,10 @@ class AccumulatorTakeGradientOp
                      ConditionalAccumulatorBase* accumulator,
                      DoneCallback callback) override {
    // Check signature
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
+    OP_REQUIRES_OK_ASYNC(
-                                                  {accumulator->dtype()}),
+        ctx,
-                         callback);
+        ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}),
        callback);
  }
 private:
@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp
 REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
                        AccumulatorTakeGradientOp);
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@ -146,7 +146,6 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 template <typename Device, typename T, typename Index>
 class FillOp : public OpKernel {
 public:
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@ -113,47 +113,47 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
 #undef REGISTER_GPU_HOST_REF_KERNEL
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_SWITCH(type)                       \
+#define REGISTER_SYCL_SWITCH(type)                        \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
-                              .Device(DEVICE_SYCL)       \
+                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("pred")        \
+                              .HostMemory("pred")         \
-                              .TypeConstraint<type>("T"),\
+                              .TypeConstraint<type>("T"), \
                          SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
-#define REGISTER_SYCL_REF_SWITCH(type)                     \
+#define REGISTER_SYCL_REF_SWITCH(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
-                              .Device(DEVICE_SYCL)         \
+                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("pred")          \
+                              .HostMemory("pred")         \
-                              .TypeConstraint<type>("T"),  \
+                              .TypeConstraint<type>("T"), \
                          SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
 #undef REGISTER_SYCL_SWITCH
 #undef REGISTER_SYCL_REF_SWITCH
-#define REGISTER_SYCL_HOST_KERNEL(type)                  \
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
-                              .Device(DEVICE_SYCL)       \
+                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")        \
+                              .HostMemory("data")         \
-                              .HostMemory("pred")        \
+                              .HostMemory("pred")         \
-                              .HostMemory("output_false")\
+                              .HostMemory("output_false") \
-                              .HostMemory("output_true") \
+                              .HostMemory("output_true")  \
-                              .TypeConstraint<type>("T"),\
+                              .TypeConstraint<type>("T"), \
                          SwitchOp)
 REGISTER_SYCL_HOST_KERNEL(bool);
 REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(int32);
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)                \
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
-                              .Device(DEVICE_SYCL)         \
+                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")          \
+                              .HostMemory("data")         \
-                              .HostMemory("pred")          \
+                              .HostMemory("pred")         \
-                              .HostMemory("output_false")  \
+                              .HostMemory("output_false") \
-                              .HostMemory("output_true")   \
+                              .HostMemory("output_true")  \
-                              .TypeConstraint<type>("T"),  \
+                              .TypeConstraint<type>("T"), \
                          SwitchOp)
 REGISTER_SYCL_HOST_REF_KERNEL(int32);
@ -162,7 +162,7 @@ REGISTER_SYCL_HOST_REF_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 class RefSelectOp : public OpKernel {
 public:
@ -282,7 +282,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@ -331,7 +331,7 @@ REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 void EnterOp::Compute(OpKernelContext* context) {
  if (IsRefType(context->input_dtype(0))) {
@ -360,14 +360,14 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
+#define REGISTER_SYCL_KERNEL(type) \
-  REGISTER_KERNEL_BUILDER(          \
+  REGISTER_KERNEL_BUILDER(         \
      Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-#define REGISTER_SYCL_REF_KERNEL(type)  \
+#define REGISTER_SYCL_REF_KERNEL(type) \
-  REGISTER_KERNEL_BUILDER(              \
+  REGISTER_KERNEL_BUILDER(             \
      Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_REF_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
@ -398,7 +398,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@ -455,10 +455,10 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
+#define REGISTER_SYCL_KERNEL(type)                                         \
-  REGISTER_KERNEL_BUILDER(                                                   \
+  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);   \
+      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp); \
-  REGISTER_KERNEL_BUILDER(                                                   \
+  REGISTER_KERNEL_BUILDER(                                                 \
      Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@ -483,7 +483,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@ -556,12 +556,12 @@ REGISTER_GPU_HOST_KERNEL(string);
 #undef REGISTER_GPU_HOST_KERNEL
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
+#define REGISTER_SYCL_KERNEL(type)                                            \
-  REGISTER_KERNEL_BUILDER(                                                   \
+  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
+      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
-      NextIterationOp);                                                      \
+      NextIterationOp);                                                       \
-  REGISTER_KERNEL_BUILDER(                                                   \
+  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
      NextIterationOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@ -585,7 +585,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 // A LoopCond op has one input and one output. The input is a boolean
 // scalar representing the taken branches of the "pivot" Switch that
@ -619,7 +619,7 @@ REGISTER_KERNEL_BUILDER(Name("LoopCond")
                            .HostMemory("input")
                            .HostMemory("output"),
                        LoopCondOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 // ControlTrigger kernels
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU),
@ -631,7 +631,7 @@ REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU),
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_SYCL),
                        ControlTriggerOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 // When called, abort op will abort the current process. This can be used to
 // abort remote PSs when needed.
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@ -91,6 +91,7 @@ class KilledBySignal {
 public:
  explicit KilledBySignal(int signum) : signum_(signum) {}
  bool operator()(int exit_status) const { return exit_status == signum_; }
 private:
  const int signum_;
 };
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@ -688,7 +688,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
  static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
      // default value is in bytes despite the name of the environment variable
      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
-      );
+  );
  int device_id = stream->parent()->device_ordinal();
  DataType dtype = input.dtype();
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@ -679,8 +679,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
    const int dims = resized_shape.dims();
    OP_REQUIRES(
-        context, TensorShapeUtils::IsMatrix(paddings.shape()) &&
+        context,
-                     paddings.dim_size(1) == 2,
+        TensorShapeUtils::IsMatrix(paddings.shape()) &&
            paddings.dim_size(1) == 2,
        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                paddings.shape().DebugString()));
    const int fixed_dims =
@ -715,20 +716,22 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
      const int32 after =
          paddings_matrix(d, 1);  // Pad after existing elements.
      OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
+                  errors::InvalidArgument(
-                                          before, " ", after));
+                      "paddings must be non-negative: ", before, " ", after));
      if (offset_ == 0) {  // SYMMETRIC mode.
        OP_REQUIRES(
-            context, before <= resized_shape.dim_size(d) &&
+            context,
-                         after <= resized_shape.dim_size(d),
+            before <= resized_shape.dim_size(d) &&
                after <= resized_shape.dim_size(d),
            errors::InvalidArgument("paddings must be no greater "
                                    "than the dimension size: ",
                                    before, ", ", after, " greater than ",
                                    resized_shape.dim_size(d)));
      } else if (offset_ == 1) {  // REFLECT mode.
        OP_REQUIRES(
-            context, before < resized_shape.dim_size(d) &&
+            context,
-                         after < resized_shape.dim_size(d),
+            before < resized_shape.dim_size(d) &&
                after < resized_shape.dim_size(d),
            errors::InvalidArgument("paddings must be less than"
                                    " the dimension size: ",
                                    before, ", ", after, " not less than ",
@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
    // We only check the first three dims, since the depth is accessed as an
    // int64 below.
    for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
+      OP_REQUIRES(
-                                           std::numeric_limits<int>::max()),
+          context,
-                  errors::InvalidArgument("filter too large"));
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
          errors::InvalidArgument("filter too large"));
    }
    // The last dimension for input is in_depth. It must be the same as the
    // filter's in_depth.
    const int64 in_depth = padded_shape.dim_size(3);
-    OP_REQUIRES(
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
-        context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
-        errors::InvalidArgument("input and filter must have the same depth: ",
+                    "input and filter must have the same depth: ", in_depth,
-                                in_depth, " vs ", filter.dim_size(2)));
+                    " vs ", filter.dim_size(2)));
    // The last dimension for filter is out_depth.
    const int out_depth = static_cast<int>(filter.dim_size(3));
@ -786,9 +790,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
    // The second dimension for input is rows/height.
    // The first dimension for filter is rows/height.
    const int64 padded_rows_raw = padded_shape.dim_size(1);
-    OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw,
+    OP_REQUIRES(
-                                         std::numeric_limits<int>::max()),
+        context,
-                errors::InvalidArgument("Input rows too large"));
+        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
        errors::InvalidArgument("Input rows too large"));
    const int padded_rows = static_cast<int>(padded_rows_raw);
    const int filter_rows = static_cast<int>(filter.dim_size(0));
    const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
@ -796,9 +801,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
    // The third dimension for input is columns/width.
    // The second dimension for filter is columns/width.
    const int64 padded_cols_raw = padded_shape.dim_size(2);
-    OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw,
+    OP_REQUIRES(
-                                         std::numeric_limits<int>::max()),
+        context,
-                errors::InvalidArgument("Input cols too large"));
+        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
        errors::InvalidArgument("Input cols too large"));
    const int padded_cols = static_cast<int>(padded_cols_raw);
    const int filter_cols = static_cast<int>(filter.dim_size(1));
    const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
@ -864,24 +870,26 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
  TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
 };
-#define REGISTER_FUSED(T)                                                    \
+#define REGISTER_FUSED(T)                                                 \
-  REGISTER_KERNEL_BUILDER(                                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
-      Name("FusedResizeAndPadConv2D")                                        \
+      Name("FusedResizeAndPadConv2D")                                     \
-          .Device(DEVICE_CPU)                                                \
+          .Device(DEVICE_CPU)                                             \
-          .TypeConstraint<T>("T"),                                           \
+          .TypeConstraint<T>("T"),                                        \
-      FusedResizeConv2DUsingGemmOp<                                          \
+      FusedResizeConv2DUsingGemmOp<                                       \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+          T,                                                              \
-                                          BILINEAR>,                         \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
                                       BILINEAR>,                         \
          true>);
 TF_CALL_float(REGISTER_FUSED);
-#define REGISTER_PAD_ONLY_FUSED(T)                                           \
+#define REGISTER_PAD_ONLY_FUSED(T)                                        \
-  REGISTER_KERNEL_BUILDER(                                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
-      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),      \
+      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
-      FusedResizeConv2DUsingGemmOp<                                          \
+      FusedResizeConv2DUsingGemmOp<                                       \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+          T,                                                              \
-                                          NEAREST>,                          \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
                                       NEAREST>,                          \
          false>);
 TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@ -27,7 +27,6 @@ limitations under the License.
 namespace tensorflow {
 // Get the Cudnn workspace limit from the environment variable, which is in MB.
 // Return the workspace memory limit in bytes. If no value is set, return the
 // default value.
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@ -25,9 +25,9 @@ limitations under the License.
 #include "cuda/include/cuda.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/lib/math/math_util.h"
 namespace tensorflow {
@ -252,11 +252,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
  int x = threadIdx.x;
  Dimension<3> output_dims = {
-      input_dims[0], input_dims[2], input_dims[1],
+      input_dims[0],
      input_dims[2],
      input_dims[1],
  };
  Dimension<3> input_dims_in_tiles = {
-      input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI,
+      input_dims[0],
      (input_dims[1] + TileSizeI - 1) / TileSizeI,
      (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
  };
@ -264,7 +267,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
      FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
  Index<3> input_tile_origin = {
-      input_tile_index[0], input_tile_index[1] * TileSizeI,
+      input_tile_index[0],
      input_tile_index[1] * TileSizeI,
      input_tile_index[2] * TileSizeJ,
  };
@ -322,11 +326,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
  __syncthreads();
  Index<3> output_tile_index = {
-      input_tile_index[0], input_tile_index[2], input_tile_index[1],
+      input_tile_index[0],
      input_tile_index[2],
      input_tile_index[1],
  };
  Index<3> output_tile_origin = {
-      output_tile_index[0], output_tile_index[1] * TileSizeJ,
+      output_tile_index[0],
      output_tile_index[1] * TileSizeJ,
      output_tile_index[2] * TileSizeI,
  };
@ -799,7 +806,7 @@ struct TransposeElemType<16> {
 // A helper function to make RunSwapDimension1And2InTensor3 concise. This
 // helper function looks at the data type and input matrix sizes and decides
 // the thread numbers and tile sizes to use.
-template <typename T, bool conjugate = false >
+template <typename T, bool conjugate = false>
 void SwapDimension1And2InTensor3WithNarrowMatrices(
    const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
    T* output, const int kMinDimensionToUseTiles) {
@ -902,19 +909,21 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
    constexpr int kNumThreads = 256;
    Dimension<3> input_dims_in_tiles = {
-        input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
+        input_dims[0],
        MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
        MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
    };
    int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
                            input_dims_in_tiles[2];
-    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate>
+    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
                                          conjugate>
        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
                                                            output);
  } else if (narrow_matrix) {
-    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output,
+    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
-                                                  kMinDimensionToUseTiles);
+        d, input, input_dims, output, kMinDimensionToUseTiles);
  } else {
    int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
    CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
                                        filter.shape().DebugString()));
    for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
+      OP_REQUIRES(
-                                           std::numeric_limits<int>::max()),
+          context,
-                  errors::InvalidArgument("filter too large"));
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
          errors::InvalidArgument("filter too large"));
    }
    // The last dimension for input is in_depth. It must be the same as the
    // filter's in_depth.
    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
-        context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
-        errors::InvalidArgument("input and filter must have the same depth: ",
+                    "input and filter must have the same depth: ", in_depth,
-                                in_depth, " vs ", filter.dim_size(2)));
+                    " vs ", filter.dim_size(2)));
    // The last dimension for filter is out_depth.
    const int out_depth = static_cast<int>(filter.dim_size(3));
@ -487,18 +488,20 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
    // The second dimension for input is rows/height.
    // The first dimension for filter is rows/height.
    const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
+    OP_REQUIRES(
-                                         std::numeric_limits<int>::max()),
+        context,
-                errors::InvalidArgument("Input rows too large"));
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
        errors::InvalidArgument("Input rows too large"));
    const int input_rows = static_cast<int>(input_rows_raw);
    const int filter_rows = static_cast<int>(filter.dim_size(0));
    // The third dimension for input is columns/width.
    // The second dimension for filter is columns/width.
    const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
+    OP_REQUIRES(
-                                         std::numeric_limits<int>::max()),
+        context,
-                errors::InvalidArgument("Input cols too large"));
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
        errors::InvalidArgument("Input cols too large"));
    const int input_cols = static_cast<int>(input_cols_raw);
    const int filter_cols = static_cast<int>(filter.dim_size(1));
--- a/tensorflow/core/kernels/cross_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cross_op_gpu.cu.cc
@ -17,8 +17,8 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #include "tensorflow/core/kernels/cross_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/cross_op.h"
 namespace tensorflow {
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@ -19,13 +19,13 @@ limitations under the License.
 #include <limits>
 #include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 namespace tensorflow {
@ -80,16 +80,17 @@ class CTCDecodeHelper {
    if (!(batch_size == (*seq_len)->dim_size(0))) {
      return errors::FailedPrecondition(
-          "len(sequence_length) != batch_size.  ", "len(sequence_length):  ",
+          "len(sequence_length) != batch_size.  ",
-          (*seq_len)->dim_size(0), " batch_size: ", batch_size);
+          "len(sequence_length):  ", (*seq_len)->dim_size(0),
          " batch_size: ", batch_size);
    }
    auto seq_len_t = (*seq_len)->vec<int32>();
    for (int b = 0; b < batch_size; ++b) {
      if (!(seq_len_t(b) <= max_time)) {
-        return errors::FailedPrecondition("sequence_length(", b, ") <= ",
+        return errors::FailedPrecondition("sequence_length(", b,
-                                          max_time);
+                                          ") <= ", max_time);
      }
    }
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel {
      const int64 batch_indices = g.group()[0];
      OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
                  errors::InvalidArgument("labels batch index must be between ",
-                                          0, " and ", batch_size, " but saw: ",
+                                          0, " and ", batch_size,
-                                          batch_indices));
+                                          " but saw: ", batch_indices));
      auto values = g.values<int32>();
      std::vector<int>* b_values = &labels_t[batch_indices];
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@ -45,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
                            .HostMemory("y")
                            .TypeConstraint<int32>("T"),
                        UnaryOp<CPUDevice, functor::abs<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@ -17,12 +17,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
+REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
-          complex64, complex128);
+          complex128);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double);
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                        BinaryOp<CPUDevice, functor::add<int32>>);
 #endif
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_KERNEL(type)                          \
  REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
@ -66,5 +65,5 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                            .HostMemory("z")
                            .TypeConstraint<int32>("T"),
                        BinaryOp<CPUDevice, functor::add<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
-REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
+REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
-          uint8, complex128, string);
+          complex128, string);
 // Notice: String is excluded to allow marking AddV2 is_commutative and
 // is_aggregate.
 REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@ -1,10 +1,10 @@
-  /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-  http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
+REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
-          complex64, complex128);
+          complex128);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
+REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
-          complex64, complex128);
+          complex128);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@ -16,20 +16,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double,
+REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
-          complex64, complex128);
+          complex128);
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+#define REGISTER_SYCL_KERNEL(TYPE)                                \
-  REGISTER_KERNEL_BUILDER(                                            \
+  REGISTER_KERNEL_BUILDER(                                        \
-                          Name("Cosh")                                \
+      Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-                          .Device(DEVICE_SYCL)                        \
+      UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
                          .TypeConstraint<TYPE>("T"),                 \
                          UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@ -54,5 +54,5 @@ REGISTER_KERNEL_BUILDER(Name("Div")
                            .HostMemory("z")
                            .TypeConstraint<int32>("T"),
                        BinaryOp<CPUDevice, functor::safe_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@ -26,5 +26,5 @@ REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@ -49,5 +49,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                            .HostMemory("z")
                            .TypeConstraint<int32>("T"),
                        BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@ -40,5 +40,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                            .HostMemory("z")
                            .TypeConstraint<int32>("T"),
                        BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
@ -19,8 +19,8 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
-  DEFINE_UNARY1(conj, complex64);
+DEFINE_UNARY1(conj, complex64);
-  DEFINE_UNARY1(conj, complex128);
+DEFINE_UNARY1(conj, complex128);
 }  // namespace functor
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
@ -20,7 +20,7 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 DEFINE_BINARY10(equal_to, float, Eigen::half, double, uint8, int8, int16, int64,
-               complex64, complex128, bool);
+                complex64, complex128, bool);
 DEFINE_APPROXIMATE_EQUAL2(float, double);
 }  // namespace functor
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@ -15,8 +15,10 @@ limitations under the License.
 #if GOOGLE_CUDA
-#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#define EIGEN_USE_GPU
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 namespace tensorflow {
 namespace functor {
@ -38,19 +40,17 @@ struct SelectScalarFunctor<GPUDevice, T> {
                  typename TTypes<bool>::ConstScalar cond,
                  typename TTypes<T>::ConstFlat then_flat,
                  typename TTypes<T>::ConstFlat else_flat) {
 #if !defined(EIGEN_HAS_INDEX_LIST)
-  Eigen::array<int, 1> rank1{1};
+    Eigen::array<int, 1> rank1{1};
 #else
-  Eigen::IndexList<Eigen::type2index<1>> rank1;
+    Eigen::IndexList<Eigen::type2index<1> > rank1;
 #endif
-  const int size  = then_flat.dimension(0);
+    const int size = then_flat.dimension(0);
-  Eigen::array<int, 1> broadcast_dims{size};
+    Eigen::array<int, 1> broadcast_dims{size};
  To32Bit(out).device(d) = cond.reshape(rank1)
                               .broadcast(broadcast_dims)
                               .select(then_flat, else_flat);
    To32Bit(out).device(d) = cond.reshape(rank1)
                                 .broadcast(broadcast_dims)
                                 .select(then_flat, else_flat);
  }
 };
@ -89,8 +89,8 @@ struct BatchSelectFunctor<GPUDevice, T> {
  }
 };
-#define SELECT_FUNCTOR(T)                      \
+#define SELECT_FUNCTOR(T)                            \
-  template struct SelectFunctor<GPUDevice, T>; \
+  template struct SelectFunctor<GPUDevice, T>;       \
  template struct SelectScalarFunctor<GPUDevice, T>; \
  template struct BatchSelectFunctor<GPUDevice, T>;
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                            .HostMemory("z")
                            .TypeConstraint<int32>("T"),
                        BinaryOp<CPUDevice, functor::greater<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double);
+REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
          double);
 REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                            .Device(DEVICE_SYCL)
@ -44,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                            .HostMemory("z")
                            .TypeConstraint<int32>("T"),
                        BinaryOp<CPUDevice, functor::greater_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@ -21,7 +21,7 @@ REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
-         uint8, uint16);
+          uint8, uint16);
 #endif  // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@ -26,5 +26,5 @@ REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                            .HostMemory("z")
                            .TypeConstraint<int32>("T"),
                        BinaryOp<CPUDevice, functor::less<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@ -44,5 +44,5 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                            .HostMemory("z")
                            .TypeConstraint<int32>("T"),
                        BinaryOp<CPUDevice, functor::less_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                            .HostMemory("z")
                            .TypeConstraint<int32>("T"),
                        BinaryOp<CPUDevice, functor::maximum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@ -43,6 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                            .HostMemory("z")
                            .TypeConstraint<int32>("T"),
                        BinaryOp<CPUDevice, functor::minimum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@ -17,8 +17,8 @@ limitations under the License.
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
+REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
-          uint8, int32);
+          int32);
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@ -28,7 +28,7 @@ REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,
-           uint8);
+          uint8);
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
@ -50,5 +50,5 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                            .HostMemory("z")
                            .TypeConstraint<int32>("T"),
                        BinaryOp<CPUDevice, functor::mul<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_mul_2.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_2.cc
@ -22,11 +22,11 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
-REGISTER6(BinaryOp, CPU, "Mul", functor::mul,
+REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64,
-          int8, uint16, int16, int64, complex64, complex128);
+          complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64,
-           complex64, complex128);
+          complex64, complex128);
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@ -27,7 +27,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
                            .HostMemory("y")
                            .TypeConstraint<int32>("T"),
                        UnaryOp<CPUDevice, functor::neg<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #if GOOGLE_CUDA
 REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@ -17,7 +17,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
-           double, uint8, int8, int16);
+          double, uint8, int8, int16);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
          double, uint8);
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
@ -30,5 +30,5 @@ REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
 #endif  // GOOGLE_CUDA
-#endif   // !defined(__ANDROID_TYPES_SLIM__)
+#endif  // !defined(__ANDROID_TYPES_SLIM__)
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@ -38,7 +38,7 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
          Eigen::half, double, complex64, complex128);
@ -48,5 +48,5 @@ REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@ -30,7 +30,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 template <typename Device, typename T>
 class SelectOp : public OpKernel {
@ -185,7 +185,7 @@ REGISTER_SELECT_SYCL(double);
 REGISTER_SELECT_SYCL(int32);
 REGISTER_SELECT_SYCL(int64);
 #undef REGISTER_SELECT_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 namespace functor {
@ -201,13 +201,11 @@ struct SelectFunctorBase {
 };
 template <typename T>
-struct SelectFunctor<CPUDevice, T>
+struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
        : SelectFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-struct SelectFunctor<SYCLDevice, T>
+struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
-        : SelectFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 #endif // TENSORFLOW_USE_SYCL
 template <typename Device, typename T>
 struct SelectScalarFunctorBase {
@ -222,12 +220,12 @@ struct SelectScalarFunctorBase {
 // CPU Specializations of Select functors with scalar
 template <typename T>
 struct SelectScalarFunctor<CPUDevice, T>
-        : SelectScalarFunctorBase<CPUDevice, T> {};
+    : SelectScalarFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SelectScalarFunctor<SYCLDevice, T>
-        : SelectScalarFunctorBase<SYCLDevice, T> {};
+    : SelectScalarFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 template <typename Device, typename T>
 struct BatchSelectFunctorBase {
@ -240,8 +238,8 @@ struct BatchSelectFunctorBase {
    const Eigen::DenseIndex all_but_batch = then_flat_outer_dims.dimension(1);
 #if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{ 1, all_but_batch }};
+    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{1, all_but_batch}};
-    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{ batch, 1 }};
+    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{batch, 1}};
 #else
    Eigen::IndexList<Eigen::type2index<1>, Eigen::DenseIndex> broadcast_dims;
    broadcast_dims.set(1, all_but_batch);
@ -257,13 +255,13 @@ struct BatchSelectFunctorBase {
 };
 template <typename T>
-struct BatchSelectFunctor<CPUDevice, T>
+struct BatchSelectFunctor<CPUDevice, T> : BatchSelectFunctorBase<CPUDevice, T> {
-        : BatchSelectFunctorBase<CPUDevice, T> {};
+};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct BatchSelectFunctor<SYCLDevice, T>
-        : BatchSelectFunctorBase<SYCLDevice, T> {};
+    : BatchSelectFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
--- a/Show More
+++ b/Show More