diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc index a6d6c8b27f8..4ba6da6ccc4 100644 --- a/tensorflow/compiler/xla/service/shape_inference.cc +++ b/tensorflow/compiler/xla/service/shape_inference.cc @@ -37,6 +37,9 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" +using tensorflow::str_util::Join; +using tensorflow::strings::Printf; + namespace xla { namespace { @@ -934,7 +937,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}", BinaryOperation_Name(operation).c_str(), ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(), - tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str()); + Join(broadcast_dimensions, ", ").c_str()); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs)); @@ -1097,7 +1100,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Map operation requires all operands to have the same shape; got: " "%s", - tensorflow::str_util::Join(pieces, ", ").c_str()); + Join(pieces, ", ").c_str()); } // Check that dimensions.size == arg_shape.dimensions_size() (we currently @@ -1114,7 +1117,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (dimensions[i] != i) { return InvalidArgument( "Map requires monotonically increasing dimension numbers, found: %s ", - tensorflow::str_util::Join(dimensions, ", ").c_str()); + Join(dimensions, ", ").c_str()); } } @@ -1914,21 +1917,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const Shape& arg, tensorflow::gtl::ArraySlice starts, tensorflow::gtl::ArraySlice limits, tensorflow::gtl::ArraySlice strides) { + auto error = [&](const string& message) { + return InvalidArgument( + "%s in slice operation; argument shape: %s; starts: {%s}; limits: " + "{%s}; strides: {%s}", + message.c_str(), ShapeUtil::HumanString(arg).c_str(), + Join(starts, ",").c_str(), Join(limits, ",").c_str(), + Join(strides, ",").c_str()); + }; TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice")); VLOG(2) << tensorflow::strings::Printf( "slicing shape %s starts={%s} limits={%s}", - ShapeUtil::HumanString(arg).c_str(), - tensorflow::str_util::Join(starts, ", ").c_str(), - tensorflow::str_util::Join(limits, ", ").c_str()); + ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(), + Join(limits, ", ").c_str()); if (starts.size() != limits.size()) { - return InvalidArgument("slice start and limit sizes differ: %zu vs %zu", - starts.size(), limits.size()); + return error(Printf("slice start and limit sizes differ: %zu vs %zu", + starts.size(), limits.size())); } if (starts.size() != strides.size()) { - return InvalidArgument("slice start and strides sizes differ: %zu vs %zu", - starts.size(), strides.size()); + return error(Printf("slice start and strides sizes differ: %zu vs %zu", + starts.size(), strides.size())); } if (starts.size() != ShapeUtil::Rank(arg)) { @@ -1947,20 +1957,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( start_index); } if (limit_index > arg.dimensions(dimension)) { - return InvalidArgument( - "limit index (%lld) must be less than or equal to dimension " - "size (%lld)", - limit_index, arg.dimensions(dimension)); + return error( + Printf("limit index (%lld) must be less than or equal to dimension " + "size (%lld)", + limit_index, arg.dimensions(dimension))); } VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension, start_index); VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension, limit_index); if (start_index > limit_index) { - return InvalidArgument( - "limit index (%lld) must be greater or equal to " - "start index (%lld) in slice with positive stride", - limit_index, start_index); + return error( + Printf("limit index (%lld) must be greater or equal to " + "start index (%lld) in slice with positive stride", + limit_index, start_index)); } if (stride <= 0) { return InvalidArgument("stride (%lld) must be positive", stride); @@ -1983,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}", ShapeUtil::HumanString(operand_shape).c_str(), ShapeUtil::HumanString(start_indices_shape).c_str(), - tensorflow::str_util::Join(slice_sizes, ", ").c_str()); + Join(slice_sizes, ", ").c_str()); if (ShapeUtil::Rank(start_indices_shape) != 1) { return InvalidArgument( @@ -2280,8 +2290,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Reshape dimensions [%s] are not a permutation of the operand " "dimensions (operand shape is %s).", - tensorflow::str_util::Join(dimensions, ",").c_str(), - ShapeUtil::HumanString(operand).c_str()); + Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str()); } return inferred_shape; @@ -2373,8 +2382,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( // The applied function's arity equals the number of arguments. if (arg_shapes.size() != to_apply.parameters_size()) { string computation_signature = ShapeUtil::HumanString(to_apply); - string argument_shapes = tensorflow::str_util::Join( - arg_shapes, ", ", [](string* out, const Shape* shape) { + string argument_shapes = + Join(arg_shapes, ", ", [](string* out, const Shape* shape) { tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape)); }); return InvalidArgument( diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc index 99d87f3b550..026c0211657 100644 --- a/tensorflow/compiler/xla/service/shape_inference_test.cc +++ b/tensorflow/compiler/xla/service/shape_inference_test.cc @@ -1512,5 +1512,20 @@ TEST_F(ShapeInferenceTest, Conditional) { "must have the same shape")); } +TEST_F(ShapeInferenceTest, BadSlice) { + auto arg = ShapeUtil::MakeShape(F32, {4}); + StatusOr statusor = + ShapeInference::InferSliceShape(arg, {0}, {5}, {1}); + ASSERT_FALSE(statusor.ok()); + + LOG(INFO) << statusor.status(); + + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("less than or equal to dimension size")) + << statusor.status(); + EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape")) + << statusor.status(); +} + } // namespace } // namespace xla diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD index 4195e7553c4..b5428d32460 100644 --- a/tensorflow/contrib/lite/kernels/BUILD +++ b/tensorflow/contrib/lite/kernels/BUILD @@ -71,6 +71,32 @@ cc_library( ], ) +cc_library( + name = "kernel_util", + srcs = [ + "kernel_util.cc", + ], + hdrs = [ + "kernel_util.h", + ], + deps = [ + "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite:context", + "//tensorflow/contrib/lite/kernels/internal:round", + ], +) + +tf_cc_test( + name = "kernel_util_test", + size = "small", + srcs = ["kernel_util_test.cc"], + deps = [ + ":kernel_util", + "//tensorflow/contrib/lite/testing:util", + "@com_google_googletest//:gtest", + ], +) + cc_library( name = "builtin_ops", srcs = [ @@ -87,7 +113,6 @@ cc_library( "fully_connected.cc", "gather.cc", "hashtable_lookup.cc", - "kernel_util.cc", "l2norm.cc", "local_response_norm.cc", "lsh_projection.cc", @@ -111,7 +136,6 @@ cc_library( "unidirectional_sequence_rnn.cc", ], hdrs = [ - "kernel_util.h", "padding.h", "register.h", ], @@ -125,6 +149,7 @@ cc_library( }), deps = [ ":activation_functor", + ":kernel_util", ":op_macros", "//tensorflow/contrib/lite:builtin_op_data", "//tensorflow/contrib/lite:framework", diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc index b0546c00cf9..955e8c5764c 100644 --- a/tensorflow/contrib/lite/kernels/kernel_util.cc +++ b/tensorflow/contrib/lite/kernels/kernel_util.cc @@ -13,8 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/kernels/kernel_util.h" + #include #include +#include + #include "tensorflow/contrib/lite/kernels/internal/round.h" namespace tflite { @@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation, } } +bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) { + return TfLiteIntArrayEqual(input1->dims, input2->dims); +} + +TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context, + TfLiteTensor* input1, + TfLiteTensor* input2, + TfLiteIntArray** output_shape) { + int64_t dims1 = NumDimensions(input1); + int64_t dims2 = NumDimensions(input2); + int64_t out_dims = std::max(dims1, dims2); + std::unique_ptr shape( + TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree); + for (int i = 0; i < out_dims; ++i) { + int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1); + int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1); + TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1); + shape->data[out_dims - i - 1] = std::max(d1, d2); + } + *output_shape = shape.release(); + return kTfLiteOk; +} + } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h index bfdfba00f51..3cfa72615a9 100644 --- a/tensorflow/contrib/lite/kernels/kernel_util.h +++ b/tensorflow/contrib/lite/kernels/kernel_util.h @@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node, inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; } inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; } +inline int64_t NumElements(const TfLiteTensor* t) { + int64_t count = 1; + for (int i = 0; i < NumDimensions(t); ++i) { + count *= SizeOfDimension(t, i); + } + return count; +} + inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context, const TfLiteNode* node, int index) { const bool use_tensor = node->inputs->data[index] != kOptionalTensor; @@ -76,6 +84,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation, float* activation_min, float* activation_max); +// Return true if the given tensors have the same shape. +bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2); + +// Calculate the output_shape that is necessary for element-wise operations +// with broadcasting involving the two input tensors. +TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context, + TfLiteTensor* input1, + TfLiteTensor* input2, + TfLiteIntArray** output_shape); } // namespace tflite #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_ diff --git a/tensorflow/contrib/lite/kernels/kernel_util_test.cc b/tensorflow/contrib/lite/kernels/kernel_util_test.cc new file mode 100644 index 00000000000..63a317f338c --- /dev/null +++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc @@ -0,0 +1,150 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +#include +#include +#include "tensorflow/contrib/lite/testing/util.h" + +namespace tflite { +namespace { + +void ReportError(TfLiteContext* context, const char* format, ...) {} + +class KernelUtilTest : public ::testing::Test { + public: + KernelUtilTest() { + context_.ReportError = ReportError; + + tensor1_.dims = nullptr; + tensor2_.dims = nullptr; + } + ~KernelUtilTest() { + TfLiteTensorFree(&tensor1_); + TfLiteTensorFree(&tensor2_); + } + + void SetShape(TfLiteTensor* tensor, std::initializer_list dims) { + TfLiteTensorFree(tensor); + tensor->dims = TfLiteIntArrayCreate(dims.size()); + int i = 0; + for (int d : dims) { + tensor->dims->data[i] = d; + ++i; + } + } + + std::vector GetShape(TfLiteIntArray* dims) { + std::vector result; + for (int i = 0; i < dims->size; ++i) { + result.push_back(dims->data[i]); + } + return result; + } + + protected: + TfLiteContext context_; + TfLiteTensor tensor1_; + TfLiteTensor tensor2_; +}; + +TEST_F(KernelUtilTest, SameShapeEmpty) { + EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor1_, {1, 2, 3}); + EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor2_, {1, 2}); + EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor2_, {1, 2, 3, 4}); + EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor2_, {1, 2, 3}); + EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor2_, {}); + EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor1_, {}); + EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_)); +} + +TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) { + TfLiteIntArray* output = nullptr; + SetShape(&tensor1_, {1, 2}); + SetShape(&tensor2_, {1, 3}); + EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + EXPECT_EQ(output, nullptr); +} + +TEST_F(KernelUtilTest, BroadcastShapeOnes) { + TfLiteIntArray* output = nullptr; + SetShape(&tensor1_, {1, 1}); + SetShape(&tensor2_, {1, 3}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + TfLiteIntArrayFree(output); + + SetShape(&tensor1_, {1, 2}); + SetShape(&tensor2_, {1, 1}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + TfLiteIntArrayFree(output); +} + +TEST_F(KernelUtilTest, BroadcastShapeScalars) { + TfLiteIntArray* output = nullptr; + SetShape(&tensor1_, {1, 2}); + SetShape(&tensor2_, {}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2)); + TfLiteIntArrayFree(output); + + SetShape(&tensor1_, {}); + SetShape(&tensor2_, {2}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2)); + TfLiteIntArrayFree(output); +} + +TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) { + TfLiteIntArray* output = nullptr; + SetShape(&tensor1_, {1, 2}); + SetShape(&tensor2_, {3, 1, 1}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2)); + TfLiteIntArrayFree(output); + + SetShape(&tensor1_, {1, 2, 3, 4}); + SetShape(&tensor2_, {1, 3, 1}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4)); + TfLiteIntArrayFree(output); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc index f8281f3a572..c5a62fdb620 100644 --- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc +++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc @@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags( "For Protobuf formats, the binary format will be used."), Flag("input_format", parsed_flags.input_format.bind(), parsed_flags.input_format.default_value(), - "Input file format. One of: tensorflow_graphdef, "), + "Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."), Flag("output_format", parsed_flags.output_format.bind(), - parsed_flags.output_format.default_value(), "Output file format."), + parsed_flags.output_format.default_value(), + "Output file format. " + "One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."), Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(), parsed_flags.default_ranges_min.default_value(), "If defined, will be used as the default value for the min bound " @@ -58,11 +60,13 @@ bool ParseTocoFlagsFromCommandLineFlags( Flag("inference_type", parsed_flags.inference_type.bind(), parsed_flags.inference_type.default_value(), "Target data type of arrays in the output file (for input_arrays, " - "this may be overridden by inference_input_type)."), + "this may be overridden by inference_input_type). " + "One of FLOAT, QUANTIZED_UINT8."), Flag("inference_input_type", parsed_flags.inference_input_type.bind(), parsed_flags.inference_input_type.default_value(), - "Target data type of input arrays. If not specified, inference_type " - "is used."), + "Target data type of input arrays. " + "If not specified, inference_type is used. " + "One of FLOAT, QUANTIZED_UINT8."), Flag("input_type", parsed_flags.input_type.bind(), parsed_flags.input_type.default_value(), "Deprecated ambiguous flag that set both --input_data_types and " @@ -76,35 +80,31 @@ bool ParseTocoFlagsFromCommandLineFlags( Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(), parsed_flags.drop_fake_quant.default_value(), - "Ignore and discard FakeQuant nodes. For instance, that can be used " - "to " + "Ignore and discard FakeQuant nodes. For instance, to " "generate plain float code without fake-quantization from a " - "quantized " - "graph."), + "quantized graph."), Flag( "reorder_across_fake_quant", parsed_flags.reorder_across_fake_quant.bind(), parsed_flags.reorder_across_fake_quant.default_value(), "Normally, FakeQuant nodes must be strict boundaries for graph " "transformations, in order to ensure that quantized inference has " - "the " - "exact same arithmetic behavior as quantized training --- which is " - "the " - "whole point of quantized training and of FakeQuant nodes in the " - "first " - "place. However, that entails subtle requirements on where exactly " + "the exact same arithmetic behavior as quantized training --- which " + "is the whole point of quantized training and of FakeQuant nodes in " + "the first place. " + "However, that entails subtle requirements on where exactly " "FakeQuant nodes must be placed in the graph. Some quantized graphs " "have FakeQuant nodes at unexpected locations, that prevent graph " "transformations that are necessary in order to generate inference " "code for these graphs. Such graphs should be fixed, but as a " "temporary work-around, setting this reorder_across_fake_quant flag " - "allows toco to perform necessary graph transformaitons on them, " + "allows TOCO to perform necessary graph transformaitons on them, " "at the cost of no longer faithfully matching inference and training " "arithmetic."), Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(), parsed_flags.allow_custom_ops.default_value(), - "If true, allow TOCO to create TF Lite Custom operators for all the" - "unsupported Tensorflow ops."), + "If true, allow TOCO to create TF Lite Custom operators for all the " + "unsupported TensorFlow ops."), Flag( "drop_control_dependency", parsed_flags.drop_control_dependency.bind(), diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/py2tf/BUILD index d395de986d2..3e846aefeb3 100644 --- a/tensorflow/contrib/py2tf/BUILD +++ b/tensorflow/contrib/py2tf/BUILD @@ -57,6 +57,7 @@ py_library( py_test( name = "api_test", srcs = ["api_test.py"], + srcs_version = "PY2AND3", deps = [ ":py2tf_internal", "//tensorflow/python:client_testlib", @@ -66,6 +67,7 @@ py_test( py_test( name = "conversion_test", srcs = ["conversion_test.py"], + srcs_version = "PY2AND3", deps = [ ":py2tf_internal", "//tensorflow/python:client_testlib", @@ -76,6 +78,7 @@ py_test( py_test( name = "naming_test", srcs = ["naming_test.py"], + srcs_version = "PY2AND3", deps = [ ":py2tf_internal", "//tensorflow/python:client_testlib", diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD index 2b0a1234e69..4f90f94e096 100644 --- a/tensorflow/contrib/py2tf/converters/BUILD +++ b/tensorflow/contrib/py2tf/converters/BUILD @@ -52,6 +52,7 @@ py_library( py_test( name = "break_canonicalization_test", srcs = ["break_canonicalization_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -62,6 +63,7 @@ py_test( py_test( name = "call_trees_test", srcs = ["call_trees_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -72,6 +74,7 @@ py_test( py_test( name = "continue_canonicalization_test", srcs = ["continue_canonicalization_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -82,6 +85,7 @@ py_test( py_test( name = "control_flow_test", srcs = ["control_flow_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -92,6 +96,7 @@ py_test( py_test( name = "builtin_functions_test", srcs = ["builtin_functions_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -112,6 +117,7 @@ py_test( py_test( name = "logical_expressions_test", srcs = ["logical_expressions_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -122,6 +128,7 @@ py_test( py_test( name = "print_functions_test", srcs = ["print_functions_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -133,6 +140,7 @@ py_test( py_test( name = "side_effect_guards_test", srcs = ["side_effect_guards_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD index e0331dbc97c..88902dea84a 100644 --- a/tensorflow/contrib/py2tf/pyct/BUILD +++ b/tensorflow/contrib/py2tf/pyct/BUILD @@ -38,6 +38,7 @@ py_library( py_test( name = "anno_test", srcs = ["anno_test.py"], + srcs_version = "PY2AND3", deps = [ ":pyct", "//tensorflow/python:client_testlib", @@ -47,6 +48,7 @@ py_test( py_test( name = "compiler_test", srcs = ["compiler_test.py"], + srcs_version = "PY2AND3", deps = [ ":pyct", "//tensorflow/python:client_testlib", @@ -57,6 +59,7 @@ py_test( py_test( name = "parser_test", srcs = ["parser_test.py"], + srcs_version = "PY2AND3", deps = [ ":pyct", "//tensorflow/python:client_testlib", @@ -66,6 +69,7 @@ py_test( py_test( name = "pretty_printer_test", srcs = ["pretty_printer_test.py"], + srcs_version = "PY2AND3", deps = [ ":pyct", "//tensorflow/python:client_testlib", @@ -75,6 +79,7 @@ py_test( py_test( name = "templates_test", srcs = ["templates_test.py"], + srcs_version = "PY2AND3", deps = [ ":pyct", "//tensorflow/python:client_testlib", diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD index abaf9536781..32e2954fffc 100644 --- a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD +++ b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD @@ -32,6 +32,7 @@ py_library( py_test( name = "access_test", srcs = ["access_test.py"], + srcs_version = "PY2AND3", deps = [ ":static_analysis", "//tensorflow/contrib/py2tf/pyct", @@ -43,6 +44,7 @@ py_test( py_test( name = "live_values_test", srcs = ["live_values_test.py"], + srcs_version = "PY2AND3", deps = [ ":static_analysis", "//tensorflow/contrib/py2tf/pyct", @@ -53,6 +55,7 @@ py_test( py_test( name = "type_info_test", srcs = ["type_info_test.py"], + srcs_version = "PY2AND3", deps = [ ":static_analysis", "//tensorflow/contrib/py2tf/pyct", diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc index 995fd1253fb..2f13cf8bd76 100644 --- a/tensorflow/core/common_runtime/gpu/process_state.cc +++ b/tensorflow/core/common_runtime/gpu/process_state.cc @@ -230,8 +230,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) { // TODO(tucker): actually maintain separate CPUAllocators for // different numa_nodes. For now, just one. numa_node = 0; - mutex_lock lock(mu_); + { + // Here we optimize the most common use case where cuda_host_allocators_ + // and cuda_al_ have already been populated and since we're only reading + // these vectors, we can get by with a shared lock. In the slower case, + // we take a unique lock and populate these vectors. + tf_shared_lock lock(mu_); + + if (FLAGS_brain_gpu_record_mem_types && + static_cast(cuda_al_.size()) > 0) { + return cuda_al_[0]; + } + if (static_cast(cuda_host_allocators_.size()) > numa_node) { + return cuda_host_allocators_[0]; + } + } + + mutex_lock lock(mu_); // Find the first valid StreamExecutor to request CUDA host memory // through, since any will work. // diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc index 01a618ed777..39bfca244ed 100644 --- a/tensorflow/core/grappler/clusters/cluster.cc +++ b/tensorflow/core/grappler/clusters/cluster.cc @@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) { DisableDetailedStats(false); } -Cluster::~Cluster() { -} +Cluster::~Cluster() {} void Cluster::AllowSoftPlacement(bool soft_placement_state) { options_.config.set_allow_soft_placement(soft_placement_state); diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h index 9db6d462667..5116c8183cb 100644 --- a/tensorflow/core/grappler/costs/virtual_scheduler.h +++ b/tensorflow/core/grappler/costs/virtual_scheduler.h @@ -325,7 +325,7 @@ class VirtualScheduler { // Boolean field for whether the cost is accurate. std::map> op_costs_; - Costs graph_costs_; // Graph cost. + Costs graph_costs_; // Graph cost. std::map op_to_cost_; // Per-op cost. // Auxilliary data structures for constructing NodeState and DeviceState. diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h index c5d2d47782f..8d1098d8775 100644 --- a/tensorflow/core/grappler/optimizers/auto_parallel.h +++ b/tensorflow/core/grappler/optimizers/auto_parallel.h @@ -16,8 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_ #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_ -#include "tensorflow/core/grappler/optimizers/graph_optimizer.h" #include "tensorflow/core/framework/variable.pb.h" +#include "tensorflow/core/grappler/optimizers/graph_optimizer.h" #include "tensorflow/core/lib/core/status.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc index 37976f71837..72155fd0373 100644 --- a/tensorflow/core/kernels/adjust_contrast_op.cc +++ b/tensorflow/core/kernels/adjust_contrast_op.cc @@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice; template class AdjustContrastOp : public OpKernel { public: - explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) { - } + explicit AdjustContrastOp(OpKernelConstruction* context) + : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc index 0fc03b5a236..7522b320400 100644 --- a/tensorflow/core/kernels/adjust_contrast_op_test.cc +++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc @@ -29,8 +29,7 @@ limitations under the License. namespace tensorflow { -class AdjustContrastOpTest : public OpsTestBase { -}; +class AdjustContrastOpTest : public OpsTestBase {}; TEST_F(AdjustContrastOpTest, Simple_1113) { TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2") diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc index 4643d4e6efd..f0c6ae499d4 100644 --- a/tensorflow/core/kernels/adjust_saturation_op.cc +++ b/tensorflow/core/kernels/adjust_saturation_op.cc @@ -192,8 +192,9 @@ class AdjustSaturationOp : public AdjustSaturationOpBase { const DeviceBase::CpuWorkerThreads& worker_threads = *context->device()->tensorflow_cpu_worker_threads(); Shard(worker_threads.num_threads, worker_threads.workers, channel_count, - kCostPerChannel, [channel_count, &input_data, &output_data, scale_h]( - int64 start_channel, int64 end_channel) { + kCostPerChannel, + [channel_count, &input_data, &output_data, scale_h]( + int64 start_channel, int64 end_channel) { const float* p = input_data.data() + start_channel * kChannelSize; float* q = output_data.data() + start_channel * kChannelSize; for (int i = start_channel; i < end_channel; i++) { diff --git a/tensorflow/core/kernels/aggregate_ops_cpu.h b/tensorflow/core/kernels/aggregate_ops_cpu.h index dfa3fe585e3..aa1cead928a 100644 --- a/tensorflow/core/kernels/aggregate_ops_cpu.h +++ b/tensorflow/core/kernels/aggregate_ops_cpu.h @@ -25,7 +25,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace tensorflow { @@ -201,7 +201,7 @@ struct Add7Functor { typename TTypes::ConstFlat in6, typename TTypes::ConstFlat in7) { Add7EigenImpl::Compute(d, out, in1, in2, in3, in4, in5, in6, - in7); + in7); } }; @@ -214,7 +214,7 @@ struct Add8Functor { typename TTypes::ConstFlat in5, typename TTypes::ConstFlat in6, typename TTypes::ConstFlat in7, typename TTypes::ConstFlat in8) { Add8EigenImpl::Compute(d, out, in1, in2, in3, in4, in5, in6, - in7, in8); + in7, in8); } }; @@ -227,7 +227,7 @@ struct Add8pFunctor { typename TTypes::ConstFlat in5, typename TTypes::ConstFlat in6, typename TTypes::ConstFlat in7, typename TTypes::ConstFlat in8) { Add8pEigenImpl::Compute(d, out, in1, in2, in3, in4, in5, in6, - in7, in8); + in7, in8); } }; @@ -241,10 +241,10 @@ struct Add9Functor { typename TTypes::ConstFlat in7, typename TTypes::ConstFlat in8, typename TTypes::ConstFlat in9) { Add9EigenImpl::Compute(d, out, in1, in2, in3, in4, in5, in6, - in7, in8, in9); + in7, in8, in9); } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc index cc8f122cab3..ce2fce92e4e 100644 --- a/tensorflow/core/kernels/attention_ops.cc +++ b/tensorflow/core/kernels/attention_ops.cc @@ -52,8 +52,9 @@ class ExtractGlimpseOp : public OpKernel { const int64 batch_size = input_shape.dim_size(0); const Tensor& window_size = context->input(1); - OP_REQUIRES(context, (window_size.shape().dims() == 1) && - window_size.shape().dim_size(0) == 2, + OP_REQUIRES(context, + (window_size.shape().dims() == 1) && + window_size.shape().dim_size(0) == 2, errors::InvalidArgument( "input must be a vector of size 2 (height, width)", window_size.shape().DebugString())); diff --git a/tensorflow/core/kernels/avgpooling_op.h b/tensorflow/core/kernels/avgpooling_op.h index dea2683184a..f5e81dbc093 100644 --- a/tensorflow/core/kernels/avgpooling_op.h +++ b/tensorflow/core/kernels/avgpooling_op.h @@ -48,9 +48,8 @@ struct SpatialAvgPooling { typedef Eigen::GpuDevice GPUDevice; -// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation -// that works NHWC data formats. -// Arguments: +// Launch a custom GPU kernels from Yanqing for the avgpooling backward +// operation that works NHWC data formats. Arguments: // top_diff: backprop to the output of the pooling layer // num: number of input batches // height: input height diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc index 2be330d1427..6537b42f1ed 100644 --- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc +++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc @@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads, hstart = max(hstart, 0); wstart = max(wstart, 0); int pool_size = (hend - hstart) * (wend - wstart); - gradient += - top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size); + gradient += top_diff_slice[(ph * pooled_width + pw) * channels] / + dtype(pool_size); } } bottom_diff[index] = gradient; @@ -90,11 +90,11 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num, const GPUDevice& d) { int x_size = num * height * width * channels; CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d); - AvePoolBackwardNHWC< - T><<>>( - config.virtual_thread_count, top_diff, num, height, width, channels, - pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w, - pad_t, pad_t, bottom_diff); + AvePoolBackwardNHWC + <<>>( + config.virtual_thread_count, top_diff, num, height, width, channels, + pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w, + pad_t, pad_t, bottom_diff); return d.ok(); } diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc index d0bbea9fe27..944564dfba6 100644 --- a/tensorflow/core/kernels/barrier_ops.cc +++ b/tensorflow/core/kernels/barrier_ops.cc @@ -111,13 +111,14 @@ class Barrier : public ResourceBase { mutex_lock lock(mu_); if (closed_) { OP_REQUIRES_ASYNC( - ctx, !cancel_pending_enqueues_ && - (num_inserted == 0 || !incomplete_.empty()), + ctx, + !cancel_pending_enqueues_ && + (num_inserted == 0 || !incomplete_.empty()), errors::Cancelled( "Barrier ", name_, " is closed. Pending enqueues cancelled: ", - cancel_pending_enqueues_, ". Number of new insertions: ", - num_inserted, ". Number of incomplete keys: ", - incomplete_.size(), "."), + cancel_pending_enqueues_, + ". Number of new insertions: ", num_inserted, + ". Number of incomplete keys: ", incomplete_.size(), "."), callback); } @@ -128,9 +129,10 @@ class Barrier : public ResourceBase { for (int i = 0; i < num_inserted; ++i) { OP_REQUIRES_OK_ASYNC( - ctx, InsertOneLocked(ctx, keys, values, element_shape, - component_index, i, &ready_tuples, - &new_elements), + ctx, + InsertOneLocked(ctx, keys, values, element_shape, + component_index, i, &ready_tuples, + &new_elements), callback); } @@ -317,8 +319,9 @@ class Barrier : public ResourceBase { return errors::Cancelled( "Barrier ", name_, " is closed, but attempted to insert a brand new key: ", - keys_vec(i), ". Pending enqueues cancelled: ", - cancel_pending_enqueues_, ". Insertion index: ", i, + keys_vec(i), + ". Pending enqueues cancelled: ", cancel_pending_enqueues_, + ". Insertion index: ", i, ". Number of incomplete keys: ", incomplete_.size(), "."); } } else { @@ -532,13 +535,14 @@ class InsertManyOp : public BarrierOpKernel { OP_REQUIRES_ASYNC( ctx, component_index_ < barrier->num_components(), errors::InvalidArgument("The component ID is out of range ", - component_index_, " > num_components", " (= ", - barrier->num_components(), ")"), + component_index_, " > num_components", + " (= ", barrier->num_components(), ")"), callback); OP_REQUIRES_OK_ASYNC( - ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING, - barrier->component_type(component_index_)}, - {}), + ctx, + ctx->MatchSignature({DT_STRING_REF, DT_STRING, + barrier->component_type(component_index_)}, + {}), callback); const Tensor* keys; diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc index 5b4e1a809fa..c447db842d3 100644 --- a/tensorflow/core/kernels/batch_kernels.cc +++ b/tensorflow/core/kernels/batch_kernels.cc @@ -13,22 +13,20 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ - #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_util.h" #include "tensorflow/core/framework/types.h" -#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h" #include "tensorflow/core/kernels/batching_util/periodic_function.h" +#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h" #include "tensorflow/core/kernels/concat_lib.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/kernels/split_lib.h" #include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/platform/macros.h" - namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h index 93c39183198..43e716c542a 100644 --- a/tensorflow/core/kernels/batch_matmul_op_impl.h +++ b/tensorflow/core/kernels/batch_matmul_op_impl.h @@ -41,7 +41,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace { @@ -429,14 +429,13 @@ template struct LaunchBatchMatMul { static void Launch(OpKernelContext* context, const Tensor& in_x, const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) { - - // Number of matrix multiplies i.e. size of the batch. - const int64 batch_size = in_x.dim_size(0); - ParallelMatMulKernelSYCL::Run(context, in_x, in_y, adj_x, adj_y, out, - 0, batch_size); + // Number of matrix multiplies i.e. size of the batch. + const int64 batch_size = in_x.dim_size(0); + ParallelMatMulKernelSYCL::Run(context, in_x, in_y, adj_x, adj_y, + out, 0, batch_size); } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class BatchMatMul : public OpKernel { @@ -462,10 +461,10 @@ class BatchMatMul : public OpKernel { TensorShape out_shape; for (int i = 0; i < ndims - 2; ++i) { OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i), - errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(", - i, ") must be the same: ", - in0.shape().DebugString(), " vs ", - in1.shape().DebugString())); + errors::InvalidArgument( + "In[0].dim(", i, ") and In[1].dim(", i, + ") must be the same: ", in0.shape().DebugString(), " vs ", + in1.shape().DebugString())); out_shape.AddDim(in0.dim_size(i)); } auto n = (ndims == 2) ? 1 : out_shape.num_elements(); @@ -507,12 +506,12 @@ class BatchMatMul : public OpKernel { bool adj_y_; }; -#define REGISTER_BATCH_MATMUL_CPU(TYPE) \ +#define REGISTER_BATCH_MATMUL_CPU(TYPE) \ REGISTER_KERNEL_BUILDER( \ Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint("T"), \ BatchMatMul) -#define REGISTER_BATCH_MATMUL_GPU(TYPE) \ +#define REGISTER_BATCH_MATMUL_GPU(TYPE) \ REGISTER_KERNEL_BUILDER( \ Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint("T"), \ BatchMatMul) @@ -522,5 +521,5 @@ class BatchMatMul : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint("T"), \ BatchMatMul) -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc index 8d155ca62b2..7e1e2aa4ec1 100644 --- a/tensorflow/core/kernels/batch_matmul_op_real.cc +++ b/tensorflow/core/kernels/batch_matmul_op_real.cc @@ -35,5 +35,5 @@ TF_CALL_half(REGISTER_BATCH_MATMUL_GPU); #ifdef TENSORFLOW_USE_SYCL TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL); TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/batch_matmul_op_test.cc b/tensorflow/core/kernels/batch_matmul_op_test.cc index 7923f34155b..c3932cd7b90 100644 --- a/tensorflow/core/kernels/batch_matmul_op_test.cc +++ b/tensorflow/core/kernels/batch_matmul_op_test.cc @@ -53,9 +53,10 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a, /* Uncomment to enable benchmarks for double & complex types: */ // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex, DT_COMPLEX64, // gpu); -// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \ -// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex, DT_COMPLEX128, cpu); \ -// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \ +// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \ +// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex, DT_COMPLEX128, cpu); +// \ +// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \ // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex, DT_COMPLEX128, gpu); // Typical fully connected layers diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc index d3ed617f713..c34ea14bf60 100644 --- a/tensorflow/core/kernels/batch_norm_op.cc +++ b/tensorflow/core/kernels/batch_norm_op.cc @@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class BatchNormOp : public OpKernel { diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc index 5e3fcd2114a..45ddc853295 100644 --- a/tensorflow/core/kernels/batch_norm_op_test.cc +++ b/tensorflow/core/kernels/batch_norm_op_test.cc @@ -54,7 +54,7 @@ TEST_F(BatchNormOpTest, Simple) { Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2})); test::FillValues( &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f, - -33.31f, -23.85f, -34.72f, -25.85f, -36.13f }); + -33.31f, -23.85f, -34.72f, -25.85f, -36.13f}); test::ExpectTensorNear(expected, *GetOutput(0), 0.01); } diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc index c1c0d6d3292..b07c5fd718d 100644 --- a/tensorflow/core/kernels/batchtospace_op.cc +++ b/tensorflow/core/kernels/batchtospace_op.cc @@ -56,9 +56,10 @@ static void BatchToSpaceOpCompute(OpKernelContext* context, errors::InvalidArgument("input rank should be >= ", 1 + block_dims, " instead of ", orig_input_tensor.dims())); - OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) && - block_dims == orig_crops.dim_size(0) && - 2 == orig_crops.dim_size(1), + OP_REQUIRES(context, + TensorShapeUtils::IsMatrix(orig_crops.shape()) && + block_dims == orig_crops.dim_size(0) && + 2 == orig_crops.dim_size(1), errors::InvalidArgument("crops should have shape [", block_dims, ", 2] instead of ", orig_crops.shape().DebugString())); diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc index 7fc4b1762d0..8e4f08e4730 100644 --- a/tensorflow/core/kernels/bcast_ops.cc +++ b/tensorflow/core/kernels/bcast_ops.cc @@ -13,11 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/util/bcast.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/bcast.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc index 2ca194a77f2..754b93b073a 100644 --- a/tensorflow/core/kernels/bias_op_gpu.cu.cc +++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc @@ -77,14 +77,14 @@ void BiasGPU::compute(const GPUDevice& d, const T* input, const T* bias, } CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); if (data_format == FORMAT_NHWC) { - BiasNHWCKernel< - T><<>>( - config.virtual_thread_count, input, bias, output, bias_size); + BiasNHWCKernel + <<>>( + config.virtual_thread_count, input, bias, output, bias_size); } else { - BiasNCHWKernel< - T><<>>( - config.virtual_thread_count, input, bias, output, bias_size, - image_size); + BiasNCHWKernel + <<>>( + config.virtual_thread_count, input, bias, output, bias_size, + image_size); } } @@ -206,10 +206,10 @@ void BiasGradGPU::compute(const GPUDevice& d, const T* output_backprop, // Check if we have enough shared memory. if (shared_memory_size <= max_shared_memory_size) { if (data_format == FORMAT_NHWC) { - BiasGradNHWC_SharedAtomics< - T><<>>(total_count, output_backprop, bias_backprop, - bias_size); + BiasGradNHWC_SharedAtomics + <<>>(total_count, output_backprop, bias_backprop, + bias_size); } else { // Round up the block count to multiple of bias_size. int group_size = (config.block_count + bias_size - 1) / bias_size; @@ -217,23 +217,24 @@ void BiasGradGPU::compute(const GPUDevice& d, const T* output_backprop, if (config.thread_per_block < kWarpSize) { config.thread_per_block = kWarpSize; } - BiasGradNCHW_SharedAtomics< - T><<>>( - output_backprop, bias_backprop, batch, bias_size, image_size, - group_size); + BiasGradNCHW_SharedAtomics + <<>>( + output_backprop, bias_backprop, batch, bias_size, image_size, + group_size); } } else { // Note that even if we don't have enough shared memory to fit the entire // output block, it is possible to process one group of elements at a time. // But for now, we simply fall back to the naive implementation. if (data_format == FORMAT_NHWC) { - BiasGradNHWC_Naive< - T><<>>( - total_count, output_backprop, bias_backprop, bias_size); + BiasGradNHWC_Naive + <<>>( + total_count, output_backprop, bias_backprop, bias_size); } else { - BiasGradNCHW_Naive< - T><<>>( - total_count, output_backprop, bias_backprop, bias_size, image_size); + BiasGradNCHW_Naive + <<>>( + total_count, output_backprop, bias_backprop, bias_size, + image_size); } } } diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/kernels/bounds_check.h index e35f42ad417..c8c60c55241 100644 --- a/tensorflow/core/kernels/bounds_check.h +++ b/tensorflow/core/kernels/bounds_check.h @@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) { auto *to_x = reinterpret_cast(&x); return *to_x; } -} // namespace tensorflow::internal +} // namespace internal } // namespace tensorflow #endif // TENSORFLOW_UTIL_BOUNDS_CHECK_H_ diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc index e937c4f11ba..654d99301af 100644 --- a/tensorflow/core/kernels/candidate_sampler_ops.cc +++ b/tensorflow/core/kernels/candidate_sampler_ops.cc @@ -126,13 +126,13 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU), REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU), SimpleCandidateSamplerOp); -REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler") - .Device(DEVICE_CPU), - SimpleCandidateSamplerOp); +REGISTER_KERNEL_BUILDER( + Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU), + SimpleCandidateSamplerOp); -REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler") - .Device(DEVICE_CPU), - SimpleCandidateSamplerOp); +REGISTER_KERNEL_BUILDER( + Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU), + SimpleCandidateSamplerOp); class AllCandidateSamplerOp : public BaseCandidateSamplerOp { public: @@ -197,8 +197,9 @@ class ComputeAccidentalHitsOp : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& in_true_candidates = context->input(0); const TensorShape& in_true_candidates_shape = in_true_candidates.shape(); - OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) && - in_true_candidates_shape.dim_size(1) == num_true_, + OP_REQUIRES(context, + TensorShapeUtils::IsMatrix(in_true_candidates_shape) && + in_true_candidates_shape.dim_size(1) == num_true_, errors::InvalidArgument( "true_candidates must be a batch_size * num_true matrix")); diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc index f16abb2b79f..626db9131ae 100644 --- a/tensorflow/core/kernels/cast_op.cc +++ b/tensorflow/core/kernels/cast_op.cc @@ -36,7 +36,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define CURRY_TYPES2(FN, arg0) \ FN(arg0, bool); \ @@ -223,11 +223,11 @@ class SyclCastOp : public CastOpBase { } }; -#define REGISTER_CAST_SYCL(srctype, dsttype) \ - REGISTER_KERNEL_BUILDER(Name("Cast") \ - .TypeConstraint("SrcT") \ - .TypeConstraint("DstT") \ - .Device(DEVICE_SYCL), \ +#define REGISTER_CAST_SYCL(srctype, dsttype) \ + REGISTER_KERNEL_BUILDER(Name("Cast") \ + .TypeConstraint("SrcT") \ + .TypeConstraint("DstT") \ + .Device(DEVICE_SYCL), \ SyclCastOp) CURRY_TYPES2(REGISTER_CAST_SYCL, bool); CURRY_TYPES2(REGISTER_CAST_SYCL, int32); @@ -237,7 +237,7 @@ CURRY_TYPES2(REGISTER_CAST_SYCL, double); #undef REGISTER_CAST_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #undef CURRY_TYPES2 @@ -250,6 +250,5 @@ REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER( Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"), CpuCastOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow - diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h index 8fedf2c271c..fd4e75d26f0 100644 --- a/tensorflow/core/kernels/cast_op.h +++ b/tensorflow/core/kernels/cast_op.h @@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> { p[0] = a.value; p[1] = 0; #else - static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!"); + static_assert(::tensorflow::port::kLittleEndian, + "Not a little endian system!"); p[0] = 0; p[1] = a.value; #endif diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h index 470e9e08041..3ae9f2ab4d9 100644 --- a/tensorflow/core/kernels/cast_op_impl.h +++ b/tensorflow/core/kernels/cast_op_impl.h @@ -41,25 +41,25 @@ struct CastFunctor { o.device(d) = i.template cast(); } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor -#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \ - FN(arg0, arg1, bool); \ - FN(arg0, arg1, uint8); \ - FN(arg0, arg1, int8); \ - FN(arg0, arg1, uint16); \ - FN(arg0, arg1, int16); \ - FN(arg0, arg1, int32); \ - FN(arg0, arg1, int64); \ - FN(arg0, arg1, float); \ - FN(arg0, arg1, double); \ - FN(arg0, arg1, std::complex); \ +#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \ + FN(arg0, arg1, bool); \ + FN(arg0, arg1, uint8); \ + FN(arg0, arg1, int8); \ + FN(arg0, arg1, uint16); \ + FN(arg0, arg1, int16); \ + FN(arg0, arg1, int32); \ + FN(arg0, arg1, int64); \ + FN(arg0, arg1, float); \ + FN(arg0, arg1, double); \ + FN(arg0, arg1, std::complex); \ FN(arg0, arg1, std::complex) -#define CURRY_TYPES3(FN, arg0, arg1) \ - CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \ +#define CURRY_TYPES3(FN, arg0, arg1) \ + CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \ FN(arg0, arg1, Eigen::half); #define CAST_CASE(DEVICE, IN, OUT) \ diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc index a106f287c18..057e209a719 100644 --- a/tensorflow/core/kernels/cast_op_test.cc +++ b/tensorflow/core/kernels/cast_op_test.cc @@ -107,10 +107,10 @@ static void BM_gpu_float_int64(int iters, int num) { testing::UseRealTime(); #if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL test::Benchmark("sycl", Cast(num)).Run(iters); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20); @@ -130,10 +130,10 @@ static void BM_gpu_bool_float(int iters, int num) { testing::UseRealTime(); #if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL test::Benchmark("sycl", Cast(num)).Run(iters); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20); @@ -180,7 +180,7 @@ static void BM_gpu_float_half(int iters, int num) { testing::UseRealTime(); #if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA } BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20); @@ -191,7 +191,7 @@ static void BM_gpu_half_float(int iters, int num) { testing::UseRealTime(); #if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA } BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20); diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc index ba100b32e7d..9cc2e67bbe1 100644 --- a/tensorflow/core/kernels/colorspace_op.cc +++ b/tensorflow/core/kernels/colorspace_op.cc @@ -107,14 +107,14 @@ class HSVToRGBOp : public OpKernel { } }; -#define REGISTER_CPU(T) \ - REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \ - .TypeConstraint("T"), \ - RGBToHSVOp); \ - template class RGBToHSVOp; \ - REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \ - .TypeConstraint("T"), \ - HSVToRGBOp); \ +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint("T"), \ + RGBToHSVOp); \ + template class RGBToHSVOp; \ + REGISTER_KERNEL_BUILDER( \ + Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint("T"), \ + HSVToRGBOp); \ template class HSVToRGBOp; TF_CALL_float(REGISTER_CPU); TF_CALL_double(REGISTER_CPU); @@ -123,40 +123,39 @@ TF_CALL_double(REGISTER_CPU); // Forward declarations of the function specializations for GPU (to prevent // building the GPU versions here, they will be built compiling _gpu.cu.cc). namespace functor { -#define DECLARE_GPU(T) \ - template <> \ - void RGBToHSV::operator()(const GPUDevice& d, \ - TTypes::ConstTensor input_data, \ - TTypes::Tensor range, \ - TTypes::Tensor output_data); \ - extern template struct RGBToHSV; \ - template <> \ - void HSVToRGB::operator()(const GPUDevice& d, \ - TTypes::ConstTensor input_data, \ - TTypes::Tensor output_data); \ +#define DECLARE_GPU(T) \ + template <> \ + void RGBToHSV::operator()( \ + const GPUDevice& d, TTypes::ConstTensor input_data, \ + TTypes::Tensor range, TTypes::Tensor output_data); \ + extern template struct RGBToHSV; \ + template <> \ + void HSVToRGB::operator()( \ + const GPUDevice& d, TTypes::ConstTensor input_data, \ + TTypes::Tensor output_data); \ extern template struct HSVToRGB; TF_CALL_float(DECLARE_GPU); TF_CALL_double(DECLARE_GPU); } // namespace functor -#define REGISTER_GPU(T) \ - REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \ - .TypeConstraint("T"), \ - RGBToHSVOp); \ - REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \ - .TypeConstraint("T"), \ - HSVToRGBOp); +#define REGISTER_GPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint("T"), \ + RGBToHSVOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint("T"), \ + HSVToRGBOp); TF_CALL_float(REGISTER_GPU); TF_CALL_double(REGISTER_GPU); #endif #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL(T) \ - REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - RGBToHSVOp); \ - REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - HSVToRGBOp); +#define REGISTER_SYCL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint("T"), \ + RGBToHSVOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint("T"), \ + HSVToRGBOp); TF_CALL_float(REGISTER_SYCL); TF_CALL_double(REGISTER_SYCL); #endif diff --git a/tensorflow/core/kernels/colorspace_op.h b/tensorflow/core/kernels/colorspace_op.h index c5721ef6dd0..90bfce14194 100644 --- a/tensorflow/core/kernels/colorspace_op.h +++ b/tensorflow/core/kernels/colorspace_op.h @@ -54,10 +54,9 @@ struct RGBToHSV { // TODO(wicke): all these assignments are only necessary because a combined // expression is larger than kernel parameter space. A custom kernel is // probably in order. - H.device(d) = (R == V).select(norm * (G - B), - (G == V).select( - norm * (B - R) + T(2) / T(6), - norm * (R - G) + T(4) / T(6))); + H.device(d) = (R == V).select( + norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6), + norm * (R - G) + T(4) / T(6))); H.device(d) = (range > T(0)).select(H, H.constant(T(0))); H.device(d) = (H < T(0)).select(H + T(1), H); } diff --git a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc index e19d0b14d5d..61f9ba44c46 100644 --- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc +++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc @@ -17,8 +17,8 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/kernels/colorspace_op.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/colorspace_op.h" namespace tensorflow { @@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice; template class functor::HSVToRGB; TF_CALL_float(INSTANTIATE_GPU); TF_CALL_double(INSTANTIATE_GPU); -} +} // namespace tensorflow #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/colorspace_op_test.cc index 8c6fb732abf..bd82826770f 100644 --- a/tensorflow/core/kernels/colorspace_op_test.cc +++ b/tensorflow/core/kernels/colorspace_op_test.cc @@ -224,34 +224,34 @@ class HSVToRGBOpTest : public OpsTestBase { } }; -#define TEST_COLORSPACE(test, dt) \ - TEST_F(test, CheckBlack) { \ - MakeOp(dt); \ - CheckBlack(dt); \ - } \ - TEST_F(test, CheckGray) { \ - MakeOp(dt); \ - CheckGray(dt); \ - } \ - TEST_F(test, CheckWhite) { \ - MakeOp(dt); \ - CheckWhite(dt); \ - } \ - TEST_F(test, CheckRedMax) { \ - MakeOp(dt); \ - CheckRedMax(dt); \ - } \ - TEST_F(test, CheckGreenMax) { \ - MakeOp(dt); \ - CheckGreenMax(dt); \ - } \ - TEST_F(test, CheckBlueMax) { \ - MakeOp(dt); \ - CheckBlueMax(dt); \ - } \ - TEST_F(test, CheckNegativeDifference) { \ - MakeOp(dt); \ - CheckNegativeDifference(dt); \ +#define TEST_COLORSPACE(test, dt) \ + TEST_F(test, CheckBlack) { \ + MakeOp(dt); \ + CheckBlack(dt); \ + } \ + TEST_F(test, CheckGray) { \ + MakeOp(dt); \ + CheckGray(dt); \ + } \ + TEST_F(test, CheckWhite) { \ + MakeOp(dt); \ + CheckWhite(dt); \ + } \ + TEST_F(test, CheckRedMax) { \ + MakeOp(dt); \ + CheckRedMax(dt); \ + } \ + TEST_F(test, CheckGreenMax) { \ + MakeOp(dt); \ + CheckGreenMax(dt); \ + } \ + TEST_F(test, CheckBlueMax) { \ + MakeOp(dt); \ + CheckBlueMax(dt); \ + } \ + TEST_F(test, CheckNegativeDifference) { \ + MakeOp(dt); \ + CheckNegativeDifference(dt); \ } typedef RGBToHSVOpTest rgb_to_hsv_float; diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h index 526f9420d72..16784c4770e 100644 --- a/tensorflow/core/kernels/concat_lib.h +++ b/tensorflow/core/kernels/concat_lib.h @@ -41,10 +41,11 @@ namespace tensorflow { // Assumes all inputs are nonempty template -void ConcatCPU(DeviceBase* d, - const std::vector< - std::unique_ptr::ConstMatrix>>& inputs, - typename TTypes::Matrix* output); +void ConcatCPU( + DeviceBase* d, + const std::vector::ConstMatrix>>& + inputs, + typename TTypes::Matrix* output); #if GOOGLE_CUDA template void ConcatGPU( @@ -57,11 +58,12 @@ void ConcatGPU( #ifdef TENSORFLOW_USE_SYCL template -void ConcatSYCL(const Eigen::SyclDevice& d, - const std::vector< - std::unique_ptr::ConstMatrix>>& inputs, - typename TTypes::Matrix* output); -#endif // TENSORFLOW_USE_SYCL +void ConcatSYCL( + const Eigen::SyclDevice& d, + const std::vector::ConstMatrix>>& + inputs, + typename TTypes::Matrix* output); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow #endif // TENSORFLOW_KERNELS_CONCAT_LIB_H_ diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc index 43731114c0b..fc5a3e62885 100644 --- a/tensorflow/core/kernels/concat_lib_cpu.cc +++ b/tensorflow/core/kernels/concat_lib_cpu.cc @@ -48,10 +48,11 @@ struct MemCpyCopier { } // namespace template -void ConcatCPU(DeviceBase* d, - const std::vector< - std::unique_ptr::ConstMatrix>>& inputs, - typename TTypes::Matrix* output) { +void ConcatCPU( + DeviceBase* d, + const std::vector::ConstMatrix>>& + inputs, + typename TTypes::Matrix* output) { if (std::is_same::value) { // use a large cost here to force strings to be handled by separate threads ConcatCPUImpl(d, inputs, 100000, MemCpyCopier(), output); @@ -86,21 +87,22 @@ TF_CALL_variant(REGISTER) #ifdef TENSORFLOW_USE_SYCL template -void ConcatSYCL(const Eigen::SyclDevice& d, - const std::vector< - std::unique_ptr::ConstMatrix>>& inputs, - typename TTypes::Matrix* output) { +void ConcatSYCL( + const Eigen::SyclDevice& d, + const std::vector::ConstMatrix>>& + inputs, + typename TTypes::Matrix* output) { ConcatSYCLImpl(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier(), - output); + output); } -#define REGISTER_SYCL(T) \ - template void ConcatSYCL( \ - const Eigen::SyclDevice&, \ - const std::vector::ConstMatrix>>&, \ - typename TTypes::Matrix* output); +#define REGISTER_SYCL(T) \ + template void ConcatSYCL( \ + const Eigen::SyclDevice&, \ + const std::vector::ConstMatrix>>&, \ + typename TTypes::Matrix* output); TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL) #undef REGISTER_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h index 6a933efde4b..720b5065377 100644 --- a/tensorflow/core/kernels/concat_lib_cpu.h +++ b/tensorflow/core/kernels/concat_lib_cpu.h @@ -15,9 +15,9 @@ limitations under the License. #define EIGEN_USE_THREADS -#include "tensorflow/core/kernels/concat_lib.h" #include #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/concat_lib.h" #include "tensorflow/core/util/work_sharder.h" namespace tensorflow { @@ -73,7 +73,7 @@ void ConcatCPUImpl( // Sharded mode. auto work = [&row_size, &sizes, &inputs, &output, &copier, &num_inputs]( - int64 start, int64 end) { + int64 start, int64 end) { int64 skipped_rows = start / row_size; T* out = output->data() + skipped_rows * row_size; T* out_start = output->data() + start; @@ -160,5 +160,5 @@ void ConcatSYCLImpl( } } } -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc index ae1b5da32ea..7011550f7e1 100644 --- a/tensorflow/core/kernels/concat_op.cc +++ b/tensorflow/core/kernels/concat_op.cc @@ -37,7 +37,7 @@ typedef Eigen::GpuDevice GPUDevice; #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM }; @@ -71,8 +71,9 @@ class ConcatBaseOp : public OpKernel { const TensorShape& input_shape = values[0].shape(); int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim; - OP_REQUIRES(c, (0 <= axis && axis < input_dims) || - (allow_legacy_scalars() && concat_dim == 0), + OP_REQUIRES(c, + (0 <= axis && axis < input_dims) || + (allow_legacy_scalars() && concat_dim == 0), errors::InvalidArgument( "ConcatOp : Expected concatenating dimensions in the range " "[", @@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel { c, in.dims() == input_dims || (input_is_scalar && in_is_scalar), errors::InvalidArgument( "ConcatOp : Ranks of all input tensors should match: shape[0] = ", - input_shape.DebugString(), " vs. shape[", i, "] = ", - in.shape().DebugString())); + input_shape.DebugString(), " vs. shape[", i, + "] = ", in.shape().DebugString())); for (int j = 0; j < input_dims; ++j) { if (j == axis) { continue; @@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel { c, in.dim_size(j) == input_shape.dim_size(j), errors::InvalidArgument( "ConcatOp : Dimensions of inputs should match: shape[0] = ", - input_shape.DebugString(), " vs. shape[", i, "] = ", - in.shape().DebugString())); + input_shape.DebugString(), " vs. shape[", i, + "] = ", in.shape().DebugString())); } if (in.NumElements() > 0) { int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0; @@ -142,7 +143,7 @@ class ConcatBaseOp : public OpKernel { ConcatSYCL(c->eigen_sycl_device(), inputs_flat, &output_flat); return; } -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL ConcatCPU(c->device(), inputs_flat, &output_flat); } } @@ -252,7 +253,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2") ConcatV2Op); #undef REGISTER_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class ConcatOffsetOp : public OpKernel { public: @@ -347,5 +348,5 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset") .HostMemory("shape") .HostMemory("offset"), ConcatOffsetOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc index c5bded9dafc..e3ba8ae9f69 100644 --- a/tensorflow/core/kernels/concat_op_test.cc +++ b/tensorflow/core/kernels/concat_op_test.cc @@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000); BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000); typedef Eigen::TensorMap, - Eigen::Unaligned> EigenMap; + Eigen::Unaligned> + EigenMap; static void MemcpyManyAlternative1(int iters, int dim2) { testing::StopTiming(); diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h index 794ac6fa6de..c7c7c983691 100644 --- a/tensorflow/core/kernels/conditional_accumulator_base.h +++ b/tensorflow/core/kernels/conditional_accumulator_base.h @@ -160,7 +160,7 @@ class ConditionalAccumulatorBase : public ResourceBase { * Modifications to convenience macros defined in core/framework/op_kernel.h. * The below macros return a boolean if the test fails, so that the calling * function can get an indication that a failure has occurred. -*/ + */ #define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS) \ do { \ if (!TF_PREDICT_TRUE(EXP)) { \ diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc index fa37916eaba..e13bf8a4c63 100644 --- a/tensorflow/core/kernels/conditional_accumulator_op.cc +++ b/tensorflow/core/kernels/conditional_accumulator_op.cc @@ -99,9 +99,10 @@ class AccumulatorTakeGradientOp ConditionalAccumulatorBase* accumulator, DoneCallback callback) override { // Check signature - OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32}, - {accumulator->dtype()}), - callback); + OP_REQUIRES_OK_ASYNC( + ctx, + ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}), + callback); } private: @@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU), AccumulatorTakeGradientOp); - } // namespace tensorflow diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc index 59f9f69315e..920cd87858a 100644 --- a/tensorflow/core/kernels/constant_op.cc +++ b/tensorflow/core/kernels/constant_op.cc @@ -146,7 +146,6 @@ typedef Eigen::GpuDevice GPUDevice; typedef Eigen::SyclDevice SYCLDevice; #endif // TENSORFLOW_USE_SYCL - template class FillOp : public OpKernel { public: diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc index 8fe82d118a7..7d5d54e5bec 100644 --- a/tensorflow/core/kernels/control_flow_ops.cc +++ b/tensorflow/core/kernels/control_flow_ops.cc @@ -113,47 +113,47 @@ REGISTER_GPU_HOST_REF_KERNEL(string); #undef REGISTER_GPU_HOST_REF_KERNEL #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_SWITCH(type) \ - REGISTER_KERNEL_BUILDER(Name("Switch") \ - .Device(DEVICE_SYCL) \ - .HostMemory("pred") \ - .TypeConstraint("T"),\ +#define REGISTER_SYCL_SWITCH(type) \ + REGISTER_KERNEL_BUILDER(Name("Switch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("pred") \ + .TypeConstraint("T"), \ SwitchOp) TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH); -#define REGISTER_SYCL_REF_SWITCH(type) \ - REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ - .Device(DEVICE_SYCL) \ - .HostMemory("pred") \ - .TypeConstraint("T"), \ +#define REGISTER_SYCL_REF_SWITCH(type) \ + REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("pred") \ + .TypeConstraint("T"), \ SwitchOp) TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH); #undef REGISTER_SYCL_SWITCH #undef REGISTER_SYCL_REF_SWITCH -#define REGISTER_SYCL_HOST_KERNEL(type) \ - REGISTER_KERNEL_BUILDER(Name("Switch") \ - .Device(DEVICE_SYCL) \ - .HostMemory("data") \ - .HostMemory("pred") \ - .HostMemory("output_false")\ - .HostMemory("output_true") \ - .TypeConstraint("T"),\ +#define REGISTER_SYCL_HOST_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Switch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("data") \ + .HostMemory("pred") \ + .HostMemory("output_false") \ + .HostMemory("output_true") \ + .TypeConstraint("T"), \ SwitchOp) REGISTER_SYCL_HOST_KERNEL(bool); REGISTER_SYCL_HOST_KERNEL(string); REGISTER_SYCL_HOST_KERNEL(int32); -#define REGISTER_SYCL_HOST_REF_KERNEL(type) \ - REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ - .Device(DEVICE_SYCL) \ - .HostMemory("data") \ - .HostMemory("pred") \ - .HostMemory("output_false") \ - .HostMemory("output_true") \ - .TypeConstraint("T"), \ +#define REGISTER_SYCL_HOST_REF_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("data") \ + .HostMemory("pred") \ + .HostMemory("output_false") \ + .HostMemory("output_true") \ + .TypeConstraint("T"), \ SwitchOp) REGISTER_SYCL_HOST_REF_KERNEL(int32); @@ -162,7 +162,7 @@ REGISTER_SYCL_HOST_REF_KERNEL(string); #undef REGISTER_SYCL_HOST_KERNEL #undef REGISTER_SYCL_HOST_REF_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class RefSelectOp : public OpKernel { public: @@ -282,7 +282,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL); #undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_REF_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Special GPU kernels for int32 and string. // TODO(b/25387198): Also enable int32 in device memory. This kernel @@ -331,7 +331,7 @@ REGISTER_SYCL_HOST_KERNEL(string); REGISTER_SYCL_HOST_KERNEL(ResourceHandle); #undef REGISTER_SYCL_HOST_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL void EnterOp::Compute(OpKernelContext* context) { if (IsRefType(context->input_dtype(0))) { @@ -360,14 +360,14 @@ REGISTER_GPU_REF_KERNEL(bool); #undef REGISTER_GPU_REF_KERNEL #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ Name("Enter").Device(DEVICE_SYCL).TypeConstraint("T"), EnterOp) REGISTER_SYCL_KERNEL(bool); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); -#define REGISTER_SYCL_REF_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ +#define REGISTER_SYCL_REF_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint("T"), EnterOp) REGISTER_SYCL_REF_KERNEL(bool); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL); @@ -398,7 +398,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle); #undef REGISTER_SYCL_HOST_KERNEL #undef REGISTER_SYCL_HOST_REF_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Special GPU kernels for int32 and string. // TODO(b/25387198): Also enable int32 in device memory. This kernel @@ -455,10 +455,10 @@ REGISTER_GPU_REF_KERNEL(bool); #undef REGISTER_GPU_REF_KERNEL #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ - Name("Exit").Device(DEVICE_SYCL).TypeConstraint("T"), ExitOp); \ - REGISTER_KERNEL_BUILDER( \ +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Exit").Device(DEVICE_SYCL).TypeConstraint("T"), ExitOp); \ + REGISTER_KERNEL_BUILDER( \ Name("RefExit").Device(DEVICE_SYCL).TypeConstraint("T"), ExitOp); REGISTER_SYCL_KERNEL(bool); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); @@ -483,7 +483,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); REGISTER_SYCL_HOST_KERNEL(int32); REGISTER_SYCL_HOST_KERNEL(string); #undef REGISTER_SYCL_HOST_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Special GPU kernels for int32 and string. // TODO(b/25387198): Also enable int32 in device memory. This kernel @@ -556,12 +556,12 @@ REGISTER_GPU_HOST_KERNEL(string); #undef REGISTER_GPU_HOST_KERNEL #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ - Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint("T"), \ - NextIterationOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint("T"),\ +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint("T"), \ + NextIterationOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint("T"), \ NextIterationOp) REGISTER_SYCL_KERNEL(bool); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); @@ -585,7 +585,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); REGISTER_SYCL_HOST_KERNEL(int32); REGISTER_SYCL_HOST_KERNEL(string); #undef REGISTER_SYCL_HOST_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // A LoopCond op has one input and one output. The input is a boolean // scalar representing the taken branches of the "pivot" Switch that @@ -619,7 +619,7 @@ REGISTER_KERNEL_BUILDER(Name("LoopCond") .HostMemory("input") .HostMemory("output"), LoopCondOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // ControlTrigger kernels REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU), @@ -631,7 +631,7 @@ REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU), #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_SYCL), ControlTriggerOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // When called, abort op will abort the current process. This can be used to // abort remote PSs when needed. diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc index affa0e8ca6b..a2f7bd40692 100644 --- a/tensorflow/core/kernels/control_flow_ops_test.cc +++ b/tensorflow/core/kernels/control_flow_ops_test.cc @@ -91,6 +91,7 @@ class KilledBySignal { public: explicit KilledBySignal(int signum) : signum_(signum) {} bool operator()(int exit_status) const { return exit_status == signum_; } + private: const int signum_; }; diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index 985586d6262..dbddaf3dc64 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -688,7 +688,7 @@ void LaunchConv2DOp::operator()( static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit( // default value is in bytes despite the name of the environment variable "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB - ); + ); int device_id = stream->parent()->device_ordinal(); DataType dtype = input.dtype(); diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc index 291ebf22987..1b40ad81f41 100644 --- a/tensorflow/core/kernels/conv_ops_fused.cc +++ b/tensorflow/core/kernels/conv_ops_fused.cc @@ -679,8 +679,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { const int dims = resized_shape.dims(); OP_REQUIRES( - context, TensorShapeUtils::IsMatrix(paddings.shape()) && - paddings.dim_size(1) == 2, + context, + TensorShapeUtils::IsMatrix(paddings.shape()) && + paddings.dim_size(1) == 2, errors::InvalidArgument("paddings must be a matrix with 2 columns: ", paddings.shape().DebugString())); const int fixed_dims = @@ -715,20 +716,22 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { const int32 after = paddings_matrix(d, 1); // Pad after existing elements. OP_REQUIRES(context, before >= 0 && after >= 0, - errors::InvalidArgument("paddings must be non-negative: ", - before, " ", after)); + errors::InvalidArgument( + "paddings must be non-negative: ", before, " ", after)); if (offset_ == 0) { // SYMMETRIC mode. OP_REQUIRES( - context, before <= resized_shape.dim_size(d) && - after <= resized_shape.dim_size(d), + context, + before <= resized_shape.dim_size(d) && + after <= resized_shape.dim_size(d), errors::InvalidArgument("paddings must be no greater " "than the dimension size: ", before, ", ", after, " greater than ", resized_shape.dim_size(d))); } else if (offset_ == 1) { // REFLECT mode. OP_REQUIRES( - context, before < resized_shape.dim_size(d) && - after < resized_shape.dim_size(d), + context, + before < resized_shape.dim_size(d) && + after < resized_shape.dim_size(d), errors::InvalidArgument("paddings must be less than" " the dimension size: ", before, ", ", after, " not less than ", @@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { // We only check the first three dims, since the depth is accessed as an // int64 below. for (int i = 0; i < 3; i++) { - OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), - std::numeric_limits::max()), - errors::InvalidArgument("filter too large")); + OP_REQUIRES( + context, + FastBoundsCheck(filter.dim_size(i), std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); } // The last dimension for input is in_depth. It must be the same as the // filter's in_depth. const int64 in_depth = padded_shape.dim_size(3); - OP_REQUIRES( - context, in_depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - in_depth, " vs ", filter.dim_size(2))); + OP_REQUIRES(context, in_depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", in_depth, + " vs ", filter.dim_size(2))); // The last dimension for filter is out_depth. const int out_depth = static_cast(filter.dim_size(3)); @@ -786,9 +790,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { // The second dimension for input is rows/height. // The first dimension for filter is rows/height. const int64 padded_rows_raw = padded_shape.dim_size(1); - OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input rows too large")); + OP_REQUIRES( + context, + FastBoundsCheck(padded_rows_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input rows too large")); const int padded_rows = static_cast(padded_rows_raw); const int filter_rows = static_cast(filter.dim_size(0)); const int resized_rows = static_cast(resized_shape.dim_size(1)); @@ -796,9 +801,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { // The third dimension for input is columns/width. // The second dimension for filter is columns/width. const int64 padded_cols_raw = padded_shape.dim_size(2); - OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input cols too large")); + OP_REQUIRES( + context, + FastBoundsCheck(padded_cols_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input cols too large")); const int padded_cols = static_cast(padded_cols_raw); const int filter_cols = static_cast(filter.dim_size(1)); const int resized_cols = static_cast(resized_shape.dim_size(2)); @@ -864,24 +870,26 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp); }; -#define REGISTER_FUSED(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("FusedResizeAndPadConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T"), \ - FusedResizeConv2DUsingGemmOp< \ - T, FusedResizeAndPadConvFunctor, \ - BILINEAR>, \ +#define REGISTER_FUSED(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("FusedResizeAndPadConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + FusedResizeConv2DUsingGemmOp< \ + T, \ + FusedResizeAndPadConvFunctor, \ + BILINEAR>, \ true>); TF_CALL_float(REGISTER_FUSED); -#define REGISTER_PAD_ONLY_FUSED(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint("T"), \ - FusedResizeConv2DUsingGemmOp< \ - T, FusedResizeAndPadConvFunctor, \ - NEAREST>, \ +#define REGISTER_PAD_ONLY_FUSED(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint("T"), \ + FusedResizeConv2DUsingGemmOp< \ + T, \ + FusedResizeAndPadConvFunctor, \ + NEAREST>, \ false>); TF_CALL_float(REGISTER_PAD_ONLY_FUSED); diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h index 57e196c67cf..f0085be3a53 100644 --- a/tensorflow/core/kernels/conv_ops_gpu.h +++ b/tensorflow/core/kernels/conv_ops_gpu.h @@ -27,7 +27,6 @@ limitations under the License. namespace tensorflow { - // Get the Cudnn workspace limit from the environment variable, which is in MB. // Return the workspace memory limit in bytes. If no value is set, return the // default value. diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc index af6013c9747..e58f5f61f35 100644 --- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc +++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc @@ -25,9 +25,9 @@ limitations under the License. #include "cuda/include/cuda.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/lib/math/math_util.h" #include "tensorflow/core/util/cuda_kernel_helper.h" #include "tensorflow/core/util/tensor_format.h" -#include "tensorflow/core/lib/math/math_util.h" namespace tensorflow { @@ -252,11 +252,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles( int x = threadIdx.x; Dimension<3> output_dims = { - input_dims[0], input_dims[2], input_dims[1], + input_dims[0], + input_dims[2], + input_dims[1], }; Dimension<3> input_dims_in_tiles = { - input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI, + input_dims[0], + (input_dims[1] + TileSizeI - 1) / TileSizeI, (input_dims[2] + TileSizeJ - 1) / TileSizeJ, }; @@ -264,7 +267,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles( FlatToTensorIndex(blockIdx.x, input_dims_in_tiles); Index<3> input_tile_origin = { - input_tile_index[0], input_tile_index[1] * TileSizeI, + input_tile_index[0], + input_tile_index[1] * TileSizeI, input_tile_index[2] * TileSizeJ, }; @@ -322,11 +326,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles( __syncthreads(); Index<3> output_tile_index = { - input_tile_index[0], input_tile_index[2], input_tile_index[1], + input_tile_index[0], + input_tile_index[2], + input_tile_index[1], }; Index<3> output_tile_origin = { - output_tile_index[0], output_tile_index[1] * TileSizeJ, + output_tile_index[0], + output_tile_index[1] * TileSizeJ, output_tile_index[2] * TileSizeI, }; @@ -799,7 +806,7 @@ struct TransposeElemType<16> { // A helper function to make RunSwapDimension1And2InTensor3 concise. This // helper function looks at the data type and input matrix sizes and decides // the thread numbers and tile sizes to use. -template +template void SwapDimension1And2InTensor3WithNarrowMatrices( const GPUDevice& d, const T* input, const Dimension<3>& input_dims, T* output, const int kMinDimensionToUseTiles) { @@ -902,19 +909,21 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input, constexpr int kNumThreads = 256; Dimension<3> input_dims_in_tiles = { - input_dims[0], MathUtil::CeilOfRatio(input_dims[1], kTileSize), + input_dims[0], + MathUtil::CeilOfRatio(input_dims[1], kTileSize), MathUtil::CeilOfRatio(input_dims[2], kTileSize), }; int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] * input_dims_in_tiles[2]; - SwapDimension1And2InTensor3UsingTiles + SwapDimension1And2InTensor3UsingTiles <<>>(input, input_dims, output); } else if (narrow_matrix) { - SwapDimension1And2InTensor3WithNarrowMatrices(d, input, input_dims, output, - kMinDimensionToUseTiles); + SwapDimension1And2InTensor3WithNarrowMatrices( + d, input, input_dims, output, kMinDimensionToUseTiles); } else { int total_element_count = input_dims[0] * input_dims[1] * input_dims[2]; CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d); diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc index 20da77c36f6..af0a9fa82ee 100644 --- a/tensorflow/core/kernels/conv_ops_using_gemm.cc +++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc @@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp { filter.shape().DebugString())); for (int i = 0; i < 3; i++) { - OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), - std::numeric_limits::max()), - errors::InvalidArgument("filter too large")); + OP_REQUIRES( + context, + FastBoundsCheck(filter.dim_size(i), std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); } // The last dimension for input is in_depth. It must be the same as the // filter's in_depth. const int64 in_depth = GetTensorDim(input, data_format_, 'C'); - OP_REQUIRES( - context, in_depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - in_depth, " vs ", filter.dim_size(2))); + OP_REQUIRES(context, in_depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", in_depth, + " vs ", filter.dim_size(2))); // The last dimension for filter is out_depth. const int out_depth = static_cast(filter.dim_size(3)); @@ -487,18 +488,20 @@ class Conv2DUsingGemmOp : public BinaryOp { // The second dimension for input is rows/height. // The first dimension for filter is rows/height. const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H'); - OP_REQUIRES(context, FastBoundsCheck(input_rows_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input rows too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input_rows_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input rows too large")); const int input_rows = static_cast(input_rows_raw); const int filter_rows = static_cast(filter.dim_size(0)); // The third dimension for input is columns/width. // The second dimension for filter is columns/width. const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W'); - OP_REQUIRES(context, FastBoundsCheck(input_cols_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input cols too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input_cols_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input cols too large")); const int input_cols = static_cast(input_cols_raw); const int filter_cols = static_cast(filter.dim_size(1)); diff --git a/tensorflow/core/kernels/cross_op_gpu.cu.cc b/tensorflow/core/kernels/cross_op_gpu.cu.cc index 7ea0b3be0ca..4a37f6cfbbc 100644 --- a/tensorflow/core/kernels/cross_op_gpu.cu.cc +++ b/tensorflow/core/kernels/cross_op_gpu.cu.cc @@ -17,8 +17,8 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/kernels/cross_op.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/cross_op.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc index 73ee3106048..96bdb6a241b 100644 --- a/tensorflow/core/kernels/ctc_decoder_ops.cc +++ b/tensorflow/core/kernels/ctc_decoder_ops.cc @@ -19,13 +19,13 @@ limitations under the License. #include -#include "tensorflow/core/util/ctc/ctc_beam_search.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/util/ctc/ctc_beam_search.h" #include "tensorflow/core/util/sparse/sparse_tensor.h" namespace tensorflow { @@ -80,16 +80,17 @@ class CTCDecodeHelper { if (!(batch_size == (*seq_len)->dim_size(0))) { return errors::FailedPrecondition( - "len(sequence_length) != batch_size. ", "len(sequence_length): ", - (*seq_len)->dim_size(0), " batch_size: ", batch_size); + "len(sequence_length) != batch_size. ", + "len(sequence_length): ", (*seq_len)->dim_size(0), + " batch_size: ", batch_size); } auto seq_len_t = (*seq_len)->vec(); for (int b = 0; b < batch_size; ++b) { if (!(seq_len_t(b) <= max_time)) { - return errors::FailedPrecondition("sequence_length(", b, ") <= ", - max_time); + return errors::FailedPrecondition("sequence_length(", b, + ") <= ", max_time); } } diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc index fb03adb7a53..b38d838bf1e 100644 --- a/tensorflow/core/kernels/ctc_loss_op.cc +++ b/tensorflow/core/kernels/ctc_loss_op.cc @@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel { const int64 batch_indices = g.group()[0]; OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size), errors::InvalidArgument("labels batch index must be between ", - 0, " and ", batch_size, " but saw: ", - batch_indices)); + 0, " and ", batch_size, + " but saw: ", batch_indices)); auto values = g.values(); std::vector* b_values = &labels_t[batch_indices]; diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc index 5fd38d9dc25..1466f24202f 100644 --- a/tensorflow/core/kernels/cwise_op_abs.cc +++ b/tensorflow/core/kernels/cwise_op_abs.cc @@ -45,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("Abs") .HostMemory("y") .TypeConstraint("T"), UnaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc index 12cc6c8bdd4..49191226074 100644 --- a/tensorflow/core/kernels/cwise_op_acos.cc +++ b/tensorflow/core/kernels/cwise_op_acos.cc @@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double); #if TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc index 39c88140733..c2b355ab7f4 100644 --- a/tensorflow/core/kernels/cwise_op_acosh.cc +++ b/tensorflow/core/kernels/cwise_op_acosh.cc @@ -17,12 +17,12 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_gradients.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, - complex64, complex128); +REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64, + complex128); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double); diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc index 608a6dce3d2..bf32c8a54b3 100644 --- a/tensorflow/core/kernels/cwise_op_add_1.cc +++ b/tensorflow/core/kernels/cwise_op_add_1.cc @@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2") BinaryOp>); #endif - #if TENSORFLOW_USE_SYCL #define REGISTER_KERNEL(type) \ REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \ @@ -66,5 +65,5 @@ REGISTER_KERNEL_BUILDER(Name("AddV2") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc index ac21ca06c92..e8acbac2853 100644 --- a/tensorflow/core/kernels/cwise_op_add_2.cc +++ b/tensorflow/core/kernels/cwise_op_add_2.cc @@ -22,8 +22,8 @@ namespace tensorflow { // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__. #if !defined(__ANDROID_TYPES_SLIM__) -REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, - uint8, complex128, string); +REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8, + complex128, string); // Notice: String is excluded to allow marking AddV2 is_commutative and // is_aggregate. REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8, diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc index c28e27d95ae..fe8dfea1173 100644 --- a/tensorflow/core/kernels/cwise_op_asin.cc +++ b/tensorflow/core/kernels/cwise_op_asin.cc @@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double); #if TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc index 0aec6aac344..7cf0405f524 100644 --- a/tensorflow/core/kernels/cwise_op_asinh.cc +++ b/tensorflow/core/kernels/cwise_op_asinh.cc @@ -1,10 +1,10 @@ - /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -17,8 +17,8 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_gradients.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, - complex64, complex128); +REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64, + complex128); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double); diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc index 7d73de48102..09f0448874f 100644 --- a/tensorflow/core/kernels/cwise_op_atan.cc +++ b/tensorflow/core/kernels/cwise_op_atan.cc @@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double); #if TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc index 7b688db4c58..6170683fa64 100644 --- a/tensorflow/core/kernels/cwise_op_atanh.cc +++ b/tensorflow/core/kernels/cwise_op_atanh.cc @@ -17,8 +17,8 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_gradients.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, - complex64, complex128); +REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64, + complex128); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double); diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc index 0111e9d5fd1..816eadc80eb 100644 --- a/tensorflow/core/kernels/cwise_op_ceil.cc +++ b/tensorflow/core/kernels/cwise_op_ceil.cc @@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double); #if TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc index d4b3b0e3935..71ad0ff0dc2 100644 --- a/tensorflow/core/kernels/cwise_op_cos.cc +++ b/tensorflow/core/kernels/cwise_op_cos.cc @@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc index bca99a4f897..31b4bb3cadd 100644 --- a/tensorflow/core/kernels/cwise_op_cosh.cc +++ b/tensorflow/core/kernels/cwise_op_cosh.cc @@ -16,20 +16,18 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, - complex64, complex128); +REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64, + complex128); #if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Cosh") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); +#define REGISTER_SYCL_KERNEL(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Cosh").Device(DEVICE_SYCL).TypeConstraint("T"), \ + UnaryOp>); REGISTER_SYCL_KERNEL(float); REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double); diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc index d44c1bf473e..c71c756e446 100644 --- a/tensorflow/core/kernels/cwise_op_div.cc +++ b/tensorflow/core/kernels/cwise_op_div.cc @@ -54,5 +54,5 @@ REGISTER_KERNEL_BUILDER(Name("Div") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc index 66d7b7d22eb..8f4ac98016c 100644 --- a/tensorflow/core/kernels/cwise_op_exp.cc +++ b/tensorflow/core/kernels/cwise_op_exp.cc @@ -26,5 +26,5 @@ REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double, #if TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc index 4f723080060..ce03ad5de62 100644 --- a/tensorflow/core/kernels/cwise_op_expm1.cc +++ b/tensorflow/core/kernels/cwise_op_expm1.cc @@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double); #endif #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc index 5a142b9ce9f..d554d41c412 100644 --- a/tensorflow/core/kernels/cwise_op_floor.cc +++ b/tensorflow/core/kernels/cwise_op_floor.cc @@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double); #endif #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc index fa81ef0872d..fecbf859897 100644 --- a/tensorflow/core/kernels/cwise_op_floor_div.cc +++ b/tensorflow/core/kernels/cwise_op_floor_div.cc @@ -49,5 +49,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc index 55f8a30461f..29340b88506 100644 --- a/tensorflow/core/kernels/cwise_op_floor_mod.cc +++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc @@ -40,5 +40,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc index e7dff5d0ac5..77723b3169f 100644 --- a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc +++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc @@ -19,8 +19,8 @@ limitations under the License. namespace tensorflow { namespace functor { - DEFINE_UNARY1(conj, complex64); - DEFINE_UNARY1(conj, complex128); +DEFINE_UNARY1(conj, complex64); +DEFINE_UNARY1(conj, complex128); } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc index 3675398126f..26748ef0e72 100644 --- a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc +++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc @@ -20,7 +20,7 @@ limitations under the License. namespace tensorflow { namespace functor { DEFINE_BINARY10(equal_to, float, Eigen::half, double, uint8, int8, int16, int64, - complex64, complex128, bool); + complex64, complex128, bool); DEFINE_APPROXIMATE_EQUAL2(float, double); } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc index a54dbdfc247..627ecc8c802 100644 --- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc +++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc @@ -15,8 +15,10 @@ limitations under the License. #if GOOGLE_CUDA -#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" +#define EIGEN_USE_GPU + #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" namespace tensorflow { namespace functor { @@ -38,19 +40,17 @@ struct SelectScalarFunctor { typename TTypes::ConstScalar cond, typename TTypes::ConstFlat then_flat, typename TTypes::ConstFlat else_flat) { - #if !defined(EIGEN_HAS_INDEX_LIST) - Eigen::array rank1{1}; + Eigen::array rank1{1}; #else - Eigen::IndexList> rank1; + Eigen::IndexList > rank1; #endif - const int size = then_flat.dimension(0); - Eigen::array broadcast_dims{size}; - - To32Bit(out).device(d) = cond.reshape(rank1) - .broadcast(broadcast_dims) - .select(then_flat, else_flat); + const int size = then_flat.dimension(0); + Eigen::array broadcast_dims{size}; + To32Bit(out).device(d) = cond.reshape(rank1) + .broadcast(broadcast_dims) + .select(then_flat, else_flat); } }; @@ -89,8 +89,8 @@ struct BatchSelectFunctor { } }; -#define SELECT_FUNCTOR(T) \ - template struct SelectFunctor; \ +#define SELECT_FUNCTOR(T) \ + template struct SelectFunctor; \ template struct SelectScalarFunctor; \ template struct BatchSelectFunctor; diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc index ba89899fb32..a4ea4088369 100644 --- a/tensorflow/core/kernels/cwise_op_greater.cc +++ b/tensorflow/core/kernels/cwise_op_greater.cc @@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Greater") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc index 8f0c483aecd..3f34d6269ef 100644 --- a/tensorflow/core/kernels/cwise_op_greater_equal.cc +++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc @@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual") #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double); +REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, + double); REGISTER_KERNEL_BUILDER(Name("GreaterEqual") .Device(DEVICE_SYCL) @@ -44,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc index df2c02e42e1..f5cafcc7809 100644 --- a/tensorflow/core/kernels/cwise_op_invert.cc +++ b/tensorflow/core/kernels/cwise_op_invert.cc @@ -21,7 +21,7 @@ REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64, #ifdef TENSORFLOW_USE_SYCL REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64, - uint8, uint16); + uint8, uint16); #endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc index 53ec1c1c63f..ae1e590d242 100644 --- a/tensorflow/core/kernels/cwise_op_isfinite.cc +++ b/tensorflow/core/kernels/cwise_op_isfinite.cc @@ -26,5 +26,5 @@ REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half, #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc index 4b34744304f..f22ca21e1ca 100644 --- a/tensorflow/core/kernels/cwise_op_isinf.cc +++ b/tensorflow/core/kernels/cwise_op_isinf.cc @@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc index ad2dd3f722c..aa180c247e7 100644 --- a/tensorflow/core/kernels/cwise_op_isnan.cc +++ b/tensorflow/core/kernels/cwise_op_isnan.cc @@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc index 136c3666dfc..00cdecdbd18 100644 --- a/tensorflow/core/kernels/cwise_op_less.cc +++ b/tensorflow/core/kernels/cwise_op_less.cc @@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Less") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc index 97a2508d129..11806c5fc77 100644 --- a/tensorflow/core/kernels/cwise_op_less_equal.cc +++ b/tensorflow/core/kernels/cwise_op_less_equal.cc @@ -44,5 +44,5 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc index 7fdfdff0e38..98936e0f960 100644 --- a/tensorflow/core/kernels/cwise_op_log.cc +++ b/tensorflow/core/kernels/cwise_op_log.cc @@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc index 25ad7b24bb1..162ca9e07cd 100644 --- a/tensorflow/core/kernels/cwise_op_log1p.cc +++ b/tensorflow/core/kernels/cwise_op_log1p.cc @@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc index 87d54e380b4..8c54f22f108 100644 --- a/tensorflow/core/kernels/cwise_op_maximum.cc +++ b/tensorflow/core/kernels/cwise_op_maximum.cc @@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Maximum") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc index 442171193bf..dff83df828f 100644 --- a/tensorflow/core/kernels/cwise_op_minimum.cc +++ b/tensorflow/core/kernels/cwise_op_minimum.cc @@ -43,6 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Minimum") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc index 023eb07ca3f..0e8d2e37350 100644 --- a/tensorflow/core/kernels/cwise_op_mul_1.cc +++ b/tensorflow/core/kernels/cwise_op_mul_1.cc @@ -17,8 +17,8 @@ limitations under the License. namespace tensorflow { -REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, - uint8, int32); +REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8, + int32); #if defined(__ANDROID_TYPES_SLIM__) // We only register the first type when we have multi-argument calls in the // case where we're trying to reduce executable size, but it turns out that the @@ -28,7 +28,7 @@ REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32); #if GOOGLE_CUDA REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double, - uint8); + uint8); // A special GPU kernel for int32. // TODO(b/25387198): Also enable int32 in device memory. This kernel // registration requires all int32 inputs and outputs to be in host memory. @@ -50,5 +50,5 @@ REGISTER_KERNEL_BUILDER(Name("Mul") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_mul_2.cc b/tensorflow/core/kernels/cwise_op_mul_2.cc index 7be5857cc06..6aa8f883640 100644 --- a/tensorflow/core/kernels/cwise_op_mul_2.cc +++ b/tensorflow/core/kernels/cwise_op_mul_2.cc @@ -22,11 +22,11 @@ namespace tensorflow { // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__. #if !defined(__ANDROID_TYPES_SLIM__) -REGISTER6(BinaryOp, CPU, "Mul", functor::mul, - int8, uint16, int16, int64, complex64, complex128); +REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64, + complex64, complex128); #if GOOGLE_CUDA REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64, - complex64, complex128); + complex64, complex128); #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc index 536891b548f..a136769b912 100644 --- a/tensorflow/core/kernels/cwise_op_neg.cc +++ b/tensorflow/core/kernels/cwise_op_neg.cc @@ -27,7 +27,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg") .HostMemory("y") .TypeConstraint("T"), UnaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64, diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc index 7bd81ee1271..02cd2987457 100644 --- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc +++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc @@ -17,7 +17,7 @@ limitations under the License. namespace tensorflow { REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half, - double, uint8, int8, int16); + double, uint8, int8, int16); #if GOOGLE_CUDA REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half, double, uint8); diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc index 7d4ecec59f1..05bdea66367 100644 --- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc +++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc @@ -30,5 +30,5 @@ REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64, #endif // GOOGLE_CUDA -#endif // !defined(__ANDROID_TYPES_SLIM__) +#endif // !defined(__ANDROID_TYPES_SLIM__) } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc index 8c0e21f9cf3..aee25747b86 100644 --- a/tensorflow/core/kernels/cwise_op_reciprocal.cc +++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc @@ -38,7 +38,7 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half, #endif #ifdef TENSORFLOW_USE_SYCL REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float, Eigen::half, double, complex64, complex128); @@ -48,5 +48,5 @@ REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float, #endif #ifdef TENSORFLOW_USE_SYCL REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc index 3dd9de8d897..e259daaba47 100644 --- a/tensorflow/core/kernels/cwise_op_select.cc +++ b/tensorflow/core/kernels/cwise_op_select.cc @@ -30,7 +30,7 @@ typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class SelectOp : public OpKernel { @@ -185,7 +185,7 @@ REGISTER_SELECT_SYCL(double); REGISTER_SELECT_SYCL(int32); REGISTER_SELECT_SYCL(int64); #undef REGISTER_SELECT_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace functor { @@ -201,13 +201,11 @@ struct SelectFunctorBase { }; template -struct SelectFunctor - : SelectFunctorBase {}; +struct SelectFunctor : SelectFunctorBase {}; #ifdef TENSORFLOW_USE_SYCL template -struct SelectFunctor - : SelectFunctorBase {}; -#endif // TENSORFLOW_USE_SYCL +struct SelectFunctor : SelectFunctorBase {}; +#endif // TENSORFLOW_USE_SYCL template struct SelectScalarFunctorBase { @@ -222,12 +220,12 @@ struct SelectScalarFunctorBase { // CPU Specializations of Select functors with scalar template struct SelectScalarFunctor - : SelectScalarFunctorBase {}; + : SelectScalarFunctorBase {}; #ifdef TENSORFLOW_USE_SYCL template struct SelectScalarFunctor - : SelectScalarFunctorBase {}; -#endif // TENSORFLOW_USE_SYCL + : SelectScalarFunctorBase {}; +#endif // TENSORFLOW_USE_SYCL template struct BatchSelectFunctorBase { @@ -240,8 +238,8 @@ struct BatchSelectFunctorBase { const Eigen::DenseIndex all_but_batch = then_flat_outer_dims.dimension(1); #if !defined(EIGEN_HAS_INDEX_LIST) - Eigen::array broadcast_dims{{ 1, all_but_batch }}; - Eigen::Tensor::Dimensions reshape_dims{{ batch, 1 }}; + Eigen::array broadcast_dims{{1, all_but_batch}}; + Eigen::Tensor::Dimensions reshape_dims{{batch, 1}}; #else Eigen::IndexList, Eigen::DenseIndex> broadcast_dims; broadcast_dims.set(1, all_but_batch); @@ -257,13 +255,13 @@ struct BatchSelectFunctorBase { }; template -struct BatchSelectFunctor - : BatchSelectFunctorBase {}; +struct BatchSelectFunctor : BatchSelectFunctorBase { +}; #ifdef TENSORFLOW_USE_SYCL template struct BatchSelectFunctor - : BatchSelectFunctorBase {}; -#endif // TENSORFLOW_USE_SYCL + : BatchSelectFunctorBase {}; +#endif // TENSORFLOW_USE_SYCL } // namespace functor diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc index a76a088ac8f..c132fdb63f2 100644 --- a/tensorflow/core/kernels/cwise_op_sigmoid.cc +++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc @@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half, #endif #ifdef TENSORFLOW_USE_SYCL REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float, Eigen::half, double, complex64, complex128); @@ -35,6 +35,6 @@ REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float, #endif #ifdef TENSORFLOW_USE_SYCL REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc index a4084d5ad17..02915ff4ce4 100644 --- a/tensorflow/core/kernels/cwise_op_sign.cc +++ b/tensorflow/core/kernels/cwise_op_sign.cc @@ -41,6 +41,6 @@ REGISTER_KERNEL_BUILDER(Name("Sign") .HostMemory("y") .TypeConstraint("T"), UnaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc index b91ff1ac30b..16c60578640 100644 --- a/tensorflow/core/kernels/cwise_op_sin.cc +++ b/tensorflow/core/kernels/cwise_op_sin.cc @@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Sin", functor::sin, float, double); -#endif // TENSORFLOW_USE_SYC +#endif // TENSORFLOW_USE_SYC } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc index 055f0b12e14..26b7a940aa8 100644 --- a/tensorflow/core/kernels/cwise_op_sinh.cc +++ b/tensorflow/core/kernels/cwise_op_sinh.cc @@ -16,20 +16,18 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, - complex64, complex128); +REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64, + complex128); #if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Sinh") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); +#define REGISTER_SYCL_KERNEL(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Sinh").Device(DEVICE_SYCL).TypeConstraint("T"), \ + UnaryOp>); REGISTER_SYCL_KERNEL(float); REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYC +#endif // TENSORFLOW_USE_SYC #if GOOGLE_CUDA REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double); diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc index 00efbb00f15..497756133d0 100644 --- a/tensorflow/core/kernels/cwise_op_sqrt.cc +++ b/tensorflow/core/kernels/cwise_op_sqrt.cc @@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER5(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float, Eigen::half, double, complex64, complex128); @@ -36,5 +36,5 @@ REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float, #ifdef TENSORFLOW_USE_SYCL REGISTER2(SimpleBinaryOp, SYCL, "SqrtGrad", functor::sqrt_grad, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc index 07a4b0b084d..7fc2f6bf08b 100644 --- a/tensorflow/core/kernels/cwise_op_square.cc +++ b/tensorflow/core/kernels/cwise_op_square.cc @@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Square") .HostMemory("y") .TypeConstraint("T"), UnaryOp>); -#endif // TENSORFLOW_USE_SYC +#endif // TENSORFLOW_USE_SYC } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc index 6adaecba04b..025041946ac 100644 --- a/tensorflow/core/kernels/cwise_op_sub.cc +++ b/tensorflow/core/kernels/cwise_op_sub.cc @@ -53,5 +53,5 @@ REGISTER_KERNEL_BUILDER(Name("Sub") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc index 7891b1183dd..c1a25767d31 100644 --- a/tensorflow/core/kernels/cwise_op_tan.cc +++ b/tensorflow/core/kernels/cwise_op_tan.cc @@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Tan", functor::tan, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc index 8b3900892c3..c5005f5ea8a 100644 --- a/tensorflow/core/kernels/cwise_op_tanh.cc +++ b/tensorflow/core/kernels/cwise_op_tanh.cc @@ -26,7 +26,7 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float, Eigen::half, double, complex64, complex128); diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc index e561e59cf5a..980edffceb3 100644 --- a/tensorflow/core/kernels/cwise_ops_common.cc +++ b/tensorflow/core/kernels/cwise_ops_common.cc @@ -57,9 +57,9 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx) in1(ctx->input(1)), bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) { if (!bcast.IsValid()) { - ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ", - in0.shape().DebugString(), " vs. ", - in1.shape().DebugString())); + ctx->SetStatus(errors::InvalidArgument( + "Incompatible shapes: ", in0.shape().DebugString(), " vs. ", + in1.shape().DebugString())); return; } const TensorShape output_shape = BCast::ToShape(bcast.output_shape()); diff --git a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h index 43947707089..e81b840a509 100644 --- a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h +++ b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h @@ -50,16 +50,16 @@ struct SimpleBinaryFunctor { // Macros to explicitly instantiate kernels on GPU for multiple types // (T0, T1, etc.) for SimpleBiaryFunctor (e.g., functor::tanh_grad). -#define DEFINE_SIMPLE_BINARY1(F, T) \ +#define DEFINE_SIMPLE_BINARY1(F, T) \ template struct SimpleBinaryFunctor > -#define DEFINE_SIMPLE_BINARY2(F, T0, T1) \ - DEFINE_SIMPLE_BINARY1(F, T0); \ +#define DEFINE_SIMPLE_BINARY2(F, T0, T1) \ + DEFINE_SIMPLE_BINARY1(F, T0); \ DEFINE_SIMPLE_BINARY1(F, T1) -#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2) \ - DEFINE_SIMPLE_BINARY2(F, T0, T1); \ +#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2) \ + DEFINE_SIMPLE_BINARY2(F, T0, T1); \ DEFINE_SIMPLE_BINARY1(F, T2) -#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3) \ - DEFINE_SIMPLE_BINARY2(F, T0, T1); \ +#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3) \ + DEFINE_SIMPLE_BINARY2(F, T0, T1); \ DEFINE_SIMPLE_BINARY2(F, T2, T3) #define DEFINE_SIMPLE_BINARY5(F, T0, T1, T2, T3, T4) \ DEFINE_SIMPLE_BINARY2(F, T0, T1); \ diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h index 77b330f5899..82cdae9a348 100644 --- a/tensorflow/core/kernels/cwise_ops_gradients.h +++ b/tensorflow/core/kernels/cwise_ops_gradients.h @@ -171,7 +171,6 @@ struct SimpleBinaryFunctor { } }; - #ifdef TENSORFLOW_USE_SYCL // Partial specialization of BinaryFunctor for SYCL devices typedef Eigen::SyclDevice SYCLDevice; @@ -184,7 +183,7 @@ struct SimpleBinaryFunctor { } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template struct tanh_grad : base> {}; diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h index 3f6ff7303d6..3e107cee04c 100644 --- a/tensorflow/core/kernels/cwise_ops_sycl_common.h +++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h @@ -51,7 +51,8 @@ struct BinaryFunctor { void operator()(const SYCLDevice& d, typename Functor::tout_type out, typename Functor::tin_type in0, typename Functor::tin_type in1, bool* error) { - To32Bit(out).device(d) = To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func()); + To32Bit(out).device(d) = + To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func()); } void Left(const SYCLDevice& d, typename Functor::tout_type out, @@ -61,7 +62,9 @@ struct BinaryFunctor { constexpr int NumDims = Functor::tin_type::NumDimensions; static_assert(NumDims == 1, "Unexpected size"); Eigen::Sizes<1> scalar_dim; - out.device(d) = scalar.reshape(scalar_dim).broadcast(in.dimensions()).binaryExpr(in, Binary()); + out.device(d) = scalar.reshape(scalar_dim) + .broadcast(in.dimensions()) + .binaryExpr(in, Binary()); } void Right(const SYCLDevice& d, typename Functor::tout_type out, @@ -71,7 +74,8 @@ struct BinaryFunctor { constexpr int NumDims = Functor::tin_type::NumDimensions; static_assert(NumDims == 1, "Unexpected size"); Eigen::Sizes<1> scalar_dim; - out.device(d) = in.binaryExpr(scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary()); + out.device(d) = in.binaryExpr( + scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary()); } void BCast(const SYCLDevice& d, diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc index bca0f1004d5..39f497e7161 100644 --- a/tensorflow/core/kernels/cwise_ops_test.cc +++ b/tensorflow/core/kernels/cwise_ops_test.cc @@ -54,36 +54,36 @@ int ColsFromArg(int arg) { return (arg % kRows); } BM_UNARY(cpu, Floor, float, DT_FLOAT); #if GOOGLE_CUDA BM_UNARY(gpu, Floor, float, DT_FLOAT); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_UNARY(sycl, Floor, float, DT_FLOAT); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL BM_UNARY(cpu, Floor, double, DT_DOUBLE); #if GOOGLE_CUDA BM_UNARY(gpu, Floor, double, DT_DOUBLE); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_UNARY(sycl, Floor, double, DT_DOUBLE); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL BM_UNARY(cpu, Conj, std::complex, DT_COMPLEX64); #if GOOGLE_CUDA BM_UNARY(gpu, Conj, std::complex, DT_COMPLEX64); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_UNARY(cpu, Conj, std::complex, DT_COMPLEX128); #if GOOGLE_CUDA BM_UNARY(gpu, Conj, std::complex, DT_COMPLEX128); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_UNARY(cpu, Rint, double, DT_DOUBLE); #if GOOGLE_CUDA BM_UNARY(gpu, Rint, double, DT_DOUBLE); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_UNARY(cpu, Rint, float, DT_FLOAT); #if GOOGLE_CUDA BM_UNARY(gpu, Rint, float, DT_FLOAT); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA // data func scalar. Graph* BinaryScalar(int num, const string& func) { @@ -113,18 +113,18 @@ Graph* BinaryScalar(int num, const string& func) { BM_BINARY_SCALAR(cpu, Less); #if GOOGLE_CUDA BM_BINARY_SCALAR(gpu, Less); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_BINARY_SCALAR(sycl, Less); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL BM_BINARY_SCALAR(cpu, Add); #if GOOGLE_CUDA BM_BINARY_SCALAR(gpu, Add); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_BINARY_SCALAR(sycl, Add); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #undef BM_BINARY_SCALAR template @@ -163,11 +163,11 @@ using Eigen::half; BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT); #if GOOGLE_CUDA BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_BIAS_ADD_ALL(cpu, half, DT_HALF); #if GOOGLE_CUDA BM_BIAS_ADD_ALL(gpu, half, DT_HALF); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #undef BM_BIAS_ADD_ALL #undef BM_BIAS_ADD @@ -217,15 +217,15 @@ using Eigen::half; #if GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT); BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT); #if GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF); #if GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #undef BM_BIAS_ADD_GRAD_ALL #undef BM_BIAS_ADD_GRAD @@ -265,10 +265,10 @@ Graph* BcastAdd(int rows, int cols, int dim) { BM_BCAST_ADD_ROW_ALL(cpu); #if GOOGLE_CUDA BM_BCAST_ADD_ROW_ALL(gpu); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_BCAST_ADD_ROW_ALL(sycl); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #undef BM_BCAST_ADD_ROW_ALL #undef BM_BCAST_ADD_ROW @@ -291,10 +291,10 @@ BM_BCAST_ADD_ROW_ALL(sycl); BM_BCAST_ADD_COL_ALL(cpu); #if GOOGLE_CUDA BM_BCAST_ADD_COL_ALL(gpu); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_BCAST_ADD_COL_ALL(sycl); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #undef BM_BCAST_ADD_COL_ALL #undef BM_BCAST_ADD_COL diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc index 56044a3d41a..ca22f10a850 100644 --- a/tensorflow/core/kernels/data/iterator_ops.cc +++ b/tensorflow/core/kernels/data/iterator_ops.cc @@ -430,13 +430,10 @@ class IteratorStateVariant { REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant, kIteratorVariantTypeName); -// TODO(mrry): Can we simply use the template kernel here? class IteratorHandleOp : public OpKernel { public: explicit IteratorHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) { - OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_STRING, TensorShape({2}), - &handle_, nullptr)); OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_)); @@ -460,56 +457,51 @@ class IteratorHandleOp : public OpKernel { } void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) { - mutex_lock l(mu_); - FunctionLibraryRuntime* lib = context->function_library(); - std::unique_ptr device_mgr(nullptr); - std::unique_ptr flib_def(nullptr); - std::unique_ptr pflr(nullptr); - // If the iterator is shared then we construct a new FLR, and pass that in. - // NOTE(mrry,rohanj): In this case it is not possible to call remote - // functions from the iterator. We may add this functionality if there - // is sufficient demand, but it will require a significant refactoring. - if (!name_.empty()) { - lib = CreateFLR(context, &device_mgr, &flib_def, &pflr); - } + { + mutex_lock l(mu_); + if (resource_ == nullptr) { + FunctionLibraryRuntime* lib = context->function_library(); + std::unique_ptr device_mgr(nullptr); + std::unique_ptr flib_def(nullptr); + std::unique_ptr pflr(nullptr); + // If the iterator is shared then we construct a new FLR, and pass that + // in. NOTE(mrry,rohanj): In this case it is not possible to call remote + // functions from the iterator. We may add this functionality if there + // is sufficient demand, but it will require a significant refactoring. + if (!name_.empty()) { + lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr); + } - if (resource_ == nullptr) { - ResourceMgr* mgr = context->resource_manager(); - OP_REQUIRES_OK(context, cinfo_.Init(mgr, def())); + ResourceMgr* mgr = context->resource_manager(); + OP_REQUIRES_OK(context, cinfo_.Init(mgr, def())); - IteratorResource* resource; - OP_REQUIRES_OK( - context, - mgr->LookupOrCreate( - cinfo_.container(), cinfo_.name(), &resource, - [lib, &device_mgr, &flib_def, &pflr, this](IteratorResource** ret) - EXCLUSIVE_LOCKS_REQUIRED(mu_) { - *ret = new IteratorResource( - output_dtypes_, output_shapes_, graph_def_version_, - std::move(device_mgr), std::move(flib_def), - std::move(pflr), lib); - return Status::OK(); - })); + IteratorResource* resource; + OP_REQUIRES_OK( + context, + mgr->LookupOrCreate( + cinfo_.container(), cinfo_.name(), &resource, + [lib, &device_mgr, &flib_def, &pflr, + this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + *ret = new IteratorResource( + output_dtypes_, output_shapes_, graph_def_version_, + std::move(device_mgr), std::move(flib_def), + std::move(pflr), lib); + return Status::OK(); + })); - Status s = VerifyResource(resource); - if (TF_PREDICT_FALSE(!s.ok())) { - resource->Unref(); - context->SetStatus(s); - return; + Status s = VerifyResource(resource); + if (TF_PREDICT_FALSE(!s.ok())) { + resource->Unref(); + context->SetStatus(s); + return; + } + + resource_ = resource; } - - auto h = handle_.AccessTensor(context)->template flat(); - h(0) = cinfo_.container(); - h(1) = cinfo_.name(); - resource_ = resource; - } - if (context->expected_output_dtype(0) == DT_RESOURCE) { - OP_REQUIRES_OK(context, MakeResourceHandleToOutput( - context, 0, cinfo_.container(), cinfo_.name(), - MakeTypeIndex())); - } else { - context->set_output_ref(0, &mu_, handle_.AccessTensor(context)); } + OP_REQUIRES_OK(context, MakeResourceHandleToOutput( + context, 0, cinfo_.container(), cinfo_.name(), + MakeTypeIndex())); } private: @@ -526,7 +518,7 @@ class IteratorHandleOp : public OpKernel { return Status::OK(); } - FunctionLibraryRuntime* CreateFLR( + FunctionLibraryRuntime* CreatePrivateFLR( OpKernelContext* ctx, std::unique_ptr* device_mgr, std::unique_ptr* flib_def, std::unique_ptr* pflr) { @@ -546,9 +538,8 @@ class IteratorHandleOp : public OpKernel { } mutex mu_; - ContainerInfo cinfo_ GUARDED_BY(mu_); + ContainerInfo cinfo_; // Written once under mu_ then constant afterwards. IteratorResource* resource_ GUARDED_BY(mu_) = nullptr; - PersistentTensor handle_ GUARDED_BY(mu_); DataTypeVector output_dtypes_; std::vector output_shapes_; const int graph_def_version_; diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc index 965a60c7e05..1b94ea05440 100644 --- a/tensorflow/core/kernels/debug_ops.cc +++ b/tensorflow/core/kernels/debug_ops.cc @@ -46,7 +46,7 @@ REGISTER_KERNEL_BUILDER(Name("CopyHost") .HostMemory("input") .HostMemory("output"), CopyOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Register debug identity (non-ref and ref) ops. REGISTER_KERNEL_BUILDER(Name("DebugIdentity").Device(DEVICE_CPU), @@ -66,7 +66,7 @@ REGISTER_KERNEL_BUILDER(Name("DebugIdentity") .HostMemory("input") .HostMemory("output"), DebugIdentityOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Register debug NaN-counter (non-ref and ref) ops. #define REGISTER_DEBUG_NAN_COUNT(type) \ @@ -98,7 +98,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double); DebugNanCountOp); REGISTER_GPU_DEBUG_NAN_COUNT(float); REGISTER_GPU_DEBUG_NAN_COUNT(double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Register debug numeric summary ops. #define REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(type) \ diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h index 381add3fb3b..53a23b13060 100644 --- a/tensorflow/core/kernels/debug_ops.h +++ b/tensorflow/core/kernels/debug_ops.h @@ -21,7 +21,7 @@ limitations under the License. #endif #ifdef TENSORFLOW_USE_SYCL #include "tensorflow/core/common_runtime/sycl/sycl_util.h" -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #include "tensorflow/core/debug/debug_io_utils.h" #include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/framework/op_kernel.h" @@ -91,7 +91,7 @@ class CopyOp : public OpKernel { Device* device = static_cast(context->device()); // Determine if the input tensor is not on CPU (e.g., on GPU). const bool off_host_input = device->device_type() == DEVICE_SYCL && - !context->input_alloc_attr(0).on_host(); + !context->input_alloc_attr(0).on_host(); if (off_host_input) { SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor); diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc index c4555db453b..0c42f632521 100644 --- a/tensorflow/core/kernels/decode_csv_op.cc +++ b/tensorflow/core/kernels/decode_csv_op.cc @@ -91,9 +91,9 @@ class DecodeCSVOp : public OpKernel { } else { int32 value; OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value), - errors::InvalidArgument("Field ", f, " in record ", i, - " is not a valid int32: ", - fields[f])); + errors::InvalidArgument( + "Field ", f, " in record ", i, + " is not a valid int32: ", fields[f])); output[f]->flat()(i) = value; } break; @@ -111,9 +111,9 @@ class DecodeCSVOp : public OpKernel { } else { int64 value; OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value), - errors::InvalidArgument("Field ", f, " in record ", i, - " is not a valid int64: ", - fields[f])); + errors::InvalidArgument( + "Field ", f, " in record ", i, + " is not a valid int64: ", fields[f])); output[f]->flat()(i) = value; } break; @@ -130,9 +130,9 @@ class DecodeCSVOp : public OpKernel { } else { float value; OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value), - errors::InvalidArgument("Field ", f, " in record ", i, - " is not a valid float: ", - fields[f])); + errors::InvalidArgument( + "Field ", f, " in record ", i, + " is not a valid float: ", fields[f])); output[f]->flat()(i) = value; } break; @@ -150,9 +150,9 @@ class DecodeCSVOp : public OpKernel { } else { double value; OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value), - errors::InvalidArgument("Field ", f, " in record ", i, - " is not a valid double: ", - fields[f])); + errors::InvalidArgument( + "Field ", f, " in record ", i, + " is not a valid double: ", fields[f])); output[f]->flat()(i) = value; } break; @@ -208,9 +208,10 @@ class DecodeCSVOp : public OpKernel { if (!quoted) { while (static_cast(current_idx) < input.size() && input[current_idx] != delim_) { - OP_REQUIRES(ctx, (!use_quote_delim_ || input[current_idx] != '"') && - input[current_idx] != '\n' && - input[current_idx] != '\r', + OP_REQUIRES(ctx, + (!use_quote_delim_ || input[current_idx] != '"') && + input[current_idx] != '\n' && + input[current_idx] != '\r', errors::InvalidArgument( "Unquoted fields cannot have quotes/CRLFs inside")); field += input[current_idx]; @@ -238,10 +239,11 @@ class DecodeCSVOp : public OpKernel { } OP_REQUIRES( - ctx, (static_cast(current_idx) < input.size() && - input[current_idx] == '"' && - (static_cast(current_idx) == input.size() - 1 || - input[current_idx + 1] == delim_)), + ctx, + (static_cast(current_idx) < input.size() && + input[current_idx] == '"' && + (static_cast(current_idx) == input.size() - 1 || + input[current_idx + 1] == delim_)), errors::InvalidArgument("Quoted field has to end with quote " "followed by delim or end")); diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc index 44dcbf834ce..912d04c1536 100644 --- a/tensorflow/core/kernels/decode_image_op.cc +++ b/tensorflow/core/kernels/decode_image_op.cc @@ -87,10 +87,11 @@ class DecodeImageOp : public OpKernel { channels_ = 3; } else { OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_)); - OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 || - channels_ == 4, - errors::InvalidArgument( - "channels must be 0, 1, 3, or 4, got ", channels_)); + OP_REQUIRES( + context, + channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4, + errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ", + channels_)); } flags_.components = channels_; @@ -114,8 +115,9 @@ class DecodeImageOp : public OpKernel { if (format_ == kJpgFormat) { OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio)); - OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 || - flags_.ratio == 4 || flags_.ratio == 8, + OP_REQUIRES(context, + flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 || + flags_.ratio == 8, errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ", flags_.ratio)); OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling", @@ -130,8 +132,9 @@ class DecodeImageOp : public OpKernel { string dct_method; OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method)); OP_REQUIRES( - context, (dct_method.empty() || dct_method == "INTEGER_FAST" || - dct_method == "INTEGER_ACCURATE"), + context, + (dct_method.empty() || dct_method == "INTEGER_FAST" || + dct_method == "INTEGER_ACCURATE"), errors::InvalidArgument("dct_method must be one of " "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}")); if (dct_method == "INTEGER_FAST") { @@ -157,9 +160,9 @@ class DecodeImageOp : public OpKernel { errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ", FileFormatString(magic, input))); OP_REQUIRES(context, input.size() <= std::numeric_limits::max(), - errors::InvalidArgument(FileFormatString(magic, input), - " contents are too large for int: ", - input.size())); + errors::InvalidArgument( + FileFormatString(magic, input), + " contents are too large for int: ", input.size())); OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8, errors::InvalidArgument(FileFormatString(magic, input), " does not support uint16 output")); @@ -212,9 +215,10 @@ class DecodeImageOp : public OpKernel { input.data(), input.size(), flags, nullptr /* nwarn */, [=, &output](int width, int height, int channels) -> uint8* { Status status(context->allocate_output( - 0, format_ == kGifFormat - ? TensorShape({1, height, width, channels}) - : TensorShape({height, width, channels}), + 0, + format_ == kGifFormat + ? TensorShape({1, height, width, channels}) + : TensorShape({height, width, channels}), &output)); if (!status.ok()) { VLOG(1) << status; diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc index 8e9b8a7e2e7..829155fb313 100644 --- a/tensorflow/core/kernels/deep_conv2d.cc +++ b/tensorflow/core/kernels/deep_conv2d.cc @@ -120,9 +120,9 @@ bool CanUseDeepConv2D(int stride_rows, int stride_cols, int filter_rows, VLOG(2) << "CanUseDeepConv2D" << " deep_conv_cost: " << deep_conv_cost - << " direct_conv_cost: " << direct_conv_cost - << " deep_direct_ratio: " << (static_cast(deep_conv_cost) / - static_cast(direct_conv_cost)) + << " direct_conv_cost: " << direct_conv_cost << " deep_direct_ratio: " + << (static_cast(deep_conv_cost) / + static_cast(direct_conv_cost)) << " use_deep_conv: " << (deep_conv_cost < direct_conv_cost); return deep_conv_cost < direct_conv_cost; } diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc index 6d44a92fa3c..6497c8f3719 100644 --- a/tensorflow/core/kernels/dense_update_ops.cc +++ b/tensorflow/core/kernels/dense_update_ops.cc @@ -89,7 +89,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define REGISTER_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ @@ -113,14 +113,14 @@ TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS); #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNELS(type) \ -REGISTER_KERNEL_BUILDER( \ - Name("Assign").Device(DEVICE_SYCL).TypeConstraint("T"), \ - AssignOpT); +#define REGISTER_SYCL_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Assign").Device(DEVICE_SYCL).TypeConstraint("T"), \ + AssignOpT); TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS); #undef REGISTER_SYCL_KERNELS -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define REGISTER_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ @@ -146,7 +146,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #endif // end GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNELS(type) \ +#define REGISTER_SYCL_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint("T"), \ DenseUpdateOp); \ @@ -156,5 +156,5 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS); #undef REGISTER_SYCL_KERNELS -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc index 9347978d515..91a9587174b 100644 --- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc @@ -400,7 +400,7 @@ struct LaunchDepthwiseConvBackpropInputOp { // Computes one shard of depthwise conv2d backprop input. auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop]( - int64 start, int64 limit) { + int64 start, int64 limit) { static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); const int64 input_image_size = @@ -750,7 +750,7 @@ struct LaunchDepthwiseConvBackpropFilterOp { // Computes one shard of depthwise conv2d backprop filter. auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data]( - int64 start, int64 limit) { + int64 start, int64 limit) { static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); const int64 filter_spatial_size = args.filter_rows * args.filter_cols; const int64 padded_out_depth_size = diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc index a5fd07fbe17..c060b2e14d2 100644 --- a/tensorflow/core/kernels/depthwise_conv_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_op.cc @@ -308,10 +308,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp { // in_depth for input and filter must match. const int64 in_depth = GetTensorDim(input, data_format_, 'C'); - OP_REQUIRES( - context, in_depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - in_depth, " vs ", filter.dim_size(2))); + OP_REQUIRES(context, in_depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", in_depth, + " vs ", filter.dim_size(2))); // The last dimension for filter is depth multiplier. const int32 depth_multiplier = filter.dim_size(3); @@ -430,9 +430,10 @@ TF_CALL_double(REGISTER_CPU_KERNEL); #endif #if GOOGLE_CUDA -REGISTER_KERNEL_BUILDER( - Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint("T"), - DepthwiseConv2dNativeOp); +REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative") + .Device(DEVICE_GPU) + .TypeConstraint("T"), + DepthwiseConv2dNativeOp); REGISTER_KERNEL_BUILDER( Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint("T"), diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc index 5493e335328..126b64f73df 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc @@ -17,12 +17,12 @@ limitations under the License. #define EIGEN_USE_GPU #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "external/cub_archive/cub/util_ptx.cuh" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/kernels/depthwise_conv_op.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/cuda_kernel_helper.h" #include "tensorflow/core/util/tensor_format.h" -#include "external/cub_archive/cub/util_ptx.cuh" #if !defined(_MSC_VER) #define UNROLL _Pragma("unroll") @@ -1021,7 +1021,7 @@ __global__ void __launch_bounds__(640, 2) // Device function to compute sub-warp sum reduction for a power-of-two group of // neighboring threads. -template +template __device__ __forceinline__ T WarpSumReduce(T val) { // support only power-of-two widths. assert(__popc(kWidth) == 1); diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc index 86fa7dce36a..d228153d4c7 100644 --- a/tensorflow/core/kernels/diag_op.cc +++ b/tensorflow/core/kernels/diag_op.cc @@ -29,8 +29,8 @@ limitations under the License. #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/work_sharder.h" namespace tensorflow { @@ -47,8 +47,9 @@ class DiagOp : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& diagonal = context->input(0); const int num_dims = diagonal.dims(); - OP_REQUIRES(context, 0 != num_dims, errors::InvalidArgument( - "Input must be at least rank 1, got 0")); + OP_REQUIRES( + context, 0 != num_dims, + errors::InvalidArgument("Input must be at least rank 1, got 0")); TensorShape out_shape; for (int i = 0; i < num_dims; ++i) { out_shape.AddDim(diagonal.dim_size(i)); @@ -60,10 +61,9 @@ class DiagOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output_tensor)); functor::DiagFunctor diagFunc; - Status s = diagFunc(context, - diagonal.NumElements(), - diagonal.flat().data(), - output_tensor->flat().data()); + Status s = + diagFunc(context, diagonal.NumElements(), diagonal.flat().data(), + output_tensor->flat().data()); OP_REQUIRES_OK(context, s); } }; @@ -82,12 +82,12 @@ class DiagPartOp : public OpKernel { errors::InvalidArgument("The rank of the tensor should be \ even and positive, got shape ", tensor.shape().DebugString())); - for (int i = 0; i < out_dims; i++){ - OP_REQUIRES(context, tensor.dim_size(i) == tensor.dim_size(i + out_dims), - errors::InvalidArgument( - "Invalid shape ", tensor.shape().DebugString(), - ": dimensions ", i, " and ", i + out_dims, " do not match.") - ); + for (int i = 0; i < out_dims; i++) { + OP_REQUIRES( + context, tensor.dim_size(i) == tensor.dim_size(i + out_dims), + errors::InvalidArgument("Invalid shape ", + tensor.shape().DebugString(), ": dimensions ", + i, " and ", i + out_dims, " do not match.")); } TensorShape out_shape; @@ -96,13 +96,10 @@ class DiagPartOp : public OpKernel { } Tensor* output = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output(0, out_shape, &output)); + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); functor::DiagPartFunctor diagPartFunc; - Status s = diagPartFunc(context, - out_shape.num_elements(), - tensor.flat().data(), - output->flat().data()); + Status s = diagPartFunc(context, out_shape.num_elements(), + tensor.flat().data(), output->flat().data()); OP_REQUIRES_OK(context, s); } }; @@ -129,9 +126,8 @@ class DiagPartOp : public OpKernel { namespace functor { template struct DiagFunctor { - EIGEN_ALWAYS_INLINE Status - operator() (OpKernelContext* context, const int64 size, - const T* in, T* out) { + EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context, + const int64 size, const T* in, T* out) { // This subprocess is responsible for writing values in index range // [start*size, limit*size) auto subDiag = [in, out, size](int64 start, int64 limit) { @@ -143,17 +139,16 @@ struct DiagFunctor { // Here, 5 is a empirical factor of cost_per_unit. auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); - Shard(worker_threads.num_threads, worker_threads.workers, size, - 5 * size, subDiag); + Shard(worker_threads.num_threads, worker_threads.workers, size, 5 * size, + subDiag); return Status::OK(); } }; template struct DiagPartFunctor { - EIGEN_ALWAYS_INLINE Status - operator() (OpKernelContext* context, const int64 size, - const T* in, T* out) { + EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context, + const int64 size, const T* in, T* out) { // This subprocess is responsible for extracting values in index range // [start, limit) auto subDiagPart = [in, out, size](int64 start, int64 limit) { @@ -164,14 +159,13 @@ struct DiagPartFunctor { // Here, 5 is a empirical factor of cost_per_unit. auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); - Shard(worker_threads.num_threads, worker_threads.workers, size, - 5, subDiagPart); + Shard(worker_threads.num_threads, worker_threads.workers, size, 5, + subDiagPart); return Status::OK(); } }; } // namespace functor - // Register the CPU kernels. #define REGISTER_DIAGOP(T) \ REGISTER_KERNEL_BUILDER( \ @@ -250,6 +244,4 @@ TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU); #endif // GOOGLE_CUDA - } // namespace tensorflow - diff --git a/tensorflow/core/kernels/diag_op.h b/tensorflow/core/kernels/diag_op.h index c6ca6a20474..baf16ddb4b9 100644 --- a/tensorflow/core/kernels/diag_op.h +++ b/tensorflow/core/kernels/diag_op.h @@ -26,14 +26,14 @@ namespace functor { template struct DiagFunctor { - Status operator() (OpKernelContext* context, const int64 size, - const T* in, T* out); + Status operator()(OpKernelContext* context, const int64 size, const T* in, + T* out); }; template struct DiagPartFunctor { - Status operator() (OpKernelContext* context, const int64 size, - const T* in, T* out); + Status operator()(OpKernelContext* context, const int64 size, const T* in, + T* out); }; } // namespace functor diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc index d3c529d784e..910f3093b23 100644 --- a/tensorflow/core/kernels/diag_op_gpu.cu.cc +++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc @@ -19,8 +19,8 @@ limitations under the License. #include #include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/util/cuda_kernel_helper.h" #include "tensorflow/core/kernels/diag_op.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" namespace tensorflow { namespace functor { @@ -28,10 +28,8 @@ namespace functor { typedef Eigen::GpuDevice GPUDevice; template -__global__ void DiagCudaKernel(const int num_threads, - const int64 size, - const T* in, - T* out) { +__global__ void DiagCudaKernel(const int num_threads, const int64 size, + const T* in, T* out) { CUDA_1D_KERNEL_LOOP(index, num_threads) { // Fill the diagonal elements or set to zero in other place. if (index % (1 + size) == 0) { @@ -44,9 +42,8 @@ __global__ void DiagCudaKernel(const int num_threads, template struct DiagFunctor { - EIGEN_ALWAYS_INLINE Status - operator() (OpKernelContext* context, const int64 size, - const T* in, T* out) { + EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context, + const int64 size, const T* in, T* out) { // Empty tensor couldn't launch the kernel. if (size == 0) { return Status::OK(); @@ -56,25 +53,22 @@ struct DiagFunctor { // so this may overflow for `size*size` in extreme cases, // here is checking the multiplication overflow for integer. if (size && (int(size * size) / size) != size) { - return errors::Internal( - "DiagOp got input size too large."); + return errors::Internal("DiagOp got input size too large."); } int virtual_thread_count = int(size * size); // Launch the GPU kernel. const GPUDevice& device = context->eigen_device(); - CudaLaunchConfig diag_config = GetCudaLaunchConfig( - virtual_thread_count, device); - DiagCudaKernel<<>>( - diag_config.virtual_thread_count, size, in, out); + CudaLaunchConfig diag_config = + GetCudaLaunchConfig(virtual_thread_count, device); + DiagCudaKernel<<>>(diag_config.virtual_thread_count, size, + in, out); auto err = cudaGetLastError(); if (err != cudaSuccess) { return errors::Internal( - "Could not launch DiagOp kernel: ", - cudaGetErrorString(err), "."); + "Could not launch DiagOp kernel: ", cudaGetErrorString(err), "."); } return Status::OK(); } @@ -87,12 +81,9 @@ template struct DiagFunctor; template struct DiagFunctor; template struct DiagFunctor; - template -__global__ void DiagPartCudaKernel(const int num_threads, - const int64 size, - const T* in, - T* out) { +__global__ void DiagPartCudaKernel(const int num_threads, const int64 size, + const T* in, T* out) { CUDA_1D_KERNEL_LOOP(index, num_threads) { out[index] = in[(1 + size) * index]; } @@ -100,9 +91,8 @@ __global__ void DiagPartCudaKernel(const int num_threads, template struct DiagPartFunctor { - EIGEN_ALWAYS_INLINE Status - operator() (OpKernelContext* context, const int64 size, - const T* in, T* out) { + EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context, + const int64 size, const T* in, T* out) { // Empty tensor couldn't launch the kernel. if (size == 0) { return Status::OK(); @@ -111,16 +101,14 @@ struct DiagPartFunctor { // Extract the diagonal elements. CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device); - DiagPartCudaKernel<<>>( - diag_config.virtual_thread_count, size, in, out); + DiagPartCudaKernel<<>>(diag_config.virtual_thread_count, + size, in, out); auto err = cudaGetLastError(); if (err != cudaSuccess) { return errors::Internal( - "Could not launch DiagPartOp kernel: ", - cudaGetErrorString(err), "."); + "Could not launch DiagPartOp kernel: ", cudaGetErrorString(err), "."); } return Status::OK(); } diff --git a/tensorflow/core/kernels/diag_op_test.cc b/tensorflow/core/kernels/diag_op_test.cc index 2d1417854cc..a708e53dd01 100644 --- a/tensorflow/core/kernels/diag_op_test.cc +++ b/tensorflow/core/kernels/diag_op_test.cc @@ -30,8 +30,8 @@ static Graph* Diag(int n, DataType type) { return g; } -#define BM_DiagDev(N, T, TFTYPE, DEVICE) \ - static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) { \ +#define BM_DiagDev(N, T, TFTYPE, DEVICE) \ + static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) { \ testing::UseRealTime(); \ testing::ItemsProcessed(static_cast(iters) * N * N); \ test::Benchmark(#DEVICE, Diag(N, TFTYPE)).Run(iters); \ @@ -51,4 +51,3 @@ BM_Diag(128); BM_Diag(512); } // end namespace tensorflow - diff --git a/tensorflow/core/kernels/dilation_ops.cc b/tensorflow/core/kernels/dilation_ops.cc index 6f5c0e91569..441a63465c8 100644 --- a/tensorflow/core/kernels/dilation_ops.cc +++ b/tensorflow/core/kernels/dilation_ops.cc @@ -91,10 +91,10 @@ void ParseSizes(OpKernelContext* context, const std::vector& strides, filter.shape().DebugString())); const int filter_rows = filter.dim_size(0); const int filter_cols = filter.dim_size(1); - OP_REQUIRES( - context, depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - depth, " vs ", filter.dim_size(2))); + OP_REQUIRES(context, depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", depth, " vs ", + filter.dim_size(2))); // Effective filter size, after introducing rate - 1 zeros between each // non-zero filter element. @@ -234,10 +234,11 @@ class DilationBackpropInputOp : public OpKernel { // [ batch, out_rows, out_cols, depth ] const int batch = input.dim_size(0); const int depth = input.dim_size(3); - OP_REQUIRES(context, batch == out_backprop.dim_size(0) && - out_rows == out_backprop.dim_size(1) && - out_cols == out_backprop.dim_size(2) && - depth == out_backprop.dim_size(3), + OP_REQUIRES(context, + batch == out_backprop.dim_size(0) && + out_rows == out_backprop.dim_size(1) && + out_cols == out_backprop.dim_size(2) && + depth == out_backprop.dim_size(3), errors::InvalidArgument("out_backprop has incompatible size.")); // The computed in_backprop has the same dimensions as the input: @@ -353,10 +354,11 @@ class DilationBackpropFilterOp : public OpKernel { // [ batch, out_rows, out_cols, depth ] const int batch = input.dim_size(0); const int depth = input.dim_size(3); - OP_REQUIRES(context, batch == out_backprop.dim_size(0) && - out_rows == out_backprop.dim_size(1) && - out_cols == out_backprop.dim_size(2) && - depth == out_backprop.dim_size(3), + OP_REQUIRES(context, + batch == out_backprop.dim_size(0) && + out_rows == out_backprop.dim_size(1) && + out_cols == out_backprop.dim_size(2) && + depth == out_backprop.dim_size(3), errors::InvalidArgument("out_backprop has incompatible size.")); // The computed filter_backprop has the same dimensions as the filter: diff --git a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc index ac0775fbefe..c63806a7f68 100644 --- a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc @@ -61,9 +61,8 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr, const int w_in = w_beg + w * rate_cols; if (w_in >= 0 && w_in < input_cols) { const T val = - input_ptr[d + - depth * - (w_in + input_cols * (h_in + input_rows * b))] + + input_ptr[d + depth * (w_in + + input_cols * (h_in + input_rows * b))] + filter_ptr[d + depth * (w + filter_cols * h)]; if (val > cur_val) { cur_val = val; @@ -106,9 +105,8 @@ __global__ void DilationBackpropInputKernel( const int w_in = w_beg + w * rate_cols; if (w_in >= 0 && w_in < input_cols) { const T val = - input_ptr[d + - depth * - (w_in + input_cols * (h_in + input_rows * b))] + + input_ptr[d + depth * (w_in + + input_cols * (h_in + input_rows * b))] + filter_ptr[d + depth * (w + filter_cols * h)]; if (val > cur_val) { cur_val = val; @@ -156,9 +154,8 @@ __global__ void DilationBackpropFilterKernel( const int w_in = w_beg + w * rate_cols; if (w_in >= 0 && w_in < input_cols) { const T val = - input_ptr[d + - depth * - (w_in + input_cols * (h_in + input_rows * b))] + + input_ptr[d + depth * (w_in + + input_cols * (h_in + input_rows * b))] + filter_ptr[d + depth * (w + filter_cols * h)]; if (val > cur_val) { cur_val = val; diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/draw_bounding_box_op.cc index a8818b7385d..b5d5b880bbb 100644 --- a/tensorflow/core/kernels/draw_bounding_box_op.cc +++ b/tensorflow/core/kernels/draw_bounding_box_op.cc @@ -29,8 +29,7 @@ template class DrawBoundingBoxesOp : public OpKernel { public: explicit DrawBoundingBoxesOp(OpKernelConstruction* context) - : OpKernel(context) { - } + : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& images = context->input(0); @@ -94,35 +93,28 @@ class DrawBoundingBoxesOp : public OpKernel { int64 color_index = bb % color_table_length; const int64 min_box_row = static_cast(tboxes(b, bb, 0)) * (height - 1); - const int64 min_box_row_clamp = - std::max(min_box_row, 0); + const int64 min_box_row_clamp = std::max(min_box_row, 0); const int64 max_box_row = static_cast(tboxes(b, bb, 2)) * (height - 1); const int64 max_box_row_clamp = std::min(max_box_row, height - 1); const int64 min_box_col = static_cast(tboxes(b, bb, 1)) * (width - 1); - const int64 min_box_col_clamp = - std::max(min_box_col, 0); + const int64 min_box_col_clamp = std::max(min_box_col, 0); const int64 max_box_col = static_cast(tboxes(b, bb, 3)) * (width - 1); - const int64 max_box_col_clamp = - std::min(max_box_col, width - 1); + const int64 max_box_col_clamp = std::min(max_box_col, width - 1); if (min_box_row > max_box_row || min_box_col > max_box_col) { - LOG(WARNING) << "Bounding box (" << min_box_row - << "," << min_box_col - << "," << max_box_row - << "," << max_box_col + LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col + << "," << max_box_row << "," << max_box_col << ") is inverted and will not be drawn."; continue; } - if (min_box_row >= height || max_box_row < 0 || - min_box_col >= width || max_box_col < 0) { - LOG(WARNING) << "Bounding box (" << min_box_row - << "," << min_box_col - << "," << max_box_row - << "," << max_box_col + if (min_box_row >= height || max_box_row < 0 || min_box_col >= width || + max_box_col < 0) { + LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col + << "," << max_box_row << "," << max_box_col << ") is completely outside the image" << " and will not be drawn."; continue; diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc index 861e16b2fd0..3c988db5e61 100644 --- a/tensorflow/core/kernels/dynamic_partition_op.cc +++ b/tensorflow/core/kernels/dynamic_partition_op.cc @@ -103,7 +103,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared { // Walk through data and copy the data to the appropriate output tensor const auto data_flat = data->flat(); std::vector, - Eigen::Aligned> > out_vec; + Eigen::Aligned> > + out_vec; out_vec.reserve(num_partitions_); for (int p = 0; p < num_partitions_; p++) { out_vec.push_back(outputs[p]->vec()); @@ -124,7 +125,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared { } else { // If data has extra dimensions, use Eigen slices std::vector, - Eigen::Aligned> > out_flat; + Eigen::Aligned> > + out_flat; out_flat.reserve(num_partitions_); for (int p = 0; p < num_partitions_; p++) { out_flat.push_back(outputs[p]->flat_outer_dims()); diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc index 9bb58b13f38..9dfeccff0e8 100644 --- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc +++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc @@ -79,9 +79,9 @@ template void RangeInit(const GPUDevice& d, const T start, const T delta, const int32 size, typename TTypes::Flat out) { CudaLaunchConfig config = GetCudaLaunchConfig(size, d); - RangeInitKernel< - T><<>>( - start, delta, size, out.data()); + RangeInitKernel + <<>>( + start, delta, size, out.data()); } // Given *num_runs pairs (key, value), this function moves the value @@ -103,11 +103,10 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices, T* out, int64 gather_dim_size, int64 indices_size, int64 slice_size, int64 out_size) { CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d); - GatherOpKernel< - T, int32, - true><<>>( - params, indices, out, gather_dim_size, indices_size, slice_size, - out_size); + GatherOpKernel + <<>>( + params, indices, out, gather_dim_size, indices_size, slice_size, + out_size); } struct IdentityOp { @@ -231,10 +230,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { OP_REQUIRES_ASYNC( c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()), - errors::InvalidArgument("data.shape must start with partitions.shape, ", - "got data.shape = ", data.shape().DebugString(), - ", partitions.shape = ", - partitions.shape().DebugString()), + errors::InvalidArgument( + "data.shape must start with partitions.shape, ", + "got data.shape = ", data.shape().DebugString(), + ", partitions.shape = ", partitions.shape().DebugString()), done); Tensor partition_count; @@ -245,8 +244,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { AllocatorAttributes alloc_attr; alloc_attr.set_on_host(true); OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), - &partition_count, alloc_attr), + c, + c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), + &partition_count, alloc_attr), done); auto e_part_count = partition_count.flat(); for (int i = 0; i < num_partitions_; i++) e_part_count(i) = 0; @@ -259,8 +259,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { // Prepare for counting. OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), - &partition_count), + c, + c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), + &partition_count), done); Tensor indices_out; // Count how many times each partition index occurs. @@ -280,8 +281,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { alloc_attr.set_on_host(true); alloc_attr.set_gpu_compatible(true); OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp(partition_count.dtype(), partition_count.shape(), - &cpu_tensor, alloc_attr), + c, + c->allocate_temp(partition_count.dtype(), partition_count.shape(), + &cpu_tensor, alloc_attr), done); perftools::gputools::DeviceMemoryBase wrapped( partition_count.flat().data(), num_partitions_ * sizeof(int32)); @@ -340,9 +342,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream); // Allocate temporary storage. OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp( - DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), - &cub_temp_storage), + c, + c->allocate_temp(DT_INT8, + TensorShape({static_cast(temp_storage_bytes)}), + &cub_temp_storage), done); // Radix-sort the partition information. cub::DeviceRadixSort::SortPairs( @@ -376,8 +379,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { zero_functor(device, partition_count->flat()); // Allocate memory for aggregates_out. OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), - &aggregates_out), + c, + c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), + &aggregates_out), done); // Obtain the pointers to inner buffers. int32* keys_in_ptr = partitions_out.flat().data(); @@ -408,9 +412,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { num_runs_ptr, reduction_op, N, cu_stream); // Allocate temporary storage. OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp( - DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), - &cub_temp_storage), + c, + c->allocate_temp(DT_INT8, + TensorShape({static_cast(temp_storage_bytes)}), + &cub_temp_storage), done); // Run reduce-by-key. The effect is that we count how many times // each index appears in partitions. The distinct indices are stored diff --git a/tensorflow/core/kernels/eigen_activations.h b/tensorflow/core/kernels/eigen_activations.h index 99b4b2abe66..302033e47c5 100644 --- a/tensorflow/core/kernels/eigen_activations.h +++ b/tensorflow/core/kernels/eigen_activations.h @@ -21,13 +21,13 @@ limitations under the License. namespace Eigen { /** scalar_sigmoid_fast_derivative_op - * \ingroup CXX11_NeuralNetworks_Module - * \brief Template functor to compute the fast derivative of a sigmoid - * - * Input should be the backpropagated gradient. - * - * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative() - */ + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to compute the fast derivative of a sigmoid + * + * Input should be the backpropagated gradient. + * + * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative() + */ template struct scalar_sigmoid_fast_derivative_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op) @@ -55,13 +55,13 @@ struct functor_traits > { } // namespace internal /** scalar_tanh_fast_derivative_op - * \ingroup CXX11_NeuralNetworks_Module - * \brief Template functor to compute the fast derivative of a tanh - * - * Input should be the backpropagated gradient. - * - * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative() - */ + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to compute the fast derivative of a tanh + * + * Input should be the backpropagated gradient. + * + * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative() + */ template struct scalar_tanh_fast_derivative_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op) @@ -89,11 +89,11 @@ struct functor_traits > { } // namespace internal /** - * \ingroup CXX11_NeuralNetworks_Module - * \brief Template functor to clip the magnitude of the first scalar. - * - * \sa class CwiseBinaryOp, MatrixBase::Clip - */ + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to clip the magnitude of the first scalar. + * + * \sa class CwiseBinaryOp, MatrixBase::Clip + */ template struct scalar_clip_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op) diff --git a/tensorflow/core/kernels/eigen_activations_test.cc b/tensorflow/core/kernels/eigen_activations_test.cc index 907233103d8..34952f5abb8 100644 --- a/tensorflow/core/kernels/eigen_activations_test.cc +++ b/tensorflow/core/kernels/eigen_activations_test.cc @@ -23,7 +23,7 @@ namespace { void EigenApprox(float a, float b) { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); } -} +} // namespace TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) { const ptrdiff_t depth = 3; diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h index 3a94b8c9933..4d86f9deb99 100644 --- a/tensorflow/core/kernels/eigen_attention.h +++ b/tensorflow/core/kernels/eigen_attention.h @@ -21,35 +21,47 @@ limitations under the License. namespace Eigen { /** ExtractGlimpses - * \ingroup CXX11_NeuralNetworks_Module - * - * \brief Extract glimpses from an input tensor. - * - * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch). - * The width and height parameters specify the extension of the returned glimpses. - * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension. - * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension. - * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center. - * - * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch). - * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size. - */ + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Extract glimpses from an input tensor. + * + * The input parameter is expected to be a col-major tensor with a rank of 4 + * (depth, x, y, and batch). The width and height parameters specify the + * extension of the returned glimpses. The offsets parameter specifies the x, y + * locations of the center of the glimpses relative to the center of the input + * image. The vector is expected to contain one IndexPair for each image in the + * batch dimension. The normalized boolean indicates if incoming coordinates are + * normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each + * height and width dimension. The centered boolean indicates if incoming + * coordinates are centered relative to the image, in which case -1.0 and 1.0 + * correspond to minimum and maximum of each dimension while 0.0 corresponds to + * the center. + * + * The result can be assigned to a tensor of rank equal to that of the input. + * The result will be laid out in col-major order (depth, x, y, batch). The + * dimensions of the result will be equal to the dimensions of the input except + * for width and height which will be equal to the requested glimpse size. + */ namespace { template struct GlimpseExtractionOp { GlimpseExtractionOp(const Index width, const Index height, const std::vector >& offsets, - const bool normalized, - const bool centered, - const bool uniform_noise) : - width_(width), height_(height), offsets_(offsets), - normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { } + const bool normalized, const bool centered, + const bool uniform_noise) + : width_(width), + height_(height), + offsets_(offsets), + normalized_(normalized), + centered_(centered), + uniform_noise_(uniform_noise) {} template DSizes dimensions(const Input& input) const { typedef typename internal::traits::Index IndexType; typedef TensorRef::Scalar, 4, - internal::traits::Layout, IndexType> > Ref; + internal::traits::Layout, IndexType> > + Ref; Ref in(input); DSizes dims = in.dimensions(); @@ -62,12 +74,12 @@ struct GlimpseExtractionOp { } template - EIGEN_DEVICE_FUNC - void eval(const Input& input, Output& output, const Device& device) const - { + EIGEN_DEVICE_FUNC void eval(const Input& input, Output& output, + const Device& device) const { typedef typename internal::traits::Index IndexType; typedef TensorRef::Scalar, 4, - internal::traits::Layout, IndexType> > Ref; + internal::traits::Layout, IndexType> > + Ref; Ref in(input); const Index num_channels = in.dimension(0); const Index input_width = in.dimension(1); @@ -97,8 +109,8 @@ struct GlimpseExtractionOp { x -= width_ / 2.0f; y -= height_ / 2.0f; - const Index offset_x = (Index) x; - const Index offset_y = (Index) y; + const Index offset_x = (Index)x; + const Index offset_y = (Index)y; Index glimpse_width = width_; Index glimpse_height = height_; bool partial_overlap = false; @@ -135,7 +147,7 @@ struct GlimpseExtractionOp { if (uniform_noise_) { // Initialize the glimpse with uniform noise. typedef typename internal::remove_const< - typename internal::traits::Scalar>::type Scalar; + typename internal::traits::Scalar>::type Scalar; TensorFixedSize > mini; mini.device(device) = input.template chip<3>(i).minimum(); TensorFixedSize > range; @@ -215,21 +227,22 @@ struct GlimpseExtractionOp { const bool centered_; const bool uniform_noise_; }; -} - +} // namespace template -EIGEN_ALWAYS_INLINE -static const TensorCustomUnaryOp::Index>, const Input> +EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp< + const GlimpseExtractionOp::Index>, + const Input> ExtractGlimpses(const Input& input, const typename internal::traits::Index width, const typename internal::traits::Index height, const std::vector >& offsets, const bool normalized = true, const bool centered = true, - const bool uniform_noise = true) -{ - EIGEN_STATIC_ASSERT(internal::traits::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + const bool uniform_noise = true) { + EIGEN_STATIC_ASSERT(internal::traits::Layout == ColMajor, + YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 4, + YOU_MADE_A_PROGRAMMING_MISTAKE); typedef typename internal::traits::Index Index; const GlimpseExtractionOp op(width, height, offsets, normalized, @@ -237,6 +250,6 @@ ExtractGlimpses(const Input& input, return input.customOp(op); } -} // end namespace Eigen +} // end namespace Eigen #endif // TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_ diff --git a/tensorflow/core/kernels/eigen_attention_test.cc b/tensorflow/core/kernels/eigen_attention_test.cc index 3a2eeb05959..08f61877182 100644 --- a/tensorflow/core/kernels/eigen_attention_test.cc +++ b/tensorflow/core/kernels/eigen_attention_test.cc @@ -23,7 +23,7 @@ namespace { void EigenApprox(float a, float b) { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); } -} +} // namespace TEST(EigenAttentionTest, Simple) { const ptrdiff_t depth = 3; diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h index aec76978102..099696105b6 100644 --- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h +++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h @@ -21,29 +21,29 @@ limitations under the License. namespace Eigen { /** SpatialConvolutionBackwardInput - * \ingroup CXX11_NeuralNetworks_Module - * - * \brief Computes the backprop for the input of a 2D convolution. - * - * The output_backward parameter is expected to be a tensor with a rank of 3 or + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the input of a 2D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 3 or * more (channels, height, width, and optionally others) - * The kernel parameter is expected to be a 4D tensor (filters, channels, + * The kernel parameter is expected to be a 4D tensor (filters, channels, * kernel_height, kernel_width) - * The output_backward and the kernel must both be in col-major layout. The + * The output_backward and the kernel must both be in col-major layout. The * result will also be in col-major layout. - * - * If row_in_stride, col_in_stride > 1, then applies convolution with holes + * + * If row_in_stride, col_in_stride > 1, then applies convolution with holes * (aka atrous convolution), sampling every row_in_stride, col_in_stride input * pixels. - * - * The result can be assigned to a tensor of rank equal to the rank of the + * + * The result can be assigned to a tensor of rank equal to the rank of the * output_backward. The dimensions of the result will be filters, height, width * (and others if applicable). - * - * It is possible to swap the order of the width and height dimensions provided + * + * It is possible to swap the order of the width and height dimensions provided * that the same order is used in the input, the kernel, and the output. - * - */ + * + */ #ifdef EIGEN_HAS_INDEX_LIST typedef IndexList, type2index<0>, type2index<1>, type2index<1> > ReverseColMajor; @@ -293,29 +293,29 @@ SpatialConvolutionBackwardInput( } /** SpatialConvolutionBackwardKernel - * \ingroup CXX11_NeuralNetworks_Module - * - * \brief Computes the backprop for the filter of a 2D convolution. - * - * The output_backward parameter is expected to be a tensor with a rank of 3 or + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the filter of a 2D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 3 or * more (channels, height, width, and optionally others) - * The kernel parameter is expected to be a 4D tensor (filters, channels, + * The kernel parameter is expected to be a 4D tensor (filters, channels, * kernel_height, kernel_width) - * The output_backward and the kernel must both be in col-major layout. The + * The output_backward and the kernel must both be in col-major layout. The * result will also be in col-major layout. - * - * If row_in_stride, col_stride > 1, then applies convolution with holes (aka + * + * If row_in_stride, col_stride > 1, then applies convolution with holes (aka * atrous convolution), sampling every row_in_stride, col_in_stride input * pixels. - * - * The result can be assigned to a tensor of rank equal to the rank of the + * + * The result can be assigned to a tensor of rank equal to the rank of the * output_backward. The dimensions of the result will be filters, height, width * (and others if applicable). - * - * It is possible to swap the order of the width and height dimensions provided + * + * It is possible to swap the order of the width and height dimensions provided * that the same order is used in the input, the kernel, and the output. - * - */ + * + */ template EIGEN_ALWAYS_INLINE static const typename internal::conditional< diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc index 1758067829e..2229ec96594 100644 --- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc +++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc @@ -25,7 +25,7 @@ void EigenApprox(float a, float b) { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); } static int ceil_div(int a, int b) { return (a + b - 1) / b; } -} +} // namespace TEST(EigenBackwardSpatialConvolutionsTest, test_simple_spatial_convolution_backward_input_valid) { diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h index 972036833ff..896c9957616 100644 --- a/tensorflow/core/kernels/eigen_pooling.h +++ b/tensorflow/core/kernels/eigen_pooling.h @@ -309,10 +309,10 @@ struct AvgPoolMeanReducer { _mm512_castsi512_ps( \ _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1)) -// The ternarylogic function immediate determines the values in the result -// In the case below, 0xd8 implies (false_mask) ? (b) : (a) -// For details, refer to the vpternlogd instruction table at -// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf + // The ternarylogic function immediate determines the values in the result + // In the case below, 0xd8 implies (false_mask) ? (b) : (a) + // For details, refer to the vpternlogd instruction table at + // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf #define psel(a, b, false_mask) \ _mm512_castsi512_ps(_mm512_ternarylogic_epi32( \ diff --git a/tensorflow/core/kernels/eigen_pooling_test.cc b/tensorflow/core/kernels/eigen_pooling_test.cc index 9383972b9ff..47b6665e680 100644 --- a/tensorflow/core/kernels/eigen_pooling_test.cc +++ b/tensorflow/core/kernels/eigen_pooling_test.cc @@ -23,7 +23,7 @@ namespace { void EigenApprox(float a, float b) { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); } -} +} // namespace TEST(EigenPoolingTest, Simple) { const int depth = 10; diff --git a/tensorflow/core/kernels/eigen_softmax.h b/tensorflow/core/kernels/eigen_softmax.h index a2930a726f9..12148c54b36 100644 --- a/tensorflow/core/kernels/eigen_softmax.h +++ b/tensorflow/core/kernels/eigen_softmax.h @@ -21,19 +21,21 @@ limitations under the License. namespace Eigen { /** SoftMax - * \ingroup CXX11_NeuralNetworks_Module - * - * \brief Applies a softmax - * - * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other). - * - * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order. - * -*/ + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a softmax + * + * The input parameter is expected to be a col-major tensor with a rank of 2 + * (depth and other). + * + * The result can be assigned to a tensor of rank and dimensions equal to that + * of the input. The result will be laid out in col-major order. + * + */ namespace { struct SoftmaxOp { - SoftmaxOp(const float beta) : beta_(beta) { } + SoftmaxOp(const float beta) : beta_(beta) {} template typename Input::Dimensions dimensions(const Input& input) const { @@ -41,8 +43,7 @@ struct SoftmaxOp { } template - void eval(const Input& input, Output& output, const Device& device) const - { + void eval(const Input& input, Output& output, const Device& device) const { #if !defined(EIGEN_HAS_INDEX_LIST) // nvcc doesn't support cxx11 Eigen::array::Index, 1> depth_dim; @@ -56,35 +57,43 @@ struct SoftmaxOp { #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. - Eigen::IndexList> depth_dim; - Eigen::IndexList> bcast; + Eigen::IndexList > depth_dim; + Eigen::IndexList > bcast; bcast.set(0, dimensions(input)[0]); - Eigen::IndexList, typename internal::traits::Index> dims2d; + Eigen::IndexList, + typename internal::traits::Index> + dims2d; dims2d.set(1, dimensions(input)[1]); #endif - output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp(); - output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + output.device(device) = + ((input - + input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * + beta_) + .exp(); + output.device(device) = + output / + (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); } private: const float beta_; }; -} - +} // namespace template -EIGEN_ALWAYS_INLINE -static const TensorCustomUnaryOp -SoftMax(const Input& input, const float beta) -{ - EIGEN_STATIC_ASSERT(internal::traits::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE); +EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp +SoftMax(const Input& input, const float beta) { + EIGEN_STATIC_ASSERT(internal::traits::Layout == ColMajor, + YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 2, + YOU_MADE_A_PROGRAMMING_MISTAKE); const SoftmaxOp op(beta); return input.customOp(op); } -} // end namespace Eigen +} // end namespace Eigen #endif // TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_ diff --git a/tensorflow/core/kernels/eigen_softmax_test.cc b/tensorflow/core/kernels/eigen_softmax_test.cc index ba681d68ab0..7f985d71366 100644 --- a/tensorflow/core/kernels/eigen_softmax_test.cc +++ b/tensorflow/core/kernels/eigen_softmax_test.cc @@ -23,7 +23,7 @@ namespace { void EigenApprox(float a, float b) { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); } -} +} // namespace TEST(EigenSoftmaxTest, Simple) { const int depth = 1024; diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h index 2fe64cd72ac..1acbe3a6580 100644 --- a/tensorflow/core/kernels/eigen_spatial_convolutions.h +++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h @@ -877,29 +877,29 @@ struct gemm_pack_rhs< } // end namespace internal /** SpatialConvolution - * \ingroup CXX11_NeuralNetworks_Module - * - * \brief Applies a 2D convolution over a multichannel input image. - * - * The input parameter is expected to be a tensor with a rank of 3 or more + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a 2D convolution over a multichannel input image. + * + * The input parameter is expected to be a tensor with a rank of 3 or more * (channels, height, width, and optionally others) - * The kernel parameter is expected to be a 4D tensor (filters, channels, + * The kernel parameter is expected to be a 4D tensor (filters, channels, * kernel_height, kernel_width) - * The input and the kernel must both be in col-major layout. The result will + * The input and the kernel must both be in col-major layout. The result will * also be in col-major layout. - * - * If col_in_stride, row_in_stride > 1, then applies convolution with holes + * + * If col_in_stride, row_in_stride > 1, then applies convolution with holes * (aka atrous convolution), sampling every col_in_stride, row_in_stride input * pixels. - * - * The result can be assigned to a tensor of rank equal to the rank of the + * + * The result can be assigned to a tensor of rank equal to the rank of the * input. The dimensions of the result will be filters, height, width (and * others if applicable). - * - * It is possible to swap the order of the width and height dimensions provided + * + * It is possible to swap the order of the width and height dimensions provided * that the same order is used in the input, the kernel, and the output. - * - */ + * + */ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename internal::conditional< @@ -993,7 +993,7 @@ EIGEN_DEVICE_FUNC default: // Initialize unused variables to avoid a compiler warning out_height = 0; - out_width = 0; + out_width = 0; eigen_assert(false && "unexpected padding"); } diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc index 4fcae25aa6e..1a5b0f2b675 100644 --- a/tensorflow/core/kernels/encode_jpeg_op.cc +++ b/tensorflow/core/kernels/encode_jpeg_op.cc @@ -80,10 +80,11 @@ class EncodeJpegOp : public OpKernel { errors::InvalidArgument("image must be 3-dimensional", image.shape().DebugString())); - OP_REQUIRES(context, FastBoundsCheck(image.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument( - "Cannot encode images with >= max int32 elements")); + OP_REQUIRES( + context, + FastBoundsCheck(image.NumElements(), std::numeric_limits::max()), + errors::InvalidArgument( + "Cannot encode images with >= max int32 elements")); const int32 dim_size0 = static_cast(image.dim_size(0)); const int32 dim_size1 = static_cast(image.dim_size(1)); @@ -100,9 +101,10 @@ class EncodeJpegOp : public OpKernel { } else if (channels == 3) { adjusted_flags.format = jpeg::FORMAT_RGB; } else { - OP_REQUIRES(context, false, errors::InvalidArgument( - "image must have 1 or 3 channels, got ", - image.shape().DebugString())); + OP_REQUIRES( + context, false, + errors::InvalidArgument("image must have 1 or 3 channels, got ", + image.shape().DebugString())); } } else { if (flags_.format == jpeg::FORMAT_GRAYSCALE) { diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc index 268a059275a..83cd0e9b47e 100644 --- a/tensorflow/core/kernels/example_parsing_ops.cc +++ b/tensorflow/core/kernels/example_parsing_ops.cc @@ -346,8 +346,9 @@ class SingleSequenceExampleParserOp : public OpKernel { feature_list_sparse_keys[di].scalar()(); } OP_REQUIRES( - ctx, TensorShapeUtils::IsVector( - feature_list_dense_missing_assumed_empty->shape()), + ctx, + TensorShapeUtils::IsVector( + feature_list_dense_missing_assumed_empty->shape()), errors::InvalidArgument( "Expected feature_list_dense_missing_assumed_empty ", "to be a vector, got shape: ", @@ -386,12 +387,12 @@ class SingleSequenceExampleParserOp : public OpKernel { required[d] = (def_value.NumElements() == 0); // No default provided. if (def_value.NumElements() > 0) { - OP_REQUIRES( - ctx, def_value.shape() == attrs_.context_dense_shapes[d], - errors::InvalidArgument( - "def_value[", d, "].shape() == ", - def_value.shape().DebugString(), " != context_dense_shapes_[", - d, "] == ", attrs_.context_dense_shapes[d].DebugString())); + OP_REQUIRES(ctx, def_value.shape() == attrs_.context_dense_shapes[d], + errors::InvalidArgument( + "def_value[", d, + "].shape() == ", def_value.shape().DebugString(), + " != context_dense_shapes_[", d, + "] == ", attrs_.context_dense_shapes[d].DebugString())); OP_REQUIRES( ctx, def_value.dtype() == attrs_.context_dense_types[d], errors::InvalidArgument( @@ -576,12 +577,12 @@ class SingleSequenceExampleParserOp : public OpKernel { const Feature& f = fl.feature(t); bool types_match; OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match)); - OP_REQUIRES( - ctx, types_match, - errors::InvalidArgument( - "Name: ", name, ", Feature list: ", key, ", Index: ", t, - ". Data types don't match. ", "Expected type: ", - DataTypeString(dtype), " Feature is: ", ProtoDebugString(f))); + OP_REQUIRES(ctx, types_match, + errors::InvalidArgument( + "Name: ", name, ", Feature list: ", key, ", Index: ", t, + ". Data types don't match. ", + "Expected type: ", DataTypeString(dtype), + " Feature is: ", ProtoDebugString(f))); OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f, feature_list_dense_values[d])); } diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc index 4fbf76d2d0d..4a1aa433bc9 100644 --- a/tensorflow/core/kernels/fact_op.cc +++ b/tensorflow/core/kernels/fact_op.cc @@ -122,13 +122,9 @@ static string D(const char* s) { return ret; } -REGISTER_KERNEL_BUILDER(Name("Fact") - .Device(DEVICE_CPU) - .Label(D("Yoxmos").c_str()), - FactOpKernel2); -REGISTER_KERNEL_BUILDER(Name("Fact") - .Device(DEVICE_CPU) - .Label(D("yoxmos").c_str()), - FactOpKernel2); +REGISTER_KERNEL_BUILDER( + Name("Fact").Device(DEVICE_CPU).Label(D("Yoxmos").c_str()), FactOpKernel2); +REGISTER_KERNEL_BUILDER( + Name("Fact").Device(DEVICE_CPU).Label(D("yoxmos").c_str()), FactOpKernel2); } // namespace tensorflow diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc index 5953db14768..af3a42135d1 100644 --- a/tensorflow/core/kernels/fake_quant_ops_test.cc +++ b/tensorflow/core/kernels/fake_quant_ops_test.cc @@ -378,9 +378,8 @@ TEST_F(QuantOpsTest, WithArgsGradient_RegularRange) { Tensor* output = GetOutput(0); auto input_flat = GetInput(0).flat(); Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3})); - FillValues(&expected, - {0.0f, input_flat(1), input_flat(2), - input_flat(3), input_flat(4), 0.0f}); + FillValues(&expected, {0.0f, input_flat(1), input_flat(2), + input_flat(3), input_flat(4), 0.0f}); ExpectClose(expected, *output); } @@ -2167,21 +2166,19 @@ TEST_F(QuantOpsTest, Tensor* output_bprop_wrt_input = GetOutput(0); Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3})); auto grad_flat = GetInput(0).flat(); - FillValues(&expected_bprop_wrt_input, - {0.0f, grad_flat(1), grad_flat(2), - grad_flat(3), grad_flat(4), 0.0f}); + FillValues( + &expected_bprop_wrt_input, + {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f}); ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input); Tensor* output_bprop_wrt_min = GetOutput(1); Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3})); - FillValues(&expected_bprop_wrt_min, - {grad_flat(0), 0.0f, 0.0f}); + FillValues(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f}); ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min); Tensor* output_bprop_wrt_max = GetOutput(2); Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3})); - FillValues(&expected_bprop_wrt_max, - {0.0f, 0.0f, grad_flat(5)}); + FillValues(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)}); ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max); } @@ -2215,21 +2212,19 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_4Bits_NarrowRange) { Tensor* output_bprop_wrt_input = GetOutput(0); Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3})); auto grad_flat = GetInput(0).flat(); - FillValues(&expected_bprop_wrt_input, - {0.0f, grad_flat(1), grad_flat(2), - grad_flat(3), grad_flat(4), 0.0f}); + FillValues( + &expected_bprop_wrt_input, + {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f}); ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input); Tensor* output_bprop_wrt_min = GetOutput(1); Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3})); - FillValues(&expected_bprop_wrt_min, - {grad_flat(0), 0.0f, 0.0f}); + FillValues(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f}); ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min); Tensor* output_bprop_wrt_max = GetOutput(2); Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3})); - FillValues(&expected_bprop_wrt_max, - {0.0f, 0.0f, grad_flat(5)}); + FillValues(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)}); ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max); } @@ -2270,14 +2265,13 @@ TEST_F(QuantOpsTest, Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4})); auto grad_flat = GetInput(0).flat(); - FillValues( - &expected_bprop_wrt_input, - {0.0f, grad_flat(1), grad_flat(2), 0.0f, - 0.0f, grad_flat(5), grad_flat(6), 0.0f, - 0.0f, grad_flat(9), grad_flat(10), 0.0f, - 0.0f, grad_flat(13), grad_flat(14), 0.0f, - 0.0f, grad_flat(17), grad_flat(18), 0.0f, - 0.0f, grad_flat(21), grad_flat(22), 0.0f}); + FillValues(&expected_bprop_wrt_input, + {0.0f, grad_flat(1), grad_flat(2), 0.0f, + 0.0f, grad_flat(5), grad_flat(6), 0.0f, + 0.0f, grad_flat(9), grad_flat(10), 0.0f, + 0.0f, grad_flat(13), grad_flat(14), 0.0f, + 0.0f, grad_flat(17), grad_flat(18), 0.0f, + 0.0f, grad_flat(21), grad_flat(22), 0.0f}); ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input); Tensor* output_bprop_wrt_min = GetOutput(1); diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc index 82ec8791198..479f7be4b50 100644 --- a/tensorflow/core/kernels/fifo_queue.cc +++ b/tensorflow/core/kernels/fifo_queue.cc @@ -255,97 +255,96 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx, // TODO(josh11b): This makes two copies of callback, avoid this if possible. dequeue_attempts_.emplace_back( num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token, - [callback, allow_small_batch, this](Attempt* attempt) - EXCLUSIVE_LOCKS_REQUIRED(mu_) { - int64 queue_size = queues_[0].size(); + [callback, allow_small_batch, + this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + int64 queue_size = queues_[0].size(); - if (closed_ && queue_size < attempt->elements_requested) { - // If we don't have enough for a full dequeue, we have - // to reset the attempt tuple. - if (!attempt->tuple.empty()) { - // Restore already-dequeued elements to the front of the - // queue. - for (int64 i = attempt->tuple[0].dim_size(0) - - attempt->elements_requested - 1; - i >= 0; --i) { - for (int j = 0; j < num_components(); ++j) { - PersistentTensor element; - Status s = GetElementComponentFromBatch( - attempt->tuple, i, j, attempt->context, &element); - if (!s.ok()) { - attempt->context->SetStatus( - errors::DataLoss("Failed to restore element from " - "partially-dequeued batch " - "to FIFOQueue: ", - s.error_message())); - } - queues_[j].push_front(element); - } - } - } - if (allow_small_batch && !queues_[0].empty()) { - // Request all remaining elements in the queue. - queue_size = queues_[0].size(); - attempt->tuple.clear(); - attempt->elements_requested = queue_size; - } else { - if (allow_small_batch) { - // There may be some other attempts containing - // values. If so, we'll yield and wait for them - // to add elements to the queue. - if (!enqueue_attempts_.empty()) return kProgress; - } - if (attempt->context->status().ok()) { - attempt->context->SetStatus(errors::OutOfRange( - "FIFOQueue '", name_, "' is closed and has ", - "insufficient elements (requested ", - attempt->elements_requested, ", current size ", - queue_size, ")")); - } - return kComplete; - } - } - - RunResult result = kNoProgress; - for (; queue_size > 0; --queue_size) { - if (attempt->tuple.empty()) { - // Only allocate tuple when we have something to dequeue - // so we don't use excessive memory when there are many - // blocked dequeue attempts waiting. - attempt->tuple.reserve(num_components()); - for (int i = 0; i < num_components(); ++i) { - const TensorShape shape = - ManyOutShape(i, attempt->elements_requested); - Tensor element; + if (closed_ && queue_size < attempt->elements_requested) { + // If we don't have enough for a full dequeue, we have + // to reset the attempt tuple. + if (!attempt->tuple.empty()) { + // Restore already-dequeued elements to the front of the + // queue. + for (int64 i = attempt->tuple[0].dim_size(0) - + attempt->elements_requested - 1; + i >= 0; --i) { + for (int j = 0; j < num_components(); ++j) { + PersistentTensor element; + Status s = GetElementComponentFromBatch( + attempt->tuple, i, j, attempt->context, &element); + if (!s.ok()) { attempt->context->SetStatus( - attempt->context->allocate_temp(component_dtypes_[i], - shape, &element)); - if (!attempt->context->status().ok()) return kComplete; - attempt->tuple.emplace_back(element); + errors::DataLoss("Failed to restore element from " + "partially-dequeued batch " + "to FIFOQueue: ", + s.error_message())); } - } - result = kProgress; - Tuple tuple; - DequeueLocked(attempt->context, &tuple); - const int64 index = attempt->tuple[0].dim_size(0) - - attempt->elements_requested; - for (int i = 0; i < num_components(); ++i) { - attempt->context->SetStatus(batch_util::CopyElementToSlice( - std::move(tuple[i]), &attempt->tuple[i], index)); - if (!attempt->context->status().ok()) return kComplete; - } - tuple.clear(); - --attempt->elements_requested; - if (attempt->elements_requested == 0) { - tuple = attempt->tuple; - attempt->done_callback = [callback, tuple]() { - callback(tuple); - }; - return kComplete; + queues_[j].push_front(element); } } - return result; - }); + } + if (allow_small_batch && !queues_[0].empty()) { + // Request all remaining elements in the queue. + queue_size = queues_[0].size(); + attempt->tuple.clear(); + attempt->elements_requested = queue_size; + } else { + if (allow_small_batch) { + // There may be some other attempts containing + // values. If so, we'll yield and wait for them + // to add elements to the queue. + if (!enqueue_attempts_.empty()) return kProgress; + } + if (attempt->context->status().ok()) { + attempt->context->SetStatus(errors::OutOfRange( + "FIFOQueue '", name_, "' is closed and has ", + "insufficient elements (requested ", + attempt->elements_requested, ", current size ", + queue_size, ")")); + } + return kComplete; + } + } + + RunResult result = kNoProgress; + for (; queue_size > 0; --queue_size) { + if (attempt->tuple.empty()) { + // Only allocate tuple when we have something to dequeue + // so we don't use excessive memory when there are many + // blocked dequeue attempts waiting. + attempt->tuple.reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + const TensorShape shape = + ManyOutShape(i, attempt->elements_requested); + Tensor element; + attempt->context->SetStatus(attempt->context->allocate_temp( + component_dtypes_[i], shape, &element)); + if (!attempt->context->status().ok()) return kComplete; + attempt->tuple.emplace_back(element); + } + } + result = kProgress; + Tuple tuple; + DequeueLocked(attempt->context, &tuple); + const int64 index = + attempt->tuple[0].dim_size(0) - attempt->elements_requested; + for (int i = 0; i < num_components(); ++i) { + attempt->context->SetStatus(batch_util::CopyElementToSlice( + std::move(tuple[i]), &attempt->tuple[i], index)); + if (!attempt->context->status().ok()) return kComplete; + } + tuple.clear(); + --attempt->elements_requested; + if (attempt->elements_requested == 0) { + tuple = attempt->tuple; + attempt->done_callback = [callback, tuple]() { + callback(tuple); + }; + return kComplete; + } + } + return result; + }); } } if (!already_cancelled) { diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc index bde39770dee..7090417dfdb 100644 --- a/tensorflow/core/kernels/fill_functor.cc +++ b/tensorflow/core/kernels/fill_functor.cc @@ -18,8 +18,8 @@ limitations under the License. #define EIGEN_USE_THREADS #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/variant_encode_decode.h" @@ -60,7 +60,7 @@ DEFINE_SETZERO_CPU(Variant); template void SetZeroFunctor::operator()( const Eigen::SyclDevice& d, typename TTypes::Flat out) { - To32Bit(out).device(d) = To32Bit(out).constant(T(0)); + To32Bit(out).device(d) = To32Bit(out).constant(T(0)); } #define DEFINE_SETZERO_SYCL(T) \ @@ -118,7 +118,8 @@ DEFINE_SETONE_SYCL(double); template struct FillFunctor { - void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes::Flat out, + void operator()(const Eigen::ThreadPoolDevice& d, + typename TTypes::Flat out, typename TTypes::ConstScalar in) { out.device(d) = out.constant(in()); } @@ -150,8 +151,7 @@ struct FillFunctor { } }; -#define DEFINE_FILL_SYCL(T) \ - template struct FillFunctor; +#define DEFINE_FILL_SYCL(T) template struct FillFunctor; DEFINE_FILL_SYCL(float); DEFINE_FILL_SYCL(double); TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL) diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc index 47f4189c30f..135d0023458 100644 --- a/tensorflow/core/kernels/fractional_avg_pool_op.cc +++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc @@ -232,8 +232,9 @@ class FractionalAvgPoolGradOp : public OpKernel { // Grab the inputs. const Tensor& orig_input_tensor_shape = context->input(0); - OP_REQUIRES(context, orig_input_tensor_shape.dims() == 1 && - orig_input_tensor_shape.NumElements() == 4, + OP_REQUIRES(context, + orig_input_tensor_shape.dims() == 1 && + orig_input_tensor_shape.NumElements() == 4, errors::InvalidArgument("original input tensor shape must be" "1-dimensional and 4 elements")); const Tensor& out_backprop = context->input(1); diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc index ef9e8484132..9d4bc35ba89 100644 --- a/tensorflow/core/kernels/function_ops.cc +++ b/tensorflow/core/kernels/function_ops.cc @@ -253,22 +253,21 @@ class SymbolicGradientOp : public AsyncOpKernel { args.push_back(ctx->input(i)); } std::vector* rets = new std::vector; - lib->Run( - opts, handle, args, rets, [ctx, done, rets](const Status& status) { - if (!status.ok()) { - ctx->SetStatus(status); - } else if (rets->size() != ctx->num_outputs()) { - ctx->SetStatus(errors::InvalidArgument( - "SymGrad expects to return ", ctx->num_outputs(), - " tensor(s), but get ", rets->size(), " tensor(s) instead.")); - } else { - for (size_t i = 0; i < rets->size(); ++i) { - ctx->set_output(i, (*rets)[i]); - } - } - delete rets; - done(); - }); + lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) { + if (!status.ok()) { + ctx->SetStatus(status); + } else if (rets->size() != ctx->num_outputs()) { + ctx->SetStatus(errors::InvalidArgument( + "SymGrad expects to return ", ctx->num_outputs(), + " tensor(s), but get ", rets->size(), " tensor(s) instead.")); + } else { + for (size_t i = 0; i < rets->size(); ++i) { + ctx->set_output(i, (*rets)[i]); + } + } + delete rets; + done(); + }); } private: diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc index a8484390b92..4a67b2b3a30 100644 --- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc +++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc @@ -68,7 +68,8 @@ void InvVarianceToVariance::operator()(const Eigen::GpuDevice& d, template void SetNanFunctor::operator()(const Eigen::GpuDevice& d, typename TTypes::Flat out) { - To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits::quiet_NaN()); + To32Bit(out).device(d) = + To32Bit(out).constant(Eigen::NumTraits::quiet_NaN()); } template class VarianceToInvVariance; diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc index dde08b37eac..e6fefe643b7 100644 --- a/tensorflow/core/kernels/gather_functor.cc +++ b/tensorflow/core/kernels/gather_functor.cc @@ -25,12 +25,12 @@ typedef Eigen::GpuDevice GPUDevice; namespace functor { // Forward declarations of the functor specializations for GPU. -#define DECLARE_GPU_SPECS_INDEX(T, Index) \ - template <> \ - int64 GatherFunctor::operator()( \ +#define DECLARE_GPU_SPECS_INDEX(T, Index) \ + template <> \ + int64 GatherFunctor::operator()( \ OpKernelContext* ctx, typename TTypes::ConstTensor Tparams, \ - typename TTypes::ConstFlat Tindices, \ - typename TTypes::Tensor Tout); \ + typename TTypes::ConstFlat Tindices, \ + typename TTypes::Tensor Tout); \ extern template struct GatherFunctor; #define DECLARE_GPU_SPECS(T) \ diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h index 1e429a037e8..16ccb03b850 100644 --- a/tensorflow/core/kernels/gather_functor.h +++ b/tensorflow/core/kernels/gather_functor.h @@ -18,12 +18,12 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/type_traits.h" #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/platform/prefetch.h" #include "tensorflow/core/platform/types.h" -#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/util/work_sharder.h" namespace tensorflow { @@ -52,21 +52,23 @@ SliceIndex HandleCopies(OpKernelContext* ctx, const size_t slice_bytes = slice_elems * sizeof(T); auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); mutex mu; - // Store the value of invalidate index for printing error information, it's a shared variable. + // Store the value of invalidate index for printing error information, it's a + // shared variable. SliceIndex result = -1; - auto work = [&] (int64 start, int64 end) { + auto work = [&](int64 start, int64 end) { SliceIndex batch_idx = static_cast(start / indices_size); SliceIndex indices_idx = static_cast(start % indices_size); SliceIndex batch_idx_end = static_cast(end / indices_size); SliceIndex indices_idx_end = static_cast(end % indices_size); while ((batch_idx < batch_idx_end) || - (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) { + (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) { SliceIndex i_next = indices_idx + 1; SliceIndex b_next = batch_idx + 1; if ((batch_idx == batch_idx_end && i_next < indices_idx_end) || - (i_next < indices_size)) { - port::prefetch(¶ms(batch_idx, indices(i_next), 0)); + (i_next < indices_size)) { + port::prefetch( + ¶ms(batch_idx, indices(i_next), 0)); port::prefetch(&out(batch_idx, i_next, 0)); b_next = batch_idx; } else if (b_next <= batch_idx_end) { @@ -85,11 +87,12 @@ SliceIndex HandleCopies(OpKernelContext* ctx, // ahead-of-time compilation binary size). if (is_simple_type::value) { // Avoid auto-promotion to Index from SliceIndex by casting. - memcpy(out_base + (batch_idx * indices_size + indices_idx) * slice_elems, - params_base + (batch_idx * static_cast(limit) + - static_cast(index)) * - slice_elems, - slice_bytes); + memcpy( + out_base + (batch_idx * indices_size + indices_idx) * slice_elems, + params_base + (batch_idx * static_cast(limit) + + static_cast(index)) * + slice_elems, + slice_bytes); } else { // For non-"simple" types (e.g. strings). out.template chip<1>(indices_idx) = params.template chip<1>(index); @@ -99,8 +102,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx, } }; - Shard(worker_threads->num_threads, worker_threads->workers, batch_size*indices_size, - slice_elems * sizeof(T), work); + Shard(worker_threads->num_threads, worker_threads->workers, + batch_size * indices_size, slice_elems * sizeof(T), work); return result; } @@ -117,16 +120,16 @@ struct GatherFunctorCPU { bool use_large = (slice_size > std::numeric_limits::max() || params.size() > std::numeric_limits::max() || N > std::numeric_limits::max()); -#define CALL(elems) \ - do { \ - if (use_large) { \ - bad_i = HandleCopies(ctx, params, indices, \ - slice_size, out); \ - } else { \ - const int32 small_slice = static_cast(slice_size); \ - bad_i = HandleCopies(ctx, params, indices, \ - small_slice, out); \ - } \ +#define CALL(elems) \ + do { \ + if (use_large) { \ + bad_i = HandleCopies(ctx, params, indices, \ + slice_size, out); \ + } else { \ + const int32 small_slice = static_cast(slice_size); \ + bad_i = HandleCopies(ctx, params, indices, \ + small_slice, out); \ + } \ } while (0) if (slice_size == 10) @@ -143,7 +146,8 @@ struct GatherFunctorCPU { template struct GatherFunctor { - int64 operator()(OpKernelContext* ctx, typename TTypes::ConstTensor params, + int64 operator()(OpKernelContext* ctx, + typename TTypes::ConstTensor params, typename TTypes::ConstFlat indices, typename TTypes::Tensor out); }; diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc index 239d5d2e990..d6cbcf1d936 100644 --- a/tensorflow/core/kernels/gather_op.cc +++ b/tensorflow/core/kernels/gather_op.cc @@ -106,8 +106,7 @@ class GatherOp : public OpKernel { auto out_flat = out->shaped({outer_size, N, inner_size}); functor::GatherFunctor functor; - int64 bad_i = functor(c, params_flat, - indices_flat, out_flat); + int64 bad_i = functor(c, params_flat, indices_flat, out_flat); OP_REQUIRES( c, bad_i < 0, diff --git a/tensorflow/core/kernels/hinge-loss.h b/tensorflow/core/kernels/hinge-loss.h index 789a7ce7a3d..d303e9c877e 100644 --- a/tensorflow/core/kernels/hinge-loss.h +++ b/tensorflow/core/kernels/hinge-loss.h @@ -50,9 +50,8 @@ class HingeLossUpdater : public DualLossUpdater { // valid value for new dual = 0 // c. new optimal value > 1.0. Then new optimal value should be set to 1.0. const double candidate_optimal_dual = - current_dual + - (label - wx) / - (num_loss_partitions * example_weight * weighted_example_norm); + current_dual + (label - wx) / (num_loss_partitions * example_weight * + weighted_example_norm); if (label * candidate_optimal_dual < 0) { return 0.0; } diff --git a/tensorflow/core/kernels/histogram_op_gpu.cu.cc b/tensorflow/core/kernels/histogram_op_gpu.cu.cc index c2bb958be8b..a88e9b0ddcd 100644 --- a/tensorflow/core/kernels/histogram_op_gpu.cu.cc +++ b/tensorflow/core/kernels/histogram_op_gpu.cu.cc @@ -17,16 +17,16 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/kernels/histogram_op.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "external/cub_archive/cub/device/device_histogram.cuh" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/kernels/histogram_op.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/cuda_kernel_helper.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { @@ -104,8 +104,8 @@ struct HistogramFixedWidthFunctor { /* num_samples */ num_samples, /* stream */ stream); if (err != cudaSuccess) { - return errors::Internal("Could not launch HistogramRange: ", - cudaGetErrorString(err), "."); + return errors::Internal( + "Could not launch HistogramRange: ", cudaGetErrorString(err), "."); } return Status::OK(); diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h index f088315ff53..faf997be05c 100644 --- a/tensorflow/core/kernels/image_resizer_state.h +++ b/tensorflow/core/kernels/image_resizer_state.h @@ -109,8 +109,9 @@ struct ImageResizerState { ValidateAndCalculateOutputSize(context, input); if (!context->status().ok()) return; OP_REQUIRES_OK(context, context->allocate_output( - 0, TensorShape({input.dim_size(0), out_height, - out_width, input.dim_size(3)}), + 0, + TensorShape({input.dim_size(0), out_height, + out_width, input.dim_size(3)}), &output)); } @@ -168,8 +169,9 @@ struct ImageResizerGradientState { CalculateResizeScale(original_width, resized_width, align_corners_); output = nullptr; OP_REQUIRES_OK(context, context->allocate_output( - 0, TensorShape({batch_size, original_height, - original_width, channels}), + 0, + TensorShape({batch_size, original_height, + original_width, channels}), &output)); } diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc index e2861ae090c..c37055239c2 100644 --- a/tensorflow/core/kernels/in_topk_op.cc +++ b/tensorflow/core/kernels/in_topk_op.cc @@ -17,11 +17,11 @@ limitations under the License. #define EIGEN_USE_THREADS +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/kernels/bounds_check.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { @@ -98,36 +98,36 @@ class InTopK : public OpKernel { int k_; }; -REGISTER_KERNEL_BUILDER( - Name("InTopK").Device(DEVICE_CPU) - .HostMemory("predictions") - .HostMemory("targets") - .HostMemory("precision") - .TypeConstraint("T"), - InTopK); -REGISTER_KERNEL_BUILDER( - Name("InTopK").Device(DEVICE_CPU) - .HostMemory("predictions") - .HostMemory("targets") - .HostMemory("precision") - .TypeConstraint("T"), - InTopK); +REGISTER_KERNEL_BUILDER(Name("InTopK") + .Device(DEVICE_CPU) + .HostMemory("predictions") + .HostMemory("targets") + .HostMemory("precision") + .TypeConstraint("T"), + InTopK); +REGISTER_KERNEL_BUILDER(Name("InTopK") + .Device(DEVICE_CPU) + .HostMemory("predictions") + .HostMemory("targets") + .HostMemory("precision") + .TypeConstraint("T"), + InTopK); -REGISTER_KERNEL_BUILDER( - Name("InTopKV2").Device(DEVICE_CPU) - .HostMemory("predictions") - .HostMemory("targets") - .HostMemory("k") - .HostMemory("precision") - .TypeConstraint("T"), - InTopK); -REGISTER_KERNEL_BUILDER( - Name("InTopKV2").Device(DEVICE_CPU) - .HostMemory("predictions") - .HostMemory("targets") - .HostMemory("k") - .HostMemory("precision") - .TypeConstraint("T"), - InTopK); +REGISTER_KERNEL_BUILDER(Name("InTopKV2") + .Device(DEVICE_CPU) + .HostMemory("predictions") + .HostMemory("targets") + .HostMemory("k") + .HostMemory("precision") + .TypeConstraint("T"), + InTopK); +REGISTER_KERNEL_BUILDER(Name("InTopKV2") + .Device(DEVICE_CPU) + .HostMemory("predictions") + .HostMemory("targets") + .HostMemory("k") + .HostMemory("precision") + .TypeConstraint("T"), + InTopK); } // namespace tensorflow diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc index 7728ba850c9..a71d047ed1a 100644 --- a/tensorflow/core/kernels/inplace_ops.cc +++ b/tensorflow/core/kernels/inplace_ops.cc @@ -27,13 +27,13 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SyclDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace functor { template -Status DoParallelConcatUpdate(const Device& d, const Tensor& value, - int32 loc, Tensor* output) { +Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc, + Tensor* output) { auto Tvalue = value.shaped({1, value.NumElements()}); auto Toutput = output->flat_outer_dims(); auto nrows = Toutput.dimension(0); @@ -74,7 +74,7 @@ Status DoParallelConcat(const SyclDevice& d, const Tensor& value, int32 loc, return errors::InvalidArgument("Unsupported data type: ", value.dtype()); } } -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace functor @@ -207,7 +207,7 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate") .HostMemory("output") .TypeConstraint("T"), ParallelConcatUpdate); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc index f8ed9351579..f561287f7a1 100644 --- a/tensorflow/core/kernels/l2loss_op.cc +++ b/tensorflow/core/kernels/l2loss_op.cc @@ -17,8 +17,8 @@ limitations under the License. #define EIGEN_USE_THREADS -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/kernels/l2loss_op.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc index 36907fb5716..b58bcf58348 100644 --- a/tensorflow/core/kernels/linalg_ops_common.cc +++ b/tensorflow/core/kernels/linalg_ops_common.cc @@ -108,7 +108,6 @@ void LinearAlgebraOp::Compute(OpKernelContext* context) { auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); Shard(worker_threads.num_threads, worker_threads.workers, batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard); - } template diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc index 31a427f2c90..1335a95dceb 100755 --- a/tensorflow/core/kernels/lmdb_reader_op.cc +++ b/tensorflow/core/kernels/lmdb_reader_op.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/framework/reader_op_kernel.h" #include "tensorflow/core/framework/reader_base.h" +#include "tensorflow/core/framework/reader_op_kernel.h" #include "tensorflow/core/lib/core/errors.h" #include @@ -77,15 +77,13 @@ class LMDBReader : public ReaderBase { *at_end = true; return Status::OK(); } - } - else { + } else { if (Seek(MDB_NEXT) == false) { *at_end = true; return Status::OK(); } } - *key = string(static_cast(mdb_key_.mv_data), - mdb_key_.mv_size); + *key = string(static_cast(mdb_key_.mv_data), mdb_key_.mv_size); *value = string(static_cast(mdb_value_.mv_data), mdb_value_.mv_size); *produced = true; @@ -123,13 +121,10 @@ class LMDBReaderOp : public ReaderOpKernel { explicit LMDBReaderOp(OpKernelConstruction* context) : ReaderOpKernel(context) { Env* env = context->env(); - SetReaderFactory([this, env]() { - return new LMDBReader(name(), env); - }); + SetReaderFactory([this, env]() { return new LMDBReader(name(), env); }); } }; -REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU), - LMDBReaderOp); +REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU), LMDBReaderOp); } // namespace tensorflow diff --git a/tensorflow/core/kernels/logistic-loss.h b/tensorflow/core/kernels/logistic-loss.h index 2765f42bbdc..6479e6f5dc3 100644 --- a/tensorflow/core/kernels/logistic-loss.h +++ b/tensorflow/core/kernels/logistic-loss.h @@ -122,10 +122,9 @@ class LogisticLossUpdater : public DualLossUpdater { num_loss_partitions * weighted_example_norm * example_weight * (0.5 * (1 + tanhx) / label - current_dual); - const double denominator = -2 * label - - num_loss_partitions * weighted_example_norm * - example_weight * (1 - tanhx * tanhx) * 0.5 / - label; + const double denominator = + -2 * label - num_loss_partitions * weighted_example_norm * + example_weight * (1 - tanhx * tanhx) * 0.5 / label; return x - numerator / denominator; } }; diff --git a/tensorflow/core/kernels/loss_test.cc b/tensorflow/core/kernels/loss_test.cc index 89f0677e1f5..460d65c5c27 100644 --- a/tensorflow/core/kernels/loss_test.cc +++ b/tensorflow/core/kernels/loss_test.cc @@ -32,14 +32,17 @@ namespace { TEST(LogisticLoss, ComputePrimalLoss) { LogisticLossUpdater loss_updater; - EXPECT_NEAR(0.693147, loss_updater.ComputePrimalLoss( - 0 /* wx */, 1 /* label */, 1 /* example weight */), + EXPECT_NEAR(0.693147, + loss_updater.ComputePrimalLoss(0 /* wx */, 1 /* label */, + 1 /* example weight */), 1e-3); - EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */, - 1 /* example weight */), + EXPECT_NEAR(0.0, + loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */, + 1 /* example weight */), 1e-3); - EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */, - 1 /* example weight */), + EXPECT_NEAR(0.0, + loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */, + 1 /* example weight */), 1e-3); } @@ -53,31 +56,35 @@ TEST(LogisticLoss, ComputeDualLoss) { loss_updater.ComputeDualLoss(1 /* current dual */, 1 /* label */, 1 /* example weight */), 1e-3); - EXPECT_NEAR(-0.693147, loss_updater.ComputeDualLoss(0.5 /* current dual */, - 1 /* label */, - 1 /* example weight */), - 1e-3); + EXPECT_NEAR( + -0.693147, + loss_updater.ComputeDualLoss(0.5 /* current dual */, 1 /* label */, + 1 /* example weight */), + 1e-3); } TEST(LogisticLoss, ComputeUpdatedDual) { LogisticLossUpdater loss_updater; - EXPECT_NEAR(0.479, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, 1.0 /* label */, - 1.0 /* example weight */, 0.5 /* current_dual */, - 0.3 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(0.479, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, 1.0 /* label */, + 1.0 /* example weight */, 0.5 /* current_dual */, + 0.3 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); - EXPECT_NEAR(-0.031, loss_updater.ComputeUpdatedDual( - 2 /* num partitions */, -1.0 /* label */, - 1.0 /* example weight */, 0.1 /* current_dual */, - -0.8 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(-0.031, + loss_updater.ComputeUpdatedDual( + 2 /* num partitions */, -1.0 /* label */, + 1.0 /* example weight */, 0.1 /* current_dual */, + -0.8 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); } TEST(SquaredLoss, ComputePrimalLoss) { SquaredLossUpdater loss_updater; - EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, - 1.0 /* example weight */), + EXPECT_NEAR(0.5, + loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, + 1.0 /* example weight */), 1e-3); EXPECT_NEAR(40.5, loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */, @@ -95,43 +102,50 @@ TEST(SquaredLoss, ComputePrimalLoss) { TEST(SquaredLoss, ComputeDualLoss) { SquaredLossUpdater loss_updater; - EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */, - -1.0 /* label */, - 1.0 /* example weight */), - 1e-3); - EXPECT_NEAR(0.66, loss_updater.ComputeDualLoss(0.2 /* current dual */, - -1.0 /* label */, - 3.0 /* example weight */), - 1e-3); - EXPECT_NEAR(-0.375, loss_updater.ComputeDualLoss(1.5 /* current dual */, - 1.0 /* label */, - 1.0 /* example weight */), - 1e-3); - EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */, - 1.0 /* label */, - 3.0 /* example weight */), - 1e-3); + EXPECT_NEAR( + 0.0, + loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */, + 1.0 /* example weight */), + 1e-3); + EXPECT_NEAR( + 0.66, + loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */, + 3.0 /* example weight */), + 1e-3); + EXPECT_NEAR( + -0.375, + loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */, + 1.0 /* example weight */), + 1e-3); + EXPECT_NEAR( + -1.125, + loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */, + 3.0 /* example weight */), + 1e-3); } TEST(SquaredLoss, ComputeUpdatedDual) { SquaredLossUpdater loss_updater; - EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, 1.0 /* label */, - 1.0 /* example weight */, 0.3 /* current_dual */, - 0.3 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(0.336, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, 1.0 /* label */, + 1.0 /* example weight */, 0.3 /* current_dual */, + 0.3 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); - EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual( - 5 /* num partitions */, -1.0 /* label */, - 1.0 /* example weight */, -0.4 /* current_dual */, - 0.8 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(-0.427, + loss_updater.ComputeUpdatedDual( + 5 /* num partitions */, -1.0 /* label */, + 1.0 /* example weight */, -0.4 /* current_dual */, + 0.8 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); } TEST(HingeLoss, ComputePrimalLoss) { HingeLossUpdater loss_updater; - EXPECT_NEAR(1.0, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, - 1.0 /* example weight */), + EXPECT_NEAR(1.0, + loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, + 1.0 /* example weight */), 1e-3); EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */, @@ -149,10 +163,11 @@ TEST(HingeLoss, ComputePrimalLoss) { TEST(HingeLoss, ComputeDualLoss) { HingeLossUpdater loss_updater; - EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */, - -1.0 /* label */, - 1.0 /* example weight */), - 1e-3); + EXPECT_NEAR( + 0.0, + loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */, + 1.0 /* example weight */), + 1e-3); EXPECT_NEAR( std::numeric_limits::max(), loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */, @@ -163,10 +178,11 @@ TEST(HingeLoss, ComputeDualLoss) { loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */, 1.0 /* example weight */), 1e-3); - EXPECT_NEAR(-1.5, loss_updater.ComputeDualLoss(0.5 /* current dual */, - 1.0 /* label */, - 3.0 /* example weight */), - 1e-3); + EXPECT_NEAR( + -1.5, + loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */, + 3.0 /* example weight */), + 1e-3); } TEST(HingeLoss, ConvertLabel) { @@ -195,28 +211,31 @@ TEST(HingeLoss, ComputeUpdatedDual) { // weighted_example_norm=100.0, it turns out that the optimal value to update // the dual to is 0.507 which is within the permitted range and thus should be // the value returned. - EXPECT_NEAR(0.507, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, 1.0 /* label */, - 1.0 /* example weight */, 0.5 /* current_dual */, - 0.3 /* wx */, 100.0 /* weighted_example_norm */), + EXPECT_NEAR(0.507, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, 1.0 /* label */, + 1.0 /* example weight */, 0.5 /* current_dual */, + 0.3 /* wx */, 100.0 /* weighted_example_norm */), 1e-3); // When label=-1.0, example_weight=1.0, current_dual=0.4, wx=0.6, // weighted_example_norm=10.0 and num_loss_partitions=10, it turns out that // the optimal value to update the dual to is 0.384 which is within the // permitted range and thus should be the value returned. - EXPECT_NEAR(-0.416, loss_updater.ComputeUpdatedDual( - 10 /* num partitions */, -1.0 /* label */, - 1.0 /* example weight */, -0.4 /* current_dual */, - 0.6 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(-0.416, + loss_updater.ComputeUpdatedDual( + 10 /* num partitions */, -1.0 /* label */, + 1.0 /* example weight */, -0.4 /* current_dual */, + 0.6 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); // When label=1.0, example_weight=1.0, current_dual=-0.5, wx=0.3 and // weighted_example_norm=10.0, it turns out that the optimal value to update // the dual to is -0.43. However, this is outside the allowed [0.0, 1.0] range // and hence the closest permitted value (0.0) should be returned instead. - EXPECT_NEAR(0.0, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, 1.0 /* label */, - 1.0 /* example weight */, -0.5 /* current_dual */, - 0.3 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(0.0, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, 1.0 /* label */, + 1.0 /* example weight */, -0.5 /* current_dual */, + 0.3 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); // When label=-1.0, example_weight=2.0, current_dual=-1.0, wx=0.3 and @@ -224,17 +243,19 @@ TEST(HingeLoss, ComputeUpdatedDual) { // the dual to is -1.065. However, this is outside the allowed [-1.0, 0.0] // range and hence the closest permitted value (-1.0) should be returned // instead. - EXPECT_NEAR(-1.0, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, -1.0 /* label */, - 2.0 /* example weight */, -1.0 /* current_dual */, - 0.3 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(-1.0, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, -1.0 /* label */, + 2.0 /* example weight */, -1.0 /* current_dual */, + 0.3 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); } TEST(SmoothHingeLoss, ComputePrimalLoss) { SmoothHingeLossUpdater loss_updater; - EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, - 1.0 /* example weight */), + EXPECT_NEAR(0.5, + loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, + 1.0 /* example weight */), 1e-3); EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */, @@ -252,10 +273,11 @@ TEST(SmoothHingeLoss, ComputePrimalLoss) { TEST(SmoothHingeLoss, ComputeDualLoss) { SmoothHingeLossUpdater loss_updater; - EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */, - -1.0 /* label */, - 1.0 /* example weight */), - 1e-3); + EXPECT_NEAR( + 0.0, + loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */, + 1.0 /* example weight */), + 1e-3); EXPECT_NEAR( std::numeric_limits::max(), loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */, @@ -266,24 +288,27 @@ TEST(SmoothHingeLoss, ComputeDualLoss) { loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */, 1.0 /* example weight */), 1e-3); - EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */, - 1.0 /* label */, - 3.0 /* example weight */), - 1e-3); + EXPECT_NEAR( + -1.125, + loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */, + 3.0 /* example weight */), + 1e-3); } TEST(SmoothHingeLoss, ComputeUpdatedDual) { SmoothHingeLossUpdater loss_updater; - EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, 1.0 /* label */, - 1.0 /* example weight */, 0.3 /* current_dual */, - 0.3 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(0.336, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, 1.0 /* label */, + 1.0 /* example weight */, 0.3 /* current_dual */, + 0.3 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); - EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual( - 5 /* num partitions */, -1.0 /* label */, - 1.0 /* example weight */, -0.4 /* current_dual */, - 0.8 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(-0.427, + loss_updater.ComputeUpdatedDual( + 5 /* num partitions */, -1.0 /* label */, + 1.0 /* example weight */, -0.4 /* current_dual */, + 0.8 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); } diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc index c905ebc84a6..c3a59c95762 100644 --- a/tensorflow/core/kernels/lrn_op.cc +++ b/tensorflow/core/kernels/lrn_op.cc @@ -229,10 +229,11 @@ class LRNOp : public OpKernel { explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); float tmp; OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp)); @@ -247,9 +248,10 @@ class LRNOp : public OpKernel { const Tensor& in = context->input(0); OP_REQUIRES(context, in.dims() == 4, errors::InvalidArgument("in must be 4-dimensional")); - OP_REQUIRES(context, FastBoundsCheck(in.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("argument to LRN too large")); + OP_REQUIRES( + context, + FastBoundsCheck(in.NumElements(), std::numeric_limits::max()), + errors::InvalidArgument("argument to LRN too large")); // Cast to platform-specific int to avoid conversion warnings. const int batch = static_cast(in.dim_size(0)); const int rows = static_cast(in.dim_size(1)); @@ -448,10 +450,11 @@ class LRNGradOp : public OpKernel { explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); float tmp; OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp)); diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc index 5eb060f6641..cdff7bad5fe 100644 --- a/tensorflow/core/kernels/matching_files_op.cc +++ b/tensorflow/core/kernels/matching_files_op.cc @@ -45,15 +45,14 @@ class MatchingFilesOp : public OpKernel { int num_files = 0; std::vector> all_fnames(num_patterns); for (int i = 0; i < num_patterns; i++) { - OP_REQUIRES_OK( - context, - context->env()->GetMatchingPaths(patterns(i), &all_fnames[i])); + OP_REQUIRES_OK(context, context->env()->GetMatchingPaths(patterns(i), + &all_fnames[i])); num_files += all_fnames[i].size(); } Tensor* output_t = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output( - "filenames", TensorShape({num_files}), &output_t)); + OP_REQUIRES_OK( + context, context->allocate_output("filenames", TensorShape({num_files}), + &output_t)); auto output = output_t->vec(); int index = 0; for (int i = 0; i < num_patterns; ++i) { diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc index cb68690f284..f499ce6519d 100644 --- a/tensorflow/core/kernels/matmul_op.cc +++ b/tensorflow/core/kernels/matmul_op.cc @@ -261,12 +261,12 @@ struct LaunchMatMul { std::vector* algorithms, bool use_autotune, Tensor* out) { using perftools::gputools::blas::AlgorithmConfig; using perftools::gputools::blas::ComputationType; - using perftools::gputools::blas::ProfileResult; - using perftools::gputools::blas::Transpose; using perftools::gputools::blas::kDefaultAlgorithm; using perftools::gputools::blas::kDefaultBlasGemm; using perftools::gputools::blas::kDefaultBlasGemv; using perftools::gputools::blas::kNoAlgorithm; + using perftools::gputools::blas::ProfileResult; + using perftools::gputools::blas::Transpose; Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose}; const uint64 m = a.dim_size(1 - dim_pair[0].first); const uint64 k = a.dim_size(dim_pair[0].first); diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h index 6398da2fb95..628895ca86f 100644 --- a/tensorflow/core/kernels/matmul_op.h +++ b/tensorflow/core/kernels/matmul_op.h @@ -30,7 +30,8 @@ struct MatMulTypes { typedef Eigen::TensorMap, Eigen::Aligned> out_type; typedef Eigen::TensorMap, - Eigen::Aligned> in_type; + Eigen::Aligned> + in_type; }; template @@ -40,7 +39,8 @@ class MatrixExponentialOp : public LinearAlgebraOp { MatrixMaps* outputs) final { const ConstMatrixMap& input = inputs[0]; if (input.rows() == 0) return; - using Matrix = Eigen::Matrix; + using Matrix = + Eigen::Matrix; Matrix tmp = input; outputs->at(0) = tmp.exp(); } @@ -51,9 +51,9 @@ class MatrixExponentialOp : public LinearAlgebraOp { REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp), float); REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp), double); -REGISTER_LINALG_OP("MatrixExponential", - (MatrixExponentialOp), complex64); -REGISTER_LINALG_OP("MatrixExponential", - (MatrixExponentialOp), complex128); +REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp), + complex64); +REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp), + complex128); } // namespace tensorflow diff --git a/tensorflow/core/kernels/matrix_logarithm_op.cc b/tensorflow/core/kernels/matrix_logarithm_op.cc index cf0007b5b67..22ca094e243 100644 --- a/tensorflow/core/kernels/matrix_logarithm_op.cc +++ b/tensorflow/core/kernels/matrix_logarithm_op.cc @@ -26,7 +26,6 @@ limitations under the License. #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" - namespace tensorflow { template @@ -40,7 +39,8 @@ class MatrixLogarithmOp : public LinearAlgebraOp { MatrixMaps* outputs) final { const ConstMatrixMap& input = inputs[0]; if (input.rows() == 0) return; - using Matrix = Eigen::Matrix; + using Matrix = + Eigen::Matrix; Matrix tmp = input; outputs->at(0) = tmp.log(); } @@ -53,9 +53,9 @@ class MatrixLogarithmOp : public LinearAlgebraOp { // logarithm. If all eigenvalues are positive, then this returns the correct // logarithm, however checking for positive definiteness adds significant // overhead. Therefore at present we only register this Op for complex types. -REGISTER_LINALG_OP("MatrixLogarithm", - (MatrixLogarithmOp), complex64); -REGISTER_LINALG_OP("MatrixLogarithm", - (MatrixLogarithmOp), complex128); +REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp), + complex64); +REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp), + complex128); } // namespace tensorflow diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc index 9dd665392bc..502d593474e 100644 --- a/tensorflow/core/kernels/matrix_set_diag_op.cc +++ b/tensorflow/core/kernels/matrix_set_diag_op.cc @@ -69,8 +69,8 @@ class MatrixSetDiagOp : public OpKernel { errors::InvalidArgument( "must have diagonal.shape == input.shape[:-2] + " "min(input.shape[-2:]), but received input shape: ", - input_shape.DebugString(), " and diagonal shape: ", - diag_shape.DebugString())); + input_shape.DebugString(), + " and diagonal shape: ", diag_shape.DebugString())); if (input.NumElements() == 0) { // This is a no-op. diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc index 2eefadad494..9be7408012b 100644 --- a/tensorflow/core/kernels/maxpooling_op.cc +++ b/tensorflow/core/kernels/maxpooling_op.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/kernels/maxpooling_op.h" #include +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -37,7 +38,6 @@ limitations under the License. #include "tensorflow/core/util/padding.h" #include "tensorflow/core/util/tensor_format.h" #include "tensorflow/core/util/use_cudnn.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #if GOOGLE_CUDA #include "tensorflow/core/kernels/maxpooling_op_gpu.h" @@ -89,7 +89,6 @@ static void SpatialMaxPoolWithArgMaxHelper( // max value. auto shard = [¶ms, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop, &output_arg_max, &out_backprop](int64 start, int64 limit) { - const int32 depth = params.depth; const int32 in_rows = params.tensor_in_rows; const int32 in_cols = params.tensor_in_cols; @@ -180,7 +179,6 @@ static void SpatialMaxPoolWithArgMaxHelper( input_backprop_flat(input_backprop_index) += out_backprop_flat(index); } } - }; const int64 shard_cost = params.tensor_in_rows * params.tensor_in_cols * @@ -567,7 +565,7 @@ class MaxPoolingGradGradOp : public OpKernel { // tensor_out_as_matrix with the corresponding values in // top_diff_as_matrix. auto shard = [¶ms, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat]( - int64 start, int64 limit) { + int64 start, int64 limit) { const int32 depth = params.depth; const int32 in_rows = params.tensor_in_rows; const int32 in_cols = params.tensor_in_cols; diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc index f8daaca4c94..0c7a236b2ff 100644 --- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc +++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc @@ -450,10 +450,10 @@ bool MaxPoolBackwardWithArgmax::operator()( T* bottom_diff, const Eigen::GpuDevice& d) { const int kThreadsPerBlock = 1024; SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock, - kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff); + kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff); MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, d.stream()>>>( - output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff); + output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff); return d.ok(); } diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc index 9fed01189fc..39e60c9fcef 100644 --- a/tensorflow/core/kernels/meta_support.cc +++ b/tensorflow/core/kernels/meta_support.cc @@ -98,9 +98,9 @@ typedef gemmlowp::meta::SimpleContext LocalContext; template void MultiThreadGemm(Context* context, const Params& params) { if (params.m <= 4) { - gemmlowp::meta::MultiThreadGemm< - Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params, - 1, 8, 8>(context, params); + gemmlowp::meta::MultiThreadGemm< + Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params, 1, + 8, 8>(context, params); } else { if (params.m >= params.n) { gemmlowp::meta::MultiThreadGemm< diff --git a/tensorflow/core/kernels/mfcc.cc b/tensorflow/core/kernels/mfcc.cc index 2793005aa26..8c755e0df87 100644 --- a/tensorflow/core/kernels/mfcc.cc +++ b/tensorflow/core/kernels/mfcc.cc @@ -27,21 +27,19 @@ const double kFilterbankFloor = 1e-12; const int kDefaultFilterbankChannelCount = 40; const int kDefaultDCTCoefficientCount = 13; -Mfcc::Mfcc() : initialized_(false), - lower_frequency_limit_(kDefaultLowerFrequencyLimit), - upper_frequency_limit_(kDefaultUpperFrequencyLimit), - filterbank_channel_count_(kDefaultFilterbankChannelCount), - dct_coefficient_count_(kDefaultDCTCoefficientCount) { } +Mfcc::Mfcc() + : initialized_(false), + lower_frequency_limit_(kDefaultLowerFrequencyLimit), + upper_frequency_limit_(kDefaultUpperFrequencyLimit), + filterbank_channel_count_(kDefaultFilterbankChannelCount), + dct_coefficient_count_(kDefaultDCTCoefficientCount) {} -bool Mfcc::Initialize(int input_length, - double input_sample_rate) { - bool initialized = mel_filterbank_.Initialize(input_length, - input_sample_rate, - filterbank_channel_count_, - lower_frequency_limit_, - upper_frequency_limit_); - initialized &= dct_.Initialize(filterbank_channel_count_, - dct_coefficient_count_); +bool Mfcc::Initialize(int input_length, double input_sample_rate) { + bool initialized = mel_filterbank_.Initialize( + input_length, input_sample_rate, filterbank_channel_count_, + lower_frequency_limit_, upper_frequency_limit_); + initialized &= + dct_.Initialize(filterbank_channel_count_, dct_coefficient_count_); initialized_ = initialized; return initialized; } diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h index 8268f472034..8eee76f7f0c 100644 --- a/tensorflow/core/kernels/mfcc.h +++ b/tensorflow/core/kernels/mfcc.h @@ -20,18 +20,17 @@ limitations under the License. #include +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/kernels/mfcc_dct.h" #include "tensorflow/core/kernels/mfcc_mel_filterbank.h" #include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/framework/op_kernel.h" namespace tensorflow { class Mfcc { public: Mfcc(); - bool Initialize(int input_length, - double input_sample_rate); + bool Initialize(int input_length, double input_sample_rate); // Input is a single squared-magnitude spectrogram frame. The input spectrum // is converted to linear magnitude and weighted into bands using a diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.cc b/tensorflow/core/kernels/mfcc_mel_filterbank.cc index 630de8a5a33..3db3b51e8b6 100644 --- a/tensorflow/core/kernels/mfcc_mel_filterbank.cc +++ b/tensorflow/core/kernels/mfcc_mel_filterbank.cc @@ -38,13 +38,12 @@ namespace tensorflow { MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {} -bool MfccMelFilterbank::Initialize(int input_length, - double input_sample_rate, - int output_channel_count, - double lower_frequency_limit, - double upper_frequency_limit) { +bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate, + int output_channel_count, + double lower_frequency_limit, + double upper_frequency_limit) { num_channels_ = output_channel_count; - sample_rate_ = input_sample_rate; + sample_rate_ = input_sample_rate; input_length_ = input_length; if (num_channels_ < 1) { @@ -85,10 +84,9 @@ bool MfccMelFilterbank::Initialize(int input_length, } // Always exclude DC; emulate HTK. - const double hz_per_sbin = 0.5 * sample_rate_ / - static_cast(input_length_ - 1); - start_index_ = static_cast(1.5 + (lower_frequency_limit / - hz_per_sbin)); + const double hz_per_sbin = + 0.5 * sample_rate_ / static_cast(input_length_ - 1); + start_index_ = static_cast(1.5 + (lower_frequency_limit / hz_per_sbin)); end_index_ = static_cast(upper_frequency_limit / hz_per_sbin); // Maps the input spectrum bin indices to filter bank channels/indices. For @@ -121,12 +119,12 @@ bool MfccMelFilterbank::Initialize(int input_length, weights_[i] = 0.0; } else { if (channel >= 0) { - weights_[i] = (center_frequencies_[channel + 1] - - FreqToMel(i * hz_per_sbin)) / + weights_[i] = + (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) / (center_frequencies_[channel + 1] - center_frequencies_[channel]); } else { weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) / - (center_frequencies_[0] - mel_low); + (center_frequencies_[0] - mel_low); } } } @@ -152,16 +150,16 @@ bool MfccMelFilterbank::Initialize(int input_length, } } if (!bad_channels.empty()) { - LOG(ERROR) << "Missing " << bad_channels.size() << " bands " << - " starting at " << bad_channels[0] << - " in mel-frequency design. " << - "Perhaps too many channels or " << - "not enough frequency resolution in spectrum. (" << - "input_length: " << input_length << - " input_sample_rate: " << input_sample_rate << - " output_channel_count: " << output_channel_count << - " lower_frequency_limit: " << lower_frequency_limit << - " upper_frequency_limit: " << upper_frequency_limit; + LOG(ERROR) << "Missing " << bad_channels.size() << " bands " + << " starting at " << bad_channels[0] + << " in mel-frequency design. " + << "Perhaps too many channels or " + << "not enough frequency resolution in spectrum. (" + << "input_length: " << input_length + << " input_sample_rate: " << input_sample_rate + << " output_channel_count: " << output_channel_count + << " lower_frequency_limit: " << lower_frequency_limit + << " upper_frequency_limit: " << upper_frequency_limit; } initialized_ = true; return true; @@ -171,7 +169,7 @@ bool MfccMelFilterbank::Initialize(int input_length, // square root, then summing FFT magnitudes under triangular integration windows // whose widths increase with frequency. void MfccMelFilterbank::Compute(const std::vector &input, - std::vector *output) const { + std::vector *output) const { if (!initialized_) { LOG(ERROR) << "Mel Filterbank not initialized."; return; diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h index 1bdc2dc93b8..37c3936e80d 100644 --- a/tensorflow/core/kernels/mfcc_mel_filterbank.h +++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h @@ -27,10 +27,8 @@ class MfccMelFilterbank { public: MfccMelFilterbank(); bool Initialize(int input_length, // Number of unique FFT bins fftsize/2+1. - double input_sample_rate, - int output_channel_count, - double lower_frequency_limit, - double upper_frequency_limit); + double input_sample_rate, int output_channel_count, + double lower_frequency_limit, double upper_frequency_limit); // Takes a squared-magnitude spectrogram slice as input, computes a // triangular-mel-weighted linear-magnitude filterbank, and places the result @@ -56,7 +54,7 @@ class MfccMelFilterbank { // FFT bin i contributes to the upper side of mel channel band_mapper_[i] std::vector band_mapper_; int start_index_; // Lowest FFT bin used to calculate mel spectrum. - int end_index_; // Highest FFT bin used to calculate mel spectrum. + int end_index_; // Highest FFT bin used to calculate mel spectrum. TF_DISALLOW_COPY_AND_ASSIGN(MfccMelFilterbank); }; diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc index 602dfeb4e54..54f31e1699e 100644 --- a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc +++ b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc @@ -34,11 +34,9 @@ TEST(MfccMelFilterbankTest, AgreesWithPythonGoldenValues) { input.push_back(i + 1); } const int kChannelCount = 20; - filterbank.Initialize(input.size(), - 22050 /* sample rate */, - kChannelCount /* channels */, - 20.0 /* lower frequency limit */, - 4000.0 /* upper frequency limit */); + filterbank.Initialize( + input.size(), 22050 /* sample rate */, kChannelCount /* channels */, + 20.0 /* lower frequency limit */, 4000.0 /* upper frequency limit */); std::vector output; filterbank.Compute(input, &output); @@ -65,13 +63,10 @@ TEST(MfccMelFilterbankTest, IgnoresExistingContentOfOutputVector) { std::vector input; std::vector output; - filterbank.Initialize(kSampleCount, - 22050 /* sample rate */, - 20 /* channels */, - 20.0 /* lower frequency limit */, + filterbank.Initialize(kSampleCount, 22050 /* sample rate */, + 20 /* channels */, 20.0 /* lower frequency limit */, 4000.0 /* upper frequency limit */); - // First call with nonzero input value, and an empty output vector, // will resize the output and fill it with the correct, nonzero outputs. input.assign(kSampleCount, 1.0); diff --git a/tensorflow/core/kernels/mfcc_test.cc b/tensorflow/core/kernels/mfcc_test.cc index cb32df8811e..72c1d331d6e 100644 --- a/tensorflow/core/kernels/mfcc_test.cc +++ b/tensorflow/core/kernels/mfcc_test.cc @@ -36,11 +36,10 @@ TEST(MfccTest, AgreesWithPythonGoldenValues) { std::vector output; mfcc.Compute(input, &output); - std::vector expected = {29.13970072, -6.41568601, -0.61903012, - -0.96778652, -0.26819878, -0.40907028, - -0.15614748, -0.23203119, -0.10481487, - -0.1543029, -0.0769791, -0.10806114, - -0.06047613}; + std::vector expected = { + 29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878, + -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029, + -0.0769791, -0.10806114, -0.06047613}; ASSERT_EQ(expected.size(), output.size()); for (int i = 0; i < output.size(); ++i) { diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/mirror_pad_op.cc index fbdeaf43ebb..26e1082989f 100644 --- a/tensorflow/core/kernels/mirror_pad_op.cc +++ b/tensorflow/core/kernels/mirror_pad_op.cc @@ -87,8 +87,8 @@ class MirrorPadOp : public OpKernel { const Tpaddings before = paddings(d, 0); // Pad before existing elements. const Tpaddings after = paddings(d, 1); // Pad after existing elements. OP_REQUIRES(context, before >= 0 && after >= 0, - errors::InvalidArgument("paddings must be non-negative: ", - before, " ", after)); + errors::InvalidArgument( + "paddings must be non-negative: ", before, " ", after)); if (offset_ == 0) { // SYMMETRIC mode. OP_REQUIRES(context, before <= in0.dim_size(d) && after <= in0.dim_size(d), @@ -296,8 +296,8 @@ class MirrorPadGradOp : public OpKernel { const Tpaddings before = paddings(d, 0); // Pad before existing elements. const Tpaddings after = paddings(d, 1); // Pad after existing elements. OP_REQUIRES(context, before >= 0 && after >= 0, - errors::InvalidArgument("Paddings must be non-negative: ", - before, ", ", after)); + errors::InvalidArgument( + "Paddings must be non-negative: ", before, ", ", after)); const int64 out_size = in0.dim_size(d) - (before + after); if (offset_ == 0) { // SYMMETRIC mode. diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc index d751a70fc86..a7c569ee058 100644 --- a/tensorflow/core/kernels/mkl_avgpooling_op.cc +++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc @@ -26,14 +26,14 @@ #ifdef INTEL_MKL_DNN #include "mkldnn.hpp" -using mkldnn::memory; -using mkldnn::error; -using mkldnn::pooling_forward; -using mkldnn::pooling_backward; -using mkldnn::padding_kind; -using mkldnn::engine; -using mkldnn::prop_kind; using mkldnn::algorithm; +using mkldnn::engine; +using mkldnn::error; +using mkldnn::memory; +using mkldnn::padding_kind; +using mkldnn::pooling_backward; +using mkldnn::pooling_forward; +using mkldnn::prop_kind; #endif namespace tensorflow { @@ -358,10 +358,11 @@ class MklAvgPoolingGradOp : public OpKernel { if (!outbackprop_in_mkl_format) { // For avgpooling, tensor_in_shape should have 1 dimension, and 4 // elements. - OP_REQUIRES(context, tensor_in_shape.dims() == 1 && - tensor_in_shape.NumElements() == 4, - errors::InvalidArgument("original input shape must be " - "1-dimensional and 4 elements")); + OP_REQUIRES( + context, + tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4, + errors::InvalidArgument("original input shape must be " + "1-dimensional and 4 elements")); // For avgpooling, out_backprop should have 4 dimensions. OP_REQUIRES(context, out_backprop.dims() == 4, @@ -428,14 +429,13 @@ class MklAvgPoolingGradOp : public OpKernel { TensorFormat data_format_; }; // MklAvgPoolingGradOp - #else // INTEL_MKL_DNN is defined template class MklAvgPoolingOp : public MklPoolingForwardOpBase { public: explicit MklAvgPoolingOp(OpKernelConstruction* context) - : MklPoolingForwardOpBase(context) { + : MklPoolingForwardOpBase(context) { // Workspace is an MKLDNN construct that is only used in Max Pooling. // So set workspace_enabled_ to false. this->workspace_enabled_ = false; @@ -444,8 +444,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase { void Compute(OpKernelContext* context) override { try { auto cpu_engine = engine(engine::cpu, 0); - const Tensor& input_tensor = MklGetInput(context, - this->kInputTensorIndexInput); + const Tensor& input_tensor = + MklGetInput(context, this->kInputTensorIndexInput); MklDnnShape dnn_shape_input; GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input); this->SanityCheckInput(context, input_tensor, dnn_shape_input); @@ -457,9 +457,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase { // initialize variables for the pooling op MklPoolParameters pool_params; // Get the input tensor and initialize the pooling parameters - this->ConfigureInput(context, dnn_shape_input, - input_tensor, &pool_params, - &dnn_data_input); + this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params, + &dnn_data_input); OP_REQUIRES_OK(context, context->status()); // Declare output tensor @@ -470,56 +469,52 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase { // If input is in Mkl layout, then just get the memory format from it // directly, instead of using input data_format to AvgPool. if (dnn_shape_input.IsMklTensor()) { - dnn_data_output.SetUsrMem(output_dims_mkl_order, - static_cast(dnn_data_input.GetUsrMemDesc() - .data.format)); + dnn_data_output.SetUsrMem( + output_dims_mkl_order, + static_cast( + dnn_data_input.GetUsrMemDesc().data.format)); } else { - dnn_data_output.SetUsrMem(output_dims_mkl_order, - this->data_format_mkldnn_); + dnn_data_output.SetUsrMem(output_dims_mkl_order, + this->data_format_mkldnn_); } - // describe the memory layout + // describe the memory layout dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any); // 3. create a pooling primitive descriptor - auto pool_desc = pooling_forward::desc(prop_kind::forward, - algorithm::pooling_avg_exclude_padding, - dnn_data_input.GetUsrMemDesc(), - dnn_data_output.GetUsrMemDesc(), - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, - cpu_engine); + auto pool_desc = pooling_forward::desc( + prop_kind::forward, algorithm::pooling_avg_exclude_padding, + dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(), + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_prim_desc = + pooling_forward::primitive_desc(pool_desc, cpu_engine); this->AllocateOutputTensor(context, pool_prim_desc, output_dims_mkl_order, - this->data_format_mkldnn_, &output_tensor); + this->data_format_mkldnn_, &output_tensor); CHECK_NOTNULL(output_tensor); OP_REQUIRES_OK(context, context->status()); dnn_data_output.SetUsrMemDataHandle(output_tensor); - this->PrepareAndExecuteNet(pool_prim_desc, - &dnn_data_input, - &dnn_data_output); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + this->PrepareAndExecuteNet(pool_prim_desc, &dnn_data_input, + &dnn_data_output); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } // Compute -}; // MklAvgPoolingOp +}; // MklAvgPoolingOp //----------------------------------------------------------------------------- @@ -527,27 +522,23 @@ template class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { public: explicit MklAvgPoolingGradOp(OpKernelConstruction* context) - : MklPoolingBackwardOpBase(context) { - } + : MklPoolingBackwardOpBase(context) {} void Compute(OpKernelContext* context) override { try { auto cpu_engine = engine(engine::cpu, 0); MklDnnShape original_input_mkl_shape, input_gradient_mkl_shape; - const Tensor& tensor_in_shape = MklGetInput(context, - kInputTensorIndexInputShape); - const Tensor& input_gradient_tensor = MklGetInput(context, - kInputTensorIndexInputGradient); + const Tensor& tensor_in_shape = + MklGetInput(context, kInputTensorIndexInputShape); + const Tensor& input_gradient_tensor = + MklGetInput(context, kInputTensorIndexInputGradient); GetMklShape(context, kInputTensorIndexInputShape, - &original_input_mkl_shape); + &original_input_mkl_shape); GetMklShape(context, kInputTensorIndexInputGradient, - &input_gradient_mkl_shape); + &input_gradient_mkl_shape); - - SanityCheckInputs(context, tensor_in_shape, - input_gradient_tensor, - original_input_mkl_shape, - input_gradient_mkl_shape); + SanityCheckInputs(context, tensor_in_shape, input_gradient_tensor, + original_input_mkl_shape, input_gradient_mkl_shape); if (!context->status().ok()) return; // Used to allocate output_diff_src/diff_src @@ -562,90 +553,70 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { MklPoolParameters pool_params; memory::dims output_dims_mkl_order, original_input_dims_nchw; // Configure the original input memory descriptor - memory::desc original_input_md = ConfigureOriginalInput(context, - tensor_in_shape, - original_input_mkl_shape, - &original_input_dims_nchw, - &pool_params, - &original_input_shape); + memory::desc original_input_md = ConfigureOriginalInput( + context, tensor_in_shape, original_input_mkl_shape, + &original_input_dims_nchw, &pool_params, &original_input_shape); // configure the original output memory descriptor // by definition, the shape of the original output is the same // as the shape of the gradient diff_dst memory::desc original_output_md = this->ConfigureOriginalOutput( - pool_params, input_gradient_mkl_shape, output_dims_mkl_order); + pool_params, input_gradient_mkl_shape, output_dims_mkl_order); memory::desc target_diff_dst_md = this->ConfigureInputGradient( - input_gradient_mkl_shape, - input_gradient_tensor, - &input_gradient_diff_dst, - original_output_md); + input_gradient_mkl_shape, input_gradient_tensor, + &input_gradient_diff_dst, original_output_md); // The shape of the output diff src needs to be the same shape as the // original input. But we will set its format to be same as the format of // input gradient. We won't use format of original input since it will // always be in Tensorflow layout (given that AvgPoolGrad gets shape of // the input rather than actual input). - output_diff_src.SetUsrMem(original_input_dims_nchw, - static_cast( - target_diff_dst_md.data.format)); + output_diff_src.SetUsrMem( + original_input_dims_nchw, + static_cast(target_diff_dst_md.data.format)); // Create the forward pooling primitive descriptor so we can reference it // in the backward pooling primitive descriptor - auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward, - algorithm::pooling_avg_exclude_padding, - original_input_md, - original_output_md, - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_fwd_prim_desc - = pooling_forward::primitive_desc(pool_fwd_desc, - cpu_engine); + auto pool_fwd_desc = pooling_forward::desc( + prop_kind::forward, algorithm::pooling_avg_exclude_padding, + original_input_md, original_output_md, + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_fwd_prim_desc = + pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine); auto pool_bkwd_desc = pooling_backward::desc( - algorithm::pooling_avg_exclude_padding, - output_diff_src.GetUsrMemDesc(), - target_diff_dst_md, - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_bkwd_prim_desc - = pooling_backward::primitive_desc(pool_bkwd_desc, - cpu_engine, - pool_fwd_prim_desc); - this->AllocateOutputTensor(context, pool_bkwd_prim_desc, - original_input_dims_nchw, - this->data_format_mkldnn_, - &output_tensor_diff_src); + algorithm::pooling_avg_exclude_padding, + output_diff_src.GetUsrMemDesc(), target_diff_dst_md, + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_bkwd_prim_desc = pooling_backward::primitive_desc( + pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc); + this->AllocateOutputTensor( + context, pool_bkwd_prim_desc, original_input_dims_nchw, + this->data_format_mkldnn_, &output_tensor_diff_src); output_diff_src.SetUsrMemDataHandle(output_tensor_diff_src); - this->PrepareAndExecuteNet(pool_bkwd_prim_desc, - &input_gradient_diff_dst, - &output_diff_src, - memory::primitive_desc( - target_diff_dst_md, - cpu_engine)); - } catch (mkldnn::error &e) { + this->PrepareAndExecuteNet( + pool_bkwd_prim_desc, &input_gradient_diff_dst, &output_diff_src, + memory::primitive_desc(target_diff_dst_md, cpu_engine)); + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Compute received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:", + error_msg)); } } // Compute @@ -655,12 +626,11 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { const int kInputTensorIndexInputShape = 0; const int kInputTensorIndexInputGradient = 1; - memory::desc ConfigureOriginalInput(OpKernelContext* context, - const Tensor& tensor_original_input_shape, - const MklDnnShape& original_input_mkl_shape, - memory::dims* original_input_dims_mkl_order, - MklPoolParameters* pool_params, - TensorShape* input_tensor_shape) { + memory::desc ConfigureOriginalInput( + OpKernelContext* context, const Tensor& tensor_original_input_shape, + const MklDnnShape& original_input_mkl_shape, + memory::dims* original_input_dims_mkl_order, + MklPoolParameters* pool_params, TensorShape* input_tensor_shape) { CHECK_NOTNULL(original_input_dims_mkl_order); CHECK_NOTNULL(pool_params); CHECK_NOTNULL(input_tensor_shape); @@ -672,46 +642,42 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { } return MklPoolingBackwardOpBase::ConfigureOriginalInput( - context, - tensor_original_input_shape, - original_input_mkl_shape, - original_input_dims_mkl_order, - pool_params, - *input_tensor_shape); -} + context, tensor_original_input_shape, original_input_mkl_shape, + original_input_dims_mkl_order, pool_params, *input_tensor_shape); + } void SanityCheckInputs(OpKernelContext* context, - const Tensor& tensor_in_shape, - const Tensor& input_gradient_tensor, - const MklDnnShape& original_input_mkl_shape, - const MklDnnShape& input_gradient_mkl_shape) { + const Tensor& tensor_in_shape, + const Tensor& input_gradient_tensor, + const MklDnnShape& original_input_mkl_shape, + const MklDnnShape& input_gradient_mkl_shape) { if (!original_input_mkl_shape.IsMklTensor()) { - OP_REQUIRES(context, tensor_in_shape.dims() == 1 && - tensor_in_shape.NumElements() == 4, + OP_REQUIRES( + context, + tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4, errors::InvalidArgument("original input shape must be " - "1-dimensional and 4 elements")); + "1-dimensional and 4 elements")); } else { - OP_REQUIRES(context, original_input_mkl_shape.GetDimension() == 1 && - original_input_mkl_shape.DimSize(0) == 4, - errors::InvalidArgument("original input shape must be " - "1-dimensional and 4 elements")); + OP_REQUIRES(context, + original_input_mkl_shape.GetDimension() == 1 && + original_input_mkl_shape.DimSize(0) == 4, + errors::InvalidArgument("original input shape must be " + "1-dimensional and 4 elements")); } if (!input_gradient_mkl_shape.IsMklTensor()) { // For avgpooling, input_gradient_diff_dst should have 4 dimensions. OP_REQUIRES(context, input_gradient_tensor.dims() == 4, - errors::InvalidArgument("Gradient shape must be " - "4-dimensional")); + errors::InvalidArgument("Gradient shape must be " + "4-dimensional")); } else { OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4, - errors::InvalidArgument("Gradient shape must be " - "4-dimensional")); + errors::InvalidArgument("Gradient shape must be " + "4-dimensional")); } } }; // MklAvgPoolingGradOp - - #endif // INTEL_MKL_DNN REGISTER_KERNEL_BUILDER(Name("_MklAvgPool") @@ -728,4 +694,3 @@ REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad") } // namespace tensorflow #endif // INTEL_MKL - diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc index 9fee94f9465..d9713075be6 100644 --- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc +++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc @@ -40,7 +40,6 @@ limitations under the License. #include "tensorflow/core/kernels/fill_functor.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #define MKL_Complex8 tensorflow::complex64 #define MKL_Complex16 tensorflow::complex128 diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc index d109bb6bcfe..7da63604d2b 100644 --- a/tensorflow/core/kernels/mkl_concat_op.cc +++ b/tensorflow/core/kernels/mkl_concat_op.cc @@ -33,8 +33,8 @@ limitations under the License. #ifdef INTEL_MKL_DNN #include "mkldnn.hpp" -using mkldnn::stream; using mkldnn::concat; +using mkldnn::stream; #endif namespace tensorflow { @@ -45,7 +45,6 @@ typedef std::vector TensorShapeList; enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM }; - // TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable // reference inputs. // -------------------------------------------------------------------------- @@ -152,8 +151,8 @@ class EigenConcatBaseOp : public OpKernel { #else // MKL_DNN -void Compute(OpKernelContext* c, const std::vector& values, - const TensorShapeList& input_shapes) { + void Compute(OpKernelContext* c, const std::vector& values, + const TensorShapeList& input_shapes) { const Tensor* concat_dim_tensor; const char* axis_attribute_name = AxisArgName == NAME_IS_AXIS @@ -197,7 +196,8 @@ void Compute(OpKernelContext* c, const std::vector& values, const auto in = values[i]; const bool in_is_scalar = IsLegacyScalar(input_shapes[i]); OP_REQUIRES( - c, (input_shapes[i].dims() == input_dims) || + c, + (input_shapes[i].dims() == input_dims) || (input_is_scalar && in_is_scalar), errors::InvalidArgument( "ConcatOp : Ranks of all input tensors should match: shape[0] = ", @@ -208,8 +208,8 @@ void Compute(OpKernelContext* c, const std::vector& values, inputs_flat.emplace_back(new typename TTypes::ConstMatrix( in.shaped({inputs_flat_dim0, inputs_flat_dim1}))); } - output_concat_dim += input_shapes[i].dims() > 0 ? - input_shapes[i].dim_size(axis) : 1; + output_concat_dim += + input_shapes[i].dims() > 0 ? input_shapes[i].dim_size(axis) : 1; } TensorShape output_shape(input_shape); @@ -418,7 +418,6 @@ class MklConcatOp : public OpKernel { OP_REQUIRES_OK(context, context->status()); } - private: typedef struct { TensorFormat data_format; @@ -590,39 +589,45 @@ class MklConcatOp : public OpKernel { GetMklShapeList(context, "values", &input_shapes); const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM) - ? MklGetInput(context, 0) : MklGetInput(context, N); + ? MklGetInput(context, 0) + : MklGetInput(context, N); // Sanity checks - OP_REQUIRES(context, IsLegacyScalar(concat_dim_tensor.shape()), - errors::InvalidArgument( - "Concat dim tensor should be a scalar integer, but got shape ", - concat_dim_tensor.shape().DebugString())); - int32 concat_dim = internal::SubtleMustCopy( - concat_dim_tensor.scalar()()); + OP_REQUIRES( + context, IsLegacyScalar(concat_dim_tensor.shape()), + errors::InvalidArgument( + "Concat dim tensor should be a scalar integer, but got shape ", + concat_dim_tensor.shape().DebugString())); + int32 concat_dim = + internal::SubtleMustCopy(concat_dim_tensor.scalar()()); // check that ranks of all tensors match // and that their shapes match except for concat_dim. int i = 0; bool invoke_eigen = false; bool are_all_mkl_inputs = true, are_all_tf_inputs = true; - const TensorShape expected_shape = input_shapes[0].IsMklTensor() ? - input_shapes[0].GetTfShape() : - input_tensors[0].shape(); + const TensorShape expected_shape = input_shapes[0].IsMklTensor() + ? input_shapes[0].GetTfShape() + : input_tensors[0].shape(); size_t expected_dims = expected_shape.dims(); if (concat_dim < 0) concat_dim = expected_dims + concat_dim; for (auto& s : input_shapes) { - if (s == expected_shape) {++i; continue;} + if (s == expected_shape) { + ++i; + continue; + } - TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() : - input_tensors[i].shape(); + TensorShape s_shape = + s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape(); size_t s_dims = s_shape.dims(); - OP_REQUIRES(context, s_dims == expected_dims, - errors::InvalidArgument( - "_MklConcatOp : Ranks of all input tensors should match:" - " input dimensions = ", - s_dims, " vs. expected rank = ", expected_dims)); + OP_REQUIRES( + context, s_dims == expected_dims, + errors::InvalidArgument( + "_MklConcatOp : Ranks of all input tensors should match:" + " input dimensions = ", + s_dims, " vs. expected rank = ", expected_dims)); for (int d = 0; d < expected_dims; ++d) { if (d == concat_dim) continue; @@ -630,10 +635,11 @@ class MklConcatOp : public OpKernel { size_t expected_size = expected_shape.dim_size(d); size_t s_size = s_shape.dim_size(d); OP_REQUIRES( - context, expected_size == s_size, - errors::InvalidArgument("_MklConcatOp : Dimensions of inputs " - "should match: shape[0][", d, "]= ", expected_size, - " vs. shape[", i, "][", d, "] = ", s_size)); + context, expected_size == s_size, + errors::InvalidArgument("_MklConcatOp : Dimensions of inputs " + "should match: shape[0][", + d, "]= ", expected_size, " vs. shape[", i, + "][", d, "] = ", s_size)); } if (s.IsMklTensor()) @@ -657,8 +663,8 @@ class MklConcatOp : public OpKernel { TensorShapeList tf_input_shapes; i = 0; for (auto& s : input_shapes) { - TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() : - input_tensors[i].shape(); + TensorShape s_shape = + s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape(); tf_input_shapes.push_back(s_shape); ++i; } @@ -678,21 +684,22 @@ class MklConcatOp : public OpKernel { std::vector srcs_pd; std::vector> srcs(N, MklDnnData(&cpu_engine)); int64 dst_concat_dim_size = 0; - for (int k =0; k < N; k++) { + for (int k = 0; k < N; k++) { bool is_mkl_tensor = input_shapes[k].IsMklTensor(); memory::dims src_dims; // Same comment as dst_dims for src_dims. - src_dims = (is_mkl_tensor) ? - TFShapeToMklDnnDims(input_shapes[k].GetTfShape()) : - TFShapeToMklDnnDims(input_tensors[k].shape()); + src_dims = (is_mkl_tensor) + ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape()) + : TFShapeToMklDnnDims(input_tensors[k].shape()); dst_concat_dim_size += src_dims[concat_dim]; - auto src_md = is_mkl_tensor ? input_shapes[k].GetMklLayout() : - // It does not matter what data format we use here (NHWC or NCHW). - // We just need to ensure that output of Concat uses same data format - // as input. - memory::desc(src_dims, MklDnnType(), memory::format::nchw); + auto src_md = + is_mkl_tensor ? input_shapes[k].GetMklLayout() : + // It does not matter what data format we use here + // (NHWC or NCHW). We just need to ensure that output + // of Concat uses same data format as input. + memory::desc(src_dims, MklDnnType(), memory::format::nchw); srcs[k].SetUsrMem(src_md, &input_tensors[k]); auto src_mpd = srcs[k].GetUsrMemPrimDesc(); @@ -707,14 +714,15 @@ class MklConcatOp : public OpKernel { // Since we are passing a specific format for destination, // we need to have dst_dims in MklDnn order (NCHW). auto orig_tf_format = input_shapes[0].GetTfDataFormat(); - dst_dims_in_nchw = MklDnnDimsInNCHW(dst_dims, - MklDnnDataFormatToTFDataFormat(orig_tf_format)); + dst_dims_in_nchw = MklDnnDimsInNCHW( + dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format)); // We will set the output in the same format as input to avoid layout // conversions. // Currently we are setting dst format same as input format. // See if we can make this choice in a better way. - dst_md = memory::desc(dst_dims_in_nchw, MklDnnType(), - (memory::format) input_shapes[0].GetMklLayout().data.format); + dst_md = memory::desc( + dst_dims_in_nchw, MklDnnType(), + (memory::format)input_shapes[0].GetMklLayout().data.format); } else { // Again, format does not matter here. We just need to make it same as // input format. @@ -722,7 +730,7 @@ class MklConcatOp : public OpKernel { } std::vector inputs; - for (int k=0; k < input_tensors.size(); k++) + for (int k = 0; k < input_tensors.size(); k++) inputs.push_back(srcs[k].GetOpMem()); // If all inputs are in MKL format, then meaning of concat_dim needs to @@ -732,8 +740,7 @@ class MklConcatOp : public OpKernel { // But ifinput tensors are in NHWC order, then semantics need to change. // E.g., if we are concatinating over Channel (dimension 3 for NHWC), // then since MklDnn order is NCHW, concat_dim needs to be 1. - if (are_all_mkl_inputs) - concat_dim = input_shapes[0].TfDimIdx(concat_dim); + if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim); auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd); @@ -752,24 +759,25 @@ class MklConcatOp : public OpKernel { dnn_shape_dst.SetMklTensor(false); tf_shape_dst = MklDnnDimsToTFShape(dst_dims); } - AllocateOutputSetMklShape(context, 0, &dst_tensor, - tf_shape_dst, dnn_shape_dst); + AllocateOutputSetMklShape(context, 0, &dst_tensor, tf_shape_dst, + dnn_shape_dst); CHECK_NOTNULL(dst_tensor); - dst_md = dnn_shape_dst.IsMklTensor() ? - dnn_shape_dst.GetMklLayout() : dst_md; + dst_md = + dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout() : dst_md; dst.SetUsrMem(dst_md, dst_tensor); auto concat_op = concat(concat_pd, inputs, dst.GetOpMem()); std::vector net; net.push_back(concat_op); stream(stream::kind::eager).submit(net).wait(); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + ", in file " + - string(__FILE__) + ":" + std::to_string(__LINE__); - OP_REQUIRES_OK(context, errors::Aborted( - "Operation received an exception:", error_msg)); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } @@ -790,11 +798,9 @@ class MklConcatOp : public OpKernel { dnn_shape_output.SetDimensions(4); Tensor* output_tensor = nullptr; TensorShape tf_shape_output; - tf_shape_output.AddDim( - dnn_shape_output.GetSerializeBufferSize()); - context->allocate_output( - GetTensorMetaDataIndex(0, context->num_outputs()), - tf_shape_output, &output_tensor); + tf_shape_output.AddDim(dnn_shape_output.GetSerializeBufferSize()); + context->allocate_output(GetTensorMetaDataIndex(0, context->num_outputs()), + tf_shape_output, &output_tensor); dnn_shape_output.SerializeMklDnnShape( output_tensor->flat().data(), output_tensor->flat().size() * sizeof(uint8)); diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc index 0f1a218fe62..25c25737412 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc @@ -38,9 +38,9 @@ limitations under the License. #include "tensorflow/core/util/use_cudnn.h" #include "tensorflow/core/util/work_sharder.h" -#include "tensorflow/core/util/mkl_util.h" #include "mkl_dnn.h" #include "mkl_dnn_types.h" +#include "tensorflow/core/util/mkl_util.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc index 54d4916d494..ef3f8cfec11 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc @@ -38,17 +38,17 @@ limitations under the License. #include "tensorflow/core/util/use_cudnn.h" #include "tensorflow/core/util/work_sharder.h" -#include "tensorflow/core/util/mkl_util.h" #include "mkl_dnn.h" #include "mkl_dnn_types.h" +#include "tensorflow/core/util/mkl_util.h" #ifdef INTEL_MKL_DNN #include "mkldnn.hpp" -using mkldnn::stream; -using mkldnn::prop_kind; using mkldnn::convolution_backward_weights; using mkldnn::memory; +using mkldnn::prop_kind; +using mkldnn::stream; #endif namespace tensorflow { @@ -360,8 +360,8 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel { (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input; const Tensor& out_backprop = MklGetInput(context, 2); - void* mkl_buf_out_backprop = const_cast(static_cast( - out_backprop.flat().data())); + void* mkl_buf_out_backprop = const_cast( + static_cast(out_backprop.flat().data())); CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop, prim_conv_bwdfilter, @@ -371,10 +371,11 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel { !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop); if (mkl_convert_out_backprop) { CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop, - lt_out_backprop, mkl_lt_internal_out_backprop), + lt_out_backprop, + mkl_lt_internal_out_backprop), E_SUCCESS); AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor, - lt_out_backprop, &mkl_buf_convert_out_backprop); + lt_out_backprop, &mkl_buf_convert_out_backprop); CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop, mkl_buf_out_backprop, mkl_buf_convert_out_backprop), @@ -428,18 +429,18 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel { .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ - MklConv2DCustomBackpropFilterOp); + MklConv2DCustomBackpropFilterOp); TF_CALL_float(REGISTER_MKL_FILTER_KERNELS); #undef REGISTER_MKL_FILTER_KERNELS #else template -class MklConv2DCustomBackpropFilterOp : - public MklConv2DBackpropCommonOp { +class MklConv2DCustomBackpropFilterOp + : public MklConv2DBackpropCommonOp { public: explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context) - : MklConv2DBackpropCommonOp(context) { } + : MklConv2DBackpropCommonOp(context) {} ~MklConv2DCustomBackpropFilterOp() {} private: @@ -447,7 +448,7 @@ class MklConv2DCustomBackpropFilterOp : const MklDnnShape& filter_mkl_shape, const MklDnnShape& obp_mkl_shape) { CHECK(!filter_mkl_shape.IsMklTensor()) - << "Conv2DBackpropFilter: filter should not be in MKL Layout"; + << "Conv2DBackpropFilter: filter should not be in MKL Layout"; } size_t GetInputTensorIndexWithSizes() { return 1; /* filter index */ } @@ -462,8 +463,10 @@ class MklConv2DCustomBackpropFilterOp : const Tensor& filter_tensor) { TensorShape filter_tf_shape; CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true); - CHECK_EQ(TensorShapeUtils::MakeShape( - filter_tensor.vec(), &filter_tf_shape).ok(), true); + CHECK_EQ(TensorShapeUtils::MakeShape(filter_tensor.vec(), + &filter_tf_shape) + .ok(), + true); return filter_tf_shape; } @@ -485,16 +488,13 @@ class MklConv2DCustomBackpropFilterOp : return memory::format::hwio; } - void CreatePrimitive(OpKernelContext* context, - const engine& cpu_engine, + void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine, const convolution_forward::primitive_desc& conv_fwd_pd, MklDnnData* input, MklDnnData* filter, MklDnnData* outbackprop, MklDnnData* output, - Tensor** output_tensor, - const memory::dims& strides, + Tensor** output_tensor, const memory::dims& strides, const memory::dims& padding_l, - const memory::dims& padding_r, - padding_kind padding, + const memory::dims& padding_r, padding_kind padding, const memory::dims& bwd_output_dims, memory::format bwd_output_format) { CHECK_NOTNULL(context); @@ -508,34 +508,35 @@ class MklConv2DCustomBackpropFilterOp : int depth = 0; if (biasEnabled) { // Data structure for bias_grad - bias_grad = new MklDnnData (&cpu_engine); + bias_grad = new MklDnnData(&cpu_engine); TensorShape obp_tf_shape = GetTfShape(context, 2); - depth = (MklConv2DBackpropCommonOp::GetTFDataFormat() - == FORMAT_NCHW) ? - obp_tf_shape.dim_size(1) : obp_tf_shape.dim_size(3); + depth = (MklConv2DBackpropCommonOp::GetTFDataFormat() == + FORMAT_NCHW) + ? obp_tf_shape.dim_size(1) + : obp_tf_shape.dim_size(3); memory::dims bias_grad_dims = {depth}; bias_grad->SetOpMemDesc(bias_grad_dims, memory::format::x); } // Create convolution backward weights primitive. - auto bwd_desc = (biasEnabled && (bias_grad != nullptr))? - convolution_backward_weights::desc(convolution_direct, - input->GetOpMemDesc(), output->GetOpMemDesc(), - bias_grad->GetOpMemDesc(), - outbackprop->GetOpMemDesc(), strides, padding_l, - padding_r, padding) : - convolution_backward_weights::desc(convolution_direct, - input->GetOpMemDesc(), output->GetOpMemDesc(), - outbackprop->GetOpMemDesc(), strides, padding_l, - padding_r, padding); + auto bwd_desc = + (biasEnabled && (bias_grad != nullptr)) + ? convolution_backward_weights::desc( + convolution_direct, input->GetOpMemDesc(), + output->GetOpMemDesc(), bias_grad->GetOpMemDesc(), + outbackprop->GetOpMemDesc(), strides, padding_l, padding_r, + padding) + : convolution_backward_weights::desc( + convolution_direct, input->GetOpMemDesc(), + output->GetOpMemDesc(), outbackprop->GetOpMemDesc(), strides, + padding_l, padding_r, padding); - auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc, - cpu_engine, - conv_fwd_pd); + auto bwd_pd = convolution_backward_weights::primitive_desc( + bwd_desc, cpu_engine, conv_fwd_pd); // Allocate output tensor. - AllocateOutputTensor(context, bwd_pd, bwd_output_dims, - bwd_output_format, output_tensor); + AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format, + output_tensor); CHECK_NOTNULL(*output_tensor); // Set buffer handle using allocated output tensor. @@ -548,8 +549,8 @@ class MklConv2DCustomBackpropFilterOp : AllocateBiasGradTensor(context, bias_grad_shape, &bias_grad_tensor); memory::dims bias_grad_dims = {depth}; // Since Bias is 1D, we use format::x from MKLDNN to represent it. - auto bias_grad_md = memory::desc({bias_grad_dims}, MklDnnType(), - memory::format::x); + auto bias_grad_md = + memory::desc({bias_grad_dims}, MklDnnType(), memory::format::x); bias_grad->SetUsrMem(bias_grad_md, bias_grad_tensor); bias_grad->SetUsrMemDataHandle(bias_grad_tensor); } @@ -562,28 +563,29 @@ class MklConv2DCustomBackpropFilterOp : } // Allocate output tensor. - void AllocateOutputTensor(OpKernelContext* context, - const convolution_backward_weights::primitive_desc& conv_pd, - const memory::dims& output_dims_mkl_order, - memory::format output_tf_format, Tensor** output_tensor) { - CHECK_NOTNULL(output_tensor); + void AllocateOutputTensor( + OpKernelContext* context, + const convolution_backward_weights::primitive_desc& conv_pd, + const memory::dims& output_dims_mkl_order, + memory::format output_tf_format, Tensor** output_tensor) { + CHECK_NOTNULL(output_tensor); - // For BackpropFilter, we convert the output tensor back in Tensorflow - // layout. Because typically, BackpropFilter is the last operator in the - // graph that emit filter gradient that is provided to ApplyGradient - // method to update the filter. But it may be possible to eliminate this - // by forwarding filter in MKL layout if we support ApplyGradient method - // for MKL layout propagation. - MklDnnShape output_mkl_shape; - output_mkl_shape.SetMklTensor(false); - // output_dims_mkl_order is in OIHW format. - // Allocate shape of TF tensor in HWIO format. - TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H], - output_dims_mkl_order[MklDnnDims::Dim_W], - output_dims_mkl_order[MklDnnDims::Dim_I], - output_dims_mkl_order[MklDnnDims::Dim_O]}); - AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape, - output_mkl_shape); + // For BackpropFilter, we convert the output tensor back in Tensorflow + // layout. Because typically, BackpropFilter is the last operator in the + // graph that emit filter gradient that is provided to ApplyGradient + // method to update the filter. But it may be possible to eliminate this + // by forwarding filter in MKL layout if we support ApplyGradient method + // for MKL layout propagation. + MklDnnShape output_mkl_shape; + output_mkl_shape.SetMklTensor(false); + // output_dims_mkl_order is in OIHW format. + // Allocate shape of TF tensor in HWIO format. + TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H], + output_dims_mkl_order[MklDnnDims::Dim_W], + output_dims_mkl_order[MklDnnDims::Dim_I], + output_dims_mkl_order[MklDnnDims::Dim_O]}); + AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape, + output_mkl_shape); } // Allocate tensor for bias grad @@ -600,9 +602,9 @@ class MklConv2DCustomBackpropFilterOp : // Prepare and execute net - checks for input and output reorders. void PrepareAndExecutePrimitive( - const convolution_backward_weights::primitive_desc& conv_pd, - MklDnnData* input, MklDnnData* obp, - MklDnnData* output, MklDnnData* bias_grad = nullptr) { + const convolution_backward_weights::primitive_desc& conv_pd, + MklDnnData* input, MklDnnData* obp, MklDnnData* output, + MklDnnData* bias_grad = nullptr) { // Create reorders between user layout and MKL layout if it is needed and // add it to the net before convolution. std::vector net; @@ -612,15 +614,15 @@ class MklConv2DCustomBackpropFilterOp : // For BackpropFilter, we convert the output tensor back in Tensorflow // layout. bool output_reorder_required = output->PrepareReorderToUserMemIfReq( - conv_pd.diff_weights_primitive_desc()); + conv_pd.diff_weights_primitive_desc()); if (biasEnabled && (bias_grad != nullptr)) { - net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(), - obp->GetOpMem(), output->GetOpMem(), - bias_grad->GetOpMem())); + net.push_back(convolution_backward_weights( + conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem(), + bias_grad->GetOpMem())); } else { - net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(), - obp->GetOpMem(), output->GetOpMem())); + net.push_back(convolution_backward_weights( + conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem())); } if (output_reorder_required) { @@ -631,22 +633,24 @@ class MklConv2DCustomBackpropFilterOp : } }; -#define REGISTER_MKL_FILTER_KERNELS(T) \ - REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConv2DCustomBackpropFilterOp);\ - REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConv2DCustomBackpropFilterOp); \ - REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklDummyOp); +#define REGISTER_MKL_FILTER_KERNELS(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2DBackpropFilter") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConv2DCustomBackpropFilterOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2DBackpropFilterWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConv2DCustomBackpropFilterOp); \ + REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklDummyOp); TF_CALL_float(REGISTER_MKL_FILTER_KERNELS); #undef REGISTER_MKL_FILTER_KERNELS diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc index ef6db58d31f..a6745489f48 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc @@ -23,6 +23,8 @@ limitations under the License. #define EIGEN_USE_THREADS #include #include +#include "mkl_dnn.h" +#include "mkl_dnn_types.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -41,15 +43,13 @@ limitations under the License. #include "tensorflow/core/util/tensor_format.h" #include "tensorflow/core/util/use_cudnn.h" #include "tensorflow/core/util/work_sharder.h" -#include "mkl_dnn.h" -#include "mkl_dnn_types.h" #ifdef INTEL_MKL_DNN #include "mkldnn.hpp" -using mkldnn::stream; -using mkldnn::prop_kind; using mkldnn::convolution_backward_data; +using mkldnn::prop_kind; +using mkldnn::stream; #endif namespace tensorflow { @@ -359,16 +359,15 @@ class MklConv2DCustomBackpropInputOp : public OpKernel { #else template -class MklConv2DCustomBackpropInputOp : - public MklConv2DBackpropCommonOp { +class MklConv2DCustomBackpropInputOp + : public MklConv2DBackpropCommonOp { public: explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context) - : MklConv2DBackpropCommonOp(context) { } + : MklConv2DBackpropCommonOp(context) {} ~MklConv2DCustomBackpropInputOp() {} private: - const int kInputIndex_Filter = 1, - kInputIndex_InputSizes = 0, + const int kInputIndex_Filter = 1, kInputIndex_InputSizes = 0, kInputIndex_OutBackProp = 2; void ValidateMklShapes(const MklDnnShape& input_mkl_shape, const MklDnnShape& filter_mkl_shape, @@ -377,7 +376,7 @@ class MklConv2DCustomBackpropInputOp : // of the Tensor and never an actual tensor. So it will never be in MKL // layout. CHECK(!input_mkl_shape.IsMklTensor()) - << "Conv2DBackpropInput: input should not be in MKL Layout"; + << "Conv2DBackpropInput: input should not be in MKL Layout"; } size_t GetInputTensorIndexWithSizes() { return kInputIndex_InputSizes; } @@ -386,8 +385,10 @@ class MklConv2DCustomBackpropInputOp : const Tensor& input_tensor) { TensorShape input_tf_shape; CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true); - CHECK_EQ(TensorShapeUtils::MakeShape(input_tensor.vec(), - &input_tf_shape).ok(), true); + CHECK_EQ( + TensorShapeUtils::MakeShape(input_tensor.vec(), &input_tf_shape) + .ok(), + true); return input_tf_shape; } @@ -414,16 +415,13 @@ class MklConv2DCustomBackpropInputOp : return data_format; } - void CreatePrimitive(OpKernelContext* context, - const engine& cpu_engine, + void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine, const convolution_forward::primitive_desc& conv_fwd_pd, MklDnnData* input, MklDnnData* filter, MklDnnData* outbackprop, MklDnnData* output, - Tensor** output_tensor, - const memory::dims& strides, + Tensor** output_tensor, const memory::dims& strides, const memory::dims& padding_l, - const memory::dims& padding_r, - padding_kind padding, + const memory::dims& padding_r, padding_kind padding, const memory::dims& bwd_output_dims, memory::format bwd_output_format) { CHECK_NOTNULL(context); @@ -434,19 +432,16 @@ class MklConv2DCustomBackpropInputOp : CHECK_NOTNULL(output_tensor); // Create convolution backward data primitive. - auto bwd_desc = convolution_backward_data::desc(convolution_direct, - output->GetOpMemDesc(), filter->GetOpMemDesc(), - outbackprop->GetOpMemDesc(), strides, padding_l, - padding_r, padding); - - auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc, - cpu_engine, - conv_fwd_pd); + auto bwd_desc = convolution_backward_data::desc( + convolution_direct, output->GetOpMemDesc(), filter->GetOpMemDesc(), + outbackprop->GetOpMemDesc(), strides, padding_l, padding_r, padding); + auto bwd_pd = convolution_backward_data::primitive_desc( + bwd_desc, cpu_engine, conv_fwd_pd); // Allocate output tensor in TensorFlow and MKL layout. - AllocateOutputTensor(context, bwd_pd, bwd_output_dims, - bwd_output_format, output_tensor); + AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format, + output_tensor); CHECK_NOTNULL(*output_tensor); // Set buffer handle using allocated output tensor. output->SetUsrMemDataHandle(*output_tensor); @@ -455,44 +450,44 @@ class MklConv2DCustomBackpropInputOp : } // Allocate output tensor. - void AllocateOutputTensor(OpKernelContext* context, - const convolution_backward_data::primitive_desc& conv_pd, - const memory::dims& output_dims_mkl_order, - memory::format output_tf_format, Tensor** output_tensor) { - CHECK_NOTNULL(output_tensor); + void AllocateOutputTensor( + OpKernelContext* context, + const convolution_backward_data::primitive_desc& conv_pd, + const memory::dims& output_dims_mkl_order, + memory::format output_tf_format, Tensor** output_tensor) { + CHECK_NOTNULL(output_tensor); - // Output primitive descriptor for backward data is diff_src. - auto dst_pd = conv_pd.diff_src_primitive_desc(); + // Output primitive descriptor for backward data is diff_src. + auto dst_pd = conv_pd.diff_src_primitive_desc(); - // Allocate shape of Mkl tensor. - MklDnnShape output_mkl_shape; - output_mkl_shape.SetMklTensor(true); - output_mkl_shape.SetMklLayout(&dst_pd); - output_mkl_shape.SetElemType(MklDnnType()); - output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, output_tf_format); + // Allocate shape of Mkl tensor. + MklDnnShape output_mkl_shape; + output_mkl_shape.SetMklTensor(true); + output_mkl_shape.SetMklLayout(&dst_pd); + output_mkl_shape.SetElemType(MklDnnType()); + output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), + output_dims_mkl_order, output_tf_format); - // Allocate shape of TF tensor. - TensorShape output_tf_shape; - output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T)); + // Allocate shape of TF tensor. + TensorShape output_tf_shape; + output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T)); - AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape, - output_mkl_shape); + AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape, + output_mkl_shape); } // Prepare and execute net - checks for input and output reorders. void PrepareAndExecutePrimitive( - const convolution_backward_data::primitive_desc& conv_pd, - MklDnnData* filter, MklDnnData* obp, - MklDnnData* output) { + const convolution_backward_data::primitive_desc& conv_pd, + MklDnnData* filter, MklDnnData* obp, MklDnnData* output) { // Create reorders between user layout and MKL layout if it is needed and // add it to the net before convolution. std::vector net; filter->CheckReorderToOpMem(conv_pd.weights_primitive_desc(), &net); obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net); - net.push_back(convolution_backward_data(conv_pd, obp->GetOpMem(), - filter->GetOpMem(), output->GetOpMem())); + net.push_back(convolution_backward_data( + conv_pd, obp->GetOpMem(), filter->GetOpMem(), output->GetOpMem())); stream(stream::kind::eager).submit(net).wait(); } diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 0e77b45993c..e44fba754b3 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -18,8 +18,8 @@ limitations under the License. #include #include -#include #include +#include #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -41,15 +41,14 @@ limitations under the License. #include "tensorflow/core/util/mkl_util.h" - #ifdef INTEL_MKL_DNN #include "mkldnn.hpp" -using mkldnn::stream; using mkldnn::prop_kind; +using mkldnn::stream; -using mkldnn::convolution_forward; using mkldnn::convolution_direct; +using mkldnn::convolution_forward; #else #include "mkl_dnn.h" #include "mkl_dnn_types.h" @@ -116,18 +115,19 @@ class MklConv2DOp : public OpKernel { filter.shape().DebugString())); for (int i = 0; i < 3; i++) { - OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), - std::numeric_limits::max()), - errors::InvalidArgument("filter too large")); + OP_REQUIRES( + context, + FastBoundsCheck(filter.dim_size(i), std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); } const int64 input_depth = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C') : GetTensorDim(input, data_format_, 'C'); - OP_REQUIRES( - context, input_depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - input_depth, " vs ", filter.dim_size(2))); + OP_REQUIRES(context, input_depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", input_depth, + " vs ", filter.dim_size(2))); // The last dimension for filter is out_depth. const int out_depth = static_cast(filter.dim_size(3)); @@ -136,9 +136,10 @@ class MklConv2DOp : public OpKernel { const int64 input_rows_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H') : GetTensorDim(input, data_format_, 'H'); - OP_REQUIRES(context, FastBoundsCheck(input_rows_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input rows too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input_rows_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input rows too large")); const int input_rows = static_cast(input_rows_raw); const int filter_rows = static_cast(filter.dim_size(0)); @@ -147,9 +148,10 @@ class MklConv2DOp : public OpKernel { const int64 input_cols_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W') : GetTensorDim(input, data_format_, 'W'); - OP_REQUIRES(context, FastBoundsCheck(input_cols_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input cols too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input_cols_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input cols too large")); const int input_cols = static_cast(input_cols_raw); const int filter_cols = static_cast(filter.dim_size(1)); @@ -157,9 +159,10 @@ class MklConv2DOp : public OpKernel { const int64 input_batch_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N') : GetTensorDim(input, data_format_, 'N'); - OP_REQUIRES(context, FastBoundsCheck(input_batch_raw, - std::numeric_limits::max()), - errors::InvalidArgument("batch is too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input_batch_raw, std::numeric_limits::max()), + errors::InvalidArgument("batch is too large")); const int batch = static_cast(input_batch_raw); // For now we take the stride from the second and third dimensions only (we @@ -313,8 +316,7 @@ class MklConv2DOp : public OpKernel { // Temp tensor used to allocate tmp buffers Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor, mkl_tmp_bias_buf_tensor; - mkl_context.MklPrepareConvolutionInputs(context, - &mkl_tmp_input_buf_tensor, + mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor, &mkl_tmp_filter_buf_tensor, &mkl_tmp_bias_buf_tensor); @@ -398,8 +400,9 @@ class MklConv2DOp : public OpKernel { mkl_convert_input = !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input); if (mkl_convert_input) { - CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, - lt_input, mkl_lt_internal_input), E_SUCCESS); + CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input, + mkl_lt_internal_input), + E_SUCCESS); AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input, &mkl_buf_convert_input); CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input, @@ -517,8 +520,8 @@ class MklConv2DOp : public OpKernel { GetMklShape(context, kInputIndex_Src, &src_mkl_shape); GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape); OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false, - errors::InvalidArgument("Filter should not be in " - "Mkl Layout")); + errors::InvalidArgument("Filter should not be in " + "Mkl Layout")); MklDnnData src(&cpu_engine); MklDnnData filter(&cpu_engine); @@ -531,11 +534,10 @@ class MklConv2DOp : public OpKernel { MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_); auto src_tf_shape = GetTfShape(context, kInputIndex_Src); auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter); - conv_utl.GetConvFwdSizesInMklOrder(src_tf_shape, filter_tf_shape, - &src_dims, &filter_dims, &strides, - &output_dims_tf_order, - &output_dims_mkl_order, &padding_l, - &padding_r); + conv_utl.GetConvFwdSizesInMklOrder( + src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides, + &output_dims_tf_order, &output_dims_mkl_order, &padding_l, + &padding_r); if (!context->status().ok()) return; // Check for corner case - if there is nothing to compute, return. @@ -543,21 +545,20 @@ class MklConv2DOp : public OpKernel { // Corner cases: output with 0 elements and 0 batch size. Tensor* output_tensor = nullptr; - if (output_tf_shape.num_elements() == 0 || - output_dims_tf_order[0] == 0) { + if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) { // TODO(jbobba): Verify correctness here // Need semantics for Null MKL tensor MklDnnShape output_mkl_shape; output_mkl_shape.SetMklTensor(false); AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor, - src_tf_shape, output_mkl_shape); + src_tf_shape, output_mkl_shape); // MklConv2D also outputs converted filter as 2nd output of Conv2D. filter_mkl_shape.SetMklTensor(false); Tensor* output_filter_tensor = nullptr; AllocateOutputSetMklShape(context, kOutputIndex_Filter, - &output_filter_tensor, - filter_tf_shape, filter_mkl_shape); + &output_filter_tensor, filter_tf_shape, + filter_mkl_shape); return; } @@ -570,14 +571,15 @@ class MklConv2DOp : public OpKernel { // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's // layout (NHWC or NCHW depending on data format). auto src_md = src_mkl_shape.IsMklTensor() - ? src_mkl_shape.GetMklLayout() - : memory::desc(src_dims, MklDnnType(), tf_fmt); + ? src_mkl_shape.GetMklLayout() + : memory::desc(src_dims, MklDnnType(), tf_fmt); src.SetUsrMem(src_md, &src_tensor); // Although filter shape (filter_dims) required is in MKL-DNN order, // the layout is Tensorflow's layout (HWIO). auto filter_md = filter_mkl_shape.IsMklTensor() // Should NEVER be true - ? filter_mkl_shape.GetMklLayout() - : memory::desc(filter_dims, MklDnnType(), memory::format::hwio); + ? filter_mkl_shape.GetMklLayout() + : memory::desc(filter_dims, MklDnnType(), + memory::format::hwio); filter.SetUsrMem(filter_md, &filter_tensor); // Set output shape (output_dims) required in MKL-DNN order. @@ -601,34 +603,13 @@ class MklConv2DOp : public OpKernel { bias.SetOpMemDesc(bias_size, memory::format::any); // Create convolution primitive with Bias. - auto conv_desc = convolution_forward::desc(prop_kind::forward, - convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(), - bias.GetOpMemDesc(), output.GetOpMemDesc(), strides, - padding_l, padding_r, TFPaddingToMklDnnPadding(padding_)); + auto conv_desc = convolution_forward::desc( + prop_kind::forward, convolution_direct, src.GetOpMemDesc(), + filter.GetOpMemDesc(), bias.GetOpMemDesc(), output.GetOpMemDesc(), + strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_)); - auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, - cpu_engine); - AllocateOutputTensor(context, conv_prim_desc, - output_dims_mkl_order, tf_fmt, &output_tensor); - // Set data handle for output. - output.SetUsrMemDataHandle(output_tensor); - - Tensor* filter_out_tensor = nullptr; - AllocateFilterOutputTensor(context, conv_prim_desc, - TFShapeToMklDnnDims(filter_tf_shape), - &filter_out_tensor); - - PrepareAndExecuteNet(conv_prim_desc, &src, &filter, - &bias, &output, filter_out_tensor); - } else { - // Create convolution primitive without Bias. - auto conv_desc = convolution_forward::desc(prop_kind::forward, - convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(), - output.GetOpMemDesc(), strides, padding_l, padding_r, - TFPaddingToMklDnnPadding(padding_)); - - auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, - cpu_engine); + auto conv_prim_desc = + convolution_forward::primitive_desc(conv_desc, cpu_engine); AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order, tf_fmt, &output_tensor); // Set data handle for output. @@ -636,18 +617,39 @@ class MklConv2DOp : public OpKernel { Tensor* filter_out_tensor = nullptr; AllocateFilterOutputTensor(context, conv_prim_desc, - TFShapeToMklDnnDims(filter_tf_shape), - &filter_out_tensor); - PrepareAndExecuteNet(conv_prim_desc, &src, &filter, - nullptr, &output, filter_out_tensor); + TFShapeToMklDnnDims(filter_tf_shape), + &filter_out_tensor); + + PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output, + filter_out_tensor); + } else { + // Create convolution primitive without Bias. + auto conv_desc = convolution_forward::desc( + prop_kind::forward, convolution_direct, src.GetOpMemDesc(), + filter.GetOpMemDesc(), output.GetOpMemDesc(), strides, padding_l, + padding_r, TFPaddingToMklDnnPadding(padding_)); + + auto conv_prim_desc = + convolution_forward::primitive_desc(conv_desc, cpu_engine); + AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order, + tf_fmt, &output_tensor); + // Set data handle for output. + output.SetUsrMemDataHandle(output_tensor); + + Tensor* filter_out_tensor = nullptr; + AllocateFilterOutputTensor(context, conv_prim_desc, + TFShapeToMklDnnDims(filter_tf_shape), + &filter_out_tensor); + PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output, + filter_out_tensor); } - } catch (mkldnn::error &e) { + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + std::string(e.message) + - ", in file " + std::string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", error_msg)); + ", message: " + std::string(e.message) + ", in file " + + std::string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } @@ -655,71 +657,67 @@ class MklConv2DOp : public OpKernel { std::vector strides_; Padding padding_; TensorFormat data_format_; - const int kInputIndex_Src = 0, - kInputIndex_Filter = 1, - kInputIndex_Bias = 2; + const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2; const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1; // Allocate output tensor. void AllocateOutputTensor( - OpKernelContext* context, - const convolution_forward::primitive_desc& conv_prim_desc, - const memory::dims& output_dims_mkl_order, - memory::format output_tf_format, Tensor** output_tensor) { - CHECK_NOTNULL(output_tensor); - auto dst_pd = conv_prim_desc.dst_primitive_desc(); + OpKernelContext* context, + const convolution_forward::primitive_desc& conv_prim_desc, + const memory::dims& output_dims_mkl_order, + memory::format output_tf_format, Tensor** output_tensor) { + CHECK_NOTNULL(output_tensor); + auto dst_pd = conv_prim_desc.dst_primitive_desc(); - // Allocate shape of Mkl tensor. - MklDnnShape output_mkl_shape; - output_mkl_shape.SetMklTensor(true); - output_mkl_shape.SetMklLayout(&dst_pd); - output_mkl_shape.SetElemType(MklDnnType()); - output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, output_tf_format); + // Allocate shape of Mkl tensor. + MklDnnShape output_mkl_shape; + output_mkl_shape.SetMklTensor(true); + output_mkl_shape.SetMklLayout(&dst_pd); + output_mkl_shape.SetElemType(MklDnnType()); + output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), + output_dims_mkl_order, output_tf_format); - // Allocate shape of TF tensor. - TensorShape output_tf_shape; - output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T))); + // Allocate shape of TF tensor. + TensorShape output_tf_shape; + output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T))); - AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor, - output_tf_shape, output_mkl_shape); + AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor, + output_tf_shape, output_mkl_shape); } // Allocate output tensor. void AllocateFilterOutputTensor( - OpKernelContext* context, - const convolution_forward::primitive_desc& conv_prim_desc, - const memory::dims& filter_dims_tf_order, - Tensor** filter_tensor) { - CHECK_NOTNULL(filter_tensor); - auto filter_pd = conv_prim_desc.weights_primitive_desc(); + OpKernelContext* context, + const convolution_forward::primitive_desc& conv_prim_desc, + const memory::dims& filter_dims_tf_order, Tensor** filter_tensor) { + CHECK_NOTNULL(filter_tensor); + auto filter_pd = conv_prim_desc.weights_primitive_desc(); - // Allocate shape of Mkl tensor. - MklDnnShape filter_mkl_shape; - filter_mkl_shape.SetMklTensor(true); - filter_mkl_shape.SetMklLayout(&filter_pd); - filter_mkl_shape.SetElemType(MklDnnType()); + // Allocate shape of Mkl tensor. + MklDnnShape filter_mkl_shape; + filter_mkl_shape.SetMklTensor(true); + filter_mkl_shape.SetMklLayout(&filter_pd); + filter_mkl_shape.SetElemType(MklDnnType()); - // The format of the filter is actually OIhw8i8o, but TF doesn't support - // this format. Just use format::blocked for now because the layout - // is stored in the MKL data. - filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(), - filter_dims_tf_order, memory::format::blocked); + // The format of the filter is actually OIhw8i8o, but TF doesn't support + // this format. Just use format::blocked for now because the layout + // is stored in the MKL data. + filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(), + filter_dims_tf_order, memory::format::blocked); - // Allocate the data space for the filter to propagate as TF tensor. - TensorShape filter_tf_shape; - filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T))); + // Allocate the data space for the filter to propagate as TF tensor. + TensorShape filter_tf_shape; + filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T))); - AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor, - filter_tf_shape, filter_mkl_shape); + AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor, + filter_tf_shape, filter_mkl_shape); } // Prepare and execute net - checks for input and output reorders. void PrepareAndExecuteNet( - const convolution_forward::primitive_desc& conv_prim_desc, - MklDnnData* src, MklDnnData* filter, - MklDnnData* bias, MklDnnData* output, - Tensor* filter_out_tensor) { + const convolution_forward::primitive_desc& conv_prim_desc, + MklDnnData* src, MklDnnData* filter, MklDnnData* bias, + MklDnnData* output, Tensor* filter_out_tensor) { CHECK_NOTNULL(filter_out_tensor); // Create reorders between user layout and MKL layout if it is needed and @@ -731,18 +729,20 @@ class MklConv2DOp : public OpKernel { // rather than re-order to a temp buffer, reorder directly to the // filter output tensor filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(), - filter->GetTensorBuffer(filter_out_tensor), &net); + filter->GetTensorBuffer(filter_out_tensor), + &net); // Create convolution primitive and add it to net. if (bias) { CHECK_EQ(biasEnabled, true); net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(), - filter->GetOpMem(), bias->GetOpMem(), - output->GetOpMem())); + filter->GetOpMem(), bias->GetOpMem(), + output->GetOpMem())); } else { CHECK_EQ(biasEnabled, false); net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(), - filter->GetOpMem(), output->GetOpMem())); + filter->GetOpMem(), + output->GetOpMem())); } stream(stream::kind::eager).submit(net).wait(); diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h index c6456bd5c33..8b65eaea0d7 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.h +++ b/tensorflow/core/kernels/mkl_conv_ops.h @@ -16,9 +16,9 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_ #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_ -#include #include #include +#include #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -27,8 +27,8 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_slice.h" #include "tensorflow/core/kernels/bounds_check.h" -#include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/kernels/conv_grad_ops.h" +#include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/strings/numbers.h" @@ -43,11 +43,11 @@ limitations under the License. #ifdef INTEL_MKL_DNN #include "mkldnn.hpp" -using mkldnn::stream; using mkldnn::prop_kind; +using mkldnn::stream; -using mkldnn::convolution_forward; using mkldnn::convolution_direct; +using mkldnn::convolution_forward; #endif namespace tensorflow { @@ -63,13 +63,13 @@ class MklDnnConvUtil { public: MklDnnConvUtil(OpKernelContext* context, const std::vector& strides, - Padding pad, TensorFormat fm) : context_(context), - strides_(strides), padding_(pad), data_format_(fm) {} + Padding pad, TensorFormat fm) + : context_(context), strides_(strides), padding_(pad), data_format_(fm) {} virtual ~MklDnnConvUtil() { context_ = nullptr; } // Calculate Convolution strides - virtual inline void GetStridesInMklOrder(memory::dims *strides) { + virtual inline void GetStridesInMklOrder(memory::dims* strides) { // For now we take the stride from the second and third dimensions only // (we do not support striding on the batch or depth dimension). CHECK_NOTNULL(strides); @@ -82,14 +82,14 @@ class MklDnnConvUtil { // requires input in NCHW format. Function does not return anything. // But errors arising from sanity checks are returned in context's // status. - virtual inline void - GetInputSizeInMklOrder(const TensorShape& input_shape, - memory::dims *input_dims) { - #define CHECK_BOUNDS(val, err_msg) do { \ - OP_REQUIRES(context_, FastBoundsCheck(val, \ - std::numeric_limits::max()), \ - errors::InvalidArgument(err_msg)); \ - }while(0) + virtual inline void GetInputSizeInMklOrder(const TensorShape& input_shape, + memory::dims* input_dims) { +#define CHECK_BOUNDS(val, err_msg) \ + do { \ + OP_REQUIRES(context_, \ + FastBoundsCheck(val, std::numeric_limits::max()), \ + errors::InvalidArgument(err_msg)); \ + } while (0) CHECK_NOTNULL(input_dims); @@ -112,7 +112,7 @@ class MklDnnConvUtil { CHECK_BOUNDS(input_batch_raw, "Input batch too large"); int input_batch = static_cast(input_batch_raw); - #undef CHECK_BOUNDS +#undef CHECK_BOUNDS // MKL-DNN always requires input in NCHW format. std::vector mkldnn_sizes(4, -1); @@ -138,10 +138,9 @@ class MklDnnConvUtil { // forward gets actual tensor as input). // // TODO(nhasabni): Add similar function for input and filter in MklShape. - virtual inline void - GetFilterSizeInMklOrder(const TensorShape& input_shape, - const TensorShape& filter_shape, - memory::dims *filter_dims) { + virtual inline void GetFilterSizeInMklOrder(const TensorShape& input_shape, + const TensorShape& filter_shape, + memory::dims* filter_dims) { CHECK_NOTNULL(filter_dims); OP_REQUIRES(context_, filter_shape.dims() == 4, @@ -149,17 +148,18 @@ class MklDnnConvUtil { filter_shape.DebugString())); for (int i = 0; i < 3; i++) { - OP_REQUIRES(context_, FastBoundsCheck(filter_shape.dim_size(i), - std::numeric_limits::max()), - errors::InvalidArgument("filter too large")); + OP_REQUIRES(context_, + FastBoundsCheck(filter_shape.dim_size(i), + std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); } int input_depth = GetTensorDim(input_shape, data_format_, 'C'); - OP_REQUIRES( - context_, input_depth == filter_shape.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - input_depth, " vs ", filter_shape.dim_size(2))); + OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", input_depth, + " vs ", filter_shape.dim_size(2))); // TF filter is always in (rows, cols, in_depth, out_depth) order. int filter_rows = static_cast(filter_shape.dim_size(0)); @@ -182,25 +182,24 @@ class MklDnnConvUtil { // requires filter in OIHW format. Function does not return anything. // But errors arising from sanity checks are returned in context's // status. - virtual inline void - GetFilterSizeInMklOrder(size_t src_index, size_t filter_index, - memory::dims *filter_dims) { + virtual inline void GetFilterSizeInMklOrder(size_t src_index, + size_t filter_index, + memory::dims* filter_dims) { CHECK_NOTNULL(filter_dims); GetFilterSizeInMklOrder(GetTfShape(context_, src_index), - GetTfShape(context_, filter_index), - filter_dims); + GetTfShape(context_, filter_index), filter_dims); } // Calculate Bias size for 2D Convolution. Function does not return // anything, but sets error in context status. - virtual inline void - GetBiasSizeInMklOrder(size_t bias_index, memory::dims *bias_dims) { + virtual inline void GetBiasSizeInMklOrder(size_t bias_index, + memory::dims* bias_dims) { const Tensor& bias = MklGetInput(context_, bias_index); OP_REQUIRES(context_, bias.dims() == 1, errors::InvalidArgument("bias must be 1-dimensional: ", bias.shape().DebugString())); - *bias_dims = { static_cast(bias.dim_size(0)) }; + *bias_dims = {static_cast(bias.dim_size(0))}; } // Function to calculate output and padding size for 2D convolution. @@ -212,13 +211,11 @@ class MklDnnConvUtil { // status is returned via context status. // // TODO(nhasabni): Add similar function for input and filter in MklShape. - virtual inline void - GetOutputAndPadSizeInMklOrder(const TensorShape& input_shape, - const TensorShape& filter_shape, - const memory::dims& strides, - memory::dims *output_dims_tf_order, - memory::dims *output_dims_mkl_order, - memory::dims *pad_l, memory::dims *pad_r) { + virtual inline void GetOutputAndPadSizeInMklOrder( + const TensorShape& input_shape, const TensorShape& filter_shape, + const memory::dims& strides, memory::dims* output_dims_tf_order, + memory::dims* output_dims_mkl_order, memory::dims* pad_l, + memory::dims* pad_r) { CHECK_NOTNULL(output_dims_tf_order); CHECK_NOTNULL(output_dims_mkl_order); CHECK_NOTNULL(pad_l); @@ -244,16 +241,16 @@ class MklDnnConvUtil { int64 out_rows = 0, out_cols = 0; int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right; - OP_REQUIRES_OK(context_, - GetWindowedOutputSizeVerbose(input_rows, filter_rows, stride_rows, - padding_, &out_rows, &pad_top, &pad_bottom)); - OP_REQUIRES_OK(context_, - GetWindowedOutputSizeVerbose(input_cols, filter_cols, stride_cols, - padding_, &out_cols, &pad_left, &pad_right)); + OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose( + input_rows, filter_rows, stride_rows, padding_, + &out_rows, &pad_top, &pad_bottom)); + OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose( + input_cols, filter_cols, stride_cols, padding_, + &out_cols, &pad_left, &pad_right)); // Tensorflow output is in data_format order. (NHWC or NCHW) - TensorShape out_shape = ShapeFromFormat(data_format_, out_batch, - out_rows, out_cols, out_depth); + TensorShape out_shape = + ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth); *output_dims_tf_order = TFShapeToMklDnnDims(out_shape); // MKL-DNN always needs output in NCHW format. @@ -273,12 +270,10 @@ class MklDnnConvUtil { // See comment on GetConvOutputAndPadSizeInMklOrder for parameters. // // Function does not return anything, but sets error in context status. - inline void - GetOutputAndPadSizeInMklOrder(size_t src_index, size_t filter_index, - const memory::dims& strides, - memory::dims *output_dims_tf_order, - memory::dims *output_dims_mkl_order, - memory::dims *pad_l, memory::dims *pad_r) { + inline void GetOutputAndPadSizeInMklOrder( + size_t src_index, size_t filter_index, const memory::dims& strides, + memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order, + memory::dims* pad_l, memory::dims* pad_r) { CHECK_NOTNULL(output_dims_tf_order); CHECK_NOTNULL(output_dims_mkl_order); CHECK_NOTNULL(pad_l); @@ -289,11 +284,11 @@ class MklDnnConvUtil { OP_REQUIRES(context_, input_tf_shape.dims() == 4, errors::InvalidArgument("input must be 4-dimensional", - input_tf_shape.DebugString())); + input_tf_shape.DebugString())); - GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, - strides, output_dims_tf_order, - output_dims_mkl_order, pad_l, pad_r); + GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides, + output_dims_tf_order, output_dims_mkl_order, + pad_l, pad_r); } // Wrapper function to calculate input, filter, and output sizes of @@ -302,15 +297,12 @@ class MklDnnConvUtil { // also calculates strides and paddings for 2D Convolution. // // Function does not return anything, but sets error in context status. - inline void GetConvFwdSizesInMklOrder(const TensorShape& input_shape, - const TensorShape& filter_shape, - memory::dims *input_dims, - memory::dims *filter_dims, - memory::dims *strides, - memory::dims *output_dims_tf_order, - memory::dims *output_dims_mkl_order, - memory::dims *pad_l, - memory::dims *pad_r) { + inline void GetConvFwdSizesInMklOrder( + const TensorShape& input_shape, const TensorShape& filter_shape, + memory::dims* input_dims, memory::dims* filter_dims, + memory::dims* strides, memory::dims* output_dims_tf_order, + memory::dims* output_dims_mkl_order, memory::dims* pad_l, + memory::dims* pad_r) { CHECK_NOTNULL(input_dims); CHECK_NOTNULL(filter_dims); CHECK_NOTNULL(strides); @@ -325,8 +317,7 @@ class MklDnnConvUtil { if (!context_->status().ok()) return; GetStridesInMklOrder(strides); GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides, - output_dims_tf_order, - output_dims_mkl_order, + output_dims_tf_order, output_dims_mkl_order, pad_l, pad_r); if (!context_->status().ok()) return; } @@ -337,7 +328,7 @@ class MklDnnConvUtil { ///////////////////////////////////////////////////////////////////// template -class MklConv2DBackpropCommonOp : public OpKernel { +class MklConv2DBackpropCommonOp : public OpKernel { public: ~MklConv2DBackpropCommonOp() {} explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context) @@ -397,12 +388,11 @@ class MklConv2DBackpropCommonOp : public OpKernel { outbprop_tf_shape.num_elements() == 0) { MklDnnShape output_mkl_shape; output_mkl_shape.SetMklTensor(false); - TensorShape output_tf_shape = GetOutputTfShape(input_tf_shape, - filter_tf_shape, - outbprop_tf_shape); + TensorShape output_tf_shape = GetOutputTfShape( + input_tf_shape, filter_tf_shape, outbprop_tf_shape); const int kOutputIdx = 0; AllocateOutputSetMklShape(context, kOutputIdx, &output_tensor, - output_tf_shape, output_mkl_shape); + output_tf_shape, output_mkl_shape); CHECK_NOTNULL(output_tensor); // if output tensor has more than 0 elements, we need to 0 them out. @@ -421,12 +411,10 @@ class MklConv2DBackpropCommonOp : public OpKernel { // Get forward convolution parameters. MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_); - conv_utl.GetConvFwdSizesInMklOrder(input_tf_shape, filter_tf_shape, - &fwd_input_dims, &fwd_filter_dims, - &strides, - &fwd_output_dims_tf_order, - &fwd_output_dims, - &padding_l, &padding_r); + conv_utl.GetConvFwdSizesInMklOrder( + input_tf_shape, filter_tf_shape, &fwd_input_dims, &fwd_filter_dims, + &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l, + &padding_r); if (!context->status().ok()) return; // Create Convolution forward descriptor since Convolution backward @@ -437,20 +425,22 @@ class MklConv2DBackpropCommonOp : public OpKernel { // construct input TF layout. For TF layout, although input shape // required is in MKL-DNN order, the layout is Tensorflow's layout // (NHWC or NCHW depending on data format). - auto fwd_input_md = input_mkl_shape.IsMklTensor() ? - input_mkl_shape.GetMklLayout() : - memory::desc(fwd_input_dims, MklDnnType(), tf_fmt); + auto fwd_input_md = + input_mkl_shape.IsMklTensor() + ? input_mkl_shape.GetMklLayout() + : memory::desc(fwd_input_dims, MklDnnType(), tf_fmt); // If filter is in MKL layout, then simply grab filter layout; otherwise // construct filter in TF layout. For TF layout, filter is in HWIO format. - auto fwd_filter_md = filter_mkl_shape.IsMklTensor() ? - filter_mkl_shape.GetMklLayout() : - memory::desc(fwd_filter_dims, MklDnnType(), - memory::format::hwio); + auto fwd_filter_md = filter_mkl_shape.IsMklTensor() + ? filter_mkl_shape.GetMklLayout() + : memory::desc(fwd_filter_dims, MklDnnType(), + memory::format::hwio); // Tensorflow Output of Conv2D is in data_format order. auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType(), tf_fmt); - auto fwd_desc = convolution_forward::desc(prop_kind::forward, - convolution_direct, fwd_input_md, fwd_filter_md, fwd_out_md, - strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_)); + auto fwd_desc = convolution_forward::desc( + prop_kind::forward, convolution_direct, fwd_input_md, fwd_filter_md, + fwd_out_md, strides, padding_l, padding_r, + TFPaddingToMklDnnPadding(padding_)); auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine); // Create memory for user data. Describe how the inputs and outputs of @@ -495,17 +485,16 @@ class MklConv2DBackpropCommonOp : public OpKernel { // Operator-specific call to create and execute primitive. CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter, - &outbackprop, &output, &output_tensor, - strides, padding_l, padding_r, - TFPaddingToMklDnnPadding(padding_), + &outbackprop, &output, &output_tensor, strides, padding_l, + padding_r, TFPaddingToMklDnnPadding(padding_), bwd_output_dims, bwd_output_format); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:", - error_msg)); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } @@ -523,11 +512,11 @@ class MklConv2DBackpropCommonOp : public OpKernel { /// Get TensorFlow shape of input tensor. virtual TensorShape MakeInputTfShape(OpKernelContext* context, - const Tensor& input_tensor) = 0; + const Tensor& input_tensor) = 0; /// Get TensorFlow shape of filter tensor. virtual TensorShape MakeFilterTfShape(OpKernelContext* context, - const Tensor& filter_tensor) = 0; + const Tensor& filter_tensor) = 0; /// Get the TensorFlow shape of output tensor. virtual TensorShape GetOutputTfShape(const TensorShape& input_shape, @@ -536,9 +525,9 @@ class MklConv2DBackpropCommonOp : public OpKernel { /// Get shape of output in MKL-DNN order. Computes shape of output from /// input shape (fwd_input_dims) and filter shape (fwd_filter_dims). - virtual - const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims, - const memory::dims& fwd_filter_dims) = 0; + virtual const memory::dims& GetOutputDims( + const memory::dims& fwd_input_dims, + const memory::dims& fwd_filter_dims) = 0; /// Get data_format of output in MKL-DNN order. If output data format is /// same as input data format, then it simply returns value of data_format @@ -546,17 +535,18 @@ class MklConv2DBackpropCommonOp : public OpKernel { virtual memory::format GetOutputFormat(const memory::format data_format) = 0; /// Create and execute the primitive storing output in the output_tensor. - virtual void CreatePrimitive(OpKernelContext* context, - const engine& cpu_engine, - const convolution_forward::primitive_desc& conv_fwd_pd, - MklDnnData* input, MklDnnData* filter, MklDnnData* outbackprop, - MklDnnData* output, Tensor** output_tensor, const memory::dims& strides, - const memory::dims& padding_l, const memory::dims& padding_r, - padding_kind padding, const memory::dims& bwd_output_dims, - memory::format bwd_output_format) = 0; + virtual void CreatePrimitive( + OpKernelContext* context, const engine& cpu_engine, + const convolution_forward::primitive_desc& conv_fwd_pd, + MklDnnData* input, MklDnnData* filter, MklDnnData* outbackprop, + MklDnnData* output, Tensor** output_tensor, + const memory::dims& strides, const memory::dims& padding_l, + const memory::dims& padding_r, padding_kind padding, + const memory::dims& bwd_output_dims, + memory::format bwd_output_format) = 0; // Get the data_format {NCHW, NHWC} - TensorFormat GetTFDataFormat () { return data_format_; } + TensorFormat GetTFDataFormat() { return data_format_; } private: std::vector strides_; @@ -575,12 +565,12 @@ class MklDummyOp : public OpKernel { public: ~MklDummyOp() {} - explicit MklDummyOp(OpKernelConstruction* context) : - OpKernel(context) {} + explicit MklDummyOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { - TF_CHECK_OK(errors::Unimplemented("This is a dummy op." - "It should not have been invoked.")); + TF_CHECK_OK( + errors::Unimplemented("This is a dummy op." + "It should not have been invoked.")); } }; diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc index 8340a91d059..0b6d838e098 100644 --- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc +++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc @@ -28,12 +28,12 @@ limitations under the License. #ifdef INTEL_MKL_DNN #include "mkldnn.hpp" -using mkldnn::stream; -using mkldnn::prop_kind; -using mkldnn::use_scale_shift; -using mkldnn::use_global_stats; -using mkldnn::batch_normalization_forward; using mkldnn::batch_normalization_backward; +using mkldnn::batch_normalization_forward; +using mkldnn::prop_kind; +using mkldnn::stream; +using mkldnn::use_global_stats; +using mkldnn::use_scale_shift; #endif // TODO(inteltf) Address comments from PR 8968. @@ -601,7 +601,7 @@ class MklFusedBatchNormGradOp : public OpKernel { mkl_res_batchnorm_bwd[dnnResourceSrc] = (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input; - bool mkl_convert_out_backprop; + bool mkl_convert_out_backprop; dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr; dnnLayout_t mkl_lt_internal_out_backprop = nullptr; void* mkl_buf_converted_out_backprop = nullptr; @@ -709,12 +709,11 @@ class MklFusedBatchNormOp : public OpKernel { const size_t kMeanIndex = 3; // index of est_mean tensor const size_t kVarianceIndex = 4; // index of est_variance tensor - const Tensor& src_tensor = MklGetInput(context, kSrcIndex); - const Tensor& scale_tensor = MklGetInput(context, kScaleIndex); - const Tensor& shift_tensor = MklGetInput(context, kShiftIndex); - const Tensor& est_mean_tensor = MklGetInput(context, kMeanIndex); - const Tensor& est_variance_tensor = MklGetInput(context, - kVarianceIndex); + const Tensor& src_tensor = MklGetInput(context, kSrcIndex); + const Tensor& scale_tensor = MklGetInput(context, kScaleIndex); + const Tensor& shift_tensor = MklGetInput(context, kShiftIndex); + const Tensor& est_mean_tensor = MklGetInput(context, kMeanIndex); + const Tensor& est_variance_tensor = MklGetInput(context, kVarianceIndex); TensorShape tf_shape_src; MklDnnShape dnn_shape_src; @@ -723,37 +722,34 @@ class MklFusedBatchNormOp : public OpKernel { if (dnn_shape_src.IsMklTensor()) { tf_shape_src = dnn_shape_src.GetTfShape(); OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - src_tensor.shape().DebugString())); + errors::InvalidArgument("input must be 4-dimensional", + src_tensor.shape().DebugString())); } else { tf_shape_src = src_tensor.shape(); OP_REQUIRES(context, src_tensor.dims() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - src_tensor.shape().DebugString())); + errors::InvalidArgument("input must be 4-dimensional", + src_tensor.shape().DebugString())); } OP_REQUIRES(context, scale_tensor.dims() == 1, - errors::InvalidArgument( - "scale must be 1-dimensional", - scale_tensor.shape().DebugString())); + errors::InvalidArgument("scale must be 1-dimensional", + scale_tensor.shape().DebugString())); OP_REQUIRES(context, shift_tensor.dims() == 1, errors::InvalidArgument("offset must be 1-dimensional", - shift_tensor.shape().DebugString())); - OP_REQUIRES(context, est_mean_tensor.dims() == 1, - errors::InvalidArgument( - "estimated_mean must be 1-dimensional", - est_mean_tensor.shape().DebugString())); - OP_REQUIRES(context, est_variance_tensor.dims() == 1, - errors::InvalidArgument( - "estimated_variance must be 1-dimensional", - est_variance_tensor.shape().DebugString())); + shift_tensor.shape().DebugString())); + OP_REQUIRES( + context, est_mean_tensor.dims() == 1, + errors::InvalidArgument("estimated_mean must be 1-dimensional", + est_mean_tensor.shape().DebugString())); + OP_REQUIRES( + context, est_variance_tensor.dims() == 1, + errors::InvalidArgument("estimated_variance must be 1-dimensional", + est_variance_tensor.shape().DebugString())); if (is_training_) { - OP_REQUIRES(context, est_mean_tensor.dim_size(0) == 0, - errors::InvalidArgument( - "estimated_mean must be empty for training", - est_mean_tensor.shape().DebugString())); + OP_REQUIRES( + context, est_mean_tensor.dim_size(0) == 0, + errors::InvalidArgument("estimated_mean must be empty for training", + est_mean_tensor.shape().DebugString())); OP_REQUIRES(context, est_variance_tensor.dim_size(0) == 0, errors::InvalidArgument( "estimated_variance must be empty for training", @@ -763,11 +759,9 @@ class MklFusedBatchNormOp : public OpKernel { // special case: input with 0 element and 0 batch size Tensor* dst_tensor = nullptr; if (tf_shape_src.num_elements() == 0) { - HandleEmptyInput(context, - tf_shape_src, - scale_tensor.shape(), - &dst_tensor); - return; + HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(), + &dst_tensor); + return; } if (dnn_shape_src.IsMklTensor()) @@ -783,11 +777,8 @@ class MklFusedBatchNormOp : public OpKernel { Tensor* batch_variance_tensor = nullptr; Tensor* saved_mean_tensor = nullptr; Tensor* saved_variance_tensor = nullptr; - AllocateTFOutputs(context, - scale_tensor.shape(), - &batch_mean_tensor, - &batch_variance_tensor, - &saved_mean_tensor, + AllocateTFOutputs(context, scale_tensor.shape(), &batch_mean_tensor, + &batch_variance_tensor, &saved_mean_tensor, &saved_variance_tensor); if (is_training_) @@ -815,69 +806,63 @@ class MklFusedBatchNormOp : public OpKernel { src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(), tensor_format_); } else { - src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), - tensor_format_); + src_dims = + TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_); } auto src_md = dnn_shape_src.IsMklTensor() - ? dnn_shape_src.GetMklLayout() - : memory::desc(src_dims, MklDnnType(), format_m); + ? dnn_shape_src.GetMklLayout() + : memory::desc(src_dims, MklDnnType(), format_m); src.SetUsrMem(src_md, &src_tensor); // set weights primitive // MKL-DNN packs scale & shift as "weights": // ...... - auto weights_desc = memory::desc({2, depth_}, - MklDnnType(), - memory::format::nc); + auto weights_desc = + memory::desc({2, depth_}, MklDnnType(), memory::format::nc); auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine); auto weights_m = memory(weights_pd); - T* weights_data = reinterpret_cast( - weights_m.get_data_handle()); - T* scale_tf = reinterpret_cast( - const_cast(scale_tensor.flat().data())); - T* shift_tf = reinterpret_cast( - const_cast(shift_tensor.flat().data())); + T* weights_data = reinterpret_cast(weights_m.get_data_handle()); + T* scale_tf = + reinterpret_cast(const_cast(scale_tensor.flat().data())); + T* shift_tf = + reinterpret_cast(const_cast(shift_tensor.flat().data())); - for (int k=0; k < depth_; k++) { + for (int k = 0; k < depth_; k++) { weights_data[k] = scale_tf[k]; weights_data[k + depth_] = shift_tf[k]; } // set mean primitive - auto mean_desc = memory::desc({1, depth_}, - MklDnnType(), - memory::format::nc); + auto mean_desc = + memory::desc({1, depth_}, MklDnnType(), memory::format::nc); auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine); - char* saved_mean_data_tf = reinterpret_cast - (saved_mean_tensor->flat().data()); - std::memcpy(saved_mean_data_tf, - reinterpret_cast(mean_values_), - depth_*sizeof(T)); - auto mean_m = memory(mean_pd, - reinterpret_cast(saved_mean_data_tf)); + char* saved_mean_data_tf = + reinterpret_cast(saved_mean_tensor->flat().data()); + std::memcpy(saved_mean_data_tf, reinterpret_cast(mean_values_), + depth_ * sizeof(T)); + auto mean_m = + memory(mean_pd, reinterpret_cast(saved_mean_data_tf)); // set variance primitive - auto variance_desc = memory::desc({1, depth_}, - MklDnnType(), - memory::format::nc); + auto variance_desc = + memory::desc({1, depth_}, MklDnnType(), memory::format::nc); auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine); - char* saved_variance_data_tf = reinterpret_cast - (saved_variance_tensor->flat().data()); + char* saved_variance_data_tf = + reinterpret_cast(saved_variance_tensor->flat().data()); std::memcpy(saved_variance_data_tf, reinterpret_cast(variance_values_), - depth_*sizeof(T)); + depth_ * sizeof(T)); auto variance_m = memory(variance_pd, saved_variance_data_tf); - prop_kind pk = (is_training_) ? - prop_kind::forward_training : - prop_kind::forward_scoring; + prop_kind pk = (is_training_) ? prop_kind::forward_training + : prop_kind::forward_scoring; auto bnrm_fwd_desc = batch_normalization_forward::desc( - pk, src.GetUsrMemDesc(), epsilon_, - is_training_ ? use_scale_shift : - (use_scale_shift | use_global_stats)); + pk, src.GetUsrMemDesc(), epsilon_, + is_training_ ? use_scale_shift + : (use_scale_shift | use_global_stats)); auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc( - bnrm_fwd_desc, cpu_engine); + bnrm_fwd_desc, cpu_engine); // allocate dst tensor MklDnnShape dnn_shape_dst; @@ -887,47 +872,39 @@ class MklFusedBatchNormOp : public OpKernel { auto dst_pd = bnrm_fwd_pd.dst_primitive_desc(); dnn_shape_dst.SetMklLayout(&dst_pd); dnn_shape_dst.SetElemType(MklDnnType()); - dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), - src_dims, format_m); - tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T)); + dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), src_dims, + format_m); + tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T)); } else { dnn_shape_dst.SetMklTensor(false); tf_shape_dst = src_tensor.shape(); } - AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor, - tf_shape_dst, dnn_shape_dst); + AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor, tf_shape_dst, + dnn_shape_dst); // Output of batchnorm has same shape as input. dst.SetUsrMem(src_md, dst_tensor); primitive bnrm_fwd_op; if (is_training_) { - bnrm_fwd_op = batch_normalization_forward( - bnrm_fwd_pd, - src.GetOpMem(), - weights_m, - dst.GetOpMem(), - mean_m, - variance_m); + bnrm_fwd_op = + batch_normalization_forward(bnrm_fwd_pd, src.GetOpMem(), weights_m, + dst.GetOpMem(), mean_m, variance_m); } else { bnrm_fwd_op = batch_normalization_forward( - bnrm_fwd_pd, - src.GetOpMem(), - mean_m, - variance_m, - (const primitive::at) weights_m, - dst.GetOpMem()); + bnrm_fwd_pd, src.GetOpMem(), mean_m, variance_m, + (const primitive::at)weights_m, dst.GetOpMem()); } std::vector net; net.push_back(bnrm_fwd_op); stream(stream::kind::eager).submit(net).wait(); // copy batch_mean data - T* batch_mean_data_tf = reinterpret_cast( - batch_mean_tensor->flat().data()); + T* batch_mean_data_tf = + reinterpret_cast(batch_mean_tensor->flat().data()); std::memcpy(reinterpret_cast(batch_mean_data_tf), reinterpret_cast(mean_m.get_data_handle()), - depth_*sizeof(T)); + depth_ * sizeof(T)); // copy batch_variance data with Bessel's correction // if training mode is on @@ -937,18 +914,17 @@ class MklFusedBatchNormOp : public OpKernel { size_t adjust_size = orig_size - 1; adjust_factor = (static_cast(orig_size)) / adjust_size; } - for (int k=0; k < depth_; k++) + for (int k = 0; k < depth_; k++) batch_variance_tensor->flat().data()[k] = - (reinterpret_cast(variance_m.get_data_handle()))[k] - * adjust_factor; - } catch (mkldnn::error &e) { + (reinterpret_cast(variance_m.get_data_handle()))[k] * + adjust_factor; + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } @@ -958,7 +934,7 @@ class MklFusedBatchNormOp : public OpKernel { bool is_training_; T* mean_values_; T* variance_values_; - size_t depth_; // batch normalization is done for per channel. + size_t depth_; // batch normalization is done for per channel. void ExtractParams(OpKernelContext* context) { const Tensor& input = MklGetInput(context, 0); @@ -966,23 +942,20 @@ class MklFusedBatchNormOp : public OpKernel { } void SetMeanVariance(const Tensor& mean, const Tensor& variance) { - mean_values_ = reinterpret_cast( - const_cast(mean.flat().data())); - variance_values_ = reinterpret_cast( - const_cast(variance.flat().data())); + mean_values_ = reinterpret_cast(const_cast(mean.flat().data())); + variance_values_ = + reinterpret_cast(const_cast(variance.flat().data())); } - void HandleEmptyInput(OpKernelContext* context, - TensorShape tf_shape_src, - TensorShape tf_shape_scale, - Tensor** dst_tensor) { + void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src, + TensorShape tf_shape_scale, Tensor** dst_tensor) { CHECK_NOTNULL(dst_tensor); const size_t kDstIndex = 0; MklDnnShape dnn_shape_dst; dnn_shape_dst.SetMklTensor(false); - AllocateOutputSetMklShape(context, kDstIndex, dst_tensor, - tf_shape_src, dnn_shape_dst); + AllocateOutputSetMklShape(context, kDstIndex, dst_tensor, tf_shape_src, + dnn_shape_dst); CHECK_NOTNULL(*dst_tensor); memset(const_cast((*dst_tensor)->tensor_data().data()), 0, (*dst_tensor)->tensor_data().size()); @@ -991,15 +964,12 @@ class MklFusedBatchNormOp : public OpKernel { Tensor* batch_variance_tensor = nullptr; Tensor* saved_mean_tensor = nullptr; Tensor* saved_variance_tensor = nullptr; - AllocateTFOutputs(context, tf_shape_scale, - &batch_mean_tensor, - &batch_variance_tensor, - &saved_mean_tensor, + AllocateTFOutputs(context, tf_shape_scale, &batch_mean_tensor, + &batch_variance_tensor, &saved_mean_tensor, &saved_variance_tensor); } - void AllocateTFOutputs(OpKernelContext* context, - TensorShape tf_shape_scale, + void AllocateTFOutputs(OpKernelContext* context, TensorShape tf_shape_scale, Tensor** batch_mean_tensor, Tensor** batch_variance_tensor, Tensor** saved_mean_tensor, @@ -1017,51 +987,43 @@ class MklFusedBatchNormOp : public OpKernel { // allocate batch mean output tensor MklDnnShape mkl_shape_batch_mean; mkl_shape_batch_mean.SetMklTensor(false); - AllocateOutputSetMklShape(context, - kBatchMeanIndex, - batch_mean_tensor, - tf_shape_scale, - mkl_shape_batch_mean); + AllocateOutputSetMklShape(context, kBatchMeanIndex, batch_mean_tensor, + tf_shape_scale, mkl_shape_batch_mean); CHECK_NOTNULL(*batch_mean_tensor); // set NAN mean value in case of empty input tensor - for (int k=0; k < tf_shape_scale.num_elements(); k++) + for (int k = 0; k < tf_shape_scale.num_elements(); k++) (*batch_mean_tensor)->flat().data()[k] = NAN; // allocate batch variance output tensor MklDnnShape mkl_shape_batch_variance; mkl_shape_batch_variance.SetMklTensor(false); - AllocateOutputSetMklShape(context, - kBatchVarianceIndex, - batch_variance_tensor, - tf_shape_scale, + AllocateOutputSetMklShape(context, kBatchVarianceIndex, + batch_variance_tensor, tf_shape_scale, mkl_shape_batch_variance); CHECK_NOTNULL(*batch_variance_tensor); // set NAN variance value in case of empty input tensor - for (int k=0; k < tf_shape_scale.num_elements(); k++) + for (int k = 0; k < tf_shape_scale.num_elements(); k++) (*batch_variance_tensor)->flat().data()[k] = NAN; // Mean and variance (without Bessel's correction) saved for backward // computation to serve as pre-computed mean and variance. MklDnnShape mkl_shape_saved_mean; mkl_shape_saved_mean.SetMklTensor(false); - AllocateOutputSetMklShape(context, kSavedMeanIndex, - saved_mean_tensor, - tf_shape_scale, - mkl_shape_saved_mean); + AllocateOutputSetMklShape(context, kSavedMeanIndex, saved_mean_tensor, + tf_shape_scale, mkl_shape_saved_mean); CHECK_NOTNULL(*saved_mean_tensor); // set NAN mean value in case of empty input tensor - for (int k=0; k < tf_shape_scale.num_elements(); k++) + for (int k = 0; k < tf_shape_scale.num_elements(); k++) (*saved_mean_tensor)->flat().data()[k] = NAN; MklDnnShape mkl_shape_saved_variance; mkl_shape_saved_variance.SetMklTensor(false); AllocateOutputSetMklShape(context, kSavedVarianceIndex, - saved_variance_tensor, - tf_shape_scale, + saved_variance_tensor, tf_shape_scale, mkl_shape_saved_variance); CHECK_NOTNULL(*saved_variance_tensor); // set NAN variance value in case of empty input tensor - for (int k=0; k < tf_shape_scale.num_elements(); k++) + for (int k = 0; k < tf_shape_scale.num_elements(); k++) (*saved_variance_tensor)->flat().data()[k] = NAN; } }; @@ -1093,8 +1055,8 @@ class MklFusedBatchNormGradOp : public OpKernel { const Tensor& src_tensor = MklGetInput(context, kSrcIndex); const Tensor& scale_tensor = MklGetInput(context, kScaleIndex); const Tensor& saved_mean_tensor = MklGetInput(context, kMeanIndex); - const Tensor& saved_variance_tensor = MklGetInput(context, - kVarianceIndex); + const Tensor& saved_variance_tensor = + MklGetInput(context, kVarianceIndex); MklDnnShape dnn_shape_src, dnn_shape_diff_dst; GetMklShape(context, kSrcIndex, &dnn_shape_src); @@ -1103,53 +1065,49 @@ class MklFusedBatchNormGradOp : public OpKernel { if (dnn_shape_diff_dst.IsMklTensor()) { tf_shape_diff_dst = dnn_shape_diff_dst.GetTfShape(); - OP_REQUIRES(context, dnn_shape_diff_dst.GetDimension() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - diff_dst_tensor.shape().DebugString())); + OP_REQUIRES( + context, dnn_shape_diff_dst.GetDimension() == 4, + errors::InvalidArgument("input must be 4-dimensional", + diff_dst_tensor.shape().DebugString())); } else { tf_shape_diff_dst = diff_dst_tensor.shape(); - OP_REQUIRES(context, diff_dst_tensor.dims() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - diff_dst_tensor.shape().DebugString())); + OP_REQUIRES( + context, diff_dst_tensor.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + diff_dst_tensor.shape().DebugString())); } if (dnn_shape_src.IsMklTensor()) { tf_shape_src = dnn_shape_src.GetTfShape(); OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - src_tensor.shape().DebugString())); + errors::InvalidArgument("input must be 4-dimensional", + src_tensor.shape().DebugString())); } else { tf_shape_src = src_tensor.shape(); OP_REQUIRES(context, src_tensor.dims() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - src_tensor.shape().DebugString())); + errors::InvalidArgument("input must be 4-dimensional", + src_tensor.shape().DebugString())); } OP_REQUIRES(context, scale_tensor.dims() == 1, - errors::InvalidArgument( - "scale must be 1-dimensional", - scale_tensor.shape().DebugString())); - OP_REQUIRES(context, saved_mean_tensor.dims() == 1, - errors::InvalidArgument( - "saved mean must be 1-dimensional", - saved_mean_tensor.shape().DebugString())); + errors::InvalidArgument("scale must be 1-dimensional", + scale_tensor.shape().DebugString())); + OP_REQUIRES( + context, saved_mean_tensor.dims() == 1, + errors::InvalidArgument("saved mean must be 1-dimensional", + saved_mean_tensor.shape().DebugString())); - OP_REQUIRES(context, saved_variance_tensor.dims() == 1, - errors::InvalidArgument( - "saved variance must be 1-dimensional", - saved_variance_tensor.shape().DebugString())); + OP_REQUIRES( + context, saved_variance_tensor.dims() == 1, + errors::InvalidArgument("saved variance must be 1-dimensional", + saved_variance_tensor.shape().DebugString())); Tensor* diff_src_tensor = nullptr; if (tf_shape_src.num_elements() == 0 || tf_shape_diff_dst.num_elements() == 0) { - HandleEmptyInput(context, tf_shape_src, - scale_tensor.shape(), - &diff_src_tensor); - return; + HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(), + &diff_src_tensor); + return; } if (dnn_shape_src.IsMklTensor()) @@ -1175,20 +1133,18 @@ class MklFusedBatchNormGradOp : public OpKernel { memory::dims src_dims, diff_dst_dims; if (dnn_shape_src.IsMklTensor()) - src_dims = TFShapeToMklDnnDimsInNCHW( - dnn_shape_src.GetTfShape(), tensor_format_); + src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(), + tensor_format_); else - src_dims = TFShapeToMklDnnDimsInNCHW( - src_tensor.shape(), tensor_format_); + src_dims = + TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_); if (dnn_shape_diff_dst.IsMklTensor()) diff_dst_dims = TFShapeToMklDnnDimsInNCHW( - dnn_shape_diff_dst.GetTfShape(), - tensor_format_); + dnn_shape_diff_dst.GetTfShape(), tensor_format_); else - diff_dst_dims = TFShapeToMklDnnDimsInNCHW( - diff_dst_tensor.shape(), - tensor_format_); + diff_dst_dims = + TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), tensor_format_); // set src and diff_dst primitives memory::desc src_md({}, memory::data_undef, memory::format_undef); @@ -1202,7 +1158,7 @@ class MklFusedBatchNormGradOp : public OpKernel { src_md = diff_dst_md; } } else { - src_md = memory::desc(src_dims, MklDnnType(), format_m); + src_md = memory::desc(src_dims, MklDnnType(), format_m); diff_dst_md = src_md; } src.SetUsrMem(src_md, &src_tensor); @@ -1210,55 +1166,47 @@ class MklFusedBatchNormGradOp : public OpKernel { // weights -- DNN packs scales/shifts as weights in order of // scale, ..., scale, shift, ..., shift - auto weights_desc = memory::desc({2, depth_}, - MklDnnType(), - memory::format::nc); + auto weights_desc = + memory::desc({2, depth_}, MklDnnType(), memory::format::nc); auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine); auto weights_m = memory(weights_pd); T* weights_data = reinterpret_cast(weights_m.get_data_handle()); - T* scale_tf = reinterpret_cast(const_cast - (scale_tensor.flat().data())); - for (int k=0; k < depth_; k++) { + T* scale_tf = + reinterpret_cast(const_cast(scale_tensor.flat().data())); + for (int k = 0; k < depth_; k++) { weights_data[k] = scale_tf[k]; weights_data[k + depth_] = 0; } // set mean primitive memory::dims mv_dims = GetMeanVarianceDims(); - mean.SetUsrMem(mv_dims, - memory::format::nc, - const_cast(static_cast - (saved_mean_tensor.flat().data()))); + mean.SetUsrMem(mv_dims, memory::format::nc, + const_cast(static_cast( + saved_mean_tensor.flat().data()))); mean.SetOpMemDesc(mv_dims, memory::format::nc); // set variance primitive - variance.SetUsrMem(mv_dims, memory::format::nc, - const_cast(static_cast - (saved_variance_tensor.flat().data()))); + variance.SetUsrMem(mv_dims, memory::format::nc, + const_cast(static_cast( + saved_variance_tensor.flat().data()))); variance.SetOpMemDesc(mv_dims, memory::format::nc); // set diff_weight primitive - auto diff_weights_desc = memory::desc( - {2, depth_}, - MklDnnType(), - memory::format::nc); - auto diff_weights_pd = memory::primitive_desc( - diff_weights_desc, - cpu_engine); + auto diff_weights_desc = + memory::desc({2, depth_}, MklDnnType(), memory::format::nc); + auto diff_weights_pd = + memory::primitive_desc(diff_weights_desc, cpu_engine); auto diff_weights_m = memory(diff_weights_pd); auto bnrm_fwd_desc = batch_normalization_forward::desc( - prop_kind::forward_training, - src.GetUsrMemDesc(), - epsilon_, - is_training_ ? use_scale_shift : - (use_scale_shift | use_global_stats)); + prop_kind::forward_training, src.GetUsrMemDesc(), epsilon_, + is_training_ ? use_scale_shift + : (use_scale_shift | use_global_stats)); auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc( - bnrm_fwd_desc, - cpu_engine); + bnrm_fwd_desc, cpu_engine); // Indices of output tensors - const size_t kDiffSrcIndex = 0; // index of diff_src tensor + const size_t kDiffSrcIndex = 0; // index of diff_src tensor // allocate diff_src tensor MklDnnShape dnn_shape_diff_src; @@ -1268,14 +1216,11 @@ class MklFusedBatchNormGradOp : public OpKernel { auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc(); dnn_shape_diff_src.SetMklLayout(&diff_src_pd); dnn_shape_diff_src.SetElemType(MklDnnType()); - dnn_shape_diff_src.SetTfLayout( - dnn_shape_src.GetDimension(), - src_dims, - format_m); - dnn_shape_diff_src.SetTfDimOrder( - dnn_shape_src.GetDimension(), - tensor_format_); - tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T)); + dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), src_dims, + format_m); + dnn_shape_diff_src.SetTfDimOrder(dnn_shape_src.GetDimension(), + tensor_format_); + tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T)); } else { dnn_shape_diff_src.SetMklTensor(false); tf_shape_diff_src = src_tensor.shape(); @@ -1287,33 +1232,22 @@ class MklFusedBatchNormGradOp : public OpKernel { prop_kind pk = prop_kind::backward; auto bnrm_bwd_desc = batch_normalization_backward::desc( - pk, - diff_src.GetUsrMemDesc(), - src.GetUsrMemDesc(), - epsilon_, - /* for inference, specify use_global_stats - 1. on fwd prop, use mean and variance - provided as inputs - 2. on bwd prop, mean and variance are - considered as constants. Thus, - reduce the amout of MKL computations - */ - is_training_ ? use_scale_shift : - (use_scale_shift | use_global_stats)); + pk, diff_src.GetUsrMemDesc(), src.GetUsrMemDesc(), epsilon_, + /* for inference, specify use_global_stats + 1. on fwd prop, use mean and variance + provided as inputs + 2. on bwd prop, mean and variance are + considered as constants. Thus, + reduce the amout of MKL computations + */ + is_training_ ? use_scale_shift + : (use_scale_shift | use_global_stats)); auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc( - bnrm_bwd_desc, - cpu_engine, - bnrm_fwd_pd); + bnrm_bwd_desc, cpu_engine, bnrm_fwd_pd); auto bnrm_bwd_op = batch_normalization_backward( - bnrm_bwd_pd, - src.GetOpMem(), - mean.GetOpMem(), - variance.GetOpMem(), - diff_dst.GetOpMem(), - weights_m, - diff_src.GetOpMem(), - diff_weights_m); + bnrm_bwd_pd, src.GetOpMem(), mean.GetOpMem(), variance.GetOpMem(), + diff_dst.GetOpMem(), weights_m, diff_src.GetOpMem(), diff_weights_m); std::vector net; net.push_back(bnrm_bwd_op); @@ -1322,43 +1256,39 @@ class MklFusedBatchNormGradOp : public OpKernel { // allocate 4 output TF tensors Tensor* diff_scale_tensor = nullptr; Tensor* diff_shift_tensor = nullptr; - AllocateTFOutputs(context, scale_tensor.shape(), - &diff_scale_tensor, + AllocateTFOutputs(context, scale_tensor.shape(), &diff_scale_tensor, &diff_shift_tensor); // copy data: diff_scale and diff_shift - T* diff_weights_data_dnn = reinterpret_cast - (diff_weights_m.get_data_handle()); + T* diff_weights_data_dnn = + reinterpret_cast(diff_weights_m.get_data_handle()); for (int i = 0; i < depth_; i++) { - diff_scale_tensor->flat().data()[i] = - diff_weights_data_dnn[i]; + diff_scale_tensor->flat().data()[i] = diff_weights_data_dnn[i]; diff_shift_tensor->flat().data()[i] = - diff_weights_data_dnn[i + depth_]; + diff_weights_data_dnn[i + depth_]; } - } catch (mkldnn::error &e) { + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } private: T epsilon_; TensorFormat tensor_format_; - int depth_; // batch normalization is done for per channel. + int depth_; // batch normalization is done for per channel. bool is_training_; void ExtractParams(OpKernelContext* context) { - const Tensor& input = MklGetInput(context, 0); - depth_ = static_cast(GetTensorDim(input, tensor_format_, 'C')); + const Tensor& input = MklGetInput(context, 0); + depth_ = static_cast(GetTensorDim(input, tensor_format_, 'C')); } - void HandleEmptyInput(OpKernelContext* context, - TensorShape tf_shape_src, + void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src, TensorShape tf_shape_scale_shift, Tensor** diff_src_tensor) { const size_t kDiffSrcIndex = 0; @@ -1366,22 +1296,20 @@ class MklFusedBatchNormGradOp : public OpKernel { MklDnnShape dnn_shape_diff_src; dnn_shape_diff_src.SetMklTensor(false); AllocateOutputSetMklShape(context, kDiffSrcIndex, diff_src_tensor, - tf_shape_src, dnn_shape_diff_src); - for (size_t i=0; i < (*diff_src_tensor)->shape().num_elements(); i++) - (*diff_src_tensor)->flat().data()[i] = 0; + tf_shape_src, dnn_shape_diff_src); + for (size_t i = 0; i < (*diff_src_tensor)->shape().num_elements(); i++) + (*diff_src_tensor)->flat().data()[i] = 0; Tensor* diff_scale_tensor = nullptr; Tensor* diff_shift_tensor = nullptr; - AllocateTFOutputs(context, - tf_shape_scale_shift, - &diff_scale_tensor, + AllocateTFOutputs(context, tf_shape_scale_shift, &diff_scale_tensor, &diff_shift_tensor); } void AllocateTFOutputs(OpKernelContext* context, - TensorShape tf_shape_scale_shift, - Tensor** diff_scale_tensor, - Tensor** diff_shift_tensor) { + TensorShape tf_shape_scale_shift, + Tensor** diff_scale_tensor, + Tensor** diff_shift_tensor) { CHECK_NOTNULL(diff_scale_tensor); CHECK_NOTNULL(diff_shift_tensor); @@ -1396,31 +1324,29 @@ class MklFusedBatchNormGradOp : public OpKernel { AllocateOutputSetMklShape(context, kDiffScaleIndex, diff_scale_tensor, tf_shape_scale_shift, mkl_shape_diff_scale); CHECK_NOTNULL(*diff_scale_tensor); - for (size_t i=0; i < (*diff_scale_tensor)->shape().num_elements(); i++) - (*diff_scale_tensor)->flat().data()[i] = 0; + for (size_t i = 0; i < (*diff_scale_tensor)->shape().num_elements(); i++) + (*diff_scale_tensor)->flat().data()[i] = 0; MklDnnShape mkl_shape_diff_shift; mkl_shape_diff_shift.SetMklTensor(false); AllocateOutputSetMklShape(context, kDiffShiftIndex, diff_shift_tensor, tf_shape_scale_shift, mkl_shape_diff_shift); CHECK_NOTNULL(*diff_shift_tensor); - for (size_t i=0; i < (*diff_shift_tensor)->shape().num_elements(); i++) - (*diff_shift_tensor)->flat().data()[i] = 0; + for (size_t i = 0; i < (*diff_shift_tensor)->shape().num_elements(); i++) + (*diff_shift_tensor)->flat().data()[i] = 0; // Placeholders for estimated_mean and estimated_variance, which are // used for inference and thus not needed here for gradient computation. - Tensor* p1_tensor = nullptr, *p2_tensor = nullptr; + Tensor *p1_tensor = nullptr, *p2_tensor = nullptr; MklDnnShape mkl_shape_p; mkl_shape_p.SetMklTensor(false); - AllocateOutputSetMklShape(context, kP1Index, &p1_tensor, - TensorShape({}), mkl_shape_p); - AllocateOutputSetMklShape(context, kP2Index, &p2_tensor, - TensorShape({}), mkl_shape_p); + AllocateOutputSetMklShape(context, kP1Index, &p1_tensor, TensorShape({}), + mkl_shape_p); + AllocateOutputSetMklShape(context, kP2Index, &p2_tensor, TensorShape({}), + mkl_shape_p); } - memory::dims GetMeanVarianceDims() { - return memory::dims({1, depth_}); - } + memory::dims GetMeanVarianceDims() { return memory::dims({1, depth_}); } }; #endif diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc index 4b5f7b83100..73d41efce19 100644 --- a/tensorflow/core/kernels/mkl_input_conversion_op.cc +++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc @@ -271,8 +271,8 @@ class MklInputConversionOp : public OpKernel { MklDnnShape input_shape_1; GetMklShape(context, 1, &input_shape_1); - bool tf_shapes_are_same = context->input(0).shape() == - context->input(1).shape(); + bool tf_shapes_are_same = + context->input(0).shape() == context->input(1).shape(); VLOG(1) << "MklInputConversionOp: Input shapes are " << (tf_shapes_are_same ? "*same*" : "*different*") << ": " @@ -400,9 +400,9 @@ class MklInputConversionOp : public OpKernel { // Create reorder between tensorflow layout and Mkl layout. std::vector net; - CHECK_EQ(tf_input.CheckReorderToOpMem(memory::primitive_desc( - output_mkl_md, cpu_engine), - tensor_out, &net), + CHECK_EQ(tf_input.CheckReorderToOpMem( + memory::primitive_desc(output_mkl_md, cpu_engine), + tensor_out, &net), true); stream(stream::kind::eager).submit(net).wait(); diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc index 95e0404ba8a..a8b45004b79 100644 --- a/tensorflow/core/kernels/mkl_lrn_op.cc +++ b/tensorflow/core/kernels/mkl_lrn_op.cc @@ -22,6 +22,9 @@ limitations under the License. #define EIGEN_USE_THREADS #include +#include "mkl_dnn.h" +#include "mkl_dnn_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" @@ -30,9 +33,6 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/util/mkl_util.h" #include "tensorflow/core/util/tensor_format.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "mkl_dnn.h" -#include "mkl_dnn_types.h" #if !defined(IS_MOBILE_PLATFORM) #include "tensorflow/core/util/work_sharder.h" @@ -40,10 +40,10 @@ limitations under the License. #ifdef INTEL_MKL_DNN #include "mkldnn.hpp" -using mkldnn::lrn_forward; -using mkldnn::lrn_backward; -using mkldnn::prop_kind; using mkldnn::lrn_across_channels; +using mkldnn::lrn_backward; +using mkldnn::lrn_forward; +using mkldnn::prop_kind; using mkldnn::stream; #endif @@ -77,10 +77,11 @@ class MklLRNOp : public OpKernel { explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); @@ -103,9 +104,10 @@ class MklLRNOp : public OpKernel { : input.dims(); OP_REQUIRES(context, mkl_context.in_dims == 4, errors::InvalidArgument("input must be 4-dimensional")); - OP_REQUIRES(context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("argument to LRN too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input.NumElements(), std::numeric_limits::max()), + errors::InvalidArgument("argument to LRN too large")); if (!input_in_mkl_format) { mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_, @@ -339,17 +341,17 @@ class MklLRNOp : public OpKernel { float beta_; }; - template class MklLRNGradOp : public OpKernel { public: explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_)); @@ -740,10 +742,11 @@ class MklLRNOp : public OpKernel { explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); @@ -773,10 +776,10 @@ class MklLRNOp : public OpKernel { if (!src_dnn_shape.IsMklTensor()) { MklDefaultToEigen(context, src_tensor); return; - } else if (!src_dnn_shape.IsMklChannelDim( - src_dnn_shape.GetDimension() - 1) ) { + } else if (!src_dnn_shape.IsMklChannelDim(src_dnn_shape.GetDimension() - + 1)) { Tensor converted_tensor = - ConvertMklToTF(context, src_tensor, src_dnn_shape); + ConvertMklToTF(context, src_tensor, src_dnn_shape); MklDefaultToEigen(context, converted_tensor); return; } @@ -807,18 +810,16 @@ class MklLRNOp : public OpKernel { // Create LRN primitive descriptor. // Tensorflow's normalization semantics is across channels. // MKL-DNN also supports normalization within channel. - auto lrn_desc = lrn_forward::desc(prop_kind::forward, - lrn_across_channels, + auto lrn_desc = lrn_forward::desc(prop_kind::forward, lrn_across_channels, src_dnn_data.GetUsrMemDesc(), - kernel_size, - new_alpha, beta_, bias_); + kernel_size, new_alpha, beta_, bias_); auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, cpu_engine); // Allocate output_dnn_data tensor. Tensor* output_tensor = nullptr; memory::format input_format = src_dnn_shape.GetTfDataFormat(); - AllocateOutputTensor(context, lrn_prim_desc, input_dims, - input_format, &output_tensor); + AllocateOutputTensor(context, lrn_prim_desc, input_dims, input_format, + &output_tensor); OP_REQUIRES_OK(context, context->status()); CHECK_NOTNULL(output_tensor); dst_dnn_data.SetUsrMemDataHandle(output_tensor); @@ -827,25 +828,23 @@ class MklLRNOp : public OpKernel { AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data); OP_REQUIRES_OK(context, context->status()); - PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data, - &dst_dnn_data, &workspace_dnn_data); - } catch (mkldnn::error &e) { + PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data, &dst_dnn_data, + &workspace_dnn_data); + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } private: - void PrepareAndExecuteNet( - const lrn_forward::primitive_desc& lrn_fwd_desc, - MklDnnData* src_dnn_data, - MklDnnData* dst_dnn_data, - MklDnnData* wksp_dnn_data = nullptr) { + void PrepareAndExecuteNet(const lrn_forward::primitive_desc& lrn_fwd_desc, + MklDnnData* src_dnn_data, + MklDnnData* dst_dnn_data, + MklDnnData* wksp_dnn_data = nullptr) { std::vector net; // Check for input reorder @@ -853,23 +852,21 @@ class MklLRNOp : public OpKernel { // Create pooling primitive and add it to net if (wksp_dnn_data != nullptr) { - net.push_back(lrn_forward(lrn_fwd_desc, - src_dnn_data->GetOpMem(), - wksp_dnn_data->GetOpMem(), - dst_dnn_data->GetOpMem())); + net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(), + wksp_dnn_data->GetOpMem(), + dst_dnn_data->GetOpMem())); } else { - net.push_back(lrn_forward(lrn_fwd_desc, - src_dnn_data->GetOpMem(), - dst_dnn_data->GetOpMem())); + net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(), + dst_dnn_data->GetOpMem())); } stream(stream::kind::eager).submit(net).wait(); } - void AllocateOutputTensor(OpKernelContext* context, - const lrn_forward::primitive_desc& lrn_fwd_prim_desc, - const memory::dims output_dims_mkl_order, - const memory::format& output_tf_format, - Tensor** output_tensor) { + void AllocateOutputTensor( + OpKernelContext* context, + const lrn_forward::primitive_desc& lrn_fwd_prim_desc, + const memory::dims output_dims_mkl_order, + const memory::format& output_tf_format, Tensor** output_tensor) { CHECK_NOTNULL(output_tensor); memory::primitive_desc dst_pd = lrn_fwd_prim_desc.dst_primitive_desc(); @@ -880,111 +877,106 @@ class MklLRNOp : public OpKernel { output_mkl_shape.SetMklLayout(&dst_pd); output_mkl_shape.SetElemType(MklDnnType()); output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, - output_tf_format); + output_dims_mkl_order, output_tf_format); TensorShape output_tf_shape; // only allocate enough space for the elements we need. size_t num_bytes = dst_pd.get_size(); CHECK_EQ(num_bytes % sizeof(T), 0); output_tf_shape.AddDim(num_bytes / sizeof(T)); - AllocateOutputSetMklShape(context, kIdxOutput, - output_tensor, - output_tf_shape, output_mkl_shape); - } + AllocateOutputSetMklShape(context, kIdxOutput, output_tensor, + output_tf_shape, output_mkl_shape); + } - // Fallback implementation - Taken from lrn_op.cc - // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a - // copy. - void MklDefaultToEigen(OpKernelContext* context, - const Tensor& input) { - const int batch = static_cast(input.dim_size(0)); - const int rows = static_cast(input.dim_size(1)); - const int cols = static_cast(input.dim_size(2)); - const int depth = static_cast(input.dim_size(3)); - const int nodes = cols * rows; + // Fallback implementation - Taken from lrn_op.cc + // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a + // copy. + void MklDefaultToEigen(OpKernelContext* context, const Tensor& input) { + const int batch = static_cast(input.dim_size(0)); + const int rows = static_cast(input.dim_size(1)); + const int cols = static_cast(input.dim_size(2)); + const int depth = static_cast(input.dim_size(3)); + const int nodes = cols * rows; - auto in_shaped = input.shaped({nodes * batch, depth}); - // Multiplying the input with the band matrix has the effect of reducing - // the - // correct patch along the depth. - Eigen::Tensor multiplier(depth, depth); - GetBandMatrix(depth, depth_radius_, &multiplier); + auto in_shaped = input.shaped({nodes * batch, depth}); + // Multiplying the input with the band matrix has the effect of reducing + // the + // correct patch along the depth. + Eigen::Tensor multiplier(depth, depth); + GetBandMatrix(depth, depth_radius_, &multiplier); - Tensor *output_dnn_data = nullptr; - MklDnnShape mkl_output_mkl_shape; - mkl_output_mkl_shape.SetMklTensor(false); - mkl_output_mkl_shape.SetDimensions(4); - AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data, - input.shape(), mkl_output_mkl_shape); - CHECK_NOTNULL(output_dnn_data); + Tensor* output_dnn_data = nullptr; + MklDnnShape mkl_output_mkl_shape; + mkl_output_mkl_shape.SetMklTensor(false); + mkl_output_mkl_shape.SetDimensions(4); + AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data, + input.shape(), mkl_output_mkl_shape); + CHECK_NOTNULL(output_dnn_data); - Tensor* workspace_tensor = nullptr; - MklDnnShape workspace_mkl_shape; - workspace_mkl_shape.SetMklTensor(false); - TensorShape workspace_tf_shape; - workspace_tf_shape.AddDim(0); - AllocateOutputSetMklShape(context, kIdxWorkspace, - &workspace_tensor, + Tensor* workspace_tensor = nullptr; + MklDnnShape workspace_mkl_shape; + workspace_mkl_shape.SetMklTensor(false); + TensorShape workspace_tf_shape; + workspace_tf_shape.AddDim(0); + AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor, workspace_tf_shape, workspace_mkl_shape); - CHECK_NOTNULL(workspace_tensor); + CHECK_NOTNULL(workspace_tensor); - auto out_shaped = output_dnn_data->shaped({nodes * batch, depth}); - Eigen::array dims = {{DimPair(1, 0)}}; - auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_; - if (beta_ == T(1)) { - out_shaped.device(context->eigen_cpu_device()) = - in_shaped * tmp.inverse(); - } else if (beta_ == T(0.5)) { - out_shaped.device(context->eigen_cpu_device()) = - in_shaped * tmp.rsqrt(); - } else { - out_shaped.device(context->eigen_cpu_device()) = - in_shaped * (tmp.log() * -beta_).exp(); - } + auto out_shaped = output_dnn_data->shaped({nodes * batch, depth}); + Eigen::array dims = {{DimPair(1, 0)}}; + auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_; + if (beta_ == T(1)) { + out_shaped.device(context->eigen_cpu_device()) = + in_shaped * tmp.inverse(); + } else if (beta_ == T(0.5)) { + out_shaped.device(context->eigen_cpu_device()) = in_shaped * tmp.rsqrt(); + } else { + out_shaped.device(context->eigen_cpu_device()) = + in_shaped * (tmp.log() * -beta_).exp(); } + } - void AllocateWorkspaceTensor(OpKernelContext* context, - const lrn_forward::primitive_desc& lrn_fwd_prim_desc, - MklDnnData* dnn_data_wksp) { - CHECK_NOTNULL(dnn_data_wksp); - Tensor* workspace_tensor = nullptr; - memory::primitive_desc workspace_pd - = lrn_fwd_prim_desc.workspace_primitive_desc(); - size_t workspace_bytes = workspace_pd.get_size(); - MklDnnShape workspace_mkl_shape; - // the workspace tensor is a uint8 tensor that has - // exactly the number of bytes necessary - workspace_mkl_shape.SetMklTensor(false); - TensorShape workspace_tf_shape; - workspace_tf_shape.AddDim(workspace_bytes); - AllocateOutputSetMklShape(context, kIdxWorkspace, - &workspace_tensor, + void AllocateWorkspaceTensor( + OpKernelContext* context, + const lrn_forward::primitive_desc& lrn_fwd_prim_desc, + MklDnnData* dnn_data_wksp) { + CHECK_NOTNULL(dnn_data_wksp); + Tensor* workspace_tensor = nullptr; + memory::primitive_desc workspace_pd = + lrn_fwd_prim_desc.workspace_primitive_desc(); + size_t workspace_bytes = workspace_pd.get_size(); + MklDnnShape workspace_mkl_shape; + // the workspace tensor is a uint8 tensor that has + // exactly the number of bytes necessary + workspace_mkl_shape.SetMklTensor(false); + TensorShape workspace_tf_shape; + workspace_tf_shape.AddDim(workspace_bytes); + AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor, workspace_tf_shape, workspace_mkl_shape); - CHECK_NOTNULL(workspace_tensor); - dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor); - } + CHECK_NOTNULL(workspace_tensor); + dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor); + } void SanityCheckInputs(OpKernelContext* context) { const Tensor& src_tensor = MklGetInput(context, kIdxInput); MklDnnShape src_dnn_shape; GetMklShape(context, kIdxInput, &src_dnn_shape); if (src_dnn_shape.IsMklTensor()) { - OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4, - errors::InvalidArgument("input must be 4-dimensional")); - OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("argument to LRN too large")); + OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4, + errors::InvalidArgument("input must be 4-dimensional")); + OP_REQUIRES(context, + FastBoundsCheck(src_tensor.NumElements(), + std::numeric_limits::max()), + errors::InvalidArgument("argument to LRN too large")); } else { - OP_REQUIRES(context, src_tensor.dims() == 4, - errors::InvalidArgument("input must be 4-dimensional")); - OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("argument to LRN too large")); + OP_REQUIRES(context, src_tensor.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional")); + OP_REQUIRES(context, + FastBoundsCheck(src_tensor.NumElements(), + std::numeric_limits::max()), + errors::InvalidArgument("argument to LRN too large")); } } - const int kIdxInput = 0, - kIdxOutput = 0, - kIdxWorkspace = 1; + const int kIdxInput = 0, kIdxOutput = 0, kIdxWorkspace = 1; typedef typename Eigen::Tensor::DimensionPair DimPair; bool workspace_enabled_; @@ -994,17 +986,17 @@ class MklLRNOp : public OpKernel { float beta_; }; - template class MklLRNGradOp : public OpKernel { public: explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_)); @@ -1025,7 +1017,7 @@ class MklLRNGradOp : public OpKernel { MklDnnData output_dnn_data(&cpu_engine); MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape, - orig_output_dnn_shape; + orig_output_dnn_shape; GetMklShape(context, kIdxGradient, &input_grad_dnn_shape); GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape); GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape); @@ -1037,16 +1029,16 @@ class MklLRNGradOp : public OpKernel { orig_input_dnn_shape.IsMklTensor() && orig_output_dnn_shape.IsMklTensor() && input_grad_dnn_shape.IsMklChannelDim( - input_grad_dnn_shape.GetDimension() - 1) && + input_grad_dnn_shape.GetDimension() - 1) && orig_input_dnn_shape.IsMklChannelDim( - orig_input_dnn_shape.GetDimension() - 1) && + orig_input_dnn_shape.GetDimension() - 1) && orig_output_dnn_shape.IsMklChannelDim( - orig_output_dnn_shape.GetDimension() - 1); + orig_output_dnn_shape.GetDimension() - 1); if (!can_use_mkldnn) { - // Fallback to eigen - MklDefaultToEigen(context); - return; + // Fallback to eigen + MklDefaultToEigen(context); + return; } // At this point, we have the all clear to use MklDnn constructs // Naming: diff_dst is input_gradient_tensor; src is orig_input_tensor. @@ -1059,13 +1051,11 @@ class MklLRNGradOp : public OpKernel { // NHWC format. memory::desc original_output_md = orig_output_dnn_shape.GetCurLayout(); memory::desc target_diff_dst_md = ConfigureInputGradient( - input_grad_tensor, - input_grad_dnn_shape, - &input_grad_dnn_data); + input_grad_tensor, input_grad_dnn_shape, &input_grad_dnn_data); memory::desc orig_input_md = orig_input_dnn_shape.GetCurLayout(); memory::dims orig_input_dims = - orig_input_dnn_shape.GetSizesAsMklDnnDims(); + orig_input_dnn_shape.GetSizesAsMklDnnDims(); orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor); orig_input_dnn_data.SetOpMemDesc(orig_input_dims, memory::format::nhwc); @@ -1079,27 +1069,21 @@ class MklLRNGradOp : public OpKernel { // Create LRN backward primitive descriptor. It requires LRN forward // primitive descriptor also. - auto lrn_fwd_desc = lrn_forward::desc(prop_kind::forward, - lrn_across_channels, - orig_input_md, - kernel_size, - new_alpha, beta_, bias_); - auto lrn_fwd_prim_desc = lrn_forward::primitive_desc(lrn_fwd_desc, - cpu_engine); - auto lrn_bwd_desc = lrn_backward::desc(lrn_across_channels, - original_output_md, - target_diff_dst_md, - kernel_size, - new_alpha, beta_, bias_); - auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(lrn_bwd_desc, - cpu_engine, - lrn_fwd_prim_desc); + auto lrn_fwd_desc = lrn_forward::desc( + prop_kind::forward, lrn_across_channels, orig_input_md, kernel_size, + new_alpha, beta_, bias_); + auto lrn_fwd_prim_desc = + lrn_forward::primitive_desc(lrn_fwd_desc, cpu_engine); + auto lrn_bwd_desc = lrn_backward::desc( + lrn_across_channels, original_output_md, target_diff_dst_md, + kernel_size, new_alpha, beta_, bias_); + auto lrn_bwd_prim_desc = lrn_backward::primitive_desc( + lrn_bwd_desc, cpu_engine, lrn_fwd_prim_desc); Tensor* output_tensor = nullptr; - memory::format orig_input_format - = orig_input_dnn_shape.GetTfDataFormat(); - AllocateOutputTensor(context, lrn_bwd_prim_desc, - orig_input_dims, orig_input_format, &output_tensor); + memory::format orig_input_format = orig_input_dnn_shape.GetTfDataFormat(); + AllocateOutputTensor(context, lrn_bwd_prim_desc, orig_input_dims, + orig_input_format, &output_tensor); OP_REQUIRES_OK(context, context->status()); CHECK_NOTNULL(output_tensor); output_dnn_data.SetUsrMemDataHandle(output_tensor); @@ -1110,35 +1094,32 @@ class MklLRNGradOp : public OpKernel { const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace); MklDnnData workspace_dnn_data(&cpu_engine); ConfigureWorkspace(workspace_tensor, - lrn_fwd_prim_desc.workspace_primitive_desc(), - &workspace_dnn_data); + lrn_fwd_prim_desc.workspace_primitive_desc(), + &workspace_dnn_data); - PrepareAndExecuteNet(lrn_bwd_prim_desc, - lrn_fwd_prim_desc, - &orig_input_dnn_data, - &input_grad_dnn_data, - &output_dnn_data, - memory::primitive_desc(target_diff_dst_md, cpu_engine), - &workspace_dnn_data); - } catch (mkldnn::error &e) { + PrepareAndExecuteNet( + lrn_bwd_prim_desc, lrn_fwd_prim_desc, &orig_input_dnn_data, + &input_grad_dnn_data, &output_dnn_data, + memory::primitive_desc(target_diff_dst_md, cpu_engine), + &workspace_dnn_data); + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } - void AllocateOutputTensor(OpKernelContext* context, - const lrn_backward::primitive_desc& lrn_bkwd_prim_desc, - const memory::dims output_dims_mkl_order, - const memory::format& output_tf_format, - Tensor** output_tensor) { + void AllocateOutputTensor( + OpKernelContext* context, + const lrn_backward::primitive_desc& lrn_bkwd_prim_desc, + const memory::dims output_dims_mkl_order, + const memory::format& output_tf_format, Tensor** output_tensor) { CHECK_NOTNULL(output_tensor); - memory::primitive_desc dst_pd - = lrn_bkwd_prim_desc.diff_src_primitive_desc(); + memory::primitive_desc dst_pd = + lrn_bkwd_prim_desc.diff_src_primitive_desc(); MklDnnShape output_mkl_shape; // We assume that all outputs at this point are MKL Tensors @@ -1146,170 +1127,153 @@ class MklLRNGradOp : public OpKernel { output_mkl_shape.SetMklLayout(&dst_pd); output_mkl_shape.SetElemType(MklDnnType()); output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, - output_tf_format); + output_dims_mkl_order, output_tf_format); TensorShape output_tf_shape; size_t num_bytes = dst_pd.get_size(); CHECK_EQ(num_bytes % sizeof(T), 0); output_tf_shape.AddDim(num_bytes / sizeof(T)); - AllocateOutputSetMklShape(context, kIdxOutput, - output_tensor, - output_tf_shape, output_mkl_shape); + AllocateOutputSetMklShape(context, kIdxOutput, output_tensor, + output_tf_shape, output_mkl_shape); } memory::desc ConfigureInputGradient(const Tensor& input_grad_tensor, - const MklDnnShape& input_grad_dnn_shape, - MklDnnData *input_grad_dnn_data) { + const MklDnnShape& input_grad_dnn_shape, + MklDnnData* input_grad_dnn_data) { CHECK_NOTNULL(input_grad_dnn_data); // This shouldn't be necessary at this point, but just in case CHECK_EQ(input_grad_dnn_shape.IsMklTensor(), true); memory::desc input_grad_md = input_grad_dnn_shape.GetCurLayout(); - memory::dims orig_input_dims = - input_grad_dnn_shape.GetSizesAsMklDnnDims(); + memory::dims orig_input_dims = input_grad_dnn_shape.GetSizesAsMklDnnDims(); input_grad_dnn_data->SetUsrMem(input_grad_md, &input_grad_tensor); input_grad_dnn_data->SetOpMemDesc(orig_input_dims, memory::format::nhwc); return input_grad_md; } void PrepareAndExecuteNet( - const lrn_backward::primitive_desc& lrn_bkwd_desc, - const lrn_forward::primitive_desc& lrn_fwd_desc, - MklDnnData* src_dnn_data, - MklDnnData* input_gradient_diff_dst, - MklDnnData* output_diff_src, - const memory::primitive_desc& target_diff_dst_pd, - const MklDnnData* workspace_dnn_data = nullptr) { + const lrn_backward::primitive_desc& lrn_bkwd_desc, + const lrn_forward::primitive_desc& lrn_fwd_desc, + MklDnnData* src_dnn_data, MklDnnData* input_gradient_diff_dst, + MklDnnData* output_diff_src, + const memory::primitive_desc& target_diff_dst_pd, + const MklDnnData* workspace_dnn_data = nullptr) { std::vector net; // Check for input reordering on the diff dst input input_gradient_diff_dst->CheckReorderToOpMem( - lrn_bkwd_desc.diff_dst_primitive_desc(), &net); + lrn_bkwd_desc.diff_dst_primitive_desc(), &net); // Check for input reordering on the original input - src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(), - &net); + src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(), &net); // Create pooling primitive and add it to net if (nullptr == workspace_dnn_data) { - net.push_back(lrn_backward(lrn_bkwd_desc, - src_dnn_data->GetOpMem(), - input_gradient_diff_dst->GetOpMem(), - output_diff_src->GetOpMem())); + net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(), + input_gradient_diff_dst->GetOpMem(), + output_diff_src->GetOpMem())); } else { - net.push_back(lrn_backward(lrn_bkwd_desc, - src_dnn_data->GetOpMem(), - input_gradient_diff_dst->GetOpMem(), - workspace_dnn_data->GetOpMem(), - output_diff_src->GetOpMem())); + net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(), + input_gradient_diff_dst->GetOpMem(), + workspace_dnn_data->GetOpMem(), + output_diff_src->GetOpMem())); } stream(stream::kind::eager).submit(net).wait(); } void ConfigureWorkspace(const Tensor& workspace_tensor, - memory::primitive_desc workspace_pd, - MklDnnData *workspace_dnn_data) { + memory::primitive_desc workspace_pd, + MklDnnData* workspace_dnn_data) { CHECK_NOTNULL(workspace_dnn_data); workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor); } - // Fallback implementation - Taken from lrn_op.cc - // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a - // copy. - void MklDefaultToEigen(OpKernelContext* context) { - Tensor input_gradient_tensor; - Tensor orig_input_tensor; - Tensor orig_output_tensor; + // Fallback implementation - Taken from lrn_op.cc + // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a + // copy. + void MklDefaultToEigen(OpKernelContext* context) { + Tensor input_gradient_tensor; + Tensor orig_input_tensor; + Tensor orig_output_tensor; - MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape, - orig_output_dnn_shape; - GetMklShape(context, kIdxGradient, &input_grad_dnn_shape); - GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape); - GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape); + MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape, + orig_output_dnn_shape; + GetMklShape(context, kIdxGradient, &input_grad_dnn_shape); + GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape); + GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape); - if (input_grad_dnn_shape.IsMklTensor()) { - input_gradient_tensor = - ConvertMklToTF(context, - MklGetInput(context, kIdxGradient), - input_grad_dnn_shape); - } else { - input_gradient_tensor = MklGetInput(context, kIdxGradient); - } + if (input_grad_dnn_shape.IsMklTensor()) { + input_gradient_tensor = ConvertMklToTF( + context, MklGetInput(context, kIdxGradient), input_grad_dnn_shape); + } else { + input_gradient_tensor = MklGetInput(context, kIdxGradient); + } - if (orig_input_dnn_shape.IsMklTensor()) { - orig_input_tensor = - ConvertMklToTF(context, - MklGetInput(context, kIdxOrigInput), - orig_input_dnn_shape); - } else { - orig_input_tensor = MklGetInput(context, kIdxOrigInput); - } + if (orig_input_dnn_shape.IsMklTensor()) { + orig_input_tensor = ConvertMklToTF( + context, MklGetInput(context, kIdxOrigInput), orig_input_dnn_shape); + } else { + orig_input_tensor = MklGetInput(context, kIdxOrigInput); + } - if (orig_output_dnn_shape.IsMklTensor()) { - orig_output_tensor = - ConvertMklToTF(context, - MklGetInput(context, kIdxOrigOutput), - orig_output_dnn_shape); - } else { - orig_output_tensor = MklGetInput(context, kIdxOrigOutput); - } + if (orig_output_dnn_shape.IsMklTensor()) { + orig_output_tensor = ConvertMklToTF( + context, MklGetInput(context, kIdxOrigOutput), orig_output_dnn_shape); + } else { + orig_output_tensor = MklGetInput(context, kIdxOrigOutput); + } - const int64 batch = static_cast(input_gradient_tensor.dim_size(0)); - const int64 rows = static_cast(input_gradient_tensor.dim_size(1)); - const int64 cols = static_cast(input_gradient_tensor.dim_size(2)); - const int64 depth = static_cast(input_gradient_tensor.dim_size(3)); - const auto nodes = cols * rows; + const int64 batch = static_cast(input_gradient_tensor.dim_size(0)); + const int64 rows = static_cast(input_gradient_tensor.dim_size(1)); + const int64 cols = static_cast(input_gradient_tensor.dim_size(2)); + const int64 depth = static_cast(input_gradient_tensor.dim_size(3)); + const auto nodes = cols * rows; - auto grads_shaped = - input_gradient_tensor.shaped({nodes * batch, depth}); + auto grads_shaped = + input_gradient_tensor.shaped({nodes * batch, depth}); - auto in_shaped = orig_input_tensor.shaped({nodes * batch, depth}); - auto activations = - orig_output_tensor.shaped({nodes * batch, depth}); + auto in_shaped = orig_input_tensor.shaped({nodes * batch, depth}); + auto activations = orig_output_tensor.shaped({nodes * batch, depth}); - Tensor* output_dnn_data; - MklShape mkl_output_mkl_shape; - mkl_output_mkl_shape.SetMklTensor(false); - mkl_output_mkl_shape.SetDimensions(4); - AllocateOutputSetMklShape(context, kIdxOutput, - &output_dnn_data, - input_gradient_tensor.shape(), - mkl_output_mkl_shape); + Tensor* output_dnn_data; + MklShape mkl_output_mkl_shape; + mkl_output_mkl_shape.SetMklTensor(false); + mkl_output_mkl_shape.SetDimensions(4); + AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data, + input_gradient_tensor.shape(), + mkl_output_mkl_shape); - auto out_shaped = output_dnn_data->shaped({nodes * batch, depth}); - out_shaped.setZero(); - auto shard = [this, activations, in_shaped, grads_shaped, out_shaped, - depth](int64 begin, int64 end) { - for (int64 i = begin; i < end; ++i) { - for (int64 j = 0; j < depth; ++j) { - int64 depth_begin = std::max(0, j - depth_radius_); - int64 depth_end = std::min(depth, j + depth_radius_ + 1); + auto out_shaped = output_dnn_data->shaped({nodes * batch, depth}); + out_shaped.setZero(); + auto shard = [this, activations, in_shaped, grads_shaped, out_shaped, + depth](int64 begin, int64 end) { + for (int64 i = begin; i < end; ++i) { + for (int64 j = 0; j < depth; ++j) { + int64 depth_begin = std::max(0, j - depth_radius_); + int64 depth_end = std::min(depth, j + depth_radius_ + 1); - T norm(0); - for (int64 k = depth_begin; k < depth_end; ++k) { - norm += in_shaped(i, k) * in_shaped(i, k); - } - norm = alpha_ * norm + bias_; - DCHECK_GT(norm, T(1e-6)); - for (int64 k = depth_begin; k < depth_end; ++k) { - T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) * - activations(i, j) / norm; - if (k == j) { - dyi += Eigen::numext::pow(norm, -beta_); - } - dyi *= grads_shaped(i, j); - const_cast::Tensor&>(out_shaped)(i, k) += - dyi; + T norm(0); + for (int64 k = depth_begin; k < depth_end; ++k) { + norm += in_shaped(i, k) * in_shaped(i, k); + } + norm = alpha_ * norm + bias_; + DCHECK_GT(norm, T(1e-6)); + for (int64 k = depth_begin; k < depth_end; ++k) { + T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) * + activations(i, j) / norm; + if (k == j) { + dyi += Eigen::numext::pow(norm, -beta_); } + dyi *= grads_shaped(i, j); + const_cast::Tensor&>(out_shaped)(i, k) += dyi; } } - }; - auto worker_threads = - *(context->device()->tensorflow_cpu_worker_threads()); - Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch, - depth * depth, shard); - } + } + }; + auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch, + depth * depth, shard); + } void SanityCheckInputs(OpKernelContext* context) { const Tensor& input_gradient_tensor = MklGetInput(context, kIdxGradient); @@ -1317,59 +1281,59 @@ class MklLRNGradOp : public OpKernel { const Tensor& orig_output_tensor = MklGetInput(context, kIdxOrigOutput); const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace); MklDnnShape in_grads_dnn_shape, in_image_dnn_shape, out_image_dnn_shape, - workspace_dnn_shape; + workspace_dnn_shape; GetMklShape(context, kIdxGradient, &in_grads_dnn_shape); GetMklShape(context, kIdxOrigInput, &in_image_dnn_shape); GetMklShape(context, kIdxOrigOutput, &out_image_dnn_shape); GetMklShape(context, kIdxWorkspace, &workspace_dnn_shape); if (in_grads_dnn_shape.IsMklTensor()) { OP_REQUIRES(context, in_grads_dnn_shape.GetDimension() == 4, - errors::InvalidArgument("Input gradient must be " - "4-dimensional")); + errors::InvalidArgument("Input gradient must be " + "4-dimensional")); } else { - OP_REQUIRES(context, input_gradient_tensor.dims() == 4, - errors::InvalidArgument("input gradient must be 4-dimensional")); + OP_REQUIRES( + context, input_gradient_tensor.dims() == 4, + errors::InvalidArgument("input gradient must be 4-dimensional")); } if (in_image_dnn_shape.IsMklTensor()) { OP_REQUIRES(context, in_image_dnn_shape.GetDimension() == 4, - errors::InvalidArgument("input images must be " - "4-dimensional")); + errors::InvalidArgument("input images must be " + "4-dimensional")); } else { OP_REQUIRES(context, orig_input_tensor.dims() == 4, errors::InvalidArgument("input images must be " - "4-dimensional")); + "4-dimensional")); } if (out_image_dnn_shape.IsMklTensor()) { OP_REQUIRES(context, out_image_dnn_shape.GetDimension() == 4, - errors::InvalidArgument("Output image must be " - "4-dimensional")); + errors::InvalidArgument("Output image must be " + "4-dimensional")); } else { - OP_REQUIRES(context, orig_output_tensor.dims() == 4, - errors::InvalidArgument("Output image must be 4-dimensional")); + OP_REQUIRES( + context, orig_output_tensor.dims() == 4, + errors::InvalidArgument("Output image must be 4-dimensional")); } if (workspace_enabled_) { if (workspace_dnn_shape.IsMklTensor()) { - OP_REQUIRES(context, workspace_dnn_shape.IsMklTensor() == false, - errors::InvalidArgument("Workspace should not be MKL Tensor.")); + OP_REQUIRES( + context, workspace_dnn_shape.IsMklTensor() == false, + errors::InvalidArgument("Workspace should not be MKL Tensor.")); } else { OP_REQUIRES(context, workspace_tensor.dims() == 1, - errors::InvalidArgument("Workspace must be 1-dimensional")); + errors::InvalidArgument("Workspace must be 1-dimensional")); } } } -// Input("input_grads: T") -// Input("input_image: T") -// Input("output_image: T") -// Input("workspace: uint8") - const int kIdxGradient = 0, - kIdxOrigInput = 1, - kIdxOrigOutput = 2, - kIdxWorkspace = 3, - kIdxOutput = 0; + // Input("input_grads: T") + // Input("input_image: T") + // Input("output_image: T") + // Input("workspace: uint8") + const int kIdxGradient = 0, kIdxOrigInput = 1, kIdxOrigOutput = 2, + kIdxWorkspace = 3, kIdxOutput = 0; typedef typename Eigen::Tensor::DimensionPair DimPair; bool workspace_enabled_; @@ -1393,7 +1357,6 @@ class MklLRNGradOp : public OpKernel { .Label(mkl_op_registry::kMklOpLabel), \ MklLRNGradOp); - TF_CALL_float(REGISTER_MKL_LRN_CPU); } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc index 82c5229bab0..0de27ccd60f 100644 --- a/tensorflow/core/kernels/mkl_maxpooling_op.cc +++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc @@ -25,14 +25,14 @@ limitations under the License. #ifdef INTEL_MKL_DNN #include #include "mkldnn.hpp" -using mkldnn::memory; -using mkldnn::error; -using mkldnn::pooling_forward; -using mkldnn::pooling_backward; -using mkldnn::padding_kind; -using mkldnn::engine; -using mkldnn::prop_kind; using mkldnn::algorithm; +using mkldnn::engine; +using mkldnn::error; +using mkldnn::memory; +using mkldnn::padding_kind; +using mkldnn::pooling_backward; +using mkldnn::pooling_forward; +using mkldnn::prop_kind; #endif namespace tensorflow { @@ -397,18 +397,19 @@ class MklMaxPoolingGradOp : public OpKernel { if (workspace_enabled == false) { if (convert_input != nullptr) { if (input_in_mkl_format == false) { - CHECK_EQ( - dnnConversionExecute_F32( - convert_input, const_cast(static_cast( - tensor_in.flat().data())), - input_buf), - E_SUCCESS); + CHECK_EQ(dnnConversionExecute_F32( + convert_input, + const_cast(static_cast( + tensor_in.flat().data())), + input_buf), + E_SUCCESS); CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS); convert_input = nullptr; } else { input_shape.GetConvertedFlatData( - lt_input_prim, const_cast(static_cast( - tensor_in.flat().data())), + lt_input_prim, + const_cast( + static_cast(tensor_in.flat().data())), input_buf); } pooling_resfwd[dnnResourceSrc] = input_buf; @@ -453,8 +454,9 @@ class MklMaxPoolingGradOp : public OpKernel { CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS); } else { output_backprop_shape.GetConvertedFlatData( - lt_outbackprop_prim, const_cast(static_cast( - out_backprop.flat().data())), + lt_outbackprop_prim, + const_cast( + static_cast(out_backprop.flat().data())), outbackprop_buf); } pooling_res[dnnResourceDiffDst] = outbackprop_buf; @@ -499,7 +501,7 @@ template class MklMaxPoolingOp : public MklPoolingForwardOpBase { public: explicit MklMaxPoolingOp(OpKernelConstruction* context) - : MklPoolingForwardOpBase(context) { + : MklPoolingForwardOpBase(context) { // In Max Pooling, MKLDNN does not allow passing workspace as NULL. // So we set workspace_enabled_ to true. this->workspace_enabled_ = true; @@ -508,8 +510,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { void Compute(OpKernelContext* context) override { try { auto cpu_engine = engine(engine::cpu, 0); - const Tensor& input_tensor = MklGetInput(context, - this->kInputTensorIndexInput); + const Tensor& input_tensor = + MklGetInput(context, this->kInputTensorIndexInput); MklDnnShape dnn_shape_input; GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input); this->SanityCheckInput(context, input_tensor, dnn_shape_input); @@ -522,9 +524,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { // initialize variables for the pooling op MklPoolParameters pool_params; // Get the input tensor and initialize the pooling parameters - this->ConfigureInput(context, dnn_shape_input, - input_tensor, &pool_params, - &dnn_data_input); + this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params, + &dnn_data_input); OP_REQUIRES_OK(context, context->status()); // Declare output tensor @@ -535,9 +536,10 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { // If input is in Mkl layout, then just get the memory format from it // directly, instead of using input data_format to MaxPool. if (dnn_shape_input.IsMklTensor()) { - dnn_data_output.SetUsrMem(output_dims_mkl_order, - static_cast( - dnn_data_input.GetUsrMemDesc().data.format)); + dnn_data_output.SetUsrMem( + output_dims_mkl_order, + static_cast( + dnn_data_input.GetUsrMemDesc().data.format)); } else { dnn_data_output.SetUsrMem(output_dims_mkl_order, this->data_format_mkldnn_); @@ -546,24 +548,21 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { // describe the memory layout; let mkl-dnn choose the best for the op dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any); - auto pool_desc = pooling_forward::desc(prop_kind::forward, - algorithm::pooling_max, - dnn_data_input.GetUsrMemDesc(), - dnn_data_output.GetUsrMemDesc(), - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_fwd_desc = pooling_forward::primitive_desc(pool_desc, - cpu_engine); + auto pool_desc = pooling_forward::desc( + prop_kind::forward, algorithm::pooling_max, + dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(), + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_fwd_desc = + pooling_forward::primitive_desc(pool_desc, cpu_engine); this->AllocateOutputTensor(context, pool_fwd_desc, output_dims_mkl_order, - this->data_format_mkldnn_, &output_tensor); + this->data_format_mkldnn_, &output_tensor); OP_REQUIRES_OK(context, context->status()); dnn_data_output.SetUsrMemDataHandle(output_tensor); @@ -571,39 +570,38 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { OP_REQUIRES_OK(context, context->status()); this->PrepareAndExecuteNet(pool_fwd_desc, &dnn_data_input, - &dnn_data_output, &dnn_data_wksp); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Compute received an exception:", - error_msg)); + &dnn_data_output, &dnn_data_wksp); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:", + error_msg)); } } // Compute private: - const int kOutputTensorIndexWorkspace = 1; + const int kOutputTensorIndexWorkspace = 1; - void AllocateWorkspaceTensor(OpKernelContext* context, - const pooling_forward::primitive_desc& pool_fwd_prim_desc, - MklDnnData* dnn_data_wksp) { - CHECK_NOTNULL(dnn_data_wksp); - Tensor* workspace_tensor = nullptr; - memory::primitive_desc workspace_pd - = pool_fwd_prim_desc.workspace_primitive_desc(); - size_t workspace_bytes = workspace_pd.get_size(); - MklDnnShape workspace_mkl_shape; - workspace_mkl_shape.SetMklTensor(false); - TensorShape workspace_tf_shape; - workspace_tf_shape.AddDim(workspace_bytes); - AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace, - &workspace_tensor, - workspace_tf_shape, workspace_mkl_shape); - CHECK_NOTNULL(workspace_tensor); - dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor); - } + void AllocateWorkspaceTensor( + OpKernelContext* context, + const pooling_forward::primitive_desc& pool_fwd_prim_desc, + MklDnnData* dnn_data_wksp) { + CHECK_NOTNULL(dnn_data_wksp); + Tensor* workspace_tensor = nullptr; + memory::primitive_desc workspace_pd = + pool_fwd_prim_desc.workspace_primitive_desc(); + size_t workspace_bytes = workspace_pd.get_size(); + MklDnnShape workspace_mkl_shape; + workspace_mkl_shape.SetMklTensor(false); + TensorShape workspace_tf_shape; + workspace_tf_shape.AddDim(workspace_bytes); + AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace, + &workspace_tensor, workspace_tf_shape, + workspace_mkl_shape); + CHECK_NOTNULL(workspace_tensor); + dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor); + } }; // The operation to compute MaxPool gradients. @@ -616,218 +614,183 @@ template class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { public: explicit MklMaxPoolingGradOp(OpKernelConstruction* context) - : MklPoolingBackwardOpBase(context) { - } + : MklPoolingBackwardOpBase(context) {} void Compute(OpKernelContext* context) override { try { - auto cpu_engine = engine(engine::cpu, 0); - const Tensor& orig_input_tensor = MklGetInput(context, - kInputTensorIndexOrigInput); - const Tensor& orig_output_tensor = MklGetInput(context, - kInputTensorIndexOrigOutput); - const Tensor& grad_tensor = MklGetInput(context, - kInputTensorIndexGradient); - const Tensor& workspace_tensor = MklGetInput(context, - kInputTensorIndexWorkspace); - MklDnnShape orig_input_mkl_shape, - orig_output_mkl_shape, - grad_mkl_shape, - workspace_mkl_shape; - GetMklShape(context, kInputTensorIndexOrigInput, - &orig_input_mkl_shape); - GetMklShape(context, kInputTensorIndexOrigOutput, - &orig_output_mkl_shape); - GetMklShape(context, kInputTensorIndexGradient, - &grad_mkl_shape); - GetMklShape(context, kInputTensorIndexWorkspace, - &workspace_mkl_shape); + auto cpu_engine = engine(engine::cpu, 0); + const Tensor& orig_input_tensor = + MklGetInput(context, kInputTensorIndexOrigInput); + const Tensor& orig_output_tensor = + MklGetInput(context, kInputTensorIndexOrigOutput); + const Tensor& grad_tensor = + MklGetInput(context, kInputTensorIndexGradient); + const Tensor& workspace_tensor = + MklGetInput(context, kInputTensorIndexWorkspace); + MklDnnShape orig_input_mkl_shape, orig_output_mkl_shape, grad_mkl_shape, + workspace_mkl_shape; + GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape); + GetMklShape(context, kInputTensorIndexOrigOutput, &orig_output_mkl_shape); + GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape); + GetMklShape(context, kInputTensorIndexWorkspace, &workspace_mkl_shape); - SanityCheckInputs(context, - orig_input_tensor, orig_output_tensor, - grad_tensor, workspace_tensor, - orig_input_mkl_shape, orig_output_mkl_shape, - grad_mkl_shape, workspace_mkl_shape); - if (!context->status().ok()) return; + SanityCheckInputs(context, orig_input_tensor, orig_output_tensor, + grad_tensor, workspace_tensor, orig_input_mkl_shape, + orig_output_mkl_shape, grad_mkl_shape, + workspace_mkl_shape); + if (!context->status().ok()) return; - MklDnnData grad_dnn_data(&cpu_engine); - MklDnnData workspace_dnn_data(&cpu_engine); - MklDnnData output_dnn_data(&cpu_engine); - Tensor* output_tensor = nullptr; - MklPoolParameters pool_params; - TensorShape orig_input_shape; - memory::dims output_dims_mkl_order, orig_input_dims_mkl_order; - memory::desc original_input_md = ConfigureOriginalInput(context, - orig_input_tensor, - orig_input_mkl_shape, - &orig_input_dims_mkl_order, - &pool_params, - &orig_input_shape); + MklDnnData grad_dnn_data(&cpu_engine); + MklDnnData workspace_dnn_data(&cpu_engine); + MklDnnData output_dnn_data(&cpu_engine); + Tensor* output_tensor = nullptr; + MklPoolParameters pool_params; + TensorShape orig_input_shape; + memory::dims output_dims_mkl_order, orig_input_dims_mkl_order; + memory::desc original_input_md = ConfigureOriginalInput( + context, orig_input_tensor, orig_input_mkl_shape, + &orig_input_dims_mkl_order, &pool_params, &orig_input_shape); - memory::desc original_output_md = this->ConfigureOriginalOutput( - pool_params, - orig_output_mkl_shape, - output_dims_mkl_order); + memory::desc original_output_md = this->ConfigureOriginalOutput( + pool_params, orig_output_mkl_shape, output_dims_mkl_order); - memory::desc target_diff_dst_md = this->ConfigureInputGradient( - grad_mkl_shape, - grad_tensor, - &grad_dnn_data, - original_output_md); + memory::desc target_diff_dst_md = this->ConfigureInputGradient( + grad_mkl_shape, grad_tensor, &grad_dnn_data, original_output_md); - output_dnn_data.SetUsrMem(original_input_md); + output_dnn_data.SetUsrMem(original_input_md); - // Create the forward pooling primitive descriptor so we can - // pass it as a hint to the backward pooling primitive descriptor - auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward, - algorithm::pooling_max, - original_input_md, - original_output_md, - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_fwd_prim_desc - = pooling_forward::primitive_desc(pool_fwd_desc, - cpu_engine); + // Create the forward pooling primitive descriptor so we can + // pass it as a hint to the backward pooling primitive descriptor + auto pool_fwd_desc = pooling_forward::desc( + prop_kind::forward, algorithm::pooling_max, original_input_md, + original_output_md, + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_fwd_prim_desc = + pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine); - auto pool_bkwd_desc = pooling_backward::desc( - algorithm::pooling_max, - output_dnn_data.GetUsrMemDesc(), - target_diff_dst_md, - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_bkwd_prim_desc - = pooling_backward::primitive_desc(pool_bkwd_desc, - cpu_engine, - pool_fwd_prim_desc); + auto pool_bkwd_desc = pooling_backward::desc( + algorithm::pooling_max, output_dnn_data.GetUsrMemDesc(), + target_diff_dst_md, + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_bkwd_prim_desc = pooling_backward::primitive_desc( + pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc); - this->AllocateOutputTensor(context, pool_bkwd_prim_desc, - orig_input_dims_mkl_order, - this->data_format_mkldnn_, - &output_tensor); - output_dnn_data.SetUsrMemDataHandle(output_tensor); + this->AllocateOutputTensor(context, pool_bkwd_prim_desc, + orig_input_dims_mkl_order, + this->data_format_mkldnn_, &output_tensor); + output_dnn_data.SetUsrMemDataHandle(output_tensor); - ConfigureWorkspace(workspace_tensor, - pool_fwd_prim_desc.workspace_primitive_desc(), - &workspace_dnn_data); - this->PrepareAndExecuteNet(pool_bkwd_prim_desc, - &grad_dnn_data, - &output_dnn_data, - memory::primitive_desc( - target_diff_dst_md, - cpu_engine), - &workspace_dnn_data); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Compute received an exception:", - error_msg)); + ConfigureWorkspace(workspace_tensor, + pool_fwd_prim_desc.workspace_primitive_desc(), + &workspace_dnn_data); + this->PrepareAndExecuteNet( + pool_bkwd_prim_desc, &grad_dnn_data, &output_dnn_data, + memory::primitive_desc(target_diff_dst_md, cpu_engine), + &workspace_dnn_data); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:", + error_msg)); } } // Compute private: - // .Input("orig_input: T") - // .Input("orig_output: T") - // .Input("grad: T") - // .Input("workspace: T") - const int kInputTensorIndexOrigInput = 0; - const int kInputTensorIndexOrigOutput = 1; - const int kInputTensorIndexGradient = 2; - const int kInputTensorIndexWorkspace = 3; - // Output("output: T") in Base Class + // .Input("orig_input: T") + // .Input("orig_output: T") + // .Input("grad: T") + // .Input("workspace: T") + const int kInputTensorIndexOrigInput = 0; + const int kInputTensorIndexOrigOutput = 1; + const int kInputTensorIndexGradient = 2; + const int kInputTensorIndexWorkspace = 3; + // Output("output: T") in Base Class - memory::desc ConfigureOriginalInput(OpKernelContext* context, - const Tensor& tensor_original_input, - const MklDnnShape& original_input_mkl_shape, - memory::dims* original_input_dims_mkl_order, - MklPoolParameters* pool_params, - TensorShape* input_tensor_shape) { - *input_tensor_shape = tensor_original_input.shape(); - return MklPoolingBackwardOpBase::ConfigureOriginalInput( - context, - tensor_original_input, - original_input_mkl_shape, - original_input_dims_mkl_order, - pool_params, - *input_tensor_shape); + memory::desc ConfigureOriginalInput( + OpKernelContext* context, const Tensor& tensor_original_input, + const MklDnnShape& original_input_mkl_shape, + memory::dims* original_input_dims_mkl_order, + MklPoolParameters* pool_params, TensorShape* input_tensor_shape) { + *input_tensor_shape = tensor_original_input.shape(); + return MklPoolingBackwardOpBase::ConfigureOriginalInput( + context, tensor_original_input, original_input_mkl_shape, + original_input_dims_mkl_order, pool_params, *input_tensor_shape); + } + + void ConfigureWorkspace(const Tensor& workspace_tensor, + memory::primitive_desc workspace_pd, + MklDnnData* workspace_dnn_data) { + CHECK_NOTNULL(workspace_dnn_data); + + workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor); + } + + void SanityCheckInputs(OpKernelContext* context, + const Tensor& orig_input_tensor, + const Tensor& orig_output_tensor, + const Tensor& grad_tensor, + const Tensor& workspace_tensor, + const MklDnnShape& orig_input_mkl_shape, + const MklDnnShape& orig_output_mkl_shape, + const MklDnnShape& grad_mkl_shape, + const MklDnnShape& workspace_mkl_shape) { + if (!orig_input_mkl_shape.IsMklTensor()) { + OP_REQUIRES(context, orig_input_tensor.dims() == 4, + errors::InvalidArgument("Original input shape must be " + "4-dimensional")); + } else { + OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4, + errors::InvalidArgument("Original input shape must be " + "4-dimensional")); } - - void ConfigureWorkspace(const Tensor& workspace_tensor, - memory::primitive_desc workspace_pd, - MklDnnData *workspace_dnn_data) { - CHECK_NOTNULL(workspace_dnn_data); - - workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor); + if (!orig_output_mkl_shape.IsMklTensor()) { + OP_REQUIRES(context, orig_output_tensor.dims() == 4, + errors::InvalidArgument("Original output must be " + "4-dimensional")); + } else { + OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4, + errors::InvalidArgument("Original output must be " + "4-dimensional")); } - - void SanityCheckInputs(OpKernelContext* context, - const Tensor& orig_input_tensor, - const Tensor& orig_output_tensor, - const Tensor& grad_tensor, - const Tensor& workspace_tensor, - const MklDnnShape& orig_input_mkl_shape, - const MklDnnShape& orig_output_mkl_shape, - const MklDnnShape& grad_mkl_shape, - const MklDnnShape& workspace_mkl_shape) { - if (!orig_input_mkl_shape.IsMklTensor()) { - OP_REQUIRES(context, orig_input_tensor.dims() == 4, - errors::InvalidArgument("Original input shape must be " - "4-dimensional")); - } else { - OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4, - errors::InvalidArgument("Original input shape must be " - "4-dimensional")); - } - if (!orig_output_mkl_shape.IsMklTensor()) { - OP_REQUIRES(context, orig_output_tensor.dims() == 4, - errors::InvalidArgument("Original output must be " - "4-dimensional")); - } else { - OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4, - errors::InvalidArgument("Original output must be " - "4-dimensional")); - } - if (!grad_mkl_shape.IsMklTensor()) { - OP_REQUIRES(context, grad_tensor.dims() == 4, - errors::InvalidArgument("Gradient must be 4-dimensional")); - } else { - OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4, - errors::InvalidArgument("Gradient must be " - "4-dimensional")); - } - if (this->workspace_enabled_) { - // The workspace should not be an MKL tensor - OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false, - errors::InvalidArgument("Workspace tensor should not" - " be an MKL Tensor.")); - // It should only have one dimension - OP_REQUIRES(context, workspace_tensor.dims() == 1, - errors::InvalidArgument("Workspace tensor must be " - "1-dimensional")); - } else { - OP_REQUIRES(context, this->workspace_enabled_, - errors::Unimplemented("MKL-DNN Max Pooling does not " + if (!grad_mkl_shape.IsMklTensor()) { + OP_REQUIRES(context, grad_tensor.dims() == 4, + errors::InvalidArgument("Gradient must be 4-dimensional")); + } else { + OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4, + errors::InvalidArgument("Gradient must be " + "4-dimensional")); + } + if (this->workspace_enabled_) { + // The workspace should not be an MKL tensor + OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false, + errors::InvalidArgument("Workspace tensor should not" + " be an MKL Tensor.")); + // It should only have one dimension + OP_REQUIRES(context, workspace_tensor.dims() == 1, + errors::InvalidArgument("Workspace tensor must be " + "1-dimensional")); + } else { + OP_REQUIRES( + context, this->workspace_enabled_, + errors::Unimplemented("MKL-DNN Max Pooling does not " "yet support the use case " "where MaxPoolGrad is called without first" " calling MaxPool.")); - } } + } }; // MklMaxPoolingGradOp #endif // INTEL_MKL_DNN diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc index f7cadffd39c..ef8597b0577 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc @@ -15,9 +15,9 @@ limitations under the License. #ifdef INTEL_MKL -#include -#include #include "tensorflow/core/kernels/mkl_pooling_ops_common.h" +#include +#include #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/framework/common_shape_fns.h" #include "tensorflow/core/kernels/bounds_check.h" @@ -111,17 +111,17 @@ void MklPoolParameters::Init(OpKernelContext* context, // TF can work with int64, but mkldnn only supports int32 // Fail if the height or width are greater than MAX_INT - OP_REQUIRES(context, FastBoundsCheck(out_height, - std::numeric_limits::max()), + OP_REQUIRES(context, + FastBoundsCheck(out_height, std::numeric_limits::max()), errors::InvalidArgument("output height is too large")); - OP_REQUIRES(context, FastBoundsCheck(out_width, - std::numeric_limits::max()), + OP_REQUIRES(context, + FastBoundsCheck(out_width, std::numeric_limits::max()), errors::InvalidArgument("output width is too large")); #endif out_depth = depth; // output will have the same depth as the input - } else { // we are pooling in the depth dimension + } else { // we are pooling in the depth dimension // Our current version of depthwise max pooling does not support // any padding, and expects the depth_window to equal the depth // stride (no overlapping). diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h index b974b2c59af..880e45ab1e7 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.h +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h @@ -17,16 +17,16 @@ limitations under the License. #define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_ #ifdef INTEL_MKL -#include #include +#include #include "tensorflow/core/util/mkl_util.h" #include "tensorflow/core/util/padding.h" #ifdef INTEL_MKL_DNN #include "mkldnn.hpp" using mkldnn::memory; -using mkldnn::pooling_forward; using mkldnn::pooling_backward; +using mkldnn::pooling_forward; using mkldnn::stream; #endif @@ -61,13 +61,25 @@ struct MklPoolParameters { TensorFormat data_format; MklPoolParameters() - : depth(0) - , tensor_in_cols(0), tensor_in_rows(0), tensor_in_batch(0) - , window_rows(0), window_cols(0), depth_window(0) - , row_stride(0), col_stride(0), depth_stride(0) - , out_height(0), out_width(0), out_depth(0) - , pad_left(0), pad_right(0), pad_top(0), pad_bottom(0), pad_depth(0) - , data_format(TensorFormat::FORMAT_NCHW) {} + : depth(0), + tensor_in_cols(0), + tensor_in_rows(0), + tensor_in_batch(0), + window_rows(0), + window_cols(0), + depth_window(0), + row_stride(0), + col_stride(0), + depth_stride(0), + out_height(0), + out_width(0), + out_depth(0), + pad_left(0), + pad_right(0), + pad_top(0), + pad_bottom(0), + pad_depth(0), + data_format(TensorFormat::FORMAT_NCHW) {} // Updates context->status if there is an invalid input. void Init(OpKernelContext* context, const std::vector& ksize, @@ -96,33 +108,31 @@ template class MklPoolingOpBase : public OpKernel { public: explicit MklPoolingOpBase(OpKernelConstruction* context) - : OpKernel(context) - , workspace_enabled_(false) { - string data_format; - OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); - OP_REQUIRES(context, - FormatFromString(data_format, &this->data_format_tf_), - errors::InvalidArgument("Invalid data format")); - this->data_format_mkldnn_ - = TFDataFormatToMklDnnDataFormat(this->data_format_tf_); - OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_)); - OP_REQUIRES(context, this->ksize_.size() == 4, - errors::InvalidArgument("Sliding window ksize field must " - "specify 4 dimensions")); - OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_)); - OP_REQUIRES(context, this->stride_.size() == 4, - errors::InvalidArgument("Sliding window strides field must " - "specify 4 dimensions")); - OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_)); - OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1, - errors::Unimplemented("Pooling is not yet supported on the " - "batch dimension.")); + : OpKernel(context), workspace_enabled_(false) { + string data_format; + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); + OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_), + errors::InvalidArgument("Invalid data format")); + this->data_format_mkldnn_ = + TFDataFormatToMklDnnDataFormat(this->data_format_tf_); + OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_)); + OP_REQUIRES(context, this->ksize_.size() == 4, + errors::InvalidArgument("Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_)); + OP_REQUIRES(context, this->stride_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_)); + OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1, + errors::Unimplemented("Pooling is not yet supported on the " + "batch dimension.")); - // We may not get this attribute for this node if it does not go through - // graph rewrite pass. So we do not check for error while retrieving this - // attribute value. - context->GetAttr("workspace_enabled", &this->workspace_enabled_); - } + // We may not get this attribute for this node if it does not go through + // graph rewrite pass. So we do not check for error while retrieving this + // attribute value. + context->GetAttr("workspace_enabled", &this->workspace_enabled_); + } void Compute(OpKernelContext* context) override = 0; protected: @@ -132,24 +142,24 @@ class MklPoolingOpBase : public OpKernel { // output height and output width to have already been int32 // bounds-checked void GetOutputDims(const MklPoolParameters& mkl_pool_params, - memory::dims* output_dims_mkl_order) { + memory::dims* output_dims_mkl_order) { // MKL-DNN always needs output in NCHW format. - *output_dims_mkl_order = { mkl_pool_params.tensor_in_batch, + *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch, mkl_pool_params.out_depth, static_cast(mkl_pool_params.out_height), static_cast(mkl_pool_params.out_width)}; } void InitMklPoolParameters(OpKernelContext* context, - MklPoolParameters* pool_params, - const MklDnnShape& original_input_mkl_shape, - const TensorShape& input_tensor_shape) { + MklPoolParameters* pool_params, + const MklDnnShape& original_input_mkl_shape, + const TensorShape& input_tensor_shape) { if (!original_input_mkl_shape.IsMklTensor()) { pool_params->Init(context, this->ksize_, this->stride_, this->padding_, - this->data_format_tf_, input_tensor_shape); + this->data_format_tf_, input_tensor_shape); } else { pool_params->Init(context, this->ksize_, this->stride_, this->padding_, - this->data_format_tf_, &original_input_mkl_shape); + this->data_format_tf_, &original_input_mkl_shape); } } @@ -159,13 +169,12 @@ class MklPoolingOpBase : public OpKernel { size_t GetNumTElements(const memory::primitive_desc& pd) { size_t num_bytes = pd.get_size(); size_t ret_val = num_bytes / sizeof(T); - if ( num_bytes % sizeof(T) != 0 ) { - ret_val++; + if (num_bytes % sizeof(T) != 0) { + ret_val++; } return ret_val; } - std::vector ksize_; std::vector stride_; Padding padding_; @@ -183,30 +192,29 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase { protected: void ConfigureInput(OpKernelContext* context, - const MklDnnShape& input_mkl_shape, - const Tensor& input_tensor, - MklPoolParameters* pool_params, - MklDnnData* dnn_data_input) { + const MklDnnShape& input_mkl_shape, + const Tensor& input_tensor, + MklPoolParameters* pool_params, + MklDnnData* dnn_data_input) { CHECK_NOTNULL(pool_params); CHECK_NOTNULL(dnn_data_input); TensorShape input_tensor_shape = input_tensor.shape(); - memory::desc input_md = input_mkl_shape.IsMklTensor() - ? input_mkl_shape.GetMklLayout() - : memory::desc( - TFShapeToMklDnnDimsInNCHW( - input_tensor_shape, this->data_format_tf_), - MklDnnType(), - this->data_format_mkldnn_); + memory::desc input_md = + input_mkl_shape.IsMklTensor() + ? input_mkl_shape.GetMklLayout() + : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape, + this->data_format_tf_), + MklDnnType(), this->data_format_mkldnn_); dnn_data_input->SetUsrMem(input_md, &input_tensor); - this->InitMklPoolParameters(context, pool_params, - input_mkl_shape, input_tensor_shape); + this->InitMklPoolParameters(context, pool_params, input_mkl_shape, + input_tensor_shape); } - void AllocateOutputTensor(OpKernelContext* context, - const pooling_forward::primitive_desc& pool_fwd_prim_desc, - const memory::dims output_dims_mkl_order, - const memory::format& output_tf_format, - Tensor** output_tensor) { + void AllocateOutputTensor( + OpKernelContext* context, + const pooling_forward::primitive_desc& pool_fwd_prim_desc, + const memory::dims output_dims_mkl_order, + const memory::format& output_tf_format, Tensor** output_tensor) { CHECK_NOTNULL(output_tensor); memory::primitive_desc dst_pd = pool_fwd_prim_desc.dst_primitive_desc(); @@ -215,50 +223,42 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase { output_mkl_shape.SetMklLayout(&dst_pd); output_mkl_shape.SetElemType(MklDnnType()); output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, - output_tf_format); + output_dims_mkl_order, output_tf_format); TensorShape output_tf_shape; // only allocate enough space for the elements we need. output_tf_shape.AddDim(this->GetNumTElements(dst_pd)); - AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, - output_tensor, - output_tf_shape, output_mkl_shape); + AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor, + output_tf_shape, output_mkl_shape); CHECK_NOTNULL(*output_tensor); } void PrepareAndExecuteNet( - const pooling_forward::primitive_desc& pool_fwd_desc, - const MklDnnData* src, - MklDnnData* dst, - MklDnnData* wksp = nullptr) { + const pooling_forward::primitive_desc& pool_fwd_desc, + const MklDnnData* src, MklDnnData* dst, + MklDnnData* wksp = nullptr) { std::vector net; // Create pooling primitive and add it to net if (wksp != nullptr) { - net.push_back(pooling_forward(pool_fwd_desc, - src->GetOpMem(), - dst->GetOpMem(), - wksp->GetOpMem())); + net.push_back(pooling_forward(pool_fwd_desc, src->GetOpMem(), + dst->GetOpMem(), wksp->GetOpMem())); } else { - net.push_back(pooling_forward(pool_fwd_desc, - src->GetOpMem(), - dst->GetOpMem())); + net.push_back( + pooling_forward(pool_fwd_desc, src->GetOpMem(), dst->GetOpMem())); } stream(stream::kind::eager).submit(net).wait(); } - - void SanityCheckInput(OpKernelContext* context, - const Tensor& input_tensor, - const MklDnnShape& input_mkl_shape) { + void SanityCheckInput(OpKernelContext* context, const Tensor& input_tensor, + const MklDnnShape& input_mkl_shape) { if (!input_mkl_shape.IsMklTensor()) { OP_REQUIRES(context, input_tensor.dims() == 4, - errors::InvalidArgument("Input must be 4-dimensional")); + errors::InvalidArgument("Input must be 4-dimensional")); } else { - OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4, - errors::InvalidArgument("Input shape must be " - "4-dimensional")); + OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4, + errors::InvalidArgument("Input shape must be " + "4-dimensional")); } } // .Input("value: T") @@ -267,66 +267,58 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase { const int kOutputTensorIndexOutput = 0; }; // MklPoolingForwardBaseOp - template class MklPoolingBackwardOpBase : public MklPoolingOpBase { public: explicit MklPoolingBackwardOpBase(OpKernelConstruction* context) - : MklPoolingOpBase(context) { } + : MklPoolingOpBase(context) {} void Compute(OpKernelContext* context) override = 0; protected: const int kOutputTensorIndexOutput = 0; - void AllocateOutputTensor(OpKernelContext* context, - const pooling_backward::primitive_desc& pool_bkwd_prim_desc, - const memory::dims output_dims_mkl_order, - const memory::format& output_tf_format, - Tensor** output_tensor) { + void AllocateOutputTensor( + OpKernelContext* context, + const pooling_backward::primitive_desc& pool_bkwd_prim_desc, + const memory::dims output_dims_mkl_order, + const memory::format& output_tf_format, Tensor** output_tensor) { CHECK_NOTNULL(output_tensor); - memory::primitive_desc dst_pd - = pool_bkwd_prim_desc.diff_src_primitive_desc(); + memory::primitive_desc dst_pd = + pool_bkwd_prim_desc.diff_src_primitive_desc(); MklDnnShape output_mkl_shape; output_mkl_shape.SetMklTensor(true); output_mkl_shape.SetMklLayout(&dst_pd); output_mkl_shape.SetElemType(MklDnnType()); output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, - output_tf_format); + output_dims_mkl_order, output_tf_format); TensorShape output_tf_shape; output_tf_shape.AddDim(this->GetNumTElements(dst_pd)); - AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, - output_tensor, - output_tf_shape, output_mkl_shape); + AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor, + output_tf_shape, output_mkl_shape); CHECK_NOTNULL(*output_tensor); } void PrepareAndExecuteNet( - const pooling_backward::primitive_desc& pool_bkwd_desc, - MklDnnData* input_gradient_diff_dst, - MklDnnData* output_diff_src, - const memory::primitive_desc& target_diff_dst_pd, - const MklDnnData* workspace = nullptr) { - + const pooling_backward::primitive_desc& pool_bkwd_desc, + MklDnnData* input_gradient_diff_dst, MklDnnData* output_diff_src, + const memory::primitive_desc& target_diff_dst_pd, + const MklDnnData* workspace = nullptr) { std::vector net; // If the input gradient isn't in the same format as the output // reorder it to the same format as the output - input_gradient_diff_dst->CheckReorderToOpMem( - target_diff_dst_pd, - &net); + input_gradient_diff_dst->CheckReorderToOpMem(target_diff_dst_pd, &net); // Create pooling primitive and add it to net if (nullptr == workspace) { net.push_back(pooling_backward(pool_bkwd_desc, - input_gradient_diff_dst->GetOpMem(), - output_diff_src->GetOpMem())); + input_gradient_diff_dst->GetOpMem(), + output_diff_src->GetOpMem())); } else { - net.push_back(pooling_backward(pool_bkwd_desc, - input_gradient_diff_dst->GetOpMem(), - workspace->GetOpMem(), - output_diff_src->GetOpMem())); + net.push_back( + pooling_backward(pool_bkwd_desc, input_gradient_diff_dst->GetOpMem(), + workspace->GetOpMem(), output_diff_src->GetOpMem())); } stream(stream::kind::eager).submit(net).wait(); } @@ -334,77 +326,73 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase { // Max Pooling and Avg Pooling have slightly different implementations // Takes the Tensor containing original input data and the original // mkl Dnn Shape and populates other data - memory::desc ConfigureOriginalInput(OpKernelContext* context, - const Tensor& tensor_original_input_shape, - const MklDnnShape& original_input_mkl_shape, - memory::dims* original_input_dims_nchw, - MklPoolParameters* pool_params, - const TensorShape& input_tensor_shape) { + memory::desc ConfigureOriginalInput( + OpKernelContext* context, const Tensor& tensor_original_input_shape, + const MklDnnShape& original_input_mkl_shape, + memory::dims* original_input_dims_nchw, MklPoolParameters* pool_params, + const TensorShape& input_tensor_shape) { CHECK_NOTNULL(original_input_dims_nchw); CHECK_NOTNULL(pool_params); - this->InitMklPoolParameters(context, pool_params, - original_input_mkl_shape, - input_tensor_shape); + this->InitMklPoolParameters(context, pool_params, original_input_mkl_shape, + input_tensor_shape); - *original_input_dims_nchw - = original_input_mkl_shape.IsMklTensor() - ? original_input_mkl_shape.GetSizesAsMklDnnDims() - : TFShapeToMklDnnDimsInNCHW(input_tensor_shape, - this->data_format_tf_); + *original_input_dims_nchw = + original_input_mkl_shape.IsMklTensor() + ? original_input_mkl_shape.GetSizesAsMklDnnDims() + : TFShapeToMklDnnDimsInNCHW(input_tensor_shape, + this->data_format_tf_); - return original_input_mkl_shape.IsMklTensor() - ? original_input_mkl_shape.GetMklLayout() - : memory::desc(*original_input_dims_nchw, - MklDnnType(), - this->data_format_mkldnn_); + return original_input_mkl_shape.IsMklTensor() + ? original_input_mkl_shape.GetMklLayout() + : memory::desc(*original_input_dims_nchw, MklDnnType(), + this->data_format_mkldnn_); } - memory::desc ConfigureOriginalOutput(const MklPoolParameters& pool_params, - const MklDnnShape& original_output_mkl_shape, - memory::dims output_dims_mkl_order) { + memory::desc ConfigureOriginalOutput( + const MklPoolParameters& pool_params, + const MklDnnShape& original_output_mkl_shape, + memory::dims output_dims_mkl_order) { this->GetOutputDims(pool_params, &output_dims_mkl_order); return original_output_mkl_shape.IsMklTensor() - ? original_output_mkl_shape.GetMklLayout() - : memory::desc(output_dims_mkl_order, - MklDnnType(), - this->data_format_mkldnn_); + ? original_output_mkl_shape.GetMklLayout() + : memory::desc(output_dims_mkl_order, MklDnnType(), + this->data_format_mkldnn_); } memory::desc ConfigureInputGradient( - const MklDnnShape& input_gradient_mkl_shape, - const Tensor& input_gradient_tensor, - MklDnnData* input_gradient_dnn_data, - const memory::desc& original_output_md) { + const MklDnnShape& input_gradient_mkl_shape, + const Tensor& input_gradient_tensor, + MklDnnData* input_gradient_dnn_data, + const memory::desc& original_output_md) { // Configure the gradient as is - memory::desc original_input_grad_md - = input_gradient_mkl_shape.IsMklTensor() - ? input_gradient_mkl_shape.GetMklLayout() - : memory::desc(TFShapeToMklDnnDimsInNCHW( - input_gradient_tensor.shape(), - this->data_format_tf_), - MklDnnType(), this->data_format_mkldnn_); + memory::desc original_input_grad_md = + input_gradient_mkl_shape.IsMklTensor() + ? input_gradient_mkl_shape.GetMklLayout() + : memory::desc( + TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(), + this->data_format_tf_), + MklDnnType(), this->data_format_mkldnn_); input_gradient_dnn_data->SetUsrMem(original_input_grad_md, - &input_gradient_tensor); + &input_gradient_tensor); // Check to see if input grad diff dst is in the right format // Create a new memory descriptor with the same shape as the // original, but the format of the other tensors. memory::format original_output_format = - static_cast(original_output_md.data.format); - bool grad_reorder_needed = input_gradient_dnn_data->IsReorderNeeded( - original_output_format); - memory::dims diff_dst_dims = input_gradient_mkl_shape.IsMklTensor() - ? input_gradient_mkl_shape.GetSizesAsMklDnnDims() - : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(), - this->data_format_tf_); - memory::desc target_diff_dst_md = memory::desc(diff_dst_dims, - MklDnnType(), original_output_format); + static_cast(original_output_md.data.format); + bool grad_reorder_needed = + input_gradient_dnn_data->IsReorderNeeded(original_output_format); + memory::dims diff_dst_dims = + input_gradient_mkl_shape.IsMklTensor() + ? input_gradient_mkl_shape.GetSizesAsMklDnnDims() + : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(), + this->data_format_tf_); + memory::desc target_diff_dst_md = + memory::desc(diff_dst_dims, MklDnnType(), original_output_format); - return grad_reorder_needed - ? target_diff_dst_md - : original_input_grad_md; + return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md; } }; #endif // INTEL_MKL_DNN diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc index dc899d8c7ee..873aca30ca9 100644 --- a/tensorflow/core/kernels/mkl_relu_op.cc +++ b/tensorflow/core/kernels/mkl_relu_op.cc @@ -16,29 +16,29 @@ limitations under the License. // See docs in ../ops/nn_ops.cc. #ifdef INTEL_MKL +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/lib/core/errors.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/platform/default/logging.h" -#include "tensorflow/core/util/mkl_util.h" #include "mkl_dnn.h" #include "mkl_dnn_types.h" +#include "tensorflow/core/platform/default/logging.h" +#include "tensorflow/core/util/mkl_util.h" #ifdef INTEL_MKL_DNN #include "mkldnn.hpp" -using mkldnn::stream; -using mkldnn::prop_kind; using mkldnn::algorithm; -using mkldnn::relu_forward; -using mkldnn::relu_backward; -using mkldnn::eltwise_relu; using mkldnn::eltwise_elu; +using mkldnn::eltwise_relu; using mkldnn::eltwise_tanh; +using mkldnn::prop_kind; +using mkldnn::relu_backward; +using mkldnn::relu_forward; +using mkldnn::stream; #endif namespace tensorflow { @@ -180,7 +180,6 @@ class MklReluOp : public OpKernel { } MklReluOpContext; }; - template class MklReluGradOp : public OpKernel { public: @@ -214,10 +213,11 @@ class MklReluGradOp : public OpKernel { if (!dnnLayoutCompare_F32(lt_input, lt_grad)) { AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad, &mkl_buffer_convert); - CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, - lt_grad), E_SUCCESS); + CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad), + E_SUCCESS); CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input, - mkl_buffer_convert), E_SUCCESS); + mkl_buffer_convert), + E_SUCCESS); relu_res[dnnResourceSrc] = mkl_buffer_convert; dnnDelete_F32(cv_input_to_grad); } else { @@ -325,7 +325,8 @@ void MklReluGradOp::Compute(OpKernelContext* context) { float negative_slope = 0.0; CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL, mkl_context.lt_grad, mkl_context.lt_grad, - negative_slope), E_SUCCESS); + negative_slope), + E_SUCCESS); Tensor mkl_tmp_input_buf_tensor; mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor); @@ -348,7 +349,8 @@ void MklReluGradOp::Compute(OpKernelContext* context) { } tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast( - mkl_context.output_shape.GetMklLayout())) / sizeof(T)); + mkl_context.output_shape.GetMklLayout())) / + sizeof(T)); AllocateOutputSetMklShape(context, 0, &output, tf_shape, mkl_context.output_shape); } else { @@ -361,13 +363,11 @@ void MklReluGradOp::Compute(OpKernelContext* context) { mkl_context.relu_res[dnnResourceDiffSrc] = static_cast(output->flat().data()); - CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, - mkl_context.relu_res), - E_SUCCESS); + CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res), + E_SUCCESS); mkl_context.MklCleanup(); } - #else // INTEL_MKL_DNN template @@ -375,8 +375,7 @@ class MklReluOpBase : public OpKernel { public: ~MklReluOpBase() {} - explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) { - } + explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {} virtual void Compute_Scalar(OpKernelContext* context) = 0; @@ -413,12 +412,12 @@ class MklReluOpBase : public OpKernel { T alpha = 0, beta = 0; std::shared_ptr relu_fwd_pd; - auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training, + auto relu_fwd_desc = relu_forward::desc( + prop_kind::forward_training, // Operator memory descriptor is same as user memory descriptor. - alg_kind, src.GetUsrMemDesc(), - alpha, beta); - relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc, - cpu_engine)); + alg_kind, src.GetUsrMemDesc(), alpha, beta); + relu_fwd_pd.reset( + new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine)); // allocate dst tensor MklDnnShape dnn_shape_dst; @@ -431,7 +430,7 @@ class MklReluOpBase : public OpKernel { dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), dnn_shape_src.GetSizesAsMklDnnDims(), dnn_shape_src.GetTfDataFormat()); - tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T)); + tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T)); } else { dnn_shape_dst.SetMklTensor(false); tf_shape_dst = src_tensor.shape(); @@ -445,34 +444,32 @@ class MklReluOpBase : public OpKernel { // execute net std::vector net; - auto relu_fwd = relu_forward(*relu_fwd_pd, src.GetOpMem(), - dst.GetOpMem()); + auto relu_fwd = + relu_forward(*relu_fwd_pd, src.GetOpMem(), dst.GetOpMem()); net.push_back(relu_fwd); stream(stream::kind::eager).submit(net).wait(); - } catch (mkldnn::error &e) { + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } }; - template class MklReluGradOpBase : public OpKernel { public: ~MklReluGradOpBase() {} - explicit MklReluGradOpBase(OpKernelConstruction* context) : - OpKernel(context) {} + explicit MklReluGradOpBase(OpKernelConstruction* context) + : OpKernel(context) {} virtual void Compute_Scalar(OpKernelContext* context) = 0; - void Compute(OpKernelContext* context) { + void Compute(OpKernelContext* context) { try { auto cpu_engine = engine(engine::cpu, 0); MklDnnData src(&cpu_engine); @@ -483,9 +480,9 @@ class MklReluGradOpBase : public OpKernel { const size_t src_index = 1; // index of src input tensor const size_t diff_src_index = 0; // index of diff_src output tensor - const Tensor& src_tensor = MklGetInput(context, src_index); + const Tensor& src_tensor = MklGetInput(context, src_index); const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index); - Tensor* diff_src_tensor = nullptr; + Tensor* diff_src_tensor = nullptr; MklDnnShape dnn_shape_src, dnn_shape_diff_dst; GetMklShape(context, src_index, &dnn_shape_src); @@ -526,25 +523,25 @@ class MklReluGradOpBase : public OpKernel { src_md = dnn_shape_src.GetMklLayout(); memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat(); - auto src_tf_data_format = MklDnnDataFormatToTFDataFormat( - src_mkl_data_format); + auto src_tf_data_format = + MklDnnDataFormatToTFDataFormat(src_mkl_data_format); auto diff_dst_dims = TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), src_tf_data_format); - diff_dst_md = memory::desc(diff_dst_dims, MklDnnType(), - src_mkl_data_format); + diff_dst_md = + memory::desc(diff_dst_dims, MklDnnType(), src_mkl_data_format); } else if (!dnn_shape_src.IsMklTensor() && - dnn_shape_diff_dst.IsMklTensor()) { + dnn_shape_diff_dst.IsMklTensor()) { // Same comment as above. diff_dst_md = dnn_shape_diff_dst.GetMklLayout(); memory::format diff_dst_mkl_data_format = - dnn_shape_diff_dst.GetTfDataFormat(); - auto diff_dst_tf_data_format = MklDnnDataFormatToTFDataFormat( - diff_dst_mkl_data_format); + dnn_shape_diff_dst.GetTfDataFormat(); + auto diff_dst_tf_data_format = + MklDnnDataFormatToTFDataFormat(diff_dst_mkl_data_format); auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), diff_dst_tf_data_format); - src_md = memory::desc(src_dims, MklDnnType(), - diff_dst_mkl_data_format); + src_md = + memory::desc(src_dims, MklDnnType(), diff_dst_mkl_data_format); } else { // If both the inputs are in MKL format, we use Mkl layout of the input // tensors. @@ -572,12 +569,12 @@ class MklReluGradOpBase : public OpKernel { std::shared_ptr relu_fwd_pd; auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training, alg_kind, src_md, alpha, beta); - relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc, - cpu_engine)); - auto relu_bwd_desc = relu_backward::desc(alg_kind, common_md, common_md, - alpha, beta); - auto relu_bwd_pd = relu_backward::primitive_desc(relu_bwd_desc, - cpu_engine, *relu_fwd_pd); + relu_fwd_pd.reset( + new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine)); + auto relu_bwd_desc = + relu_backward::desc(alg_kind, common_md, common_md, alpha, beta); + auto relu_bwd_pd = relu_backward::primitive_desc( + relu_bwd_desc, cpu_engine, *relu_fwd_pd); // allocate diff_src tensor MklDnnShape dnn_shape_diff_src; @@ -590,33 +587,32 @@ class MklReluGradOpBase : public OpKernel { dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), dnn_shape_src.GetSizesAsMklDnnDims(), dnn_shape_src.GetTfDataFormat()); - tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T)); + tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T)); } else { dnn_shape_diff_src.SetMklTensor(false); tf_shape_diff_src = src_tensor.shape(); } AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor, - tf_shape_diff_src, dnn_shape_diff_src); + tf_shape_diff_src, dnn_shape_diff_src); // diff_src memory descriptor is same as memory descriptor for both // inputs. diff_src.SetUsrMem(common_md, diff_src_tensor); PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc, - MklDnnData* src, MklDnnData* diff_src, MklDnnData* - diff_dst) { + MklDnnData* src, MklDnnData* diff_src, + MklDnnData* diff_dst) { std::vector net; // Check if we need to reorder original input tensors into common_md layout @@ -632,14 +628,13 @@ class MklReluGradOpBase : public OpKernel { } }; - template class MklReluOp : public MklReluOpBase { public: ~MklReluOp() {} - explicit MklReluOp(OpKernelConstruction* context) : - MklReluOpBase(context) {} + explicit MklReluOp(OpKernelConstruction* context) + : MklReluOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t src_index = 0; // index of src input tensor @@ -649,15 +644,15 @@ class MklReluOp : public MklReluOpBase { GetMklShape(context, src_index, &dnn_shape_src); Tensor* dst_tensor = nullptr; - void* user_i = static_cast(const_cast( - src_tensor.flat().data())); + void* user_i = + static_cast(const_cast(src_tensor.flat().data())); MklDnnShape dnn_shape_dst; dnn_shape_dst.SetMklTensor(false); AllocateOutputSetMklShape(context, dst_index, &dst_tensor, src_tensor.shape(), dnn_shape_dst); void* out_o = static_cast(dst_tensor->flat().data()); (static_cast(out_o))[0] = - std::max((static_cast(user_i))[0], static_cast(0)); + std::max((static_cast(user_i))[0], static_cast(0)); return; } }; @@ -667,14 +662,14 @@ class MklReluGradOp : public MklReluGradOpBase { public: ~MklReluGradOp() {} - explicit MklReluGradOp(OpKernelConstruction* context) : - MklReluGradOpBase(context) {} + explicit MklReluGradOp(OpKernelConstruction* context) + : MklReluGradOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t diff_dst_index = 0; // index of diff_dst input tensor const size_t src_index = 1; // index of src input tensor const size_t diff_src_index = 0; // index of diff_src output tensor - const Tensor& src_tensor = MklGetInput(context, src_index); + const Tensor& src_tensor = MklGetInput(context, src_index); const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index); Tensor* diff_src_tensor = nullptr; @@ -687,11 +682,11 @@ class MklReluGradOp : public MklReluGradOpBase { diff_dst_tensor.shape(), dnn_shape_diff_src); void* out_o = static_cast(diff_src_tensor->flat().data()); void* user_i = - static_cast(const_cast(src_tensor.flat().data())); + static_cast(const_cast(src_tensor.flat().data())); void* user_g = - static_cast(const_cast(diff_dst_tensor.flat().data())); - (static_cast(out_o))[0] = (static_cast(user_g))[0] * - ((static_cast(user_i))[0] > 0); + static_cast(const_cast(diff_dst_tensor.flat().data())); + (static_cast(out_o))[0] = + (static_cast(user_g))[0] * ((static_cast(user_i))[0] > 0); return; } }; @@ -701,8 +696,8 @@ class MklEluOp : public MklReluOpBase { public: ~MklEluOp() {} - explicit MklEluOp(OpKernelConstruction* context) : - MklReluOpBase(context) {} + explicit MklEluOp(OpKernelConstruction* context) + : MklReluOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t src_index = 0; // index of src input tensor @@ -712,8 +707,8 @@ class MklEluOp : public MklReluOpBase { GetMklShape(context, src_index, &dnn_shape_src); Tensor* dst_tensor = nullptr; - void* user_i = static_cast(const_cast( - src_tensor.flat().data())); + void* user_i = + static_cast(const_cast(src_tensor.flat().data())); MklDnnShape dnn_shape_dst; dnn_shape_dst.SetMklTensor(false); AllocateOutputSetMklShape(context, dst_index, &dst_tensor, @@ -734,14 +729,14 @@ class MklEluGradOp : public MklReluGradOpBase { public: ~MklEluGradOp() {} - explicit MklEluGradOp(OpKernelConstruction* context) : - MklReluGradOpBase(context) {} + explicit MklEluGradOp(OpKernelConstruction* context) + : MklReluGradOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t diff_dst_index = 0; // index of diff_dst input tensor const size_t src_index = 1; // index of src input tensor const size_t diff_src_index = 0; // index of diff_src output tensor - const Tensor& src_tensor = MklGetInput(context, src_index); + const Tensor& src_tensor = MklGetInput(context, src_index); const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index); Tensor* diff_src_tensor = nullptr; @@ -754,9 +749,9 @@ class MklEluGradOp : public MklReluGradOpBase { diff_dst_tensor.shape(), dnn_shape_diff_src); void* out_o = static_cast(diff_src_tensor->flat().data()); void* user_i = - static_cast(const_cast(src_tensor.flat().data())); + static_cast(const_cast(src_tensor.flat().data())); void* user_g = - static_cast(const_cast(diff_dst_tensor.flat().data())); + static_cast(const_cast(diff_dst_tensor.flat().data())); // gradient of elu(x) = 1 if x > 0; elu(x) + 1 otherwise T feature = (static_cast(user_i))[0]; if (feature > 0) { @@ -773,8 +768,8 @@ class MklTanhOp : public MklReluOpBase { public: ~MklTanhOp() {} - explicit MklTanhOp(OpKernelConstruction* context) : - MklReluOpBase(context) {} + explicit MklTanhOp(OpKernelConstruction* context) + : MklReluOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t src_index = 0; // index of src input tensor @@ -784,8 +779,8 @@ class MklTanhOp : public MklReluOpBase { GetMklShape(context, src_index, &dnn_shape_src); Tensor* dst_tensor = nullptr; - void* user_i = static_cast(const_cast( - src_tensor.flat().data())); + void* user_i = + static_cast(const_cast(src_tensor.flat().data())); MklDnnShape dnn_shape_dst; dnn_shape_dst.SetMklTensor(false); AllocateOutputSetMklShape(context, dst_index, &dst_tensor, @@ -795,7 +790,7 @@ class MklTanhOp : public MklReluOpBase { T feature = (static_cast(user_i))[0]; T e1 = std::exp(feature); T e2 = std::exp(-feature); - (static_cast(out_o))[0] = (e1 - e2)/(e1 + e2); + (static_cast(out_o))[0] = (e1 - e2) / (e1 + e2); return; } }; @@ -805,14 +800,14 @@ class MklTanhGradOp : public MklReluGradOpBase { public: ~MklTanhGradOp() {} - explicit MklTanhGradOp(OpKernelConstruction* context) : - MklReluGradOpBase(context) {} + explicit MklTanhGradOp(OpKernelConstruction* context) + : MklReluGradOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t diff_dst_index = 0; // index of diff_dst input tensor const size_t src_index = 1; // index of src input tensor const size_t diff_src_index = 0; // index of diff_src output tensor - const Tensor& src_tensor = MklGetInput(context, src_index); + const Tensor& src_tensor = MklGetInput(context, src_index); const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index); Tensor* diff_src_tensor = nullptr; @@ -825,16 +820,16 @@ class MklTanhGradOp : public MklReluGradOpBase { diff_dst_tensor.shape(), dnn_shape_diff_src); void* out_o = static_cast(diff_src_tensor->flat().data()); void* user_i = - static_cast(const_cast(src_tensor.flat().data())); + static_cast(const_cast(src_tensor.flat().data())); // gradient of tanh(x) = 1 - tanh(x)^2 T feature = (static_cast(user_i))[0]; T e1 = std::exp(feature); T e2 = std::exp(-feature); - T tanh = (e1 - e2)/(e1 + e2); + T tanh = (e1 - e2) / (e1 + e2); void* user_g = - static_cast(const_cast(diff_dst_tensor.flat().data())); - (static_cast(out_o))[0] = (static_cast(user_g))[0] * - (1 - tanh * tanh); + static_cast(const_cast(diff_dst_tensor.flat().data())); + (static_cast(out_o))[0] = + (static_cast(user_g))[0] * (1 - tanh * tanh); } }; @@ -857,13 +852,13 @@ TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES); #ifdef INTEL_MKL_DNN // register dnn kernels for supported operations and supported types -#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type) \ - REGISTER_KERNEL_BUILDER(Name("_MklElu") \ +#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type) \ + REGISTER_KERNEL_BUILDER(Name("_MklElu") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ - MklEluOp); \ - REGISTER_KERNEL_BUILDER(Name("_MklEluGrad") \ + MklEluOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklEluGrad") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ @@ -888,4 +883,3 @@ TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES); } // namespace tensorflow #endif // INTEL_MKL - diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc index b41e529357b..7d471e1e4cc 100644 --- a/tensorflow/core/kernels/mkl_reshape_op.cc +++ b/tensorflow/core/kernels/mkl_reshape_op.cc @@ -166,9 +166,9 @@ class MklReshapeOp : public OpKernel { MklDnnShape mkl_shape_input; GetMklShape(context, kInputSlotIdx, &mkl_shape_input); bool input_in_mkl_format = mkl_shape_input.IsMklTensor(); - const int64 nelems = input_in_mkl_format ? - mkl_shape_input.GetTfShape().num_elements() - : input_tensor.NumElements(); + const int64 nelems = input_in_mkl_format + ? mkl_shape_input.GetTfShape().num_elements() + : input_tensor.NumElements(); // Preliminary validation of sizes. OP_REQUIRES(context, IsLegacyVector(sizes.shape()), @@ -210,11 +210,11 @@ class MklReshapeOp : public OpKernel { product)); shape.set_dim(unknown_index, missing); } - OP_REQUIRES(context, shape.num_elements() == nelems, - errors::InvalidArgument("Input to reshape is a tensor with ", - nelems, - " values, but the requested shape has ", - shape.num_elements())); + OP_REQUIRES( + context, shape.num_elements() == nelems, + errors::InvalidArgument("Input to reshape is a tensor with ", nelems, + " values, but the requested shape has ", + shape.num_elements())); if (input_in_mkl_format) { TensorShape& shape_to = shape; @@ -237,38 +237,38 @@ class MklReshapeOp : public OpKernel { // need to update MklDnnShape object associated with the input // tensor to reflect the shape change expected by reshape. if (!SkipReorder(mkl_shape_input, shape_to)) { - // If dimensions that are being expanded or collapsed are not - // maintained contiguously by MKLDNN, then we use reorder. + // If dimensions that are being expanded or collapsed are not + // maintained contiguously by MKLDNN, then we use reorder. - // Get Mkl layout of input tensor. - auto input_mkl_md = mkl_shape_input.GetMklLayout(); - // Set input Mkl layout as the user layout. - dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor); - // Get expected Tensorflow layout of input tensor. - auto output_tf_md = mkl_shape_input.GetTfLayout(); - auto output_tf_pd = memory::primitive_desc(output_tf_md, - cpu_engine); + // Get Mkl layout of input tensor. + auto input_mkl_md = mkl_shape_input.GetMklLayout(); + // Set input Mkl layout as the user layout. + dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor); + // Get expected Tensorflow layout of input tensor. + auto output_tf_md = mkl_shape_input.GetTfLayout(); + auto output_tf_pd = + memory::primitive_desc(output_tf_md, cpu_engine); - Tensor* output_tensor = nullptr; - MklShape mkl_shape_output; - mkl_shape_output.SetMklTensor(false); - // We allocate output tensor in the shape expected by Reshape. - AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor, - shape_to, mkl_shape_output); + Tensor* output_tensor = nullptr; + MklShape mkl_shape_output; + mkl_shape_output.SetMklTensor(false); + // We allocate output tensor in the shape expected by Reshape. + AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor, + shape_to, mkl_shape_output); - // Insert reorder between Mkl layout and TensorFlow layout if - // needed. If reorder is not needed but reshape is needed (since - // shape_from != shape_to), then we just copy input tensor to - // output tensor with target shape (we cannot forward Mkl layout - // in such case because shape has changed.) - std::vector net; - if (dnn_data_input.CheckReorderToOpMem(output_tf_pd, - output_tensor, &net)) { - stream(stream::kind::eager).submit(net).wait(); - } else { - output_tensor->CopyFrom(input_tensor, shape_to); - } - return; + // Insert reorder between Mkl layout and TensorFlow layout if + // needed. If reorder is not needed but reshape is needed (since + // shape_from != shape_to), then we just copy input tensor to + // output tensor with target shape (we cannot forward Mkl layout + // in such case because shape has changed.) + std::vector net; + if (dnn_data_input.CheckReorderToOpMem(output_tf_pd, output_tensor, + &net)) { + stream(stream::kind::eager).submit(net).wait(); + } else { + output_tensor->CopyFrom(input_tensor, shape_to); + } + return; } else { // If dimensions that are being expanded or collapsed are // maintained contiguously by MKLDNN, then we skip reorder, just @@ -276,10 +276,10 @@ class MklReshapeOp : public OpKernel { // Tensorflow tensor as it is to the output. auto output_dims = TFShapeToMklDnnDims(shape_to); auto output_strides = CalculateTFStrides(output_dims); - auto output_tf_md = MklDnnData::CreateBlockedMemDesc(output_dims, - output_strides); - auto output_tf_pd = memory::primitive_desc(output_tf_md, - cpu_engine); + auto output_tf_md = MklDnnData::CreateBlockedMemDesc( + output_dims, output_strides); + auto output_tf_pd = + memory::primitive_desc(output_tf_md, cpu_engine); // Set MklDnnShape MklDnnShape mkl_shape_output; @@ -291,18 +291,17 @@ class MklReshapeOp : public OpKernel { // We now simply forward input Mkl tensor to output and change its // output MklDnnShape object. - ForwardMklTensorInToOutWithMklShape(context, kInputSlotIdx, - kOutputSlotIdx, mkl_shape_output); + ForwardMklTensorInToOutWithMklShape( + context, kInputSlotIdx, kOutputSlotIdx, mkl_shape_output); return; } - } catch (mkldnn::error &e) { + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } } else { diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc new file mode 100644 index 00000000000..c35f857cfe6 --- /dev/null +++ b/tensorflow/core/kernels/mkl_tfconv_op.cc @@ -0,0 +1,124 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef INTEL_MKL + +#include +#include +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/util/tensor_format.h" + +#include "mkl_dnn.h" +#include "mkl_dnn_types.h" +#include "tensorflow/core/util/mkl_util.h" + +namespace tensorflow { +typedef Eigen::ThreadPoolDevice CPUDevice; + +/////////////////////////////////////////////////////////// +// Op kernel +/////////////////////////////////////////////////////////// + +template +class MklToTfOp : public OpKernel { + public: + explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str)); + OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type)); + has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F); + } + + void Compute(OpKernelContext* context) override { + // Check that input tensor is in MKL format. + const Tensor& input_tensor = MklGetInput(context, 0); + MklShape input_shape; + GetMklShape(context, 0, &input_shape); + + // if input is already in Tf format, then just copy input tensor to output. + if (!input_shape.IsMklTensor()) { + context->set_output(0, input_tensor); + VLOG(1) << "MKLToTFConversion: No conversion needed, " + << "copying input to output"; + return; + } + + // Check that input data type is same as operator data type and that it is + // same as output data type. + DataType input_data_type = input_type(0); + DataType output_data_type = output_type(0); + CHECK_EQ(op_data_type, input_data_type); + CHECK_EQ(op_data_type, output_data_type); + + TensorShape output_shape; + size_t ndims = input_shape.GetDimension(); + size_t* in_sizes = new size_t[ndims]; + for (size_t i = 0; i < ndims; i++) { + // Outermost to innermost dimension + output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]); + in_sizes[i] = input_shape.GetSizes()[i]; + } + + // Allocate output tensor. + Tensor* output_tensor = NULL; + OP_REQUIRES_OK(context, + context->allocate_output(0, output_shape, &output_tensor)); + + dnnLayout_t output_layout = + static_cast(input_shape.GetTfLayout()); + // Execute DNNConversion. + void* input_buffer = + static_cast(const_cast(input_tensor.flat().data())); + delete[] in_sizes; + void* output_buffer = + static_cast(const_cast(output_tensor->flat().data())); + input_shape.GetConvertedFlatData(output_layout, input_buffer, + output_buffer); + VLOG(1) << "MKLToTFConversion complete successfully."; + } + + private: + /// Data format of the operation + string data_format_str; + + /// Data type of the operation + DataType op_data_type; + + /// CPUIDInfo + bool has_avx512f_ = false; +}; + +/////////////////////////////////////////////////////////// +// Register kernel +/////////////////////////////////////////////////////////// + +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER(Name("_MklToTf") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklToTfOp); + +TF_CALL_float(REGISTER_CPU); +#undef REGISTER_CPU +} // namespace tensorflow +#endif /* INTEL_MKL */ diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc index 64bdef0008f..5d28b87e6bb 100644 --- a/tensorflow/core/kernels/non_max_suppression_op.cc +++ b/tensorflow/core/kernels/non_max_suppression_op.cc @@ -92,13 +92,11 @@ static inline bool IOUGreaterThanThreshold( return iou > iou_threshold; } -void DoNonMaxSuppressionOp(OpKernelContext* context, - const Tensor& boxes, - const Tensor& scores, - const Tensor& max_output_size, +void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes, + const Tensor& scores, const Tensor& max_output_size, const float iou_threshold) { OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1, - errors::InvalidArgument("iou_threshold must be in [0, 1]")); + errors::InvalidArgument("iou_threshold must be in [0, 1]")); int num_boxes = 0; ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes); @@ -106,10 +104,8 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, return; } - const int output_size = - std::min(max_output_size.scalar()(), num_boxes); - typename TTypes::ConstTensor boxes_data = - boxes.tensor(); + const int output_size = std::min(max_output_size.scalar()(), num_boxes); + typename TTypes::ConstTensor boxes_data = boxes.tensor(); std::vector scores_data(num_boxes); std::copy_n(scores.flat().data(), num_boxes, scores_data.begin()); @@ -181,8 +177,7 @@ template class NonMaxSuppressionV2Op : public OpKernel { public: explicit NonMaxSuppressionV2Op(OpKernelConstruction* context) - : OpKernel(context) { - } + : OpKernel(context) {} void Compute(OpKernelContext* context) override { // boxes: [num_boxes, 4] @@ -197,10 +192,9 @@ class NonMaxSuppressionV2Op : public OpKernel { max_output_size.shape().DebugString())); // iou_threshold: scalar const Tensor& iou_threshold = context->input(3); - OP_REQUIRES( - context, TensorShapeUtils::IsScalar(iou_threshold.shape()), - errors::InvalidArgument("iou_threshold must be 0-D, got shape ", - iou_threshold.shape().DebugString())); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()), + errors::InvalidArgument("iou_threshold must be 0-D, got shape ", + iou_threshold.shape().DebugString())); const float iou_threshold_val = iou_threshold.scalar()(); diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc index fdbcf05b89d..67d9217b950 100644 --- a/tensorflow/core/kernels/non_max_suppression_op_test.cc +++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc @@ -43,9 +43,10 @@ class NonMaxSuppressionOpTest : public OpsTestBase { TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) { MakeOp(.5); - AddInputFromArray(TensorShape({6, 4}), - {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, - 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); + AddInputFromArray( + TensorShape({6, 4}), + {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, + 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); AddInputFromArray(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f}); AddInputFromArray(TensorShape({}), {3}); TF_ASSERT_OK(RunOpKernel()); @@ -58,7 +59,7 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) { TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) { MakeOp(.5); AddInputFromArray(TensorShape({6, 4}), - {1, 1, 0, 0, 0, 0.1f, 1, 1.1f, 0, .9f, 1, -0.1f, + {1, 1, 0, 0, 0, 0.1f, 1, 1.1f, 0, .9f, 1, -0.1f, 0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100}); AddInputFromArray(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f}); AddInputFromArray(TensorShape({}), {3}); @@ -71,9 +72,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) { TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) { MakeOp(.5); - AddInputFromArray(TensorShape({6, 4}), - {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, - 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); + AddInputFromArray( + TensorShape({6, 4}), + {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, + 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); AddInputFromArray(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f}); AddInputFromArray(TensorShape({}), {2}); TF_ASSERT_OK(RunOpKernel()); @@ -85,9 +87,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) { TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostThirtyBoxesFromThreeClusters) { MakeOp(.5); - AddInputFromArray(TensorShape({6, 4}), - {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, - 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); + AddInputFromArray( + TensorShape({6, 4}), + {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, + 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); AddInputFromArray(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f}); AddInputFromArray(TensorShape({}), {30}); TF_ASSERT_OK(RunOpKernel()); @@ -134,9 +137,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromTenIdenticalBoxes) { TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) { MakeOp(.5); - AddInputFromArray(TensorShape({6, 4}), - {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, - 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); + AddInputFromArray( + TensorShape({6, 4}), + {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, + 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); AddInputFromArray(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f}); AddInputFromArray(TensorShape({}), {30}); Status s = RunOpKernel(); diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc index da825e408c2..7f12eb953a3 100644 --- a/tensorflow/core/kernels/nth_element_op.cc +++ b/tensorflow/core/kernels/nth_element_op.cc @@ -16,15 +16,15 @@ limitations under the License. // See docs in ../ops/nn_ops.cc. #include "tensorflow/core/kernels/nth_element_op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/util/work_sharder.h" -#include #include #include +#include +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/util/work_sharder.h" namespace tensorflow { @@ -54,8 +54,9 @@ class NthElementOp : public OpKernel { errors::InvalidArgument("Input must be >= 1-D, got shape ", input_in.shape().DebugString())); // The last dimension of input tensor must be greater than N. - OP_REQUIRES(context, input_in.dim_size(num_dims-1) > n, - errors::InvalidArgument("Input must have at least n+1 columns")); + OP_REQUIRES( + context, input_in.dim_size(num_dims - 1) > n, + errors::InvalidArgument("Input must have at least n+1 columns")); // std::nth_element only support the nth-smallest selection. if (reverse_) { @@ -64,7 +65,7 @@ class NthElementOp : public OpKernel { // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1]. TensorShape out_shape; - for (int i = 0; i < num_dims-1; ++i) { + for (int i = 0; i < num_dims - 1; ++i) { out_shape.AddDim(input_in.dim_size(i)); } Tensor* output_tensor = nullptr; @@ -83,32 +84,28 @@ namespace functor { template struct NthElementFunctor { - void operator() (OpKernelContext* context, - const Tensor& input_tensor, - Tensor& output_tensor, - int n, - bool reverse) { + void operator()(OpKernelContext* context, const Tensor& input_tensor, + Tensor& output_tensor, int n, bool reverse) { const T* input = input_tensor.flat().data(); T* output = output_tensor.flat().data(); // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1], // then num_rows = d1*d2...dk-1, last_dim = dk. const int num_rows = output_tensor.NumElements(); - const int last_dim = input_tensor.dim_size(input_tensor.dims()-1); + const int last_dim = input_tensor.dim_size(input_tensor.dims() - 1); // Allocate each row to different shard. - auto SubNthElement = [&, input, output, last_dim, n](int start, - int limit) { + auto SubNthElement = [&, input, output, last_dim, n](int start, int limit) { // std::nth_element would rearrange the array, so we need a new buffer. std::vector buf(last_dim); for (int b = start; b < limit; ++b) { // Copy from one row of elements to buffer const T* input_start = input + b * last_dim; - const T* input_end = input + (b+1) * last_dim; + const T* input_end = input + (b + 1) * last_dim; std::copy(input_start, input_end, buf.begin()); - std::nth_element(buf.begin(), buf.begin()+n, buf.end()); + std::nth_element(buf.begin(), buf.begin() + n, buf.end()); // The element placed in the nth position is exactly the element that // would occur in this position if the range was fully sorted. output[b] = buf[n]; @@ -116,9 +113,9 @@ struct NthElementFunctor { }; auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); - // The average time complexity of partition-based nth_element (BFPRT) is O(n), - // althought the worst time complexity could be O(n^2). - // Here, 20 is a empirical factor of cost_per_unit. + // The average time complexity of partition-based nth_element (BFPRT) is + // O(n), althought the worst time complexity could be O(n^2). Here, 20 is a + // empirical factor of cost_per_unit. Shard(worker_threads.num_threads, worker_threads.workers, num_rows, 20 * last_dim, SubNthElement); } @@ -126,7 +123,6 @@ struct NthElementFunctor { } // namespace functor - #define REGISTER_NTHOP(T) \ REGISTER_KERNEL_BUILDER( \ Name("NthElement").Device(DEVICE_CPU).TypeConstraint("T"), \ @@ -136,4 +132,3 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_NTHOP); #undef REGISTER_NTHOP } // end namespace tensorflow - diff --git a/tensorflow/core/kernels/nth_element_op.h b/tensorflow/core/kernels/nth_element_op.h index 11a6c996b09..e7d25daecc7 100644 --- a/tensorflow/core/kernels/nth_element_op.h +++ b/tensorflow/core/kernels/nth_element_op.h @@ -26,10 +26,8 @@ namespace functor { template struct NthElementFunctor { - void operator() (OpKernelContext* context, - const Tensor& input_tensor, - Tensor& output_tensor, - int n); + void operator()(OpKernelContext* context, const Tensor& input_tensor, + Tensor& output_tensor, int n); }; } // namespace functor diff --git a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc index 49fd4bdebad..647515ae38a 100644 --- a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc +++ b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc @@ -19,16 +19,16 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/kernels/one_hot_op.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/one_hot_op.h" namespace tensorflow { typedef Eigen::GpuDevice GPUDevice; -#define DEFINE_GPU_SPEC_INDEX(T, TI) \ - template class generator::OneGenerator; \ +#define DEFINE_GPU_SPEC_INDEX(T, TI) \ + template class generator::OneGenerator; \ template struct functor::OneHot; #define DEFINE_GPU_SPEC(T) \ diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc index 9d53882deef..13427d71ff6 100644 --- a/tensorflow/core/kernels/ops_util_test.cc +++ b/tensorflow/core/kernels/ops_util_test.cc @@ -218,7 +218,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_2) { // in_size = 3, ksize = 3, stride = 2, pad_size = 0 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) { bcast_struct bcast[] = { - {{0, 3, 3, 2, 0}, {0, 3}}, {{1, 3, 3, 2, 0}, {2, 1}}, + {{0, 3, 3, 2, 0}, {0, 3}}, + {{1, 3, 3, 2, 0}, {2, 1}}, }; for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { VerifyBcastValues(bcast[i]); @@ -228,7 +229,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) { // in_size = 3, ksize = 3, stride = 2, pad_size = 1 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_1) { bcast_struct bcast[] = { - {{0, 3, 3, 2, 1}, {0, 2}}, {{1, 3, 3, 2, 1}, {1, 2}}, + {{0, 3, 3, 2, 1}, {0, 2}}, + {{1, 3, 3, 2, 1}, {1, 2}}, }; for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { VerifyBcastValues(bcast[i]); @@ -258,7 +260,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_0) { // in_size = 3, ksize = 3, stride = 3, pad_size = 1 TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_1) { bcast_struct bcast[] = { - {{0, 3, 3, 3, 1}, {0, 2}}, {{1, 3, 3, 3, 1}, {2, 1}}, + {{0, 3, 3, 3, 1}, {0, 2}}, + {{1, 3, 3, 3, 1}, {2, 1}}, }; for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { VerifyBcastValues(bcast[i]); @@ -348,8 +351,8 @@ TEST_F(OpsUtilTest, Misaligned1DSlice) { TEST_F(OpsUtilTest, Aligned2DSliceOfDim0) { #if EIGEN_MAX_ALIGN_BYTES == 0 - // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is nonzero, - // a multidimensional tensor is always aligned. + // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is + // nonzero, a multidimensional tensor is always aligned. Tensor t(DT_FLOAT, TensorShape({3, 4})); int64 start = 1; int64 end = 2; diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc index 2033fbf5dc3..e0ae5de0f45 100644 --- a/tensorflow/core/kernels/pack_op.cc +++ b/tensorflow/core/kernels/pack_op.cc @@ -36,7 +36,7 @@ typedef Eigen::GpuDevice GPUDevice; #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // -------------------------------------------------------------------------- template @@ -123,7 +123,7 @@ class PackOp : public OpKernel { ConcatSYCL(c->eigen_sycl_device(), inputs_flat, &output_flat); return; } -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL ConcatCPU(c->device(), inputs_flat, &output_flat); } } diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc index b232ba16a76..0ab9ff9f650 100644 --- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc +++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc @@ -95,9 +95,10 @@ struct TruncatedNormalFunctor { int64 sample = b * samples_per_batch; // On GPU, this check will just fill samples with NAN if it fails. - OP_REQUIRES(ctx, stddev > T(0) && minval < maxval && - (Eigen::numext::isfinite(minval) || - Eigen::numext::isfinite(maxval)), + OP_REQUIRES(ctx, + stddev > T(0) && minval < maxval && + (Eigen::numext::isfinite(minval) || + Eigen::numext::isfinite(maxval)), errors::InvalidArgument("Invalid parameters")); int numIterations = 0; @@ -118,8 +119,9 @@ struct TruncatedNormalFunctor { // Determine the method to use. const T sqrtFactor = Eigen::numext::sqrt((normMin * normMin) + T(4)); const T cutoff = - T(2) * Eigen::numext::exp( - T(0.5) + (normMin * (normMin - sqrtFactor)) / T(4)) / + T(2) * + Eigen::numext::exp(T(0.5) + + (normMin * (normMin - sqrtFactor)) / T(4)) / (normMin + sqrtFactor); const T diff = normMax - normMin; if (diff < cutoff) { @@ -309,30 +311,34 @@ class ParameterizedTruncatedNormalOp : public OpKernel { } else { // Parameters must be broadcastable to the shape [num_batches]. OP_REQUIRES( - ctx, TensorShapeUtils::IsScalar(means_tensor.shape()) || - means_tensor.dim_size(0) == 1 || - means_tensor.dim_size(0) == num_batches, + ctx, + TensorShapeUtils::IsScalar(means_tensor.shape()) || + means_tensor.dim_size(0) == 1 || + means_tensor.dim_size(0) == num_batches, errors::InvalidArgument( "Input means should have length 1 or shape[0], got shape: ", means_tensor.shape().DebugString())); OP_REQUIRES( - ctx, TensorShapeUtils::IsScalar(stddevs_tensor.shape()) || - stddevs_tensor.dim_size(0) == 1 || - stddevs_tensor.dim_size(0) == num_batches, + ctx, + TensorShapeUtils::IsScalar(stddevs_tensor.shape()) || + stddevs_tensor.dim_size(0) == 1 || + stddevs_tensor.dim_size(0) == num_batches, errors::InvalidArgument( "Input stddevs should have length 1 or shape[0], got shape: ", stddevs_tensor.shape().DebugString())); OP_REQUIRES( - ctx, TensorShapeUtils::IsScalar(minvals_tensor.shape()) || - minvals_tensor.dim_size(0) == 1 || - minvals_tensor.dim_size(0) == num_batches, + ctx, + TensorShapeUtils::IsScalar(minvals_tensor.shape()) || + minvals_tensor.dim_size(0) == 1 || + minvals_tensor.dim_size(0) == num_batches, errors::InvalidArgument( "Input minvals should have length 1 or shape[0], got shape: ", minvals_tensor.shape().DebugString())); OP_REQUIRES( - ctx, TensorShapeUtils::IsScalar(maxvals_tensor.shape()) || - maxvals_tensor.dim_size(0) == 1 || - maxvals_tensor.dim_size(0) == num_batches, + ctx, + TensorShapeUtils::IsScalar(maxvals_tensor.shape()) || + maxvals_tensor.dim_size(0) == 1 || + maxvals_tensor.dim_size(0) == num_batches, errors::InvalidArgument( "Input maxvals should have length 1 or shape[0], got shape: ", maxvals_tensor.shape().DebugString())); diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc index 933de65c15a..ddfeb1bb790 100644 --- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc +++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc @@ -202,12 +202,13 @@ struct TruncatedNormalFunctor { typename TTypes::Flat output) { const auto config = GetCudaLaunchConfig(num_elements, d); - TruncatedNormalKernel< - T><<>>( - gen, output.data(), num_batches, samples_per_batch, num_elements, - means.data(), means.dimension(0) == 1, stddevs.data(), - stddevs.dimension(0) == 1, minvals.data(), minvals.dimension(0) == 1, - maxvals.data(), maxvals.dimension(0) == 1, kMaxIterations); + TruncatedNormalKernel + <<>>( + gen, output.data(), num_batches, samples_per_batch, num_elements, + means.data(), means.dimension(0) == 1, stddevs.data(), + stddevs.dimension(0) == 1, minvals.data(), + minvals.dimension(0) == 1, maxvals.data(), + maxvals.dimension(0) == 1, kMaxIterations); }; }; diff --git a/tensorflow/core/kernels/parse_tensor_op.cc b/tensorflow/core/kernels/parse_tensor_op.cc index 6b599612ad7..dd41744f023 100644 --- a/tensorflow/core/kernels/parse_tensor_op.cc +++ b/tensorflow/core/kernels/parse_tensor_op.cc @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/framework/register_types.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc index a406317213f..01bcfede1e8 100644 --- a/tensorflow/core/kernels/pooling_ops_3d.cc +++ b/tensorflow/core/kernels/pooling_ops_3d.cc @@ -258,7 +258,7 @@ struct LaunchMaxPooling3dGradOp { Eigen::array bcast = {1, csize, rsize, psize, 1}; #else Eigen::IndexList, int, int, int, - Eigen::type2index<1> > + Eigen::type2index<1>> bcast; bcast.set(1, csize); bcast.set(2, rsize); @@ -431,7 +431,7 @@ struct LaunchAvgPooling3dGradOp { Eigen::array bcast = {1, csize, rsize, psize, 1}; #else Eigen::IndexList, int, int, int, - Eigen::type2index<1> > + Eigen::type2index<1>> bcast; bcast.set(1, csize); bcast.set(2, rsize); @@ -833,7 +833,7 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS) #ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T) -TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS) + TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS) #undef REGISTER_SYCL_KERNELS #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h index c1bc5af4986..b4bead2456d 100644 --- a/tensorflow/core/kernels/pooling_ops_3d_sycl.h +++ b/tensorflow/core/kernels/pooling_ops_3d_sycl.h @@ -281,12 +281,11 @@ class MaxPool3DGradSYCL { const T* input_data_n = input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_; - const T* output_data_n = - output_data + - n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_; - const T* input_backprop_n = - input_backprop + - n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_; + const T* output_data_n = output_data + n * p_.out_planes_ * p_.out_cols_ * + p_.out_rows_ * p_.depth_; + const T* input_backprop_n = input_backprop + n * p_.out_planes_ * + p_.out_cols_ * + p_.out_rows_ * p_.depth_; for (int poolp = poolpstart; poolp < poolpend; ++poolp) { int pstart = poolp * p_.stride_planes_ - p_.pad_planes_; const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_); @@ -678,9 +677,9 @@ class AvgPool3DGradSYCL { n /= p_.in_planes_; T gradient = T(0); - const T* input_backprop_n = - input_backprop + - n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_; + const T* input_backprop_n = input_backprop + n * p_.out_planes_ * + p_.out_cols_ * + p_.out_rows_ * p_.depth_; for (int poolp = poolpstart; poolp < poolpend; ++poolp) { int pstart = poolp * p_.stride_planes_ - p_.pad_planes_; const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_); diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h index e3131b804f2..fc7cb437b8f 100644 --- a/tensorflow/core/kernels/pooling_ops_common.h +++ b/tensorflow/core/kernels/pooling_ops_common.h @@ -195,7 +195,6 @@ class MaxPoolingOp : public OpKernel { // and updates the corresponding column(s) in output_as_matrix with the // max value. auto shard = [¶ms, &in_mat, &out_mat](int64 start, int64 limit) { - const int32 in_rows = params.tensor_in_rows; const int32 in_cols = params.tensor_in_cols; const int32 pad_rows = params.pad_rows; @@ -443,7 +442,6 @@ class MaxPoolingV2Op : public OpKernel { // and updates the corresponding column(s) in output_as_matrix with the // max value. auto shard = [¶ms, &in_mat, &out_mat](int64 start, int64 limit) { - const int32 in_rows = params.tensor_in_rows; const int32 in_cols = params.tensor_in_cols; const int32 pad_rows = params.pad_rows; diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc index d148c9f78d6..176720c22cc 100644 --- a/tensorflow/core/kernels/quantization_utils_test.cc +++ b/tensorflow/core/kernels/quantization_utils_test.cc @@ -385,8 +385,12 @@ void TestQuantizedToFloatInPlaceUsingEigen( // These are the float values we're going to test the conversions on. typedef std::pair FPair; for (FPair min_and_max : std::vector{ - FPair(-255.0f, 255.0f), FPair(-1.0f, 1.0f), FPair(-1.0f, 255.0f), - FPair(0.0f, 1e6), FPair(0.0f, 1.0f), FPair(-31.0f, 13.0f), + FPair(-255.0f, 255.0f), + FPair(-1.0f, 1.0f), + FPair(-1.0f, 255.0f), + FPair(0.0f, 1e6), + FPair(0.0f, 1.0f), + FPair(-31.0f, 13.0f), FPair(-5.89505e+08, 5.89505e+08), }) { const float f_min = min_and_max.first; diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h index 1363c7e325b..3b09ea2527d 100644 --- a/tensorflow/core/kernels/quantize_and_dequantize_op.h +++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h @@ -71,7 +71,8 @@ struct QuantizeAndDequantizeOneScaleImpl { out.device(d) = ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) * scale + - T(0.5)).floor() * + T(0.5)) + .floor() * inverse_scale + min_range; } else { diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc index d2cc55a94dd..57982bdf76e 100644 --- a/tensorflow/core/kernels/quantize_op_test.cc +++ b/tensorflow/core/kernels/quantize_op_test.cc @@ -250,7 +250,8 @@ TEST_F(QuantizedOpTest, QuantizeV2_32Bit) { Tensor expected(allocator(), DT_QINT32, TensorShape({element_count})); test::FillValues(&expected, { - std::numeric_limits::min(), 0, + std::numeric_limits::min(), + 0, static_cast(1.0f * (1 << 23)), static_cast(1.25f * (1 << 23)), static_cast(1.75f * (1 << 23)), diff --git a/tensorflow/core/kernels/quantized_batch_norm_op.cc b/tensorflow/core/kernels/quantized_batch_norm_op.cc index 18d83b41494..b03da7ad17f 100644 --- a/tensorflow/core/kernels/quantized_batch_norm_op.cc +++ b/tensorflow/core/kernels/quantized_batch_norm_op.cc @@ -16,11 +16,11 @@ limitations under the License. #define EIGEN_USE_THREADS #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/kernels/quantization_utils.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/kernels/quantization_utils.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc index d67f1ab3ec2..b03ac8e87da 100644 --- a/tensorflow/core/kernels/quantized_concat_op.cc +++ b/tensorflow/core/kernels/quantized_concat_op.cc @@ -135,8 +135,8 @@ class QuantizedConcatOp : public OpKernel { context, in.dims() == input_dims || (input_is_scalar && in_is_scalar), errors::InvalidArgument( "ConcatOp : Ranks of all input tensors should match: shape[0] = ", - input_shape.DebugString(), " vs. shape[", i, "] = ", - in.shape().DebugString())); + input_shape.DebugString(), " vs. shape[", i, + "] = ", in.shape().DebugString())); for (int j = 0; j < input_dims; ++j) { if (j == concat_dim) { continue; @@ -145,8 +145,8 @@ class QuantizedConcatOp : public OpKernel { context, in.dim_size(j) == input_shape.dim_size(j), errors::InvalidArgument( "ConcatOp : Dimensions of inputs should match: shape[0] = ", - input_shape.DebugString(), " vs. shape[", i, "] = ", - in.shape().DebugString())); + input_shape.DebugString(), " vs. shape[", i, + "] = ", in.shape().DebugString())); } if (in.NumElements() > 0) { int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0; diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc index 1921b83d12c..5b3570edff5 100644 --- a/tensorflow/core/kernels/quantized_conv_ops.cc +++ b/tensorflow/core/kernels/quantized_conv_ops.cc @@ -278,10 +278,9 @@ class Im2ColConvFunctor { *resource = new Im2ColBufferResource(); return Status::OK(); }; - OP_REQUIRES_OK( - context, - context->resource_manager()->LookupOrCreate( - "Conv2d", "im2col_buffer", &im2col_buffer_resource, creator)); + OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate( + "Conv2d", "im2col_buffer", + &im2col_buffer_resource, creator)); // This means that multiple ops can't be run simultaneously on different // threads, because we have a single shared resource. The platforms this is // aimed at have intra-op parallelism as their focus though, so it shouldn't diff --git a/tensorflow/core/kernels/quantized_instance_norm.cc b/tensorflow/core/kernels/quantized_instance_norm.cc index c29f534f31b..d62094cc9fa 100644 --- a/tensorflow/core/kernels/quantized_instance_norm.cc +++ b/tensorflow/core/kernels/quantized_instance_norm.cc @@ -278,10 +278,10 @@ class QuantizedInstanceNorm : public OpKernel { float input_max = context->input(2).flat()(0); float input_scale = (input_max - input_min) / 255.0f; - OP_REQUIRES( - context, input_min < input_max, - errors::InvalidArgument("input_min must be less than input_max : ", - input_min, " >= ", input_max)); + OP_REQUIRES(context, input_min < input_max, + errors::InvalidArgument( + "input_min must be less than input_max : ", input_min, + " >= ", input_max)); auto input_tensor = input.tensor(); auto N = input_tensor.dimension(0); diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc index afb30d5f627..da8c46dc516 100644 --- a/tensorflow/core/kernels/quantized_matmul_op.cc +++ b/tensorflow/core/kernels/quantized_matmul_op.cc @@ -104,9 +104,9 @@ class QuantizedMatMulOp : public OpKernel { OP_REQUIRES(context, a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second), - errors::InvalidArgument("Matrix size-compatible: In[0]: ", - a.shape().DebugString(), ", In[1]: ", - b.shape().DebugString())); + errors::InvalidArgument( + "Matrix size-compatible: In[0]: ", a.shape().DebugString(), + ", In[1]: ", b.shape().DebugString())); OP_REQUIRES(context, ((shift_c >= 0) && (shift_c <= 31)), errors::InvalidArgument("shift_c must be between 0 and 31, " diff --git a/tensorflow/core/kernels/quantized_matmul_op_test.cc b/tensorflow/core/kernels/quantized_matmul_op_test.cc index 535b5115c34..c9f05dbc10b 100644 --- a/tensorflow/core/kernels/quantized_matmul_op_test.cc +++ b/tensorflow/core/kernels/quantized_matmul_op_test.cc @@ -206,17 +206,32 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) { // We have set the transpose_a flag to true, so the matrix is transposed, and // for filling the values the in-memory storage order is effectively // column major, rather than the default row-major. - AddInputFromArray(TensorShape({a_rows, a_cols}), - { - 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, - }); + AddInputFromArray(TensorShape({a_rows, a_cols}), { + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + }); // The B matrix is: // | 1 | 4| // | 2 | 5| // | 3 | 6| AddInputFromArray(TensorShape({b_rows, b_cols}), { - 1, 4, 2, 5, 3, 6, + 1, + 4, + 2, + 5, + 3, + 6, }); AddInputFromArray(TensorShape({1}), {-12.0f}); AddInputFromArray(TensorShape({1}), {243.0f}); @@ -238,10 +253,16 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) { // | -50 | -113 | // | -56 | -128 | Tensor expected(allocator(), DT_QINT32, TensorShape({a_cols, b_cols})); - test::FillValues(&expected, - { - -38, -83, -44, -98, -50, -113, -56, -128, - }); + test::FillValues(&expected, { + -38, + -83, + -44, + -98, + -50, + -113, + -56, + -128, + }); test::ExpectTensorEqual(expected, *GetOutput(0)); } diff --git a/tensorflow/core/kernels/quantized_mul_op.cc b/tensorflow/core/kernels/quantized_mul_op.cc index eaa5e667f7d..3c7536e0373 100644 --- a/tensorflow/core/kernels/quantized_mul_op.cc +++ b/tensorflow/core/kernels/quantized_mul_op.cc @@ -298,9 +298,8 @@ class QuantizedMulOp : public OpKernel { return; } Tensor* z; - OP_REQUIRES_OK( - context, - context->allocate_output(0, BCast::ToShape(bcast.output_shape()), &z)); + OP_REQUIRES_OK(context, context->allocate_output( + 0, BCast::ToShape(bcast.output_shape()), &z)); // Make sure that we have valid quantization ranges for the input buffers. // If the difference between the min and max is negative or zero, it makes diff --git a/tensorflow/core/kernels/quantized_mul_op_test.cc b/tensorflow/core/kernels/quantized_mul_op_test.cc index b0550c8260c..a4e407c7a94 100644 --- a/tensorflow/core/kernels/quantized_mul_op_test.cc +++ b/tensorflow/core/kernels/quantized_mul_op_test.cc @@ -188,11 +188,12 @@ void TestManualScalar() { 10.0f, {1}, {10.0f}, -100.0f, 100.0f, {10}, {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f}, 3.0f); - TestMul({1}, {10.0f}, -100.0f, 100.0f, {10}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f, - 10.0f, {10}, {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, - 90.0f, 100.0f}, - 3.0f); + TestMul( + {1}, {10.0f}, -100.0f, 100.0f, {10}, + {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f, + 10.0f, {10}, + {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f}, + 3.0f); } void TestScalar() { diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc index 330d161c32b..de495c19cba 100644 --- a/tensorflow/core/kernels/queue_base.cc +++ b/tensorflow/core/kernels/queue_base.cc @@ -39,8 +39,8 @@ Status HandleSliceToElement(const Tensor& parent, Tensor* element, return errors::Internal( "HandleSliceToElement Cannot copy slice: number of elements does not " "match. Shapes are: [element]: ", - element->shape().DebugString(), ", [parent slice]: ", - chip_shape.DebugString()); + element->shape().DebugString(), + ", [parent slice]: ", chip_shape.DebugString()); } auto parent_as_matrix = parent.flat_outer_dims(); element->flat() = parent_as_matrix.chip(index, 0); diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc index 17831b74370..46a02854d73 100644 --- a/tensorflow/core/kernels/queue_ops.cc +++ b/tensorflow/core/kernels/queue_ops.cc @@ -428,13 +428,14 @@ REGISTER_KERNEL_BUILDER(Name("QueueSizeV2").Device(DEVICE_CPU), QueueSizeOp); class QueueIsClosedOp : public QueueOpKernel { public: explicit QueueIsClosedOp(OpKernelConstruction* context) - : QueueOpKernel(context) {} + : QueueOpKernel(context) {} protected: void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue, DoneCallback callback) override { Tensor* Tqueue_is_closed = nullptr; - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed)); + OP_REQUIRES_OK(ctx, + ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed)); Tqueue_is_closed->flat().setConstant(queue->is_closed()); callback(); } @@ -443,8 +444,10 @@ class QueueIsClosedOp : public QueueOpKernel { TF_DISALLOW_COPY_AND_ASSIGN(QueueIsClosedOp); }; -REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU), QueueIsClosedOp); -REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU), QueueIsClosedOp); +REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU), + QueueIsClosedOp); +REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU), + QueueIsClosedOp); class FakeQueueOp : public OpKernel { public: diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/random_crop_op.cc index ba94d6be5ca..554909760aa 100644 --- a/tensorflow/core/kernels/random_crop_op.cc +++ b/tensorflow/core/kernels/random_crop_op.cc @@ -68,10 +68,10 @@ class RandomCropOp : public OpKernel { // Edge case. The target dimensions are larger then the image, so // zero-pad the image. This guarantees that the image will *always* // be [target_height, target_width] in size. - OP_REQUIRES( - context, width >= target_width, - errors::FailedPrecondition("width must be >= target_width: width = ", - width, ", target_width = ", target_width)); + OP_REQUIRES(context, width >= target_width, + errors::FailedPrecondition( + "width must be >= target_width: width = ", width, + ", target_width = ", target_width)); OP_REQUIRES(context, height >= target_height, errors::FailedPrecondition( "height must be >= target_height: height = ", height, diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc index 55a8b9c9b67..78ff7948fbf 100644 --- a/tensorflow/core/kernels/random_op.cc +++ b/tensorflow/core/kernels/random_op.cc @@ -50,7 +50,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace functor { using random::PhiloxRandom; @@ -271,9 +271,10 @@ class RandomGammaOp : public OpKernel { const Tensor& shape_t = ctx->input(0); const Tensor& alpha_t = ctx->input(1); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(shape_t.shape()) && - (shape_t.dtype() == DataType::DT_INT32 || - shape_t.dtype() == DataType::DT_INT64), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(shape_t.shape()) && + (shape_t.dtype() == DataType::DT_INT32 || + shape_t.dtype() == DataType::DT_INT64), errors::InvalidArgument( "shape must be a vector of {int32,int64}, got shape: ", shape_t.DebugString())); @@ -325,7 +326,7 @@ class RandomGammaOp : public OpKernel { // avoid a couple flops which can be done on a per-alpha basis. auto DoWork = [num_samples, num_alphas, &rng, samples_flat, alpha_flat]( - int start_output, int limit_output) { + int start_output, int limit_output) { using Eigen::numext::exp; using Eigen::numext::log; using Eigen::numext::pow; @@ -448,40 +449,40 @@ class RandomGammaOp : public OpKernel { } // namespace -#define REGISTER(TYPE) \ - template struct functor::FillPhiloxRandom< \ - CPUDevice, random::UniformDistribution >; \ - template struct functor::FillPhiloxRandom< \ - CPUDevice, random::NormalDistribution >; \ - template struct functor::FillPhiloxRandom< \ - CPUDevice, \ - random::TruncatedNormalDistribution< \ - random::SingleSampleAdapter, TYPE> >; \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomUniform") \ - .Device(DEVICE_CPU) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomStandardNormal") \ - .Device(DEVICE_CPU) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("TruncatedNormal") \ - .Device(DEVICE_CPU) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp< \ - CPUDevice, \ - random::TruncatedNormalDistribution< \ - random::SingleSampleAdapter, TYPE> >); \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint("T"), \ +#define REGISTER(TYPE) \ + template struct functor::FillPhiloxRandom< \ + CPUDevice, random::UniformDistribution>; \ + template struct functor::FillPhiloxRandom< \ + CPUDevice, random::NormalDistribution>; \ + template struct functor::FillPhiloxRandom< \ + CPUDevice, \ + random::TruncatedNormalDistribution< \ + random::SingleSampleAdapter, TYPE>>; \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomUniform") \ + .Device(DEVICE_CPU) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomStandardNormal") \ + .Device(DEVICE_CPU) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("TruncatedNormal") \ + .Device(DEVICE_CPU) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp< \ + CPUDevice, \ + random::TruncatedNormalDistribution< \ + random::SingleSampleAdapter, TYPE>>); \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint("T"), \ RandomGammaOp) #define REGISTER_INT(IntType) \ @@ -504,33 +505,33 @@ TF_CALL_int64(REGISTER_INT); #if GOOGLE_CUDA -#define REGISTER(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomUniform") \ - .Device(DEVICE_GPU) \ - .HostMemory("shape") \ - .TypeConstraint("T") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomStandardNormal") \ - .Device(DEVICE_GPU) \ - .HostMemory("shape") \ - .TypeConstraint("T") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("TruncatedNormal") \ - .Device(DEVICE_GPU) \ - .HostMemory("shape") \ - .TypeConstraint("T") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp< \ - GPUDevice, \ - random::TruncatedNormalDistribution< \ - random::SingleSampleAdapter, TYPE> >); +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomUniform") \ + .Device(DEVICE_GPU) \ + .HostMemory("shape") \ + .TypeConstraint("T") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomStandardNormal") \ + .Device(DEVICE_GPU) \ + .HostMemory("shape") \ + .TypeConstraint("T") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("TruncatedNormal") \ + .Device(DEVICE_GPU) \ + .HostMemory("shape") \ + .TypeConstraint("T") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp< \ + GPUDevice, \ + random::TruncatedNormalDistribution< \ + random::SingleSampleAdapter, TYPE>>); #define REGISTER_INT(IntType) \ REGISTER_KERNEL_BUILDER(Name("RandomUniformInt") \ @@ -565,13 +566,12 @@ struct FillPhiloxRandomKernel; template struct FillPhiloxRandomKernel { typedef typename Distribution::ResultElementType T; - using write_accessor = sycl::accessor; + using write_accessor = sycl::accessor; - FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist) - : data_(data), - gen_(gen), - dist_(dist) { - } + FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, + Distribution& dist) + : data_(data), gen_(gen), dist_(dist) {} void operator()(sycl::nd_item<1> item) { const size_t kGroupSize = Distribution::kResultElementCount; @@ -597,7 +597,7 @@ struct FillPhiloxRandomKernel { const typename Distribution::ResultType samples = dist_(&gen_); for (size_t i = 0; i < kGroupSize; ++i) { if (offset >= size) { - return; + return; } data[offset] = samples[i]; ++offset; @@ -610,17 +610,15 @@ struct FillPhiloxRandomKernel { Distribution dist_; }; - template struct FillPhiloxRandomKernel { typedef typename Distribution::ResultElementType T; - using write_accessor = sycl::accessor; + using write_accessor = sycl::accessor; - FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist) - : data_(data), - gen_(gen), - dist_(dist) { - } + FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, + Distribution& dist) + : data_(data), gen_(gen), dist_(dist) {} void operator()(sycl::nd_item<1> item) { using random::PhiloxRandom; @@ -628,9 +626,9 @@ struct FillPhiloxRandomKernel { const size_t kReservedSamplesPerOutput = 256; const size_t kGroupSize = Distribution::kResultElementCount; - const size_t kGeneratorSkipPerOutputGroup = kGroupSize * - kReservedSamplesPerOutput / - PhiloxRandom::kResultElementCount; + const size_t kGeneratorSkipPerOutputGroup = + kGroupSize * kReservedSamplesPerOutput / + PhiloxRandom::kResultElementCount; const size_t item_id = item.get_global(0); const size_t total_item_count = item.get_global_range(); @@ -674,10 +672,9 @@ class FillRandomKernel; // It splits the work into several tasks and run them in parallel template void FillPhiloxRandom::operator()( - OpKernelContext* context, const SYCLDevice& device, random::PhiloxRandom gen, - typename Distribution::ResultElementType* data, int64 size, - Distribution dist) { - + OpKernelContext* context, const SYCLDevice& device, + random::PhiloxRandom gen, typename Distribution::ResultElementType* data, + int64 size, Distribution dist) { const size_t group_size = device.maxSyclThreadsPerBlock(); const size_t group_count = (size + group_size - 1) / group_size; @@ -686,50 +683,52 @@ void FillPhiloxRandom::operator()( device.sycl_queue().submit([&](sycl::handler& cgh) { auto access = buffer.template get_access(cgh); - FillPhiloxRandomKernel task(access, gen, dist); + FillPhiloxRandomKernel + task(access, gen, dist); cgh.parallel_for>( - sycl::nd_range<1>(sycl::range<1>(group_count * group_size), sycl::range<1>(group_size)), - task - ); + sycl::nd_range<1>(sycl::range<1>(group_count * group_size), + sycl::range<1>(group_size)), + task); }); } -} +} // namespace functor -#define REGISTER(TYPE) \ - template struct functor::FillPhiloxRandom< \ - SYCLDevice, random::UniformDistribution >; \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomUniform") \ - .Device(DEVICE_SYCL) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomStandardNormal") \ - .Device(DEVICE_SYCL) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("TruncatedNormal") \ - .Device(DEVICE_SYCL) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp< \ - SYCLDevice, \ - random::TruncatedNormalDistribution< \ - random::SingleSampleAdapter, TYPE> >); +#define REGISTER(TYPE) \ + template struct functor::FillPhiloxRandom< \ + SYCLDevice, random::UniformDistribution>; \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomUniform") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomStandardNormal") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("TruncatedNormal") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp< \ + SYCLDevice, \ + random::TruncatedNormalDistribution< \ + random::SingleSampleAdapter, TYPE>>); -#define REGISTER_INT(IntType) \ - REGISTER_KERNEL_BUILDER(Name("RandomUniformInt") \ - .Device(DEVICE_SYCL) \ - .HostMemory("shape") \ - .HostMemory("minval") \ - .HostMemory("maxval") \ - .TypeConstraint("Tout"), \ +#define REGISTER_INT(IntType) \ + REGISTER_KERNEL_BUILDER(Name("RandomUniformInt") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .HostMemory("minval") \ + .HostMemory("maxval") \ + .TypeConstraint("Tout"), \ RandomUniformIntOp); TF_CALL_float(REGISTER); @@ -740,6 +739,6 @@ TF_CALL_int64(REGISTER_INT); #undef REGISTER #undef REGISTER_INT -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc index 7afa6974c6a..3393b39faf4 100644 --- a/tensorflow/core/kernels/random_op_gpu.cu.cc +++ b/tensorflow/core/kernels/random_op_gpu.cu.cc @@ -222,9 +222,8 @@ void FillPhiloxRandom::operator()( (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) / block_size; - FillPhiloxRandomKernelLaunch< - Distribution><<>>(gen, data, size, - dist); + FillPhiloxRandomKernelLaunch + <<>>(gen, data, size, dist); }; // Explicit instantiation of the GPU distributions functors diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc index bf1d83ec751..64fb4a5c228 100644 --- a/tensorflow/core/kernels/random_poisson_op.cc +++ b/tensorflow/core/kernels/random_poisson_op.cc @@ -103,7 +103,7 @@ struct PoissonFunctor { typedef random::UniformDistribution Uniform; auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat]( - int start_output, int limit_output) { + int start_output, int limit_output) { // Capturing "rng" by value would only make a copy for the _shared_ // lambda. Since we want to let each worker have its own copy, we pass // "rng" by reference and explicitly do a copy assignment. diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc index e9695cfde30..87fc9433316 100644 --- a/tensorflow/core/kernels/random_shuffle_queue_op.cc +++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc @@ -334,96 +334,95 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx, // TODO(josh11b): This makes two copies of callback, avoid this if possible. dequeue_attempts_.emplace_back( num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token, - [callback, allow_small_batch, this](Attempt* attempt) - EXCLUSIVE_LOCKS_REQUIRED(mu_) { - int32 queue_size = queues_[0].size(); - if (closed_ && queue_size < attempt->elements_requested) { - // If we don't have enough for a full dequeue, we have - // to reset the attempt tuple. - if (!attempt->tuple.empty()) { - // Restore already-dequeued elements to the queue. - for (int64 i = attempt->tuple[0].dim_size(0) - - attempt->elements_requested - 1; - i >= 0; --i) { - for (int j = 0; j < num_components(); ++j) { - PersistentTensor element; - Status s = GetElementComponentFromBatch( - attempt->tuple, i, j, attempt->context, &element); - if (!s.ok()) { - attempt->context->SetStatus( - errors::DataLoss("Failed to restore element from " - "partially-dequeued batch " - "to RandomShuffleQueue: ", - s.error_message())); - } - queues_[j].push_back(element); - } - } - } - if (allow_small_batch && !queues_[0].empty()) { - // Request all remaining elements in the queue. - queue_size = queues_[0].size(); - attempt->tuple.clear(); - attempt->elements_requested = queue_size; - } else { - if (allow_small_batch) { - // There may be some other attempts containing - // values. If so, we'll yield and wait for them - // to add elements to the queue. - if (!enqueue_attempts_.empty()) return kProgress; - } - if (attempt->context->status().ok()) { - attempt->context->SetStatus(errors::OutOfRange( - "RandomShuffleQueue '", name_, "' is closed and has ", - "insufficient elements (requested ", - attempt->elements_requested, ", current size ", - queue_size, ")")); - } - return kComplete; - } - } - - RunResult result = kNoProgress; - if (!closed_) queue_size -= min_after_dequeue_; - for (; queue_size > 0; --queue_size) { - if (attempt->tuple.empty()) { - // Only allocate tuple when we have something to dequeue - // so we don't use excessive memory when there are many - // blocked dequeue attempts waiting. - attempt->tuple.reserve(num_components()); - for (int i = 0; i < num_components(); ++i) { - const TensorShape shape = - ManyOutShape(i, attempt->elements_requested); - Tensor element; + [callback, allow_small_batch, + this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + int32 queue_size = queues_[0].size(); + if (closed_ && queue_size < attempt->elements_requested) { + // If we don't have enough for a full dequeue, we have + // to reset the attempt tuple. + if (!attempt->tuple.empty()) { + // Restore already-dequeued elements to the queue. + for (int64 i = attempt->tuple[0].dim_size(0) - + attempt->elements_requested - 1; + i >= 0; --i) { + for (int j = 0; j < num_components(); ++j) { + PersistentTensor element; + Status s = GetElementComponentFromBatch( + attempt->tuple, i, j, attempt->context, &element); + if (!s.ok()) { attempt->context->SetStatus( - attempt->context->allocate_temp(component_dtypes_[i], - shape, &element)); - if (!attempt->context->status().ok()) return kComplete; - attempt->tuple.emplace_back(element); + errors::DataLoss("Failed to restore element from " + "partially-dequeued batch " + "to RandomShuffleQueue: ", + s.error_message())); } - } - result = kProgress; - Tuple tuple; - DequeueLocked(attempt->context, &tuple); - const int index = attempt->tuple[0].dim_size(0) - - attempt->elements_requested; - for (int i = 0; i < num_components(); ++i) { - attempt->context->SetStatus(batch_util::CopyElementToSlice( - std::move(tuple[i]), &attempt->tuple[i], index)); - if (!attempt->context->status().ok()) return kComplete; - } - tuple.clear(); - --attempt->elements_requested; - if (attempt->elements_requested == 0) { - tuple = attempt->tuple; - attempt->done_callback = [callback, tuple]() { - callback(tuple); - }; - return kComplete; + queues_[j].push_back(element); } } - return result; - }); + } + if (allow_small_batch && !queues_[0].empty()) { + // Request all remaining elements in the queue. + queue_size = queues_[0].size(); + attempt->tuple.clear(); + attempt->elements_requested = queue_size; + } else { + if (allow_small_batch) { + // There may be some other attempts containing + // values. If so, we'll yield and wait for them + // to add elements to the queue. + if (!enqueue_attempts_.empty()) return kProgress; + } + if (attempt->context->status().ok()) { + attempt->context->SetStatus(errors::OutOfRange( + "RandomShuffleQueue '", name_, "' is closed and has ", + "insufficient elements (requested ", + attempt->elements_requested, ", current size ", + queue_size, ")")); + } + return kComplete; + } + } + + RunResult result = kNoProgress; + if (!closed_) queue_size -= min_after_dequeue_; + for (; queue_size > 0; --queue_size) { + if (attempt->tuple.empty()) { + // Only allocate tuple when we have something to dequeue + // so we don't use excessive memory when there are many + // blocked dequeue attempts waiting. + attempt->tuple.reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + const TensorShape shape = + ManyOutShape(i, attempt->elements_requested); + Tensor element; + attempt->context->SetStatus(attempt->context->allocate_temp( + component_dtypes_[i], shape, &element)); + if (!attempt->context->status().ok()) return kComplete; + attempt->tuple.emplace_back(element); + } + } + result = kProgress; + Tuple tuple; + DequeueLocked(attempt->context, &tuple); + const int index = + attempt->tuple[0].dim_size(0) - attempt->elements_requested; + for (int i = 0; i < num_components(); ++i) { + attempt->context->SetStatus(batch_util::CopyElementToSlice( + std::move(tuple[i]), &attempt->tuple[i], index)); + if (!attempt->context->status().ok()) return kComplete; + } + tuple.clear(); + --attempt->elements_requested; + if (attempt->elements_requested == 0) { + tuple = attempt->tuple; + attempt->done_callback = [callback, tuple]() { + callback(tuple); + }; + return kComplete; + } + } + return result; + }); } } if (!already_cancelled) { diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h index 36ca7f834f7..15ae4c1fc53 100644 --- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h +++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h @@ -312,8 +312,7 @@ __global__ void ColumnReduceKernel( int col = blockIdx.x * 32 + threadIdx.x; value_type sum = initVal; - if (row < num_rows && col < num_cols) - sum = in[row * num_cols + col]; + if (row < num_rows && col < num_cols) sum = in[row * num_cols + col]; // 1D array necessary due to bug in CUDA 9 compiler. // TODO(nluehr) revert to 2D array when compiler is ready. @@ -366,8 +365,7 @@ __global__ void CleanupSegments( const int tid = threadIdx.x + blockIdx.x * blockDim.x; value_type val = initVal; - if (tid < segment_size * num_cols) - val = partial_sums[tid]; + if (tid < segment_size * num_cols) val = partial_sums[tid]; typedef cub::WarpReduce WarpReduce; diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc index afad288cc00..d52358737fd 100644 --- a/tensorflow/core/kernels/relu_op.cc +++ b/tensorflow/core/kernels/relu_op.cc @@ -31,7 +31,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define REGISTER_RELU_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ @@ -113,8 +113,7 @@ namespace functor { \ template <> \ void Selu::operator()( \ - const GPUDevice& d, \ - typename TTypes::ConstTensor features, \ + const GPUDevice& d, typename TTypes::ConstTensor features, \ typename TTypes::Tensor activations); \ extern template struct Selu; \ \ @@ -125,8 +124,6 @@ namespace functor { typename TTypes::Tensor backprops); \ extern template struct SeluGrad; - - TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); } // namespace functor @@ -157,8 +154,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ SeluGradOp) - - TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #undef REGISTER_GPU_KERNELS @@ -192,10 +187,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint("T"), \ SeluGradOp) - - TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS); #undef REGISTER_SYCL_KERNELS -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h index 24b789c5437..3bc5ba8a50d 100644 --- a/tensorflow/core/kernels/relu_op_functor.h +++ b/tensorflow/core/kernels/relu_op_functor.h @@ -85,10 +85,9 @@ struct Relu6Grad { // make sure not to propagate the associated gradient // value. This allows "features" to be either the input or the output of // the relu6. - backprops.device(d) = - gradients * - ((features > static_cast(0)) * (features < static_cast(6))) - .template cast(); + backprops.device(d) = gradients * ((features > static_cast(0)) * + (features < static_cast(6))) + .template cast(); } }; @@ -161,8 +160,8 @@ struct SeluGrad { const auto scale = static_cast(1.0507009873554804934193349852946); const auto scale_alpha = static_cast(1.7580993408473768599402175208123); backprops.device(d) = - (activations < static_cast(0)).select( - gradients * (activations + scale_alpha), gradients * scale); + (activations < static_cast(0)) + .select(gradients * (activations + scale_alpha), gradients * scale); } }; diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc index 1a9cf4c6406..86e61bbcefc 100644 --- a/tensorflow/core/kernels/resize_bicubic_op.cc +++ b/tensorflow/core/kernels/resize_bicubic_op.cc @@ -20,6 +20,7 @@ limitations under the License. #include #include +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" @@ -28,7 +29,6 @@ limitations under the License. #include "tensorflow/core/kernels/image_resizer_state.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { namespace { diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc index 9e10fec4232..25a37d5e1af 100644 --- a/tensorflow/core/kernels/resize_bicubic_op_test.cc +++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc @@ -286,13 +286,14 @@ BM_ResizeBicubicDev(32, 128, 3); BM_ResizeBicubicDev(32, 512, 3); BM_ResizeBicubicDev(32, 1024, 3); -#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS) \ - static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(int iters) { \ - testing::ItemsProcessed(static_cast(iters) * BATCH * SIZE * SIZE * \ - CHANNELS * 8 * 8); \ - test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8)) \ - .Run(iters); \ - } \ +#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS) \ + static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS( \ + int iters) { \ + testing::ItemsProcessed(static_cast(iters) * BATCH * SIZE * SIZE * \ + CHANNELS * 8 * 8); \ + test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8)) \ + .Run(iters); \ + } \ BENCHMARK(BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS); BM_ResizeBicubicExpand(12, 48, 1); diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc index a7da7a0777d..f82c3fcd9ff 100644 --- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc +++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc @@ -164,11 +164,11 @@ struct ResizeBilinear { if (total_count == 0) return; CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); - ResizeBilinearKernel< - T><<>>( - config.virtual_thread_count, images.data(), height_scale, width_scale, - batch, in_height, in_width, channels, out_height, out_width, - output.data()); + ResizeBilinearKernel + <<>>( + config.virtual_thread_count, images.data(), height_scale, + width_scale, batch, in_height, in_width, channels, out_height, + out_width, output.data()); } }; @@ -200,11 +200,11 @@ struct ResizeBilinearGrad { // Accumulate. total_count = batch * resized_height * resized_width * channels; config = GetCudaLaunchConfig(total_count, d); - ResizeBilinearGradKernel< - T><<>>( - config.virtual_thread_count, input_grad.data(), height_scale, - width_scale, batch, original_height, original_width, channels, - resized_height, resized_width, output_grad.data()); + ResizeBilinearGradKernel + <<>>( + config.virtual_thread_count, input_grad.data(), height_scale, + width_scale, batch, original_height, original_width, channels, + resized_height, resized_width, output_grad.data()); } }; diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc index 8f82784d936..bb96c42f10c 100644 --- a/tensorflow/core/kernels/reverse_op.cc +++ b/tensorflow/core/kernels/reverse_op.cc @@ -269,10 +269,10 @@ class ReverseV2Op : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, input.shape(), &output)); -// TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse of -// a single dimension to the dims=3 or dims=2 case, regardless of the number -// of dimensions in the tensor. This would let some ops use faster -// lower-dimension code (and use optimized versions). + // TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse + // of a single dimension to the dims=3 or dims=2 case, regardless of the + // number of dimensions in the tensor. This would let some ops use faster + // lower-dimension code (and use optimized versions). #define HANDLE_REVERSE(NDIMS) \ case NDIMS: \ diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc index b05a7c55504..3ee49db669f 100644 --- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc +++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc @@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice; #define DEFINE_REVERSE(T, DIM) \ template struct functor::Reverse; #define DEFINE_REVERSE_ALL_DIMS(T) \ - DEFINE_REVERSE(T, 0) \ - DEFINE_REVERSE(T, 1) \ - DEFINE_REVERSE(T, 2) \ - DEFINE_REVERSE(T, 3) \ - DEFINE_REVERSE(T, 4) \ - DEFINE_REVERSE(T, 5) \ - DEFINE_REVERSE(T, 6) \ - DEFINE_REVERSE(T, 7) \ + DEFINE_REVERSE(T, 0) \ + DEFINE_REVERSE(T, 1) \ + DEFINE_REVERSE(T, 2) \ + DEFINE_REVERSE(T, 3) \ + DEFINE_REVERSE(T, 4) \ + DEFINE_REVERSE(T, 5) \ + DEFINE_REVERSE(T, 6) \ + DEFINE_REVERSE(T, 7) \ DEFINE_REVERSE(T, 8) TF_CALL_uint8(DEFINE_REVERSE_ALL_DIMS); diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc index d1980d4b652..15a707a9c66 100644 --- a/tensorflow/core/kernels/reverse_sequence_op.cc +++ b/tensorflow/core/kernels/reverse_sequence_op.cc @@ -51,8 +51,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) { // Copy seq_len info down for validity checks context->eigen_device().memcpyDeviceToHost( - seq_lens_vec.data(), seq_lens_t.data(), - sizeof(Tlen) * seq_lens_t.size()); + seq_lens_vec.data(), seq_lens_t.data(), sizeof(Tlen) * seq_lens_t.size()); OP_REQUIRES(context, batch_dim != seq_dim, errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim)); @@ -76,8 +75,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) { } } -void CheckErrorsGPU(OpKernelContext* context, int batch_dim, - int seq_dim) { +void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) { const Tensor& input = context->input(0); const Tensor& seq_lens = context->input(1); @@ -98,13 +96,13 @@ void CheckErrorsGPU(OpKernelContext* context, int batch_dim, template <> void CheckErrors(OpKernelContext* context, int batch_dim, - int seq_dim) { + int seq_dim) { CheckErrorsGPU(context, batch_dim, seq_dim); } template <> void CheckErrors(OpKernelContext* context, int batch_dim, - int seq_dim) { + int seq_dim) { CheckErrorsGPU(context, batch_dim, seq_dim); } @@ -164,14 +162,15 @@ class ReverseSequenceOp : public OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp); }; -#define REGISTER_REVERSE_SEQUENCE(type, len_type) \ - REGISTER_KERNEL_BUILDER( \ - Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint("T"). \ - TypeConstraint("Tlen"), \ - ReverseSequenceOp); +#define REGISTER_REVERSE_SEQUENCE(type, len_type) \ + REGISTER_KERNEL_BUILDER(Name("ReverseSequence") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tlen"), \ + ReverseSequenceOp); -#define REGISTER_REVERSE_SEQUENCE_LEN(type) \ - REGISTER_REVERSE_SEQUENCE(type, int32); \ +#define REGISTER_REVERSE_SEQUENCE_LEN(type) \ + REGISTER_REVERSE_SEQUENCE(type, int32); \ REGISTER_REVERSE_SEQUENCE(type, int64); TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_LEN); @@ -181,23 +180,23 @@ TF_CALL_bool(REGISTER_REVERSE_SEQUENCE_LEN); // Forward declarations of the functor specializations for GPU. namespace functor { -#define DECLARE_GPU_SPEC(T, Tlen, Dims) \ - template <> \ - void ReverseSequence::Compute( \ - const GPUDevice& d, typename TTypes::ConstTensor input, \ - int32 batch_dim, int32 seq_dim, \ - typename TTypes::ConstVec seq_lens, \ - typename TTypes::Tensor output); \ +#define DECLARE_GPU_SPEC(T, Tlen, Dims) \ + template <> \ + void ReverseSequence::Compute( \ + const GPUDevice& d, typename TTypes::ConstTensor input, \ + int32 batch_dim, int32 seq_dim, \ + typename TTypes::ConstVec seq_lens, \ + typename TTypes::Tensor output); \ extern template struct ReverseSequence; -#define DECLARE_GPU_SPEC_LEN(T, Dims) \ - DECLARE_GPU_SPEC(T, int32, Dims); \ +#define DECLARE_GPU_SPEC_LEN(T, Dims) \ + DECLARE_GPU_SPEC(T, int32, Dims); \ DECLARE_GPU_SPEC(T, int64, Dims); -#define DECLARE_GPU_SPECS(T) \ - DECLARE_GPU_SPEC_LEN(T, 2); \ - DECLARE_GPU_SPEC_LEN(T, 3); \ - DECLARE_GPU_SPEC_LEN(T, 4); \ +#define DECLARE_GPU_SPECS(T) \ + DECLARE_GPU_SPEC_LEN(T, 2); \ + DECLARE_GPU_SPEC_LEN(T, 3); \ + DECLARE_GPU_SPEC_LEN(T, 4); \ DECLARE_GPU_SPEC_LEN(T, 5); TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS); @@ -206,14 +205,15 @@ TF_CALL_bool(DECLARE_GPU_SPECS); } // namespace functor // Registration of the GPU implementations. -#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type) \ - REGISTER_KERNEL_BUILDER( \ - Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint("T"). \ - TypeConstraint("Tlen"), \ - ReverseSequenceOp); +#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type) \ + REGISTER_KERNEL_BUILDER(Name("ReverseSequence") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tlen"), \ + ReverseSequenceOp); -#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type) \ - REGISTER_REVERSE_SEQUENCE_GPU(type, int32); \ +#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type) \ + REGISTER_REVERSE_SEQUENCE_GPU(type, int32); \ REGISTER_REVERSE_SEQUENCE_GPU(type, int64); TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU_LEN); diff --git a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc index cb49f14525a..4a2136a2cd3 100644 --- a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc +++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc @@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice; template class generator::ReverseGenerator; \ template struct functor::ReverseSequence; -#define DEFINE_GPU_SPEC_LEN(T, dims) \ - DEFINE_GPU_SPEC(T, int32, dims); \ +#define DEFINE_GPU_SPEC_LEN(T, dims) \ + DEFINE_GPU_SPEC(T, int32, dims); \ DEFINE_GPU_SPEC(T, int64, dims); -#define DEFINE_GPU_SPECS(T) \ - DEFINE_GPU_SPEC_LEN(T, 2); \ - DEFINE_GPU_SPEC_LEN(T, 3); \ - DEFINE_GPU_SPEC_LEN(T, 4); \ +#define DEFINE_GPU_SPECS(T) \ + DEFINE_GPU_SPEC_LEN(T, 2); \ + DEFINE_GPU_SPEC_LEN(T, 3); \ + DEFINE_GPU_SPEC_LEN(T, 4); \ DEFINE_GPU_SPEC_LEN(T, 5); TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS); diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc index df60eda7597..990bd2bff94 100644 --- a/tensorflow/core/kernels/save_restore_tensor.cc +++ b/tensorflow/core/kernels/save_restore_tensor.cc @@ -106,11 +106,11 @@ void SaveTensors( OP_REQUIRES_OK(context, checkpoint::ParseShapeAndSlice( shape_spec, &shape, &slice, &slice_shape)); OP_REQUIRES(context, slice_shape.IsSameSize(input.shape()), - errors::InvalidArgument("Slice in shape_and_slice " - "specification does not match the " - "shape of the tensor to save: ", - shape_spec, ", tensor: ", - input.shape().DebugString())); + errors::InvalidArgument( + "Slice in shape_and_slice " + "specification does not match the " + "shape of the tensor to save: ", + shape_spec, ", tensor: ", input.shape().DebugString())); } #define WRITER_ADD(T) \ diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h index c6e35fe329e..079f15e1013 100644 --- a/tensorflow/core/kernels/scatter_functor.h +++ b/tensorflow/core/kernels/scatter_functor.h @@ -29,7 +29,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace scatter_op { @@ -117,7 +117,7 @@ struct AssignSYCL { p.device(d) = p / u; } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace internal } // namespace scatter_op @@ -156,7 +156,7 @@ struct ScatterFunctorBase { #ifdef TENSORFLOW_USE_SYCL template -struct ScatterFunctorBase { +struct ScatterFunctorBase { Index operator()(OpKernelContext* c, const SYCLDevice& d, typename TTypes::Matrix params, typename TTypes::ConstMatrix updates, @@ -171,13 +171,13 @@ struct ScatterFunctorBase { const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i)); if (!FastBoundsCheck(index, limit)) return i; // Copy last Ndim-1 dimensions of updates[i] to params[index] - scatter_op::internal::AssignSYCL::Run(d, params.template chip<0>(index), - updates.template chip<0>(i)); + scatter_op::internal::AssignSYCL::Run( + d, params.template chip<0>(index), updates.template chip<0>(i)); } return -1; } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template struct ScatterFunctorBase { @@ -217,7 +217,7 @@ struct ScatterFunctorBase { template struct ScatterFunctor - : ScatterFunctorBase{}; + : ScatterFunctorBase {}; #ifdef TENSORFLOW_USE_SYCL template @@ -239,7 +239,7 @@ struct ScatterFunctorSYCL { return -1; } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h index e116077d3cf..be18658543e 100644 --- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h +++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h @@ -30,9 +30,10 @@ namespace tensorflow { typedef Eigen::GpuDevice GPUDevice; template -__global__ void ScatterOpCustomKernel( - T* params, const T* updates, const Index* indices, - Index first_dim_size, Index updates_size, Index indices_size) { +__global__ void ScatterOpCustomKernel(T* params, const T* updates, + const Index* indices, + Index first_dim_size, Index updates_size, + Index indices_size) { Index update_block = updates_size / indices_size; CUDA_1D_KERNEL_LOOP(i, updates_size) { int indices_i = i / update_block; @@ -85,8 +86,8 @@ struct ScatterFunctor { CudaLaunchConfig config = GetCudaLaunchConfig(updates_size, d); ScatterOpCustomKernel <<>>( - params.data(), updates.data(), indices.data(), - first_dim_size, updates_size, indices_size); + params.data(), updates.data(), indices.data(), first_dim_size, + updates_size, indices_size); return -1; } }; diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h index c6c9d4e6588..e82660dcc1d 100644 --- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h +++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h @@ -40,7 +40,7 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class OpKernelContext; @@ -251,7 +251,7 @@ REGISTER_SCATTER_ND_MATH_SYCL(int32); #undef REGISTER_SCATTER_ND_INDEX_SYCL #undef REGISTER_SCATTER_ND_FULL_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc index 8607c7f95af..282165349f3 100644 --- a/tensorflow/core/kernels/scatter_op.cc +++ b/tensorflow/core/kernels/scatter_op.cc @@ -25,7 +25,7 @@ limitations under the License. #ifdef TENSORFLOW_USE_SYCL #include "tensorflow/core/common_runtime/sycl/sycl_util.h" -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace tensorflow { @@ -33,7 +33,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Check whether updates.shape = indices.shape + params.shape[1:] static bool ValidShapes(const Tensor& params, const Tensor& updates, @@ -102,11 +102,12 @@ class ScatterUpdateOp : public OpKernel { // Check that we have enough index space const int64 N_big = indices.NumElements(); - OP_REQUIRES(c, N_big <= std::numeric_limits::max(), - errors::InvalidArgument( - "indices has too many elements for ", - DataTypeString(DataTypeToEnum::v()), " indexing: ", - N_big, " > ", std::numeric_limits::max())); + OP_REQUIRES( + c, N_big <= std::numeric_limits::max(), + errors::InvalidArgument("indices has too many elements for ", + DataTypeString(DataTypeToEnum::v()), + " indexing: ", N_big, " > ", + std::numeric_limits::max())); const Index N = static_cast(indices.NumElements()); OP_REQUIRES( c, params.dim_size(0) <= std::numeric_limits::max(), @@ -137,7 +138,7 @@ class ScatterUpdateOp : public OpKernel { #ifdef TENSORFLOW_USE_SYCL template -class ScatterUpdateOp : public OpKernel { +class ScatterUpdateOp : public OpKernel { public: explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) { OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_)); @@ -165,11 +166,12 @@ class ScatterUpdateOp : public OpKernel { // Check that we have enough index space const int64 N_big = indices.NumElements(); - OP_REQUIRES(c, N_big <= std::numeric_limits::max(), - errors::InvalidArgument( - "indices has too many elements for ", - DataTypeString(DataTypeToEnum::v()), " indexing: ", - N_big, " > ", std::numeric_limits::max())); + OP_REQUIRES( + c, N_big <= std::numeric_limits::max(), + errors::InvalidArgument("indices has too many elements for ", + DataTypeString(DataTypeToEnum::v()), + " indexing: ", N_big, " > ", + std::numeric_limits::max())); const Index N = static_cast(indices.NumElements()); OP_REQUIRES( c, params.dim_size(0) <= std::numeric_limits::max(), @@ -206,7 +208,7 @@ class ScatterUpdateOp : public OpKernel { } } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define REGISTER_SCATTER_KERNEL_INDEX(type, index_type, dev, name, op) \ REGISTER_KERNEL_BUILDER(Name(name) \ diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc index 863c123b43f..066a4b80a2b 100644 --- a/tensorflow/core/kernels/sdca_internal.cc +++ b/tensorflow/core/kernels/sdca_internal.cc @@ -37,9 +37,8 @@ void FeatureWeightsDenseStorage::UpdateDenseDeltaWeights( const size_t num_weight_vectors = normalized_bounded_dual_delta.size(); if (num_weight_vectors == 1) { deltas_.device(device) = - deltas_ + - dense_vector.RowAsMatrix() * - deltas_.constant(normalized_bounded_dual_delta[0]); + deltas_ + dense_vector.RowAsMatrix() * + deltas_.constant(normalized_bounded_dual_delta[0]); } else { // Transform the dual vector into a column matrix. const Eigen::TensorMap> @@ -61,9 +60,8 @@ void FeatureWeightsSparseStorage::UpdateSparseDeltaWeights( const Example::SparseFeatures& sparse_features, const std::vector& normalized_bounded_dual_delta) { for (int64 k = 0; k < sparse_features.indices->size(); ++k) { - const double feature_value = sparse_features.values == nullptr - ? 1.0 - : (*sparse_features.values)(k); + const double feature_value = + sparse_features.values == nullptr ? 1.0 : (*sparse_features.values)(k); auto it = indices_to_id_.find((*sparse_features.indices)(k)); for (size_t l = 0; l < normalized_bounded_dual_delta.size(); ++l) { deltas_(l, it->second) += @@ -122,23 +120,24 @@ Status ModelWeights::Initialize(OpKernelContext* const context) { } // Reads in the weights, and allocates and initializes the delta weights. - const auto initialize_weights = [&]( - const OpInputList& weight_inputs, OpOutputList* const weight_outputs, - std::vector* const feature_weights) { - for (int i = 0; i < weight_inputs.size(); ++i) { - Tensor* delta_t; - TF_RETURN_IF_ERROR( - weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t)); - // Convert the input vector to a row matrix in internal representation. - auto deltas = delta_t->shaped({1, delta_t->NumElements()}); - deltas.setZero(); - feature_weights->emplace_back( - FeatureWeightsDenseStorage{weight_inputs[i].shaped( - {1, weight_inputs[i].NumElements()}), - deltas}); - } - return Status::OK(); - }; + const auto initialize_weights = + [&](const OpInputList& weight_inputs, OpOutputList* const weight_outputs, + std::vector* const feature_weights) { + for (int i = 0; i < weight_inputs.size(); ++i) { + Tensor* delta_t; + TF_RETURN_IF_ERROR( + weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t)); + // Convert the input vector to a row matrix in internal + // representation. + auto deltas = delta_t->shaped({1, delta_t->NumElements()}); + deltas.setZero(); + feature_weights->emplace_back(FeatureWeightsDenseStorage{ + weight_inputs[i].shaped( + {1, weight_inputs[i].NumElements()}), + deltas}); + } + return Status::OK(); + }; return initialize_weights(dense_weights_inputs, &dense_weights_outputs, &dense_weights_); diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h index 9f072700754..45915693ac6 100644 --- a/tensorflow/core/kernels/sdca_internal.h +++ b/tensorflow/core/kernels/sdca_internal.h @@ -149,7 +149,8 @@ class Example { // 1.0f. struct SparseFeatures { std::unique_ptr::UnalignedConstVec> indices; - std::unique_ptr::UnalignedConstVec> values; // nullptr encodes optional. + std::unique_ptr::UnalignedConstVec> + values; // nullptr encodes optional. }; // A dense vector which is a row-slice of the underlying matrix. diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc index 0f5c2424b38..dbe0177dda3 100644 --- a/tensorflow/core/kernels/sdca_ops.cc +++ b/tensorflow/core/kernels/sdca_ops.cc @@ -57,11 +57,11 @@ namespace tensorflow { namespace { -using sdca::Regularizations; using sdca::Example; using sdca::Examples; using sdca::ExampleStatistics; using sdca::ModelWeights; +using sdca::Regularizations; struct ComputeOptions { explicit ComputeOptions(OpKernelConstruction* const context) { @@ -76,8 +76,9 @@ struct ComputeOptions { } else if (loss_type == "smooth_hinge_loss") { loss_updater.reset(new SmoothHingeLossUpdater); } else { - OP_REQUIRES(context, false, errors::InvalidArgument( - "Unsupported loss type: ", loss_type)); + OP_REQUIRES( + context, false, + errors::InvalidArgument("Unsupported loss type: ", loss_type)); } OP_REQUIRES_OK(context, context->GetAttr("adaptative", &adaptative)); OP_REQUIRES_OK( @@ -90,9 +91,10 @@ struct ComputeOptions { context, num_sparse_features + num_dense_features > 0, errors::InvalidArgument("Requires at least one feature to train.")); - OP_REQUIRES(context, static_cast(num_sparse_features) + - static_cast(num_dense_features) <= - std::numeric_limits::max(), + OP_REQUIRES(context, + static_cast(num_sparse_features) + + static_cast(num_dense_features) <= + std::numeric_limits::max(), errors::InvalidArgument( strings::Printf("Too many feature groups: %lld > %d", static_cast(num_sparse_features) + diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc index 3ef1cd1e062..27b8081eb88 100644 --- a/tensorflow/core/kernels/segment_reduction_ops.cc +++ b/tensorflow/core/kernels/segment_reduction_ops.cc @@ -115,7 +115,7 @@ class SegmentReductionOp : public OpKernel { Eigen::DSizes dims_to_reduce; dims_to_reduce[0] = 0; #else - Eigen::IndexList> dims_to_reduce; + Eigen::IndexList > dims_to_reduce; #endif Index start = 0, end = 1; @@ -359,7 +359,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL); namespace functor { // UnsortedSegmentSumFunctor implementation for CPUDevice. -// todo: Remove duplicate code in UnsortedSegmentSumFunctor and UnsortedSegmentMaxFunctor. +// todo: Remove duplicate code in UnsortedSegmentSumFunctor and +// UnsortedSegmentMaxFunctor. template struct UnsortedSegmentSumFunctor : UnsortedSegmentBaseFunctor { @@ -461,9 +462,10 @@ class UnsortedSegmentBaseOp : public OpKernel { auto data_ptr = data.template flat().data(); reduction_functor_(context, context->template eigen_device(), - output_rows, segment_ids.shape(), segment_flat, - data.NumElements(), data_ptr, output_flat); + output_rows, segment_ids.shape(), segment_flat, + data.NumElements(), data_ptr, output_flat); } + private: functor::UnsortedSegmentBaseFunctor& reduction_functor_; }; @@ -472,22 +474,20 @@ template class UnsortedSegmentSumOp : public UnsortedSegmentBaseOp { public: explicit UnsortedSegmentSumOp(OpKernelConstruction* context) - : UnsortedSegmentBaseOp( - context, - sum_functor_) {} + : UnsortedSegmentBaseOp(context, sum_functor_) {} + private: - functor::UnsortedSegmentSumFunctor sum_functor_; + functor::UnsortedSegmentSumFunctor sum_functor_; }; template class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp { public: explicit UnsortedSegmentMaxOp(OpKernelConstruction* context) - : UnsortedSegmentBaseOp( - context, - max_functor_) {} + : UnsortedSegmentBaseOp(context, max_functor_) {} + private: - functor::UnsortedSegmentMaxFunctor max_functor_; + functor::UnsortedSegmentMaxFunctor max_functor_; }; #define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type) \ @@ -663,9 +663,9 @@ class SparseSegmentReductionOpBase : public OpKernel { Reduce(input_flat, indices_vec, start, end - start, out); OP_REQUIRES(context, bad_offset < 0, errors::InvalidArgument( - "Bad: indices[", start + bad_offset, "] == ", - indices_vec(start + bad_offset), " out of range [0, ", - input_flat.dimension(0), ")")); + "Bad: indices[", start + bad_offset, + "] == ", indices_vec(start + bad_offset), + " out of range [0, ", input_flat.dimension(0), ")")); start = end; ++end; diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h index bcdd42c80c1..5c9cfe09065 100644 --- a/tensorflow/core/kernels/segment_reduction_ops.h +++ b/tensorflow/core/kernels/segment_reduction_ops.h @@ -51,13 +51,14 @@ struct SegmentSumFunctor { // BaseFunctor for definition of UnsorteSegmentReductionOp // for usage without templates. template -struct UnsortedSegmentBaseFunctor{ - virtual ~UnsortedSegmentBaseFunctor(){} +struct UnsortedSegmentBaseFunctor { + virtual ~UnsortedSegmentBaseFunctor() {} virtual void operator()(OpKernelContext* ctx, const Device& d, - const Index output_rows, const TensorShape& segment_ids_shape, - typename TTypes::ConstFlat segment_ids, - const Index data_size, const T* data, - typename TTypes::Tensor output){}; + const Index output_rows, + const TensorShape& segment_ids_shape, + typename TTypes::ConstFlat segment_ids, + const Index data_size, const T* data, + typename TTypes::Tensor output){}; }; // Functor for UnsortedSegmentSumOp. @@ -70,7 +71,8 @@ struct UnsortedSegmentBaseFunctor{ // data: input data tensor. // output: output reshaped to {output_rows, output.size/output_rows} template -struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor { +struct UnsortedSegmentSumFunctor + : public UnsortedSegmentBaseFunctor { void operator()(OpKernelContext* ctx, const Device& d, const Index output_rows, const TensorShape& segment_ids_shape, typename TTypes::ConstFlat segment_ids, @@ -88,7 +90,8 @@ struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor -struct UnsortedSegmentMaxFunctor: public UnsortedSegmentBaseFunctor { +struct UnsortedSegmentMaxFunctor + : public UnsortedSegmentBaseFunctor { void operator()(OpKernelContext* ctx, const Device& d, const Index output_rows, const TensorShape& segment_ids_shape, typename TTypes::ConstFlat segment_ids, diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc index 159fada621b..39d520698e1 100644 --- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc @@ -194,7 +194,8 @@ void SegmentSumFunctor::operator()( // UnsortedSegmentSumFunctor implementation for GPUDevice. template -struct UnsortedSegmentSumFunctor: UnsortedSegmentBaseFunctor { +struct UnsortedSegmentSumFunctor + : UnsortedSegmentBaseFunctor { void operator()(OpKernelContext* ctx, const GPUDevice& d, const Index output_rows, const TensorShape& segment_ids_shape, typename TTypes::ConstFlat segment_ids, @@ -221,11 +222,10 @@ struct UnsortedSegmentSumFunctor: UnsortedSegmentBaseFuncto const Index input_inner_dim_size = input_total_size / input_outer_dim_size; config = GetCudaLaunchConfig(input_total_size, d); - UnsortedSegmentSumCustomKernel< - T, - Index><<>>( - input_outer_dim_size, input_inner_dim_size, output_rows, - segment_ids.data(), data, output.data()); + UnsortedSegmentSumCustomKernel + <<>>( + input_outer_dim_size, input_inner_dim_size, output_rows, + segment_ids.data(), data, output.data()); } }; diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/self_adjoint_eig_op.cc index 97657807268..bcd88773902 100644 --- a/tensorflow/core/kernels/self_adjoint_eig_op.cc +++ b/tensorflow/core/kernels/self_adjoint_eig_op.cc @@ -25,7 +25,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" - namespace tensorflow { template diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc index 206fd40fa68..688e61fcadc 100644 --- a/tensorflow/core/kernels/sendrecv_ops.cc +++ b/tensorflow/core/kernels/sendrecv_ops.cc @@ -114,7 +114,7 @@ REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp); REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp); REGISTER_KERNEL_BUILDER( Name("_HostSend").Device(DEVICE_SYCL).HostMemory("tensor"), SendOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp); REGISTER_KERNEL_BUILDER( @@ -198,7 +198,7 @@ REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp); #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp); REGISTER_KERNEL_BUILDER( @@ -207,6 +207,6 @@ REGISTER_KERNEL_BUILDER( #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER( Name("_HostRecv").Device(DEVICE_SYCL).HostMemory("tensor"), RecvOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc index e2e3758d87e..9db0bd4d98b 100644 --- a/tensorflow/core/kernels/sequence_ops.cc +++ b/tensorflow/core/kernels/sequence_ops.cc @@ -53,13 +53,13 @@ class RangeOp : public OpKernel { if (delta > 0) { OP_REQUIRES( context, start <= limit, - errors::InvalidArgument("Requires start <= limit when delta > 0: ", - start, "/", limit)); + errors::InvalidArgument( + "Requires start <= limit when delta > 0: ", start, "/", limit)); } else { OP_REQUIRES( context, start >= limit, - errors::InvalidArgument("Requires start >= limit when delta < 0: ", - start, "/", limit)); + errors::InvalidArgument( + "Requires start >= limit when delta < 0: ", start, "/", limit)); } int64 size = (std::is_integral::value ? ((std::abs(limit - start) + std::abs(delta) - 1) / diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc index 185c5b248fc..f2dd2812b53 100644 --- a/tensorflow/core/kernels/session_ops.cc +++ b/tensorflow/core/kernels/session_ops.cc @@ -144,7 +144,7 @@ REGISTER_GPU_KERNEL(bool); TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL); REGISTER_SYCL_KERNEL(bool); #undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class DeleteSessionTensorOp : public OpKernel { public: diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h index 8d9d0ea8461..55be308901b 100644 --- a/tensorflow/core/kernels/shape_ops.h +++ b/tensorflow/core/kernels/shape_ops.h @@ -235,10 +235,10 @@ class SqueezeOp : public OpKernel { if (!wrapped_squeeze_dims.empty()) { if (wrapped_squeeze_dims.count(i) > 0) { OP_REQUIRES(ctx, existing_dim == 1, - errors::InvalidArgument("Tried to explicitly squeeze " - "dimension ", - i, " but dimension was not 1: ", - existing_dim)); + errors::InvalidArgument( + "Tried to explicitly squeeze " + "dimension ", + i, " but dimension was not 1: ", existing_dim)); } else { // This dimension is not being squeezed. new_shape.push_back(existing_dim); diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc index 82595de7794..79369fd4a9c 100644 --- a/tensorflow/core/kernels/slice_op.cc +++ b/tensorflow/core/kernels/slice_op.cc @@ -58,7 +58,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Shared code that is not dependent on the type of T. We do this to reduce // code size by not duplicating all this for all T (float, double, int32, etc.) @@ -72,10 +72,11 @@ static void SharedValidation(OpKernelContext* context, const Tensor& size_tensor = context->input(2); OP_REQUIRES( - context, context->op_kernel().IsLegacyVector(begin_tensor.shape()) && - context->op_kernel().IsLegacyVector(size_tensor.shape()) && - begin_tensor.NumElements() == input.dims() && - size_tensor.NumElements() == input.dims(), + context, + context->op_kernel().IsLegacyVector(begin_tensor.shape()) && + context->op_kernel().IsLegacyVector(size_tensor.shape()) && + begin_tensor.NumElements() == input.dims() && + size_tensor.NumElements() == input.dims(), errors::InvalidArgument( "Expected begin and size arguments to be 1-D tensors of size ", input.dims(), ", but got shapes ", begin_tensor.shape().DebugString(), @@ -125,8 +126,7 @@ static void SharedSliceCommonCases(OpKernelContext* context, TensorShape* output_shape, gtl::InlinedVector* begin, gtl::InlinedVector* size, - Tensor** result, - bool* done) { + Tensor** result, bool* done) { bool is_identity = true; bool slice_dim0 = true; *done = false; @@ -142,8 +142,8 @@ static void SharedSliceCommonCases(OpKernelContext* context, return; } - if (slice_dim0 && IsDim0SliceAligned(input.shape(), (*begin)[0], - (*size)[0])) { + if (slice_dim0 && + IsDim0SliceAligned(input.shape(), (*begin)[0], (*size)[0])) { VLOG(1) << "Slice dim 0: " << input.shape().DebugString(); CHECK_GE(input.dims(), 1); // Otherwise, is_identity should be true. context->set_output(0, input.Slice((*begin)[0], (*begin)[0] + (*size)[0])); @@ -154,7 +154,6 @@ static void SharedSliceCommonCases(OpKernelContext* context, OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result)); } - template class SliceOp : public OpKernel { public: @@ -206,8 +205,9 @@ class SliceOp : public OpKernel { #undef HANDLE_DIM - OP_REQUIRES(context, false, errors::Unimplemented( - "SliceOp : Unhandled input dimensions")); + OP_REQUIRES( + context, false, + errors::Unimplemented("SliceOp : Unhandled input dimensions")); } } @@ -280,8 +280,9 @@ class MklSliceOp : public OpKernel { #undef HANDLE_DIM - OP_REQUIRES(context, false, errors::Unimplemented( - "SliceOp : Unhandled input dimensions")); + OP_REQUIRES( + context, false, + errors::Unimplemented("SliceOp : Unhandled input dimensions")); } } @@ -292,9 +293,9 @@ class MklSliceOp : public OpKernel { // as the sizes of all the dimensions of the input except slice_dim, then // returns True. Otherwise, returns False. bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape, - const gtl::ArraySlice& begin, - const gtl::ArraySlice& size, - int slice_dim) { + const gtl::ArraySlice& begin, + const gtl::ArraySlice& size, + int slice_dim) { for (int dim = 0; dim < 4; dim++) { if (dim != slice_dim && (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) { @@ -316,9 +317,9 @@ class MklSliceOp : public OpKernel { // Returns True if Slicing over a single dimension, and sets slice_dim // to the number of the dimension that satisfies criteria. bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape, - const gtl::ArraySlice& begin, - const gtl::ArraySlice& size, - int* slice_dim) { + const gtl::ArraySlice& begin, + const gtl::ArraySlice& size, + int* slice_dim) { for (int dim = 0; dim < 4; dim++) { if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) { *slice_dim = dim; @@ -329,8 +330,7 @@ class MklSliceOp : public OpKernel { } template - void HandleCase(OpKernelContext* context, - const gtl::ArraySlice& begin, + void HandleCase(OpKernelContext* context, const gtl::ArraySlice& begin, const gtl::ArraySlice& size, Tensor* result) { int slice_dim = -1; TensorShape in_shape = context->input(0).shape(); @@ -340,67 +340,63 @@ class MklSliceOp : public OpKernel { // format over channel dimension. if (NDIM == 4 && DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) { - size_t in_strides[4] = { (size_t) in_shape.dim_size(1) * - in_shape.dim_size(2) * - in_shape.dim_size(3), - (size_t) in_shape.dim_size(2) * - in_shape.dim_size(3), - (size_t) in_shape.dim_size(3), - (size_t) 1 - }; + size_t in_strides[4] = { + (size_t)in_shape.dim_size(1) * in_shape.dim_size(2) * + in_shape.dim_size(3), + (size_t)in_shape.dim_size(2) * in_shape.dim_size(3), + (size_t)in_shape.dim_size(3), (size_t)1}; - size_t out_strides[4] = { (size_t) size[1] * size[2] * size[3], - (size_t) size[2] * size[3], - (size_t) size[3], - (size_t) 1 }; + size_t out_strides[4] = {(size_t)size[1] * size[2] * size[3], + (size_t)size[2] * size[3], (size_t)size[3], + (size_t)1}; - T *in_buf = const_cast(const_cast( - context->input(0).flat().data())); - T *op_buf = result->flat().data(); + T* in_buf = const_cast( + const_cast(context->input(0).flat().data())); + T* op_buf = result->flat().data(); - if (slice_dim == 1) { - /* data format = NCHW */ + if (slice_dim == 1) { + /* data format = NCHW */ - #pragma omp parallel for - for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { - T *ip = in_buf + (d0 * in_strides[0]); - T *op = op_buf + ((d0 - begin[0]) * out_strides[0]); - #pragma omp parallel for - for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { - T *ip1 = ip + (d1 * in_strides[1]); - T *op1 = op + ((d1 - begin[1]) * out_strides[1]); - // For NCHW, H and W will be contiguous. So we can copy - // both with one memcpy. - memcpy(static_cast(op1), static_cast(ip1), - sizeof(T) * in_strides[1]); - } +#pragma omp parallel for + for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { + T* ip = in_buf + (d0 * in_strides[0]); + T* op = op_buf + ((d0 - begin[0]) * out_strides[0]); +#pragma omp parallel for + for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { + T* ip1 = ip + (d1 * in_strides[1]); + T* op1 = op + ((d1 - begin[1]) * out_strides[1]); + // For NCHW, H and W will be contiguous. So we can copy + // both with one memcpy. + memcpy(static_cast(op1), static_cast(ip1), + sizeof(T) * in_strides[1]); } - return; - } else if (slice_dim == 3) { - /* data_format = NHWC */ - - #pragma omp parallel for - for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { - T *ip = in_buf + (d0 * in_strides[0]); - T *op = op_buf + ((d0 - begin[0]) * out_strides[0]); - #pragma omp parallel for - for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { - T *ip1 = ip + (d1 * in_strides[1]); - T *op1 = op + ((d1 - begin[1]) * out_strides[1]); - #pragma omp parallel for - for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) { - T *ip2 = ip1 + (d2 * in_strides[2]); - T *ip3 = ip2 + begin[3]; - T *op2 = op1 + ((d2 - begin[2]) * out_strides[2]); - T *op3 = op2; - memcpy(static_cast(op3), static_cast(ip3), - sizeof(T) * size[3]); - } - } - } - return; } - // slice_dim is not 1 or 3, then we fallback to Eigen implementation. + return; + } else if (slice_dim == 3) { + /* data_format = NHWC */ + +#pragma omp parallel for + for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { + T* ip = in_buf + (d0 * in_strides[0]); + T* op = op_buf + ((d0 - begin[0]) * out_strides[0]); +#pragma omp parallel for + for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { + T* ip1 = ip + (d1 * in_strides[1]); + T* op1 = op + ((d1 - begin[1]) * out_strides[1]); +#pragma omp parallel for + for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) { + T* ip2 = ip1 + (d2 * in_strides[2]); + T* ip3 = ip2 + begin[3]; + T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]); + T* op3 = op2; + memcpy(static_cast(op3), static_cast(ip3), + sizeof(T) * size[3]); + } + } + } + return; + } + // slice_dim is not 1 or 3, then we fallback to Eigen implementation. } Eigen::DSizes indices; @@ -535,13 +531,13 @@ REGISTER_KERNEL_BUILDER(Name("Slice") #ifdef TENSORFLOW_USE_SYCL // Forward declarations of the functor specializations for SYCL. namespace functor { -#define DECLARE_SYCL_SPEC(T, NDIM) \ - template <> \ - void Slice::operator()( \ - const SYCLDevice& d, typename TTypes::Tensor output,\ - typename TTypes::ConstTensor input, \ - const Eigen::DSizes& indices, \ - const Eigen::DSizes& sizes); \ +#define DECLARE_SYCL_SPEC(T, NDIM) \ + template <> \ + void Slice::operator()( \ + const SYCLDevice& d, typename TTypes::Tensor output, \ + typename TTypes::ConstTensor input, \ + const Eigen::DSizes& indices, \ + const Eigen::DSizes& sizes); \ extern template struct Slice; #define DECLARE_FOR_N(T) \ diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h index 0362a021336..db7eded745e 100644 --- a/tensorflow/core/kernels/slice_op.h +++ b/tensorflow/core/kernels/slice_op.h @@ -24,7 +24,6 @@ limitations under the License. namespace tensorflow { namespace functor { - template struct Slice { void operator()(const Device& d, typename TTypes::Tensor output, diff --git a/tensorflow/core/kernels/slice_op_cpu_impl.h b/tensorflow/core/kernels/slice_op_cpu_impl.h index 47f1d5342a9..64b6948190a 100644 --- a/tensorflow/core/kernels/slice_op_cpu_impl.h +++ b/tensorflow/core/kernels/slice_op_cpu_impl.h @@ -43,7 +43,7 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_SYCL_KERNELS); DEFINE_SYCL_KERNELS(int32); #undef DEFINE_SYCL_KERNELS -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc index 590f01c4691..e1712ac239d 100644 --- a/tensorflow/core/kernels/softmax_op.cc +++ b/tensorflow/core/kernels/softmax_op.cc @@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Partial specialization for a CPUDevice, that uses the Eigen implementation // from SoftmaxEigenImpl. @@ -48,7 +48,7 @@ struct SoftmaxFunctor : SoftmaxFunctorBase {}; #ifdef TENSORFLOW_USE_SYCL template struct SoftmaxFunctor : SoftmaxFunctorBase {}; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor template @@ -100,5 +100,5 @@ REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER( Name("Softmax").Device(DEVICE_SYCL).TypeConstraint("T"), SoftmaxOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc index c25ce2d8bb5..92ddf8edbfb 100644 --- a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc +++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc @@ -70,7 +70,7 @@ static Graph* ConstructSpaceToBatchGraph( } \ BENCHMARK( \ BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11); -#define BM_SpaceToBatch(OP, ...) \ +#define BM_SpaceToBatch(OP, ...) \ BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \ BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \ BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_HALF, __VA_ARGS__)); \ diff --git a/tensorflow/core/kernels/spacetobatch_functor.cc b/tensorflow/core/kernels/spacetobatch_functor.cc index 23d8a5f9ed4..4c374b8d994 100644 --- a/tensorflow/core/kernels/spacetobatch_functor.cc +++ b/tensorflow/core/kernels/spacetobatch_functor.cc @@ -154,7 +154,7 @@ struct SpaceToBatchFunctor { #define INSTANTIATE(NUM_BLOCK_DIMS, T) \ template struct SpaceToBatchFunctor; \ template struct SpaceToBatchFunctor; \ -/**/ + /**/ #define INSTANTIATE_FOR_T(T) \ TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T) diff --git a/tensorflow/core/kernels/spacetobatch_functor.h b/tensorflow/core/kernels/spacetobatch_functor.h index 06813650c08..f46a84da1e9 100644 --- a/tensorflow/core/kernels/spacetobatch_functor.h +++ b/tensorflow/core/kernels/spacetobatch_functor.h @@ -44,7 +44,7 @@ constexpr int kMaxSpaceToBatchBlockDims = 4; MACRO(2 /**/, ##__VA_ARGS__) \ MACRO(3 /**/, ##__VA_ARGS__) \ MACRO(4 /**/, ##__VA_ARGS__) \ -/**/ + /**/ namespace internal { namespace spacetobatch { diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc index db8d419c38f..5687141c9ea 100644 --- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc +++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc @@ -141,10 +141,10 @@ struct SpaceToBatchFunctor { } CudaLaunchConfig config = GetCudaLaunchConfig(static_cast(total_count), d); - S2B<<>>( - config.virtual_thread_count, const_cast(space_tensor.data()), args, - const_cast(batch_tensor.data())); + S2B + <<>>( + config.virtual_thread_count, const_cast(space_tensor.data()), + args, const_cast(batch_tensor.data())); return Status::OK(); } }; @@ -153,7 +153,7 @@ struct SpaceToBatchFunctor { #define INSTANTIATE(NUM_BLOCK_DIMS, T) \ template struct SpaceToBatchFunctor; \ template struct SpaceToBatchFunctor; \ -/**/ + /**/ #define INSTANTIATE_FOR_T(T) \ TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T) diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc index 95c1f5e7e8c..fdc08ec8e3b 100644 --- a/tensorflow/core/kernels/spacetobatch_op.cc +++ b/tensorflow/core/kernels/spacetobatch_op.cc @@ -58,9 +58,10 @@ void SpaceToBatchOpCompute(OpKernelContext* context, errors::InvalidArgument("input rank should be >= ", 1 + block_dims, " instead of ", orig_input_tensor.dims())); - OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_paddings.shape()) && - block_dims == orig_paddings.dim_size(0) && - 2 == orig_paddings.dim_size(1), + OP_REQUIRES(context, + TensorShapeUtils::IsMatrix(orig_paddings.shape()) && + block_dims == orig_paddings.dim_size(0) && + 2 == orig_paddings.dim_size(1), errors::InvalidArgument("paddings should have shape [", block_dims, ", 2] instead of ", orig_paddings.shape().DebugString())); diff --git a/tensorflow/core/kernels/sparse_add_grad_op.cc b/tensorflow/core/kernels/sparse_add_grad_op.cc index d8ed0c6f0c2..8597f3a8f73 100644 --- a/tensorflow/core/kernels/sparse_add_grad_op.cc +++ b/tensorflow/core/kernels/sparse_add_grad_op.cc @@ -35,9 +35,10 @@ class SparseAddGradOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices)); OP_REQUIRES_OK(ctx, ctx->input("sum_indices", &sum_indices)); - OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) && - TensorShapeUtils::IsMatrix(b_indices->shape()) && - TensorShapeUtils::IsMatrix(sum_indices->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsMatrix(a_indices->shape()) && + TensorShapeUtils::IsMatrix(b_indices->shape()) && + TensorShapeUtils::IsMatrix(sum_indices->shape()), errors::InvalidArgument( "Input indices should be matrices but received shapes: ", a_indices->shape().DebugString(), " and ", @@ -49,8 +50,9 @@ class SparseAddGradOp : public OpKernel { "Input backprop_val_grad should be a vector but received shape: ", backprop_val_grad->shape().DebugString())); OP_REQUIRES( - ctx, a_indices->dim_size(1) == b_indices->dim_size(1) && - b_indices->dim_size(1) == sum_indices->dim_size(1), + ctx, + a_indices->dim_size(1) == b_indices->dim_size(1) && + b_indices->dim_size(1) == sum_indices->dim_size(1), errors::InvalidArgument("The densified operands should have the same " "ndims; for A, B, sum got: ", a_indices->dim_size(1), b_indices->dim_size(1), diff --git a/tensorflow/core/kernels/sparse_add_op.cc b/tensorflow/core/kernels/sparse_add_op.cc index bd91dfdce64..d16317af671 100644 --- a/tensorflow/core/kernels/sparse_add_op.cc +++ b/tensorflow/core/kernels/sparse_add_op.cc @@ -34,8 +34,9 @@ class SparseAddOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices)); OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices)); - OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) && - TensorShapeUtils::IsMatrix(b_indices->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsMatrix(a_indices->shape()) && + TensorShapeUtils::IsMatrix(b_indices->shape()), errors::InvalidArgument( "Input indices should be matrices but received shapes: ", a_indices->shape().DebugString(), " and ", @@ -46,8 +47,9 @@ class SparseAddOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values_t)); OP_REQUIRES_OK(ctx, ctx->input("b_values", &b_values_t)); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) && - TensorShapeUtils::IsVector(b_values_t->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(a_values_t->shape()) && + TensorShapeUtils::IsVector(b_values_t->shape()), errors::InvalidArgument( "Input values should be vectors but received shapes: ", a_values_t->shape().DebugString(), " and ", @@ -62,8 +64,9 @@ class SparseAddOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape)); OP_REQUIRES_OK(ctx, ctx->input("b_shape", &b_shape)); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape->shape()) && - TensorShapeUtils::IsVector(b_shape->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(a_shape->shape()) && + TensorShapeUtils::IsVector(b_shape->shape()), errors::InvalidArgument( "Input shapes should be a vector but received shapes ", a_shape->shape().DebugString(), " and ", diff --git a/tensorflow/core/kernels/sparse_add_op_test.cc b/tensorflow/core/kernels/sparse_add_op_test.cc index 4cad02bbee8..1f08e6c5ce2 100644 --- a/tensorflow/core/kernels/sparse_add_op_test.cc +++ b/tensorflow/core/kernels/sparse_add_op_test.cc @@ -61,9 +61,9 @@ TEST_F(SparseAddOpTest, TwoD_AddSparseTensorWithSelf) { // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); #define ADD_TENSOR_INPUT() \ diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc index c122616cf15..80bc1f19344 100644 --- a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc +++ b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc @@ -103,8 +103,9 @@ class SparseAccumulatorTakeGradientOp DoneCallback callback) override { // Check signature OP_REQUIRES_OK_ASYNC( - ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32}, - {DT_INT64, accumulator->dtype(), DT_INT64}), + ctx, + ctx->MatchSignature({DT_STRING_REF, DT_INT32}, + {DT_INT64, accumulator->dtype(), DT_INT64}), callback); } diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc index 07d935d55fe..7cd4532ad63 100644 --- a/tensorflow/core/kernels/sparse_cross_op.cc +++ b/tensorflow/core/kernels/sparse_cross_op.cc @@ -288,8 +288,7 @@ struct CrossTraits { template class SparseCrossOp : public OpKernel { public: - explicit SparseCrossOp(OpKernelConstruction* context) - : OpKernel(context) { + explicit SparseCrossOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_)); // Read signed_hash_key_ as int64 since uint64 attributes are not // supported by REGISTER_OP. @@ -316,8 +315,8 @@ class SparseCrossOp : public OpKernel { GenerateColumnsFromInput(indices_list_in, values_list_in, shapes_list_in, dense_list_in); - typename CrossTraits::Crosser - crosser(columns, num_buckets_, hash_key_); + typename CrossTraits::Crosser crosser( + columns, num_buckets_, hash_key_); Tensor* indices_out; Tensor* values_out; Tensor* shape_out; @@ -326,8 +325,8 @@ class SparseCrossOp : public OpKernel { CreateOutputTensors(columns, batch_size, context, &indices_out, &values_out, &shape_out, &output_start_indices); - typename CrossTraits::Updater - updater(output_start_indices, indices_out, values_out); + typename CrossTraits::Updater updater( + output_start_indices, indices_out, values_out); auto do_work = [this, &columns, crosser, updater](int64 begin, int64 end) { for (int b = begin; b < end; b++) { ProductIterator product_iterator(columns, b); @@ -381,8 +380,9 @@ class SparseCrossOp : public OpKernel { "Input values should be a std::vector but received shape ", values_list_in[i].shape().DebugString(), " at position ", i)); OP_REQUIRES( - context, indices_list_in[i].shape().dim_size(0) == - values_list_in[i].shape().dim_size(0), + context, + indices_list_in[i].shape().dim_size(0) == + values_list_in[i].shape().dim_size(0), errors::InvalidArgument( "Expected size of values to be ", indices_list_in[i].shape().dim_size(0), " got ", diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc index cc0f86ce05e..ac48202ada2 100644 --- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc +++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc @@ -70,8 +70,9 @@ class SparseDenseBinaryOpShared : public OpKernel { errors::InvalidArgument( "Input sp_indices should be a matrix but received shape: ", indices_t->shape().DebugString())); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(values_t->shape()) && - TensorShapeUtils::IsVector(shape_t->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(values_t->shape()) && + TensorShapeUtils::IsVector(shape_t->shape()), errors::InvalidArgument( "Inputs sp_values and sp_shape should be vectors " "but received shapes: ", @@ -150,8 +151,9 @@ class SparseDenseBinaryOpShared : public OpKernel { CASE(4); CASE(5); default: - OP_REQUIRES(ctx, false, errors::InvalidArgument( - "Only tensors with ranks between 1 and 5 " + OP_REQUIRES( + ctx, false, + errors::InvalidArgument("Only tensors with ranks between 1 and 5 " "are currently supported. Tensor rank: ", ndims)); #undef CASE diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc index eaf1884243e..fe198af7e6c 100644 --- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc +++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc @@ -96,9 +96,9 @@ TEST_F(SparseDenseCDivTest, SameShape) { // [2 ] cdiv [dense: same shape, all 1's] // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); // Tensor dense(DT_FLOAT, TensorShape({3, 1})); @@ -125,9 +125,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseSameDims) { // [2 ] cdiv [dense: shape [3,1], all 1's] // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); Tensor dense(DT_FLOAT, TensorShape({3, 1})); @@ -152,9 +152,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseFewerDims) { // [2 ] cdiv [dense: shape [2]] // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); Tensor dense(DT_FLOAT, TensorShape({2})); @@ -184,9 +184,9 @@ TEST_F(SparseDenseCMulTest, BroadcastDense) { // [1 ?] where ? remains implicitly zero. // [1.5 0] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); Tensor dense(DT_FLOAT, TensorShape({2})); diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc index 8ab23b64d3d..a1f9667b783 100644 --- a/tensorflow/core/kernels/sparse_matmul_op.cc +++ b/tensorflow/core/kernels/sparse_matmul_op.cc @@ -159,8 +159,8 @@ struct SparseSlice { template template -void SparseSlice::Initialize(const typename SparseSlice::ConstMatrixMap& mat, - int col_offset) { +void SparseSlice::Initialize( + const typename SparseSlice::ConstMatrixMap& mat, int col_offset) { const int mat_rows = Transpose ? mat.dimension(1) : mat.dimension(0); const int mat_cols = Transpose ? mat.dimension(0) : mat.dimension(1); DCHECK_LE(num_rows, mat_rows); @@ -278,9 +278,9 @@ ALWAYS_INLINE float ConvertBfloat16ToFloat(const bfloat16* src) { float out = 0; auto tmp = reinterpret_cast(&out); #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - tmp[0] = *src; + tmp[0] = *src; #else - tmp[1] = *src; + tmp[1] = *src; #endif return out; } @@ -970,9 +970,9 @@ class SparseMatMulOp : public OpKernel { const int k2 = transpose_b_ ? b.dim_size(1) : b.dim_size(0); OP_REQUIRES(ctx, k == k2, - errors::InvalidArgument("Matrix size incompatible: a: ", - a.shape().DebugString(), ", b: ", - b.shape().DebugString())); + errors::InvalidArgument( + "Matrix size incompatible: a: ", a.shape().DebugString(), + ", b: ", b.shape().DebugString())); Tensor* output = nullptr; OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({m, n}), &output)); @@ -1224,8 +1224,9 @@ ALWAYS_INLINE void CopyAndMayBeInterleave(void* dst, const void* src, template inline BlockingCounter* SparseMatMul::ShuffleMatrix( - const typename SparseMatMul::ConstMatrixMapR& mat, int slice_row_start, - int slice_num_rows, int slice_col_start, int slice_num_cols, const int N, + const typename SparseMatMul::ConstMatrixMapR& mat, + int slice_row_start, int slice_num_rows, int slice_col_start, + int slice_num_cols, const int N, const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer) { DCHECK_EQ(N % 2, 0); DCHECK_LE(kNumOperands * sizeof(float) / sizeof(TR), N); @@ -1306,8 +1307,9 @@ inline std::unique_ptr SparseMatMul::CreateDenseSlices( template inline void SparseMatMul::ComputeBlockSizes( const typename SparseMatMul::ConstMatrixMapL& left, - const typename SparseMatMul::ConstMatrixMapR& right, bool transpose_left, - int num_threads, int* KR, int* NR, int* KL, int* JB, int* IB) { + const typename SparseMatMul::ConstMatrixMapR& right, + bool transpose_left, int num_threads, int* KR, int* NR, int* KL, int* JB, + int* IB) { // Heuristics for calculating block sizes // Assume two hyperthreads per core. const int est_num_cores = std::max(1, (num_threads + 1) / 2); diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h index cca52558ae2..14ef2ed7044 100644 --- a/tensorflow/core/kernels/sparse_matmul_op.h +++ b/tensorflow/core/kernels/sparse_matmul_op.h @@ -159,25 +159,25 @@ EIGEN_STRONG_INLINE Packet4f pload2bf16(const float* from) { // Return a packet with the first value of the input Packet replicated template <> EIGEN_STRONG_INLINE Packet4f pbroadcast_first(const Packet4f& a) { - return vec_splat (a, 0); + return vec_splat(a, 0); } // Return a packet with the second value of the input Packet replicated template <> EIGEN_STRONG_INLINE Packet4f pbroadcast_second(const Packet4f& a) { - return vec_splat (a, 1); + return vec_splat(a, 1); } // Return a packet with the third value of the input Packet replicated template <> EIGEN_STRONG_INLINE Packet4f pbroadcast_third(const Packet4f& a) { - return vec_splat (a, 2); + return vec_splat(a, 2); } // Return a packet with the fourth value of the input Packet replicated template <> EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth(const Packet4f& a) { - return vec_splat (a, 3); + return vec_splat(a, 3); } #endif diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc index f815ca9e344..ebc6d8fa4ec 100644 --- a/tensorflow/core/kernels/sparse_matmul_op_test.cc +++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc @@ -284,11 +284,11 @@ class SparseMatmulOpTest : public ::testing::Test { uint16_t* data3_bfloat16_p = reinterpret_cast(data3_bfloat16) + i; #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - data3_p[1] = 0; - data3_bfloat16_p[0] = data3_p[0]; + data3_p[1] = 0; + data3_bfloat16_p[0] = data3_p[0]; #else - data3_p[0] = 0; - data3_bfloat16_p[0] = data3_p[1]; + data3_p[0] = 0; + data3_bfloat16_p[0] = data3_p[1]; #endif } } diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc index 110376be425..96246c7a712 100644 --- a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc +++ b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc @@ -51,9 +51,9 @@ TEST_F(SparseReduceSumOpTest, SimpleReduce) { // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); AddInputFromArray(indices_shape, indices); @@ -93,9 +93,9 @@ TEST_F(SparseReduceSumSparseOpTest, SimpleReduce) { // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); AddInputFromArray(indices_shape, indices); diff --git a/tensorflow/core/kernels/sparse_softmax_op.cc b/tensorflow/core/kernels/sparse_softmax_op.cc index 327a94b8a12..444a5f657a9 100644 --- a/tensorflow/core/kernels/sparse_softmax_op.cc +++ b/tensorflow/core/kernels/sparse_softmax_op.cc @@ -50,8 +50,9 @@ class SparseSoftmaxOp : public OpKernel { errors::InvalidArgument( "Input sp_indices should be a matrix but received shape: ", indices_t->shape().DebugString())); - OP_REQUIRES(context, TensorShapeUtils::IsVector(values_t->shape()) && - TensorShapeUtils::IsVector(shape_t->shape()), + OP_REQUIRES(context, + TensorShapeUtils::IsVector(values_t->shape()) && + TensorShapeUtils::IsVector(shape_t->shape()), errors::InvalidArgument( "Inputs sp_values and sp_shape should be vectors " "but received shapes: ", diff --git a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc index b027adba6b3..09cb2a6a71c 100644 --- a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc +++ b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc @@ -132,14 +132,16 @@ class SparseSparseBinaryOpShared : public OpKernel { // Validations. OP_REQUIRES( - ctx, TensorShapeUtils::IsMatrix(a_indices_t->shape()) && - TensorShapeUtils::IsMatrix(b_indices_t->shape()), + ctx, + TensorShapeUtils::IsMatrix(a_indices_t->shape()) && + TensorShapeUtils::IsMatrix(b_indices_t->shape()), errors::InvalidArgument("Inputs a_indices and b_indices should be " "matrices but received shapes: ", a_indices_t->shape().DebugString(), ", ", b_indices_t->shape().DebugString())); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) && - TensorShapeUtils::IsVector(b_values_t->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(a_values_t->shape()) && + TensorShapeUtils::IsVector(b_values_t->shape()), errors::InvalidArgument( "Inputs a_values and b_values should be vectors " "but received shapes: ", @@ -157,8 +159,9 @@ class SparseSparseBinaryOpShared : public OpKernel { " non-empty input values, got ", a_values.size(), " and ", b_values.size())); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape_t->shape()) && - TensorShapeUtils::IsVector(b_shape_t->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(a_shape_t->shape()) && + TensorShapeUtils::IsVector(b_shape_t->shape()), errors::InvalidArgument( "Input shapes should be a vector but received shapes ", a_shape_t->shape().DebugString(), " and ", diff --git a/tensorflow/core/kernels/sparse_split_op.cc b/tensorflow/core/kernels/sparse_split_op.cc index 6171b532aa2..67dcf05a6ce 100644 --- a/tensorflow/core/kernels/sparse_split_op.cc +++ b/tensorflow/core/kernels/sparse_split_op.cc @@ -48,18 +48,20 @@ class SparseSplitOp : public OpKernel { "Input shape should be a vector but received shape ", input_shape.shape().DebugString())); - OP_REQUIRES(context, input_shape.dim_size(0) && - split_dim < input_shape.vec().size(), - errors::InvalidArgument( - "Input split_dim should be between 0 and rank (", - input_shape.vec().size(), "), got ", split_dim)); + OP_REQUIRES( + context, + input_shape.dim_size(0) && split_dim < input_shape.vec().size(), + errors::InvalidArgument( + "Input split_dim should be between 0 and rank (", + input_shape.vec().size(), "), got ", split_dim)); - OP_REQUIRES(context, num_split_ >= 1 && - num_split_ <= input_shape.vec()(split_dim), - errors::InvalidArgument("Input num_split should be between 1 " - "and the splitting dimension size (", - input_shape.vec()(split_dim), - "), got ", num_split_)); + OP_REQUIRES( + context, + num_split_ >= 1 && num_split_ <= input_shape.vec()(split_dim), + errors::InvalidArgument("Input num_split should be between 1 " + "and the splitting dimension size (", + input_shape.vec()(split_dim), "), got ", + num_split_)); sparse::SparseTensor sparse_tensor(input_indices, input_values, TensorShape(input_shape.vec())); diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc index 6a6cc3d8138..ba3da21a433 100644 --- a/tensorflow/core/kernels/sparse_to_dense_op.cc +++ b/tensorflow/core/kernels/sparse_to_dense_op.cc @@ -73,8 +73,9 @@ class SparseToDense : public OpKernel { // sparse_values const Tensor& sparse_values = c->input(2); const int64 num_values = sparse_values.NumElements(); - OP_REQUIRES(c, sparse_values.dims() == 0 || - (sparse_values.dims() == 1 && num_values == num_elems), + OP_REQUIRES(c, + sparse_values.dims() == 0 || + (sparse_values.dims() == 1 && num_values == num_elems), errors::InvalidArgument("sparse_values has incorrect shape ", sparse_values.shape().DebugString(), ", should be [] or [", num_elems, "]")); diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc index f0d19da8046..d8b0f930824 100644 --- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc +++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc @@ -38,7 +38,6 @@ namespace { class SparseToDenseTest : public OpsTestBase { protected: - void MakeOp(int dim, DataType index_type, DataType value_type) { TF_ASSERT_OK(NodeDefBuilder("sparsetodense", "SparseToDense") .Input(FakeInput(index_type)) diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc index c35ba42db29..f84ffd53238 100644 --- a/tensorflow/core/kernels/sparse_xent_op.cc +++ b/tensorflow/core/kernels/sparse_xent_op.cc @@ -39,10 +39,10 @@ Status CheckInvalidLabelIndex(const Tensor& labels, int64 max_index) { if (*min_max_dim_value.first < 0 || *min_max_dim_value.second >= max_index) { bad_index = (*min_max_dim_value.first < 0) ? *min_max_dim_value.first : *min_max_dim_value.second; - return errors::InvalidArgument("Received a label value of ", bad_index, - " which is outside the valid range of [0, ", - max_index, "). Label values: ", - labels.SummarizeValue(labels.NumElements())); + return errors::InvalidArgument( + "Received a label value of ", bad_index, + " which is outside the valid range of [0, ", max_index, + "). Label values: ", labels.SummarizeValue(labels.NumElements())); } return Status::OK(); } diff --git a/tensorflow/core/kernels/sparse_xent_op_test.cc b/tensorflow/core/kernels/sparse_xent_op_test.cc index b8ea0d2d7e2..afb0bf76267 100644 --- a/tensorflow/core/kernels/sparse_xent_op_test.cc +++ b/tensorflow/core/kernels/sparse_xent_op_test.cc @@ -41,10 +41,10 @@ static Graph* SparseXent(int batch_size, int num_classes) { return g; } -#define BM_SparseXentDev(BATCH, CLASS, DEVICE) \ - static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \ +#define BM_SparseXentDev(BATCH, CLASS, DEVICE) \ + static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \ testing::ItemsProcessed(static_cast(iters) * BATCH * CLASS); \ - test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters); \ + test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters); \ } \ BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE); diff --git a/tensorflow/core/kernels/split_lib.h b/tensorflow/core/kernels/split_lib.h index ff92ffeeb38..a08949e626c 100644 --- a/tensorflow/core/kernels/split_lib.h +++ b/tensorflow/core/kernels/split_lib.h @@ -57,7 +57,7 @@ struct Split { const Eigen::DSizes& slice_indices, const Eigen::DSizes& slice_sizes); }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc index 25026208d1e..771c633b156 100644 --- a/tensorflow/core/kernels/split_lib_cpu.cc +++ b/tensorflow/core/kernels/split_lib_cpu.cc @@ -49,13 +49,13 @@ void Split::operator()( typename TTypes::ConstTensor input, const Eigen::DSizes& slice_indices, const Eigen::DSizes& slice_sizes) { - output.device(d) = input.slice(slice_indices, slice_sizes); + output.device(d) = input.slice(slice_indices, slice_sizes); } #define DEFINE_SYCL_KERNELS(T) template struct Split; TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc index 78badde27e5..85f529326db 100644 --- a/tensorflow/core/kernels/split_op.cc +++ b/tensorflow/core/kernels/split_op.cc @@ -39,7 +39,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class SplitOpBase : public OpKernel { @@ -142,8 +142,9 @@ class SplitOpCPU : public SplitOpBase { // Android also uses int32 indexing, so check here also. OP_REQUIRES( - context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), + context, + FastBoundsCheck(input.NumElements(), + std::numeric_limits::max()), errors::InvalidArgument("Split requires input size < ", std::numeric_limits::max())); @@ -245,10 +246,11 @@ class SplitOpGPU : public SplitOpBase { const int32 split_dim = split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; const int32 num_split = Base::num_outputs(); - OP_REQUIRES(context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("Split on GPU requires input size " - "< max int32")); + OP_REQUIRES( + context, + FastBoundsCheck(input.NumElements(), std::numeric_limits::max()), + errors::InvalidArgument("Split on GPU requires input size " + "< max int32")); int32 prefix_dim_size; int32 split_dim_size; int32 suffix_dim_size; @@ -304,8 +306,9 @@ class SplitOpSYCL : public SplitOpBase { // Android also uses int32 indexing, so check here also. OP_REQUIRES( - context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), + context, + FastBoundsCheck(input.NumElements(), + std::numeric_limits::max()), errors::InvalidArgument("Split requires input size < ", std::numeric_limits::max())); @@ -342,14 +345,14 @@ class SplitOpSYCL : public SplitOpBase { {prefix_dim_size, split_dim_output_size, suffix_dim_size}); functor::Split()(context->eigen_device(), - result_shaped, input_reshaped, - slice_indices, slice_sizes); + result_shaped, input_reshaped, + slice_indices, slice_sizes); } indices[1] += split_dim_output_size; } } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define REGISTER_SPLIT(type) \ REGISTER_KERNEL_BUILDER(Name("Split") \ @@ -381,11 +384,11 @@ REGISTER_GPU(bfloat16); #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL(type) \ - REGISTER_KERNEL_BUILDER(Name("Split") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T") \ - .HostMemory("split_dim"), \ +#define REGISTER_SYCL(type) \ + REGISTER_KERNEL_BUILDER(Name("Split") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .HostMemory("split_dim"), \ SplitOpSYCL) TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL); diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc index f1078ac349c..7ff5df47d70 100644 --- a/tensorflow/core/kernels/split_v_op.cc +++ b/tensorflow/core/kernels/split_v_op.cc @@ -197,8 +197,9 @@ class SplitVOpCPU : public SplitVOpBase { // Android also uses int32 indexing, so check here also. OP_REQUIRES( - context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), + context, + FastBoundsCheck(input.NumElements(), + std::numeric_limits::max()), errors::InvalidArgument("Split requires input size < ", std::numeric_limits::max())); @@ -305,10 +306,11 @@ class SplitVOpGPU : public SplitVOpBase { const int32 split_dim_orig = context->input(2).flat()(0); const int32 split_dim = split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; - OP_REQUIRES(context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("Split on GPU requires input size " - "< max int32")); + OP_REQUIRES( + context, + FastBoundsCheck(input.NumElements(), std::numeric_limits::max()), + errors::InvalidArgument("Split on GPU requires input size " + "< max int32")); int32 prefix_dim_size; int32 split_dim_size; diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc index affe81a5556..65296f61fd1 100644 --- a/tensorflow/core/kernels/stack_ops.cc +++ b/tensorflow/core/kernels/stack_ops.cc @@ -42,7 +42,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class Stack : public ResourceBase { public: @@ -242,7 +242,7 @@ REGISTER_KERNEL_BUILDER(Name("StackV2") .HostMemory("max_size") .HostMemory("handle"), StackOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class StackPushOp : public AsyncOpKernel { @@ -274,11 +274,11 @@ class StackPushOp : public AsyncOpKernel { static constexpr int kCopyThreshold = 2048; static constexpr double kOccupancy = 0.7; if (swap_memory_ && !alloc_attrs.on_host() && - ( std::is_same::value + (std::is_same::value #ifdef TENSORFLOW_USE_SYCL - || std::is_same::value -#endif // TENSORFLOW_USE_SYCL - ) && + || std::is_same::value +#endif // TENSORFLOW_USE_SYCL + ) && tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) { DeviceContext* device_ctxt = ctx->op_device_context(); auto device = static_cast(ctx->device()); @@ -391,7 +391,7 @@ REGISTER_SYCL_HOST_KERNEL(int32); REGISTER_SYCL_HOST_KERNEL(bool); #undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_HOST_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class StackPopOp : public AsyncOpKernel { public: @@ -498,7 +498,7 @@ REGISTER_SYCL_HOST_KERNEL(bool); #undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_HOST_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class StackCloseOp : public OpKernel { public: @@ -526,6 +526,6 @@ REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER( Name("StackCloseV2").Device(DEVICE_SYCL).HostMemory("handle"), StackCloseOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc index 0fae46dea61..03fc4467a1d 100644 --- a/tensorflow/core/kernels/stage_op.cc +++ b/tensorflow/core/kernels/stage_op.cc @@ -70,12 +70,11 @@ class Buffer : public ResourceBase { return bytes + current_bytes_ > memory_limit_; } - std::size_t GetTupleBytes(const Tuple & tuple) - { + std::size_t GetTupleBytes(const Tuple& tuple) { return std::accumulate(tuple.begin(), tuple.end(), 0, - [](const std::size_t & lhs, const Tensor & rhs) { - return lhs + rhs.TotalBytes(); - }); + [](const std::size_t& lhs, const Tensor& rhs) { + return lhs + rhs.TotalBytes(); + }); } public: @@ -90,19 +89,22 @@ class Buffer : public ResourceBase { std::size_t tuple_bytes = GetTupleBytes(*tuple); // Sanity check so that we don't block for ever below - if(memory_limit_ > 0 && tuple_bytes > memory_limit_) { - return Status(errors::ResourceExhausted("Attempted to insert " - "tensors with combined size of '", tuple_bytes, "' bytes into " - "Staging Area with a memory limit of '", memory_limit_, "'.")); + if (memory_limit_ > 0 && tuple_bytes > memory_limit_) { + return Status( + errors::ResourceExhausted("Attempted to insert " + "tensors with combined size of '", + tuple_bytes, + "' bytes into " + "Staging Area with a memory limit of '", + memory_limit_, "'.")); } - // If buffer capacity is bounded wait until elements have been removed - if(IsBounded()) { + if (IsBounded()) { full_cond_var_.wait(lock, [tuple_bytes, this]() { // If there's a memory limit, check if there's space for insertion - bool memory_limit_valid = memory_limit_ > 0 ? - !WouldExceedMemoryLimit(tuple_bytes) : true; + bool memory_limit_valid = + memory_limit_ > 0 ? !WouldExceedMemoryLimit(tuple_bytes) : true; // If we're configured for capacity check if there's space for insertion bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true; @@ -186,8 +188,7 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) { ContainerInfo cinfo; // Lambda for creating the Staging Area - auto create_fn = [&ndef](Buffer** ret) -> Status - { + auto create_fn = [&ndef](Buffer** ret) -> Status { int64 capacity; int64 memory_limit; TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity)); @@ -196,7 +197,6 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) { return Status::OK(); }; - TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */)); TF_RETURN_IF_ERROR(rm->LookupOrCreate(cinfo.container(), cinfo.name(), buf, create_fn)); @@ -228,7 +228,7 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp); #endif #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class UnstageOp : public OpKernel { public: @@ -244,7 +244,8 @@ class UnstageOp : public OpKernel { buf->Get(&tuple); - OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(), + OP_REQUIRES( + ctx, tuple.size() == (size_t)ctx->num_outputs(), errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(), " vs. ", ctx->num_outputs())); @@ -260,7 +261,7 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp); #endif #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class StagePeekOp : public OpKernel { public: @@ -278,7 +279,8 @@ class StagePeekOp : public OpKernel { OP_REQUIRES_OK(ctx, buf->Peek(index, &tuple)); - OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(), + OP_REQUIRES( + ctx, tuple.size() == (size_t)ctx->num_outputs(), errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(), " vs. ", ctx->num_outputs())); @@ -288,17 +290,15 @@ class StagePeekOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), - StagePeekOp); +REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), StagePeekOp); #if GOOGLE_CUDA -REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index"). - Device(DEVICE_GPU), StagePeekOp); +REGISTER_KERNEL_BUILDER( + Name("StagePeek").HostMemory("index").Device(DEVICE_GPU), StagePeekOp); #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index") - .Device(DEVICE_SYCL), StagePeekOp); -#endif // TENSORFLOW_USE_SYCL - +REGISTER_KERNEL_BUILDER( + Name("StagePeek").HostMemory("index").Device(DEVICE_SYCL), StagePeekOp); +#endif // TENSORFLOW_USE_SYCL class StageSizeOp : public OpKernel { public: @@ -312,9 +312,8 @@ class StageSizeOp : public OpKernel { core::ScopedUnref scope(buf); // Allocate size output tensor - Tensor * size = nullptr; - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), - &size)); + Tensor* size = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size)); // Set it to the actual size size->scalar().setConstant(buf->Size()); @@ -323,13 +322,13 @@ class StageSizeOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp); #if GOOGLE_CUDA -REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size") - .Device(DEVICE_GPU), StageSizeOp); +REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size").Device(DEVICE_GPU), + StageSizeOp); #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size") - .Device(DEVICE_SYCL), StageSizeOp); -#endif // TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER( + Name("StageSize").HostMemory("size").Device(DEVICE_SYCL), StageSizeOp); +#endif // TENSORFLOW_USE_SYCL class StageClearOp : public OpKernel { public: @@ -352,7 +351,6 @@ REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp); #endif #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_SYCL), StageClearOp); -#endif // TENSORFLOW_USE_SYCL - +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc index 7c213e14d21..8f7f91c9df7 100644 --- a/tensorflow/core/kernels/strided_slice_op.cc +++ b/tensorflow/core/kernels/strided_slice_op.cc @@ -541,5 +541,5 @@ REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign") .HostMemory("strides"), StridedSliceAssignOp) #undef REGISTER_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h index a84ba38ef41..ac1259a9ac4 100644 --- a/tensorflow/core/kernels/strided_slice_op_impl.h +++ b/tensorflow/core/kernels/strided_slice_op_impl.h @@ -302,7 +302,7 @@ DECLARE_FOR_N_SYCL(int32); DECLARE_FOR_N_SYCL(int64); #undef DECLARE_FOR_N_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #undef INSTANTIATE #undef DECLARE_FOR_N_CPU diff --git a/tensorflow/core/kernels/string_join_op.cc b/tensorflow/core/kernels/string_join_op.cc index 721702bec68..28cca9f4484 100644 --- a/tensorflow/core/kernels/string_join_op.cc +++ b/tensorflow/core/kernels/string_join_op.cc @@ -50,9 +50,9 @@ class StringJoinOp : public OpKernel { } else { OP_REQUIRES( context, input_shape == input.shape(), - errors::InvalidArgument("Input shapes do not match: ", - input_shape.DebugString(), " vs. ", - input.shape().DebugString())); + errors::InvalidArgument( + "Input shapes do not match: ", input_shape.DebugString(), + " vs. ", input.shape().DebugString())); } } } diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc index 743f1131504..e29f67297f9 100644 --- a/tensorflow/core/kernels/substr_op.cc +++ b/tensorflow/core/kernels/substr_op.cc @@ -95,9 +95,9 @@ class SubstrOp : public OpKernel { // Create BCast helper with shape of input and pos/len BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(pos_shape)); OP_REQUIRES(context, bcast.IsValid(), - errors::InvalidArgument("Incompatible shapes: ", - input_shape.DebugString(), " vs. ", - pos_shape.DebugString())); + errors::InvalidArgument( + "Incompatible shapes: ", input_shape.DebugString(), + " vs. ", pos_shape.DebugString())); TensorShape output_shape = BCast::ToShape(bcast.result_shape()); int ndims = output_shape.dims(); Tensor* output_tensor = nullptr; diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc index 233b824bcc3..29b21ee7353 100644 --- a/tensorflow/core/kernels/summary_image_op.cc +++ b/tensorflow/core/kernels/summary_image_op.cc @@ -54,18 +54,20 @@ class SummaryImageOp : public OpKernel { const Tensor& tensor = c->input(1); OP_REQUIRES(c, IsLegacyScalar(tags.shape()), errors::InvalidArgument("Tags must be a scalar")); - OP_REQUIRES(c, tensor.dims() == 4 && - (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 || - tensor.dim_size(3) == 4), + OP_REQUIRES(c, + tensor.dims() == 4 && + (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 || + tensor.dim_size(3) == 4), errors::InvalidArgument( "Tensor must be 4-D with last dim 1, 3, or 4, not ", tensor.shape().DebugString())); const string& base_tag = tags.scalar()(); - OP_REQUIRES(c, tensor.dim_size(0) < (1LL << 31) && - tensor.dim_size(1) < (1LL << 31) && - tensor.dim_size(2) < (1LL << 31) && - (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29), + OP_REQUIRES(c, + tensor.dim_size(0) < (1LL << 31) && + tensor.dim_size(1) < (1LL << 31) && + tensor.dim_size(2) < (1LL << 31) && + (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29), errors::InvalidArgument("Tensor too large for summary ", tensor.shape().DebugString())); diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc index b818724ec2e..1f4e3418f48 100644 --- a/tensorflow/core/kernels/summary_op.cc +++ b/tensorflow/core/kernels/summary_op.cc @@ -41,11 +41,12 @@ class SummaryScalarOp : public OpKernel { const Tensor& values = c->input(1); OP_REQUIRES( - c, tags.IsSameSize(values) || - (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())), - errors::InvalidArgument("tags and values not the same shape: ", - tags.shape().DebugString(), " != ", - values.shape().DebugString(), SingleTag(tags))); + c, + tags.IsSameSize(values) || + (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())), + errors::InvalidArgument( + "tags and values not the same shape: ", tags.shape().DebugString(), + " != ", values.shape().DebugString(), SingleTag(tags))); auto Ttags = tags.flat(); auto Tvalues = values.flat(); Summary s; diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc index b2fd669541d..f8144867014 100644 --- a/tensorflow/core/kernels/tile_functor_cpu.cc +++ b/tensorflow/core/kernels/tile_functor_cpu.cc @@ -15,10 +15,10 @@ limitations under the License. #define EIGEN_USE_THREADS -#include "tensorflow/core/kernels/tile_functor.h" #include "tensorflow/core/framework/attr_value.pb.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/tile_functor.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h index 054b31ef9e0..df6a666cd44 100644 --- a/tensorflow/core/kernels/tile_ops_cpu_impl.h +++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h @@ -63,7 +63,7 @@ TF_CALL_int64(DEFINE_TYPE); #undef DEFINE_DIM #undef DEFINE_TYPE -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace functor } // end namespace tensorflow diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index 38e77ab60fb..07befa27bc5 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -3279,7 +3279,6 @@ REGISTER_KERNELS(double, int64); #undef REGISTER_KERNELS - template class ApplyAddSignOp : public OpKernel { public: @@ -3362,17 +3361,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { -#define DECLARE_GPU_SPEC(T) \ - template <> \ - void ApplyAddSign::operator()( \ - const GPUDevice& d, \ - typename TTypes::Flat var, \ - typename TTypes::Flat m, \ - typename TTypes::ConstScalar lr, \ - typename TTypes::ConstScalar alpha, \ - typename TTypes::ConstScalar sign_decay, \ - typename TTypes::ConstScalar beta, \ - typename TTypes::ConstFlat grad); \ +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyAddSign::operator()( \ + const GPUDevice& d, typename TTypes::Flat var, \ + typename TTypes::Flat m, typename TTypes::ConstScalar lr, \ + typename TTypes::ConstScalar alpha, \ + typename TTypes::ConstScalar sign_decay, \ + typename TTypes::ConstScalar beta, \ + typename TTypes::ConstFlat grad); \ extern template struct ApplyAddSign; DECLARE_GPU_SPEC(Eigen::half); DECLARE_GPU_SPEC(float); @@ -3387,7 +3384,6 @@ REGISTER_KERNELS(GPU, double); #undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS - template class ApplyPowerSignOp : public OpKernel { public: @@ -3470,17 +3466,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { -#define DECLARE_GPU_SPEC(T) \ - template <> \ - void ApplyPowerSign::operator()( \ - const GPUDevice& d, \ - typename TTypes::Flat var, \ - typename TTypes::Flat m, \ - typename TTypes::ConstScalar lr, \ - typename TTypes::ConstScalar logbase, \ - typename TTypes::ConstScalar sign_decay, \ - typename TTypes::ConstScalar beta, \ - typename TTypes::ConstFlat grad); \ +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyPowerSign::operator()( \ + const GPUDevice& d, typename TTypes::Flat var, \ + typename TTypes::Flat m, typename TTypes::ConstScalar lr, \ + typename TTypes::ConstScalar logbase, \ + typename TTypes::ConstScalar sign_decay, \ + typename TTypes::ConstScalar beta, \ + typename TTypes::ConstFlat grad); \ extern template struct ApplyPowerSign; DECLARE_GPU_SPEC(Eigen::half); DECLARE_GPU_SPEC(float); diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc index d443a6b3c1d..0376a3b2c60 100644 --- a/tensorflow/core/kernels/training_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc @@ -17,8 +17,8 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/kernels/training_ops.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/training_ops.h" namespace tensorflow { @@ -115,13 +115,11 @@ struct ApplyAdam { Eigen::Sizes<1> single; const auto one = static_cast(1.0); m.device(d) = - m + - (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) * - (grad - m); + m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) * + (grad - m); v.device(d) = - v + - (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) * - (grad.square() - v); + v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) * + (grad.square() - v); if (use_nesterov) { var.device(d) -= @@ -157,9 +155,9 @@ struct ApplyRMSProp { bcast[0] = grad.dimension(0); Eigen::Sizes<1> single; const auto one = static_cast(1.0); - ms.device(d) = ms + - (rho.constant(one) - rho).reshape(single).broadcast(bcast) * - (grad.square() - ms); + ms.device(d) = + ms + (rho.constant(one) - rho).reshape(single).broadcast(bcast) * + (grad.square() - ms); mom.device(d) = mom * momentum.reshape(single).broadcast(bcast) + lr.reshape(single).broadcast(bcast) * grad / @@ -212,7 +210,7 @@ struct ApplyAddSign { auto beta_bcast = beta.reshape(single).broadcast(bcast); auto one_minus_beta = (beta.constant(one) - beta).reshape(single).broadcast(bcast); - m.device(d) = m * beta_bcast + grad * one_minus_beta; + m.device(d) = m * beta_bcast + grad * one_minus_beta; // The following is the GPU equivalent of the CPU version: // var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad; @@ -244,7 +242,7 @@ struct ApplyPowerSign { auto beta_bcast = beta.reshape(single).broadcast(bcast); auto one_minus_beta = (beta.constant(one) - beta).reshape(single).broadcast(bcast); - m.device(d) = m * beta_bcast + grad * one_minus_beta; + m.device(d) = m * beta_bcast + grad * one_minus_beta; // The following is the GPU equivalent of the CPU version: // auto grad_scale = (logbase() * sign_decay() * sign_gm).exp(); @@ -253,7 +251,7 @@ struct ApplyPowerSign { auto lr_bcast = lr.reshape(single).broadcast(bcast); auto logbase_bcast = logbase.reshape(single).broadcast(bcast); auto sign_decay_bcast = sign_decay.reshape(single).broadcast(bcast); - auto grad_scale = (logbase_bcast * sign_decay_bcast * sign_gm).exp(); + auto grad_scale = (logbase_bcast * sign_decay_bcast * sign_gm).exp(); var.device(d) -= lr_bcast * grad_scale * grad; } }; diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc index ffa7f87c9ef..2dcc4a500e6 100644 --- a/tensorflow/core/kernels/training_ops_test.cc +++ b/tensorflow/core/kernels/training_ops_test.cc @@ -176,8 +176,9 @@ static void Adam(int32 n, Graph** init_g, Graph** train_g) { auto beta2 = Scalar(g, 0.99); auto epsilon = Scalar(g, 1e-8); auto grad = Random(g, n); - test::graph::Multi(g, "ApplyAdam", {var, m, v, beta1_power, beta2_power, lr, - beta1, beta2, epsilon, grad}); + test::graph::Multi( + g, "ApplyAdam", + {var, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad}); *train_g = g; } } diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc index 2e0d18b634a..7177ad78884 100644 --- a/tensorflow/core/kernels/transpose_op.cc +++ b/tensorflow/core/kernels/transpose_op.cc @@ -176,9 +176,10 @@ void TransposeOp::Compute(OpKernelContext* ctx) { } } for (int i = 0; i < dims; ++i) { - OP_REQUIRES(ctx, bits[i], errors::InvalidArgument( - i, " is missing from {", - str_util::Join(permutation, ","), "}.")); + OP_REQUIRES( + ctx, bits[i], + errors::InvalidArgument(i, " is missing from {", + str_util::Join(permutation, ","), "}.")); } // 0-D, 1-D, and identity transposes do nothing. diff --git a/tensorflow/core/kernels/typed_queue.h b/tensorflow/core/kernels/typed_queue.h index 0d608d9b879..43dcb4cef74 100644 --- a/tensorflow/core/kernels/typed_queue.h +++ b/tensorflow/core/kernels/typed_queue.h @@ -58,9 +58,9 @@ Status TypedQueue::Initialize() { if (!component_shapes_.empty() && component_dtypes_.size() != component_shapes_.size()) { return errors::InvalidArgument( - "Different number of component types. ", "Types: ", - DataTypeSliceString(component_dtypes_), ", Shapes: ", - ShapeListString(component_shapes_)); + "Different number of component types. ", + "Types: ", DataTypeSliceString(component_dtypes_), + ", Shapes: ", ShapeListString(component_shapes_)); } mutex_lock lock(mu_); diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc index 397bdd56708..764b6a252ad 100644 --- a/tensorflow/core/kernels/unpack_op.cc +++ b/tensorflow/core/kernels/unpack_op.cc @@ -34,7 +34,7 @@ typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class UnpackOp : public OpKernel { @@ -65,8 +65,9 @@ class UnpackOp : public OpKernel { output_shape.RemoveDim(axis); const int64 output_size = output_shape.num_elements(); OP_REQUIRES( - context, FastBoundsCheck(output_size, - std::numeric_limits::max()), + context, + FastBoundsCheck(output_size, + std::numeric_limits::max()), errors::InvalidArgument("output size must fit in Eigen DenseIndex")); // This optimization is currently not applicable for SYCL devices diff --git a/tensorflow/core/kernels/word2vec_kernels.cc b/tensorflow/core/kernels/word2vec_kernels.cc index 2d05d72bff1..3477445197a 100644 --- a/tensorflow/core/kernels/word2vec_kernels.cc +++ b/tensorflow/core/kernels/word2vec_kernels.cc @@ -188,9 +188,9 @@ class SkipgramOp : public OpKernel { ++corpus_size_; } if (corpus_size_ < window_size_ * 10) { - return errors::InvalidArgument("The text file ", filename, - " contains too little data: ", - corpus_size_, " words"); + return errors::InvalidArgument( + "The text file ", filename, + " contains too little data: ", corpus_size_, " words"); } typedef std::pair WordFreq; std::vector ordered; diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc index 0f8d027caad..a6a71fdfaf1 100644 --- a/tensorflow/core/kernels/xent_op.cc +++ b/tensorflow/core/kernels/xent_op.cc @@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class SoftmaxXentWithLogitsOp : public OpKernel { @@ -44,8 +44,8 @@ class SoftmaxXentWithLogitsOp : public OpKernel { OP_REQUIRES(context, logits_in.IsSameSize(labels_in), errors::InvalidArgument( "logits and labels must be same size: logits_size=", - logits_in.shape().DebugString(), " labels_size=", - labels_in.shape().DebugString())); + logits_in.shape().DebugString(), + " labels_size=", labels_in.shape().DebugString())); OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()), errors::InvalidArgument("logits must be 2-dimensional")); // As we already tested that both inputs have the same shape no need to @@ -72,7 +72,7 @@ class SoftmaxXentWithLogitsOp : public OpKernel { functor(context->eigen_device(), logits_in.matrix(), labels_in.matrix(), scratch.matrix(), loss_out->vec(), back_out->matrix()); - } + } } }; @@ -87,7 +87,7 @@ struct XentFunctorBase { typename TTypes::Vec loss, typename TTypes::Matrix backprop) { XentEigenImpl::Compute(d, logits, labels, scratch, loss, - backprop); + backprop); } }; @@ -97,7 +97,7 @@ struct XentFunctor : XentFunctorBase {}; #ifdef TENSORFLOW_USE_SYCL template struct XentFunctor : XentFunctorBase {}; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor #define REGISTER_CPU(T) \ @@ -129,6 +129,6 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") .Device(DEVICE_SYCL) .TypeConstraint("T"), SoftmaxXentWithLogitsOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc index e2947012467..481f3b7ba46 100644 --- a/tensorflow/core/kernels/xsmm_conv2d_test.cc +++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc @@ -13,18 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/conv_ops.h" -#include "tensorflow/core/platform/test.h" -#include "tensorflow/core/graph/graph.h" -#include "tensorflow/core/graph/node_builder.h" -#include "tensorflow/core/kernels/ops_testutil.h" #include "include/libxsmm.h" #include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/kernels/conv_ops.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/platform/test.h" namespace tensorflow { namespace { - typedef struct { int nImg; int nIfm; @@ -49,45 +48,41 @@ typedef struct { int stride_w; } naive_conv_t; - -LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor &nhwc, int N, int H, int W, int C) -{ - LIBXSMM_VLA_DECL(4, const float, input, nchw, C, H, W); +LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor& nhwc, + int N, int H, int W, int C) { + LIBXSMM_VLA_DECL(4, const float, input, nchw, C, H, W); int n, h, w, c; - auto output = nhwc.flat(); - for ( n = 0; n < N; n++ ) { - for ( h = 0; h < H; h++ ) { - for ( w = 0; w < W; w++ ) { - for ( c = 0; c < C; c++ ) { - output(n*H*W*C + h*W*C +w*C + c) = - LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W); + auto output = nhwc.flat(); + for (n = 0; n < N; n++) { + for (h = 0; h < H; h++) { + for (w = 0; w < W; w++) { + for (c = 0; c < C; c++) { + output(n * H * W * C + h * W * C + w * C + c) = + LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W); } } } } } - -LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor &rsck, int R, int S, int C, int K) -{ - LIBXSMM_VLA_DECL(4, const float, input, kcrs, C, R, S); +LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor& rsck, + int R, int S, int C, int K) { + LIBXSMM_VLA_DECL(4, const float, input, kcrs, C, R, S); int r, s, c, k; - auto output = rsck.flat(); + auto output = rsck.flat(); - for ( r = 0; r < R; r++ ) { - for ( s = 0; s < S; s++ ) { - for ( c = 0; c < C; c++ ) { - for ( k = 0; k < K; k++ ) { - output(r*S*C*K + s*C*K + c*K + k) = - LIBXSMM_VLA_ACCESS(4, input, k, c, r, s, C, R, S); + for (r = 0; r < R; r++) { + for (s = 0; s < S; s++) { + for (c = 0; c < C; c++) { + for (k = 0; k < K; k++) { + output(r * S * C * K + s * C * K + c * K + k) = + LIBXSMM_VLA_ACCESS(4, input, k, c, r, s, C, R, S); } } } } } - - LIBXSMM_INLINE void zero_buf(float* buf, long size) { int i; for (i = 0; i < size; ++i) { @@ -95,52 +90,53 @@ LIBXSMM_INLINE void zero_buf(float* buf, long size) { } } -LIBXSMM_INLINE void copy_buf(Tensor &dst,float *src,long size) { - long i; - auto output = dst.flat(); - for (i = 0; i < size; ++i) - output(i) = src[i]; +LIBXSMM_INLINE void copy_buf(Tensor& dst, float* src, long size) { + long i; + auto output = dst.flat(); + for (i = 0; i < size; ++i) output(i) = src[i]; } -LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne) -{ +LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne) { int i; zero_buf(buf, size); for (i = 0; i < size; ++i) { - buf[i] = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? drand48() : (0.05 - drand48()/10.0))); + buf[i] = + (float)((initOne != 0) + ? 1.0 + : ((initPos != 0) ? drand48() : (0.05 - drand48() / 10.0))); } } - - -LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float* output, const float* filter) -{ - int nImg = param->nImg; - int nIfm = param->nIfm; - int nOfm = param->nOfm; - int ifhp = param->ifhp; - int ifwp = param->ifwp; - int ofhp = param->ofhp; - int ofwp = param->ofwp; - int ifh = param->ifh; - int ifw = param->ifw; - int ofh = param->ofh; - int ofw = param->ofw; - int pad_h = param->pad_h; - int pad_w = param->pad_w; - int pad_h_in = param->pad_h_in; - int pad_w_in = param->pad_w_in; +LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, + float* output, const float* filter) { + int nImg = param->nImg; + int nIfm = param->nIfm; + int nOfm = param->nOfm; + int ifhp = param->ifhp; + int ifwp = param->ifwp; + int ofhp = param->ofhp; + int ofwp = param->ofwp; + int ifh = param->ifh; + int ifw = param->ifw; + int ofh = param->ofh; + int ofw = param->ofw; + int pad_h = param->pad_h; + int pad_w = param->pad_w; + int pad_h_in = param->pad_h_in; + int pad_w_in = param->pad_w_in; int pad_h_out = param->pad_h_out; int pad_w_out = param->pad_w_out; - int kh = param->kh; - int kw = param->kw; - int stride_h = param->stride_h; - int stride_w = param->stride_w; + int kh = param->kh; + int kw = param->kw; + int stride_h = param->stride_h; + int stride_w = param->stride_w; /* loop counters */ int img, ofm, ifm, oj, oi, ij, ii, kj, ki; - LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp); - LIBXSMM_VLA_DECL(4, const float, input_t, input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp); + LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_w_out * ofwp + pad_h_out), + nOfm, ofhp, ofwp); + LIBXSMM_VLA_DECL(4, const float, input_t, + input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp); LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw); for (img = 0; img < nImg; ++img) { @@ -151,12 +147,15 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float for (oi = 0; oi < ofw; ++oi) { ii = oi * stride_w - pad_w; for (kj = 0; kj < kh; ++kj) { - if(ij+kj < 0 || ij+kj >= ifh) continue; + if (ij + kj < 0 || ij + kj >= ifh) continue; for (ki = 0; ki < kw; ++ki) { - if(ii+ki < 0 || ii+ki >= ifw) continue; - LIBXSMM_VLA_ACCESS( 4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) += - LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp) - * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw); + if (ii + ki < 0 || ii + ki >= ifw) continue; + LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp, + ofwp) += + LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki, + nIfm, ifhp, ifwp) * + LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, + kw); } } } @@ -168,134 +167,118 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float void RunXsmmVsGeneric() {} - class XsmmConv2DTest : public OpsTestBase { protected: void MakeOp(int stride) { - TF_CHECK_OK(NodeDefBuilder("xsmm", "Conv2D") - .Input(FakeInput(DT_FLOAT)) - .Input(FakeInput(DT_FLOAT)) - .Attr("strides", {1, stride,stride, 1}) - .Attr("padding", "VALID" ) - .Finalize(node_def())); - + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Attr("strides", {1, stride, stride, 1}) + .Attr("padding", "VALID") + .Finalize(node_def())); TF_ASSERT_OK(InitOp()); } }; TEST_F(XsmmConv2DTest, Basic) { - MakeOp(1); + MakeOp(1); - // setup scoped allocator, which uses cpu_allocator() for this scope - const libxsmm_tf_allocator tf_allocator; + // setup scoped allocator, which uses cpu_allocator() for this scope + const libxsmm_tf_allocator tf_allocator; - int ifw = 14; /* input width, "W" */ - int ifh = 14; /* input height, "H" */ - int nImg = 32; /* mini-batch size, "N" */ - int nIfm = 64; /* number of input feature maps, "C" */ - int nOfm = 64; /* number of output feature maps, "K" */ - int kh = 3; /* filter height, "R" */ - int kw = 3; /* filter width, "S" */ - int pad = 0; /* padding in output */ - int stride = 1; /* stride when accessing inputs */ + int ifw = 14; /* input width, "W" */ + int ifh = 14; /* input height, "H" */ + int nImg = 32; /* mini-batch size, "N" */ + int nIfm = 64; /* number of input feature maps, "C" */ + int nOfm = 64; /* number of output feature maps, "K" */ + int kh = 3; /* filter height, "R" */ + int kw = 3; /* filter width, "S" */ + int pad = 0; /* padding in output */ + int stride = 1; /* stride when accessing inputs */ + int stride_w = stride; + int stride_h = stride; + int pad_h = pad; + int pad_w = pad; - int stride_w = stride; - int stride_h = stride; - int pad_h = pad; - int pad_w = pad; + int pad_h_in = pad_h; + int pad_w_in = pad_w; - int pad_h_in = pad_h; - int pad_w_in = pad_w; - - int pad_h_out = 0; - int pad_w_out = 0; + int pad_h_out = 0; + int pad_w_out = 0; /* deriving some values for naive code */ - int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1; - int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1; - int ifhp = ifh + 2 * pad_h_in; - int ifwp = ifw + 2 * pad_w_in; - int ofhp = ofh + 2 * pad_h_out; - int ofwp = ofw + 2 * pad_w_out; + int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1; + int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1; + int ifhp = ifh + 2 * pad_h_in; + int ifwp = ifw + 2 * pad_w_in; + int ofhp = ofh + 2 * pad_h_out; + int ofwp = ofw + 2 * pad_w_out; + // Initialization of Filter and Image - //Initialization of Filter and Image + /* allocate data */ + float* naive_input = (float*)libxsmm_aligned_scratch( + nImg * nIfm * ifhp * ifwp * sizeof(float), 2097152); + float* naive_output = (float*)libxsmm_aligned_scratch( + nImg * nOfm * ofhp * ofwp * sizeof(float), 2097152); + float* naive_filter = (float*)libxsmm_aligned_scratch( + nOfm * nIfm * kh * kw * sizeof(float), 2097152); + /* initialize data */ + init_buf(naive_input, nImg * nIfm * ifhp * ifwp, 0, 0); + zero_buf(naive_output, nImg * nOfm * ofhp * ofwp); + init_buf(naive_filter, nOfm * nIfm * kh * kw, 0, 0); - /* allocate data */ - float *naive_input = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); - float *naive_output = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); - float *naive_filter = (float*)libxsmm_aligned_scratch( nOfm*nIfm*kh*kw* sizeof(float), 2097152); - /* initialize data */ - init_buf(naive_input, nImg*nIfm*ifhp*ifwp, 0, 0); - zero_buf(naive_output, nImg*nOfm*ofhp*ofwp); - init_buf(naive_filter, nOfm*nIfm*kh*kw, 0, 0); + Tensor image(DT_FLOAT, {nImg, ifhp, ifwp, nIfm}); + Tensor filter(DT_FLOAT, {kh, kw, nIfm, nOfm}); - Tensor image(DT_FLOAT, - {nImg, ifhp, ifwp, nIfm}); + naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm); + naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm); + // Run naive convolution - Tensor filter(DT_FLOAT, {kh,kw,nIfm,nOfm}); + naive_conv_t naive_param; + naive_param.nImg = nImg; + naive_param.nIfm = nIfm; + naive_param.nOfm = nOfm; + naive_param.ifhp = ifhp; + naive_param.ifwp = ifwp; + naive_param.ofhp = ofhp; + naive_param.ofwp = ofwp; + naive_param.ifh = ifh; + naive_param.ifw = ifw; + naive_param.ofh = ofh; + naive_param.ofw = ofw; + naive_param.pad_h = pad_h; + naive_param.pad_w = pad_w; + naive_param.pad_h_in = pad_h_in; + naive_param.pad_w_in = pad_w_in; + naive_param.pad_h_out = pad_h_out; + naive_param.pad_w_out = pad_w_out; + naive_param.kh = kh; + naive_param.kw = kw; + naive_param.stride_h = stride_h; + naive_param.stride_w = stride_w; - naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm); - naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm); + naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter); + AddInputFromArray(image.shape(), image.flat()); + AddInputFromArray(filter.shape(), filter.flat()); - //Run naive convolution - - naive_conv_t naive_param; - - naive_param.nImg = nImg; - naive_param.nIfm = nIfm; - naive_param.nOfm = nOfm; - naive_param.ifhp = ifhp; - naive_param.ifwp = ifwp; - naive_param.ofhp = ofhp; - naive_param.ofwp = ofwp; - naive_param.ifh = ifh; - naive_param.ifw = ifw; - naive_param.ofh = ofh; - naive_param.ofw = ofw; - naive_param.pad_h = pad_h; - naive_param.pad_w = pad_w; - naive_param.pad_h_in = pad_h_in; - naive_param.pad_w_in = pad_w_in; - naive_param.pad_h_out = pad_h_out; - naive_param.pad_w_out = pad_w_out; - naive_param.kh = kh; - naive_param.kw = kw; - naive_param.stride_h = stride_h; - naive_param.stride_w = stride_w; - - - naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter); - - - - AddInputFromArray(image.shape(), image.flat()); - AddInputFromArray(filter.shape(), filter.flat()); - - - - //Run Op (TF) - TF_ASSERT_OK(RunOpKernel()); - - // Check the output. - Tensor expected(DT_FLOAT, {nImg,ofhp,ofwp, nOfm}); - naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm); - - - test::ExpectTensorNear(expected, *GetOutput(0), 1e-5); - libxsmm_free(naive_input); - libxsmm_free(naive_output); - libxsmm_free(naive_filter); - + // Run Op (TF) + TF_ASSERT_OK(RunOpKernel()); + // Check the output. + Tensor expected(DT_FLOAT, {nImg, ofhp, ofwp, nOfm}); + naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm); + test::ExpectTensorNear(expected, *GetOutput(0), 1e-5); + libxsmm_free(naive_input); + libxsmm_free(naive_output); + libxsmm_free(naive_filter); } /* @@ -325,7 +308,8 @@ TEST(XsmmConv2DTest, Basic) { desc.threads = num_threads; desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC; - desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK; + desc.filter_format = +LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK; desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; desc.options = LIBXSMM_DNN_CONV_OPTION_NONE; desc.datatype = LIBXSMM_DNN_DATATYPE_F32; diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc index 279a5876f96..fb9e8ad50c6 100644 --- a/tensorflow/core/ops/array_ops.cc +++ b/tensorflow/core/ops/array_ops.cc @@ -977,8 +977,8 @@ REGISTER_OP("GatherNd") if (c->Value(r_dim) > c->Rank(params)) { return errors::InvalidArgument( "indices.shape[-1] must be <= params.rank, but saw indices shape: ", - c->DebugString(indices), " and params shape: ", - c->DebugString(params)); + c->DebugString(indices), + " and params shape: ", c->DebugString(params)); } // Remove r_dim from indices to get output. @@ -1252,12 +1252,12 @@ REGISTER_OP("ReverseSequence") // Validate batch_dim and seq_dim against input. const int32 input_rank = c->Rank(input); if (batch_dim >= input_rank) { - return errors::InvalidArgument("batch_dim must be < input rank: ", - batch_dim, " vs. ", input_rank); + return errors::InvalidArgument( + "batch_dim must be < input rank: ", batch_dim, " vs. ", input_rank); } if (seq_dim >= input_rank) { - return errors::InvalidArgument("seq_dim must be < input rank: ", - seq_dim, " vs. ", input_rank); + return errors::InvalidArgument( + "seq_dim must be < input rank: ", seq_dim, " vs. ", input_rank); } DimensionHandle batch_dim_dim = c->Dim(input, batch_dim); @@ -2638,8 +2638,9 @@ Status ScatterNdShape(InferenceContext* c) { Status s = c->Merge(prefix_indices, prefix_updates, &unused); if (!s.ok()) { return errors::InvalidArgument( - "The outer ", outer_dims, " dimensions of indices.shape=", - c->DebugString(indices_shape), " must match the outer ", outer_dims, + "The outer ", outer_dims, + " dimensions of indices.shape=", c->DebugString(indices_shape), + " must match the outer ", outer_dims, " dimensions of updates.shape=", c->DebugString(updates_shape), ": ", s.error_message()); } diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc index a182fd1c475..86d64635f4c 100644 --- a/tensorflow/core/ops/array_ops_test.cc +++ b/tensorflow/core/ops/array_ops_test.cc @@ -142,8 +142,13 @@ TEST(ArrayOpsTest, Const_ShapeFn) { TEST(ArrayOpsTest, UnchangedShapes_ShapeFn) { for (const char* op_name : { - "CheckNumerics", "Identity", "RefIdentity", "QuantizeAndDequantize", - "StopGradient", "ZerosLike", "OnesLike", + "CheckNumerics", + "Identity", + "RefIdentity", + "QuantizeAndDequantize", + "StopGradient", + "ZerosLike", + "OnesLike", }) { ShapeInferenceTestOp op(op_name); INFER_OK(op, "?", "in0"); diff --git a/tensorflow/core/ops/candidate_sampling_ops_test.cc b/tensorflow/core/ops/candidate_sampling_ops_test.cc index c79b4439148..f3673716040 100644 --- a/tensorflow/core/ops/candidate_sampling_ops_test.cc +++ b/tensorflow/core/ops/candidate_sampling_ops_test.cc @@ -23,9 +23,12 @@ namespace tensorflow { TEST(CandidateSamplerOpsTest, CandidateSampler_ShapeFn) { for (const char* op_name : { - "AllCandidateSampler", "FixedUnigramCandidateSampler", - "LearnedUnigramCandidateSampler", "LogUniformCandidateSampler", - "ThreadUnsafeUnigramCandidateSampler", "UniformCandidateSampler", + "AllCandidateSampler", + "FixedUnigramCandidateSampler", + "LearnedUnigramCandidateSampler", + "LogUniformCandidateSampler", + "ThreadUnsafeUnigramCandidateSampler", + "UniformCandidateSampler", }) { ShapeInferenceTestOp op(op_name); TF_ASSERT_OK(NodeDefBuilder("test", op.name) diff --git a/tensorflow/core/ops/functional_grad.cc b/tensorflow/core/ops/functional_grad.cc index 6df3536795c..eeccb72da65 100644 --- a/tensorflow/core/ops/functional_grad.cc +++ b/tensorflow/core/ops/functional_grad.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/framework/function.h" #include +#include "tensorflow/core/framework/function.h" #include "tensorflow/core/lib/core/errors.h" namespace tensorflow { diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index dd484c3ee75..872ebe98c1f 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -1172,12 +1172,12 @@ Status RangeSize(const Tensor* start_t, const Tensor* limit_t, T limit = limit_t->scalar()(); T delta = delta_t->scalar()(); if (start > limit && delta > 0) { - return errors::InvalidArgument("Requires start <= limit when delta > 0: ", - start, "/", limit); + return errors::InvalidArgument( + "Requires start <= limit when delta > 0: ", start, "/", limit); } if (start < limit && delta < 0) { - return errors::InvalidArgument("Requires start >= limit when delta < 0: ", - start, "/", limit); + return errors::InvalidArgument( + "Requires start >= limit when delta < 0: ", start, "/", limit); } if (delta == 0) { return errors::InvalidArgument("Requires delta != 0"); diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index 3f72b415699..62661fe4bd5 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -1155,9 +1155,9 @@ Status TopKShapeFn(InferenceContext* c) { DimensionHandle last_dim = c->Dim(input, -1); if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) && c->Value(last_dim) < c->Value(k_dim)) { - return errors::InvalidArgument("input must have last dimension >= k = ", - c->Value(k_dim), " but is ", - c->Value(last_dim)); + return errors::InvalidArgument( + "input must have last dimension >= k = ", c->Value(k_dim), " but is ", + c->Value(last_dim)); } // Replace last_dim with k_dim. @@ -1211,9 +1211,9 @@ REGISTER_OP("NthElement") DimensionHandle last_dim = c->Dim(input, -1); if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) && c->Value(last_dim) <= c->Value(n_dim)) { - return errors::InvalidArgument("Input must have last dimension > n = ", - c->Value(n_dim), " but is ", - c->Value(last_dim)); + return errors::InvalidArgument( + "Input must have last dimension > n = ", c->Value(n_dim), + " but is ", c->Value(last_dim)); } // Reduce last_dim for output tensor diff --git a/tensorflow/core/ops/sdca_ops.cc b/tensorflow/core/ops/sdca_ops.cc index e67d95fa8cb..4025070adb2 100644 --- a/tensorflow/core/ops/sdca_ops.cc +++ b/tensorflow/core/ops/sdca_ops.cc @@ -19,8 +19,8 @@ limitations under the License. namespace tensorflow { -using shape_inference::ShapeHandle; using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; // -------------------------------------------------------------------------- static Status ApplySdcaOptimizerShapeFn(InferenceContext* c) { diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc index 8beb28de0a2..e4c5bcfb540 100644 --- a/tensorflow/core/ops/string_ops.cc +++ b/tensorflow/core/ops/string_ops.cc @@ -137,9 +137,9 @@ REGISTER_OP("Substr") DimensionHandle pos_dim = c->Dim(pos_shape, i); DimensionHandle len_dim = c->Dim(len_shape, i); if (c->Value(pos_dim) != c->Value(len_dim)) { - return errors::InvalidArgument("pos and len shapes must match: ", - c->DebugString(pos_shape), " vs. ", - c->DebugString(len_shape)); + return errors::InvalidArgument( + "pos and len shapes must match: ", c->DebugString(pos_shape), + " vs. ", c->DebugString(len_shape)); } } // c->input(0) is the ShapeHandle to input strings diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc index de4e3cd9e70..0f309c1f4e9 100644 --- a/tensorflow/core/ops/training_ops_test.cc +++ b/tensorflow/core/ops/training_ops_test.cc @@ -24,7 +24,7 @@ static void TestGradAndIndicesErrorHandling(const ShapeInferenceTestOp& op, string shape_spec_middle, const string& shape_spec_end = "") { auto shape_spec = [&shape_spec_middle, shape_spec_end]( - const char* var_spec, const char* grad_indices_spec) { + const char* var_spec, const char* grad_indices_spec) { return strings::StrCat(var_spec, ";", shape_spec_middle, ";", grad_indices_spec, shape_spec_end); }; diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 01b3e92d2d9..a323d5bc393 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -2668,6 +2668,7 @@ cuda_py_test( ":nn_ops_gen", "//third_party/py/numpy", ], + shard_count = 4, tags = ["no_windows"], ) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index b107670275c..e3a52141a05 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -2103,6 +2103,10 @@ class Operation(object): logging.warning("Operation._control_inputs is private, use " "Operation.control_inputs instead. " "Operation._control_inputs will eventually be removed.") + # Copy value because it may be self._control_inputs_val (in particular if + # this is called from self._control_inputs += ...), and we don't want to + # clear value below. + value = copy.copy(value) self._remove_all_control_inputs() self._add_control_inputs(value) diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py index be72c194072..bb3f6970e4f 100644 --- a/tensorflow/python/kernel_tests/softmax_op_test.py +++ b/tensorflow/python/kernel_tests/softmax_op_test.py @@ -25,11 +25,13 @@ import numpy as np from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import nn_ops from tensorflow.python.platform import test +@test_util.with_c_api class SoftmaxTest(test.TestCase): def _npSoftmax(self, features, dim=-1, log=False): @@ -174,8 +176,11 @@ class SoftmaxTest(test.TestCase): def testDimTooLarge(self): with self.test_session(): + # Use placeholder to make sure we get runtime error instead of shape + # inference error. + dim = array_ops.placeholder_with_default(100, shape=[]) with self.assertRaises(errors_impl.InvalidArgumentError): - nn_ops.softmax([1., 2., 3., 4.], dim=100).eval() + nn_ops.softmax([1., 2., 3., 4.], dim=dim).eval() def testLargeDims(self): # Make sure that we properly handle large inputs. See diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py index e5b93a54f79..7bf62d45b8e 100644 --- a/tensorflow/python/layers/core.py +++ b/tensorflow/python/layers/core.py @@ -49,9 +49,6 @@ class Dense(base.Layer): and `bias` is a bias vector created by the layer (only if `use_bias` is `True`). - Note: if the input to the layer has a rank greater than 2, then it is - flattened prior to the initial matrix multiply by `kernel`. - Arguments: units: Integer or Long, dimensionality of the output space. activation: Activation function (callable). Set it to None to maintain a @@ -199,9 +196,6 @@ def dense( and `bias` is a bias vector created by the layer (only if `use_bias` is `True`). - Note: if the `inputs` tensor has a rank greater than 2, then it is - flattened prior to the initial matrix multiply by `kernel`. - Arguments: inputs: Tensor input. units: Integer or Long, dimensionality of the output space. @@ -230,7 +224,8 @@ def dense( by the same name. Returns: - Output tensor. + Output tensor the same shape as `inputs` except the last dimension is of + size `units`. Raises: ValueError: if eager execution is enabled. diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index b8e8207bb24..9a8ac93de9d 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -1841,12 +1841,11 @@ def reduce_logsumexp(input_tensor, reduce_sum( gen_math_ops.exp(input_tensor - my_max), axis, - keepdims=True, - reduction_indices=reduction_indices)) + my_max + keepdims=keepdims, + reduction_indices=reduction_indices)) if not keepdims: - if isinstance(axis, int): - axis = [axis] - result = array_ops.squeeze(result, axis) + my_max = array_ops.reshape(my_max, array_ops.shape(result)) + result += my_max return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result) diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc index a23d6a70ba2..d1a8aae1674 100644 --- a/tensorflow/stream_executor/executor_cache.cc +++ b/tensorflow/stream_executor/executor_cache.cc @@ -23,6 +23,14 @@ namespace gputools { port::StatusOr ExecutorCache::GetOrCreate( const StreamExecutorConfig& config, const std::function& factory) { + // In the fast path case, the cache already has an entry and we can just + // return after Get() which only takes a shared lock and not a unique lock. + // If we need to create, we take a unique lock on cache_. + auto fast_result = Get(config); + if (fast_result.ok()) { + return fast_result; + } + Entry* entry = nullptr; { mutex_lock lock{mutex_}; @@ -59,12 +67,17 @@ port::StatusOr ExecutorCache::Get( const StreamExecutorConfig& config) { Entry* entry = nullptr; { - mutex_lock lock{mutex_}; - entry = &cache_[config.ordinal]; - // Release the map lock; the address of 'entry' is stable because - // std::map guarantees reference stability. + tf_shared_lock lock{mutex_}; + auto it = cache_.find(config.ordinal); + if (it != cache_.end()) { + entry = &it->second; + } else { + return port::Status(port::error::NOT_FOUND, + port::Printf("No executors registered for ordinal %d", + config.ordinal)); + } } - mutex_lock lock{entry->configurations_mutex}; + tf_shared_lock lock{entry->configurations_mutex}; if (entry->configurations.empty()) { return port::Status( port::error::NOT_FOUND, diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc index cc32a6beaa5..f23224ae772 100644 --- a/tensorflow/stream_executor/multi_platform_manager.cc +++ b/tensorflow/stream_executor/multi_platform_manager.cc @@ -45,7 +45,7 @@ namespace gputools { /* static */ port::StatusOr MultiPlatformManager::PlatformWithName( const string& target) { - mutex_lock lock(GetPlatformsMutex()); + tf_shared_lock lock(GetPlatformsMutex()); auto it = GetPlatformMap()->find(port::Lowercase(target)); if (it == GetPlatformMap()->end()) { @@ -59,7 +59,7 @@ namespace gputools { /* static */ port::StatusOr MultiPlatformManager::PlatformWithId( const Platform::Id& id) { - mutex_lock lock(GetPlatformsMutex()); + tf_shared_lock lock(GetPlatformsMutex()); auto it = GetPlatformByIdMap()->find(id); if (it == GetPlatformByIdMap()->end()) { return port::Status( diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc index 96324d0deab..593c654f9fb 100644 --- a/tensorflow/tools/graph_transforms/sparsify_gather.cc +++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc @@ -15,6 +15,7 @@ limitations under the License. #include #include +#include #include "tensorflow/c/checkpoint_reader.h" #include "tensorflow/core/framework/tensor.h" @@ -28,9 +29,10 @@ limitations under the License. #include "tensorflow/tools/graph_transforms/transform_utils.h" namespace tensorflow { -using strings::StrCat; using str_util::Join; using str_util::Split; +using str_util::StringReplace; +using strings::StrCat; namespace graph_transforms { @@ -89,7 +91,7 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def, string* shape_slice_string) { string restore_node_name; for (const auto& node : input_graph_def.node()) { - std::vector node_name_parts = str_util::Split(node.name(), "/"); + std::vector node_name_parts = Split(node.name(), "/"); if (node_name_parts.size() == 2 && StringPiece(node_name_parts[0]).starts_with("save") && StringPiece(node_name_parts[1]).starts_with("Assign") && @@ -119,13 +121,13 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def, } string GetMonolithicTensorKey(const string& tensor_slice_name) { - std::vector names = str_util::Split(tensor_slice_name, "/"); + std::vector names = Split(tensor_slice_name, "/"); CHECK_GE(names.size(), 2); CHECK(StringPiece(names[names.size() - 1]).starts_with("part_")); // Remove the "part_x" suffix names.pop_back(); - return str_util::Join(names, "/"); + return Join(names, "/"); } Status ReadTensorFromCheckpoint( @@ -193,6 +195,15 @@ Status SparsifyGatherInternal( GraphDef current_graph_def = input_graph_def; bool any_match_found = false; + // Populate references. + std::unordered_map refs; + for (const auto& node : current_graph_def.node()) { + for (const auto& input : node.input()) { + auto parsed_input = StringReplace(input, "^", "", true); + refs[parsed_input] += 1; + } + } + // The subgraphs may have overlapping components, therefore GraphMatcher // doesn't return all subgraphs in one round -- this has to be multi-round // update. @@ -200,15 +211,15 @@ Status SparsifyGatherInternal( any_match_found = false; GraphDef replaced_graph_def = current_graph_def; std::vector init_table_node_names; - std::vector removed_variable_names; + std::vector removed_node_names; TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes( current_graph_def, pattern, [&ckpt_reader, &any_match_found, &init_table_node_names, - &shapes_and_slices, &removed_variable_names]( - const NodeMatch& match, const std::set& input_nodes, - const std::set& output_nodes, - std::vector* new_nodes) { + &shapes_and_slices, &removed_node_names, + &refs](const NodeMatch& match, const std::set& input_nodes, + const std::set& output_nodes, + std::vector* new_nodes) { any_match_found = true; // The captured subgraph should be of the following pattern: @@ -291,8 +302,12 @@ Status SparsifyGatherInternal( weights_node.name(), ckpt_reader, (*shapes_and_slices)[weights_node.name()], &weight)); // Add both both weight and identity node names. - removed_variable_names.push_back(weights_node.name()); - removed_variable_names.push_back(match.inputs[0].node.name()); + removed_node_names.push_back(weights_node.name()); + removed_node_names.push_back(match.inputs[0].node.name()); + for (auto input_node : match.inputs[0].node.input()) { + auto parsed_input = StringReplace(input_node, "^", "", true); + refs[parsed_input]--; + } } Tensor indices_tensor; Tensor values_tensor; @@ -362,15 +377,23 @@ Status SparsifyGatherInternal( // Connect nodes AddNodeInput(hashtable_node.name(), &init_table_node); + refs[hashtable_node.name()]++; AddNodeInput(indices_node.name(), &init_table_node); + refs[indices_node.name()]++; AddNodeInput(values_node.name(), &init_table_node); + refs[values_node.name()]++; AddNodeInput(hashtable_node.name(), &lookup_node); + refs[hashtable_node.name()]++; AddNodeInput(gather_node.input(1), &lookup_node); + refs[gather_node.input(1)]++; AddNodeInput(default_value_node.name(), &lookup_node); + refs[default_value_node.name()]++; AddNodeInput(lookup_node.name(), &expand_dims_node); + refs[lookup_node.name()]++; AddNodeInput(dim_idx_node.name(), &expand_dims_node); + refs[dim_idx_node.name()]++; // Copy 'ids' input of original 'Gather' new_nodes->push_back(match.inputs[1].node); @@ -404,22 +427,44 @@ Status SparsifyGatherInternal( for (const string& name : init_table_node_names) { // Add control dependence from init_table_node to group_deps_node AddNodeInput(StrCat("^", name), init_op); + refs[name]++; } - // Remove all dependencies associated with removed variables. - while (!removed_variable_names.empty()) { - auto name = removed_variable_names.back(); - removed_variable_names.pop_back(); + // Erase inputs and outputs as they are not considered for deletion. + for (const auto& output : context.output_names) { + refs.erase(output); + } + + for (const auto& input : context.input_names) { + refs.erase(input); + } + + // Add nodes with a reference count of 0 for deletion. + for (auto entry : refs) { + if (entry.second == 0) { + removed_node_names.push_back(entry.first); + } + } + + while (!removed_node_names.empty()) { + auto name = removed_node_names.back(); + removed_node_names.pop_back(); + int i = 0; while (i < replaced_graph_def.node_size()) { - if (!replaced_graph_def.node(i).input_size()) { - if (replaced_graph_def.node(i).name() == name) { - replaced_graph_def.mutable_node()->SwapElements( - i, replaced_graph_def.node_size() - 1); - replaced_graph_def.mutable_node()->RemoveLast(); - continue; + // Revisit this to see if we can safely remove RestoreV2 nodes. + if ((replaced_graph_def.node(i).name() == name) && + (replaced_graph_def.node(i).op() != "RestoreV2")) { + for (const auto& input : replaced_graph_def.node(i).input()) { + auto parsed_input = StringReplace(input, "^", "", true); + refs[parsed_input] -= 1; + if (refs[parsed_input] == 0) { + removed_node_names.push_back(parsed_input); + } } - i++; + replaced_graph_def.mutable_node()->SwapElements( + i, replaced_graph_def.node_size() - 1); + replaced_graph_def.mutable_node()->RemoveLast(); continue; } int j = 0; @@ -433,18 +478,16 @@ Status SparsifyGatherInternal( } j++; } - if ((replaced_graph_def.node(i).input_size() == 0) || - (replaced_graph_def.node(i).op() == "Assign" && - replaced_graph_def.node(i).input_size() == 1)) { - removed_variable_names.push_back(replaced_graph_def.node(i).name()); - if (replaced_graph_def.node(i).input_size() == 1) { - removed_variable_names.push_back( - replaced_graph_def.node(i).input(0)); + if (!replaced_graph_def.node(i).input_size()) { + if ((refs.find(replaced_graph_def.node(i).name()) != refs.end()) && + (refs[replaced_graph_def.node(i).name()] == 0)) { + removed_node_names.push_back(replaced_graph_def.node(i).name()); } - replaced_graph_def.mutable_node()->SwapElements( - i, replaced_graph_def.node_size() - 1); - replaced_graph_def.mutable_node()->RemoveLast(); - continue; + } + + if (replaced_graph_def.node(i).op() == "Assign" && + replaced_graph_def.node(i).input_size() == 1) { + removed_node_names.push_back(replaced_graph_def.node(i).name()); } i++; } diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc index 000568a0cc9..6627df1331a 100644 --- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc +++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc @@ -80,6 +80,8 @@ class SparsifyGatherTest : public ::testing::Test { // Build the graph. NodeDef* input_node = CreateNode("ids", "Const", {}, &graph_def); NodeDef* w_node; + NodeDef* zeros_const; + NodeDef* zeros_shape; NodeDef* zeros_node; NodeDef* assign_node; @@ -92,8 +94,12 @@ class SparsifyGatherTest : public ::testing::Test { } else { w_node = CreateNode("w/part_1", "VariableV2", {}, &graph_def); - zeros_node = - CreateNode("w/part_1/Initializer/zeros", "Const", {}, &graph_def); + zeros_shape = CreateNode("w/part_1/Initializer/zeros/shape_as_tensor", + "Const", {}, &graph_def); + zeros_const = CreateNode("w/part_1/Initializer/zeros/Const", "Const", {}, + &graph_def); + zeros_node = CreateNode("w/part_1/Initializer/zeros", "Fill", + {zeros_shape, zeros_const}, &graph_def); assign_node = CreateNode("w/part_1/Assign", "Assign", {w_node, zeros_node}, &graph_def); @@ -151,6 +157,9 @@ class SparsifyGatherTest : public ::testing::Test { MapNamesToNodes(result, &node_lookup); // Check nodes. + EXPECT_EQ(0, + node_lookup.count("w/part_1/Initializer/zeros/shape_as_tensor")); + EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros/Const")); EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros")); EXPECT_EQ(0, node_lookup.count("w/part_1/Assign")); @@ -247,7 +256,11 @@ class SparsifyGatherTest : public ::testing::Test { // Two partitions NodeDef* w_node1; NodeDef* w_node2; + NodeDef* zeros_const1; + NodeDef* zeros_shape1; NodeDef* zeros_node1; + NodeDef* zeros_const2; + NodeDef* zeros_shape2; NodeDef* zeros_node2; NodeDef* assign_node1; NodeDef* assign_node2; @@ -261,8 +274,13 @@ class SparsifyGatherTest : public ::testing::Test { SetNodeTensorAttr("value", weights, w_node2); } else { w_node1 = CreateNode("w1/part_1", "VariableV2", {}, &graph_def); - zeros_node1 = - CreateNode("w1/part_1/Initializer/zeros", "Const", {}, &graph_def); + + zeros_shape1 = CreateNode("w1/part_1/Initializer/zeros/shape_as_tensor", + "Const", {}, &graph_def); + zeros_const1 = CreateNode("w1/part_1/Initializer/zeros/Const", "Const", + {}, &graph_def); + zeros_node1 = CreateNode("w1/part_1/Initializer/zeros", "Fill", + {zeros_shape1, zeros_const1}, &graph_def); assign_node1 = CreateNode("w1/part_1/Assign", "Assign", {w_node1, zeros_node1}, &graph_def); @@ -285,8 +303,12 @@ class SparsifyGatherTest : public ::testing::Test { CreateNode("save/Assign", "Assign", {w_node1, restore_node1}, &graph_def); w_node2 = CreateNode("w2/part_1", "VariableV2", {}, &graph_def); - zeros_node2 = - CreateNode("w2/part_1/Initializer/zeros", "Const", {}, &graph_def); + zeros_shape2 = CreateNode("w2/part_1/Initializer/zeros/shape_as_tensor", + "Const", {}, &graph_def); + zeros_const2 = CreateNode("w2/part_1/Initializer/zeros/Const", "Const", + {}, &graph_def); + zeros_node2 = CreateNode("w2/part_1/Initializer/zeros", "Fill", + {zeros_shape2, zeros_const2}, &graph_def); assign_node2 = CreateNode("w2/part_1/Assign", "Assign", {w_node2, zeros_node2}, &graph_def); @@ -350,8 +372,14 @@ class SparsifyGatherTest : public ::testing::Test { MapNamesToNodes(result, &node_lookup); // Check nodes. + EXPECT_EQ(0, + node_lookup.count("w1/part_1/Initializer/zeros/shape_as_tensor")); + EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros/Const")); EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros")); EXPECT_EQ(0, node_lookup.count("w1/part_1/Assign")); + EXPECT_EQ(0, + node_lookup.count("w2/part_1/Initializer/zeros/shape_as_tensor")); + EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros/Const")); EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros")); EXPECT_EQ(0, node_lookup.count("w2/part_1/Assign")); EXPECT_EQ(1, node_lookup.count("ids"));