Branch 183429339 (#16469)

* Change `reduce_logsumexp` to internally use `reshape` rather than `squeeze`
since the latter requires the `axis` arg to be a Python `list`.

PiperOrigin-RevId: 183396533

* Kernel utils to support broadcast add and mul.

PiperOrigin-RevId: 183397494

* Updating sparsify_gather.

PiperOrigin-RevId: 183402917

* [tf.data] Move slow-path-related code into the slow path in IteratorHandleOp::Compute().

This slightly reduces the amount of work performed when an iterator is accessed (after the first access), and potentially reduces contention if concurrent steps are accessing the same iterator.

PiperOrigin-RevId: 183406221

* Cleanup: Ran clang-format on all *.{cc,h} in under grappler.

PiperOrigin-RevId: 183406440

* Increase shard count of //third_party/tensorflow/python:nn_batchnorm_test to avoid timeouts

When run under asan, the test runs for about 5 minutes, and sometimes
longer, causing frequent timeouts.

This change increases the shard count of the test to 4, which brings the run time
of the longest running shard under asan to about 2 minutes.

PiperOrigin-RevId: 183414888

* Add available choices to toco flags and fix minor formatting issues.

PiperOrigin-RevId: 183415713

* Performance improvements to some GPU code to use shared locks instead of unique locks for some hotspot cases.

PiperOrigin-RevId: 183418559

* [XLA] Improve error message for bad slices.

PiperOrigin-RevId: 183420038

* Fix py3 build rules for all py tests under py2tf.

PiperOrigin-RevId: 183422144

* Fix bug with Operation._control_inputs setter.

PiperOrigin-RevId: 183422192

* Make softmax_op_test.py work with C API enabled.

PiperOrigin-RevId: 183422829

* Cleanup: Ran clang-format on all *.{cc,h} files in tensorflow/core/kernels.

PiperOrigin-RevId: 183423961

* Fix the documentation for the dense layer for how rank > 2 inputs are handled.

PiperOrigin-RevId: 183425868

* Cleanup: Ran clang-format on all *.{cc,h} in tensorflow/core/ops.

PiperOrigin-RevId: 183429339
This commit is contained in:
Rasmus Munk Larsen 2018-01-26 13:32:16 -08:00 committed by GitHub
parent f84623507b
commit 982549ea34
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
325 changed files with 4706 additions and 4373 deletions

View File

@ -37,6 +37,9 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/protobuf.h"
using tensorflow::str_util::Join;
using tensorflow::strings::Printf;
namespace xla { namespace xla {
namespace { namespace {
@ -934,7 +937,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
"inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}", "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
BinaryOperation_Name(operation).c_str(), BinaryOperation_Name(operation).c_str(),
ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(), ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str()); Join(broadcast_dimensions, ", ").c_str());
TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
@ -1097,7 +1100,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
return InvalidArgument( return InvalidArgument(
"Map operation requires all operands to have the same shape; got: " "Map operation requires all operands to have the same shape; got: "
"%s", "%s",
tensorflow::str_util::Join(pieces, ", ").c_str()); Join(pieces, ", ").c_str());
} }
// Check that dimensions.size == arg_shape.dimensions_size() (we currently // Check that dimensions.size == arg_shape.dimensions_size() (we currently
@ -1114,7 +1117,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
if (dimensions[i] != i) { if (dimensions[i] != i) {
return InvalidArgument( return InvalidArgument(
"Map requires monotonically increasing dimension numbers, found: %s ", "Map requires monotonically increasing dimension numbers, found: %s ",
tensorflow::str_util::Join(dimensions, ", ").c_str()); Join(dimensions, ", ").c_str());
} }
} }
@ -1914,21 +1917,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts, const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
tensorflow::gtl::ArraySlice<int64> limits, tensorflow::gtl::ArraySlice<int64> limits,
tensorflow::gtl::ArraySlice<int64> strides) { tensorflow::gtl::ArraySlice<int64> strides) {
auto error = [&](const string& message) {
return InvalidArgument(
"%s in slice operation; argument shape: %s; starts: {%s}; limits: "
"{%s}; strides: {%s}",
message.c_str(), ShapeUtil::HumanString(arg).c_str(),
Join(starts, ",").c_str(), Join(limits, ",").c_str(),
Join(strides, ",").c_str());
};
TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice")); TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
VLOG(2) << tensorflow::strings::Printf( VLOG(2) << tensorflow::strings::Printf(
"slicing shape %s starts={%s} limits={%s}", "slicing shape %s starts={%s} limits={%s}",
ShapeUtil::HumanString(arg).c_str(), ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(),
tensorflow::str_util::Join(starts, ", ").c_str(), Join(limits, ", ").c_str());
tensorflow::str_util::Join(limits, ", ").c_str());
if (starts.size() != limits.size()) { if (starts.size() != limits.size()) {
return InvalidArgument("slice start and limit sizes differ: %zu vs %zu", return error(Printf("slice start and limit sizes differ: %zu vs %zu",
starts.size(), limits.size()); starts.size(), limits.size()));
} }
if (starts.size() != strides.size()) { if (starts.size() != strides.size()) {
return InvalidArgument("slice start and strides sizes differ: %zu vs %zu", return error(Printf("slice start and strides sizes differ: %zu vs %zu",
starts.size(), strides.size()); starts.size(), strides.size()));
} }
if (starts.size() != ShapeUtil::Rank(arg)) { if (starts.size() != ShapeUtil::Rank(arg)) {
@ -1947,20 +1957,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
start_index); start_index);
} }
if (limit_index > arg.dimensions(dimension)) { if (limit_index > arg.dimensions(dimension)) {
return InvalidArgument( return error(
"limit index (%lld) must be less than or equal to dimension " Printf("limit index (%lld) must be less than or equal to dimension "
"size (%lld)", "size (%lld)",
limit_index, arg.dimensions(dimension)); limit_index, arg.dimensions(dimension)));
} }
VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension, VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
start_index); start_index);
VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension, VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
limit_index); limit_index);
if (start_index > limit_index) { if (start_index > limit_index) {
return InvalidArgument( return error(
"limit index (%lld) must be greater or equal to " Printf("limit index (%lld) must be greater or equal to "
"start index (%lld) in slice with positive stride", "start index (%lld) in slice with positive stride",
limit_index, start_index); limit_index, start_index));
} }
if (stride <= 0) { if (stride <= 0) {
return InvalidArgument("stride (%lld) must be positive", stride); return InvalidArgument("stride (%lld) must be positive", stride);
@ -1983,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
"slicing shape %s at dynamic start_indices %s with slice_sizes={%s}", "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
ShapeUtil::HumanString(operand_shape).c_str(), ShapeUtil::HumanString(operand_shape).c_str(),
ShapeUtil::HumanString(start_indices_shape).c_str(), ShapeUtil::HumanString(start_indices_shape).c_str(),
tensorflow::str_util::Join(slice_sizes, ", ").c_str()); Join(slice_sizes, ", ").c_str());
if (ShapeUtil::Rank(start_indices_shape) != 1) { if (ShapeUtil::Rank(start_indices_shape) != 1) {
return InvalidArgument( return InvalidArgument(
@ -2280,8 +2290,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
return InvalidArgument( return InvalidArgument(
"Reshape dimensions [%s] are not a permutation of the operand " "Reshape dimensions [%s] are not a permutation of the operand "
"dimensions (operand shape is %s).", "dimensions (operand shape is %s).",
tensorflow::str_util::Join(dimensions, ",").c_str(), Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str());
ShapeUtil::HumanString(operand).c_str());
} }
return inferred_shape; return inferred_shape;
@ -2373,8 +2382,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
// The applied function's arity equals the number of arguments. // The applied function's arity equals the number of arguments.
if (arg_shapes.size() != to_apply.parameters_size()) { if (arg_shapes.size() != to_apply.parameters_size()) {
string computation_signature = ShapeUtil::HumanString(to_apply); string computation_signature = ShapeUtil::HumanString(to_apply);
string argument_shapes = tensorflow::str_util::Join( string argument_shapes =
arg_shapes, ", ", [](string* out, const Shape* shape) { Join(arg_shapes, ", ", [](string* out, const Shape* shape) {
tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape)); tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
}); });
return InvalidArgument( return InvalidArgument(

View File

@ -1512,5 +1512,20 @@ TEST_F(ShapeInferenceTest, Conditional) {
"must have the same shape")); "must have the same shape"));
} }
TEST_F(ShapeInferenceTest, BadSlice) {
auto arg = ShapeUtil::MakeShape(F32, {4});
StatusOr<Shape> statusor =
ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
ASSERT_FALSE(statusor.ok());
LOG(INFO) << statusor.status();
EXPECT_THAT(statusor.status().error_message(),
HasSubstr("less than or equal to dimension size"))
<< statusor.status();
EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape"))
<< statusor.status();
}
} // namespace } // namespace
} // namespace xla } // namespace xla

View File

@ -71,6 +71,32 @@ cc_library(
], ],
) )
cc_library(
name = "kernel_util",
srcs = [
"kernel_util.cc",
],
hdrs = [
"kernel_util.h",
],
deps = [
"//tensorflow/contrib/lite:builtin_op_data",
"//tensorflow/contrib/lite:context",
"//tensorflow/contrib/lite/kernels/internal:round",
],
)
tf_cc_test(
name = "kernel_util_test",
size = "small",
srcs = ["kernel_util_test.cc"],
deps = [
":kernel_util",
"//tensorflow/contrib/lite/testing:util",
"@com_google_googletest//:gtest",
],
)
cc_library( cc_library(
name = "builtin_ops", name = "builtin_ops",
srcs = [ srcs = [
@ -87,7 +113,6 @@ cc_library(
"fully_connected.cc", "fully_connected.cc",
"gather.cc", "gather.cc",
"hashtable_lookup.cc", "hashtable_lookup.cc",
"kernel_util.cc",
"l2norm.cc", "l2norm.cc",
"local_response_norm.cc", "local_response_norm.cc",
"lsh_projection.cc", "lsh_projection.cc",
@ -111,7 +136,6 @@ cc_library(
"unidirectional_sequence_rnn.cc", "unidirectional_sequence_rnn.cc",
], ],
hdrs = [ hdrs = [
"kernel_util.h",
"padding.h", "padding.h",
"register.h", "register.h",
], ],
@ -125,6 +149,7 @@ cc_library(
}), }),
deps = [ deps = [
":activation_functor", ":activation_functor",
":kernel_util",
":op_macros", ":op_macros",
"//tensorflow/contrib/lite:builtin_op_data", "//tensorflow/contrib/lite:builtin_op_data",
"//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite:framework",

View File

@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h"
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <memory>
#include "tensorflow/contrib/lite/kernels/internal/round.h" #include "tensorflow/contrib/lite/kernels/internal/round.h"
namespace tflite { namespace tflite {
@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
} }
} }
bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
return TfLiteIntArrayEqual(input1->dims, input2->dims);
}
TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
TfLiteTensor* input1,
TfLiteTensor* input2,
TfLiteIntArray** output_shape) {
int64_t dims1 = NumDimensions(input1);
int64_t dims2 = NumDimensions(input2);
int64_t out_dims = std::max(dims1, dims2);
std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
for (int i = 0; i < out_dims; ++i) {
int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1);
shape->data[out_dims - i - 1] = std::max(d1, d2);
}
*output_shape = shape.release();
return kTfLiteOk;
}
} // namespace tflite } // namespace tflite

View File

@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; } inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; } inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
inline int64_t NumElements(const TfLiteTensor* t) {
int64_t count = 1;
for (int i = 0; i < NumDimensions(t); ++i) {
count *= SizeOfDimension(t, i);
}
return count;
}
inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context, inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
const TfLiteNode* node, int index) { const TfLiteNode* node, int index) {
const bool use_tensor = node->inputs->data[index] != kOptionalTensor; const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
@ -76,6 +84,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
float* activation_min, float* activation_min,
float* activation_max); float* activation_max);
// Return true if the given tensors have the same shape.
bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
// Calculate the output_shape that is necessary for element-wise operations
// with broadcasting involving the two input tensors.
TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
TfLiteTensor* input1,
TfLiteTensor* input2,
TfLiteIntArray** output_shape);
} // namespace tflite } // namespace tflite
#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_ #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_

View File

@ -0,0 +1,150 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/lite/kernels/kernel_util.h"
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/contrib/lite/testing/util.h"
namespace tflite {
namespace {
void ReportError(TfLiteContext* context, const char* format, ...) {}
class KernelUtilTest : public ::testing::Test {
public:
KernelUtilTest() {
context_.ReportError = ReportError;
tensor1_.dims = nullptr;
tensor2_.dims = nullptr;
}
~KernelUtilTest() {
TfLiteTensorFree(&tensor1_);
TfLiteTensorFree(&tensor2_);
}
void SetShape(TfLiteTensor* tensor, std::initializer_list<int> dims) {
TfLiteTensorFree(tensor);
tensor->dims = TfLiteIntArrayCreate(dims.size());
int i = 0;
for (int d : dims) {
tensor->dims->data[i] = d;
++i;
}
}
std::vector<int> GetShape(TfLiteIntArray* dims) {
std::vector<int> result;
for (int i = 0; i < dims->size; ++i) {
result.push_back(dims->data[i]);
}
return result;
}
protected:
TfLiteContext context_;
TfLiteTensor tensor1_;
TfLiteTensor tensor2_;
};
TEST_F(KernelUtilTest, SameShapeEmpty) {
EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor1_, {1, 2, 3});
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor2_, {1, 2});
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor2_, {1, 2, 3, 4});
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor2_, {1, 2, 3});
EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor2_, {});
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor1_, {});
EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
}
TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) {
TfLiteIntArray* output = nullptr;
SetShape(&tensor1_, {1, 2});
SetShape(&tensor2_, {1, 3});
EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
EXPECT_EQ(output, nullptr);
}
TEST_F(KernelUtilTest, BroadcastShapeOnes) {
TfLiteIntArray* output = nullptr;
SetShape(&tensor1_, {1, 1});
SetShape(&tensor2_, {1, 3});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
TfLiteIntArrayFree(output);
SetShape(&tensor1_, {1, 2});
SetShape(&tensor2_, {1, 1});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
TfLiteIntArrayFree(output);
}
TEST_F(KernelUtilTest, BroadcastShapeScalars) {
TfLiteIntArray* output = nullptr;
SetShape(&tensor1_, {1, 2});
SetShape(&tensor2_, {});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2));
TfLiteIntArrayFree(output);
SetShape(&tensor1_, {});
SetShape(&tensor2_, {2});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2));
TfLiteIntArrayFree(output);
}
TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
TfLiteIntArray* output = nullptr;
SetShape(&tensor1_, {1, 2});
SetShape(&tensor2_, {3, 1, 1});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2));
TfLiteIntArrayFree(output);
SetShape(&tensor1_, {1, 2, 3, 4});
SetShape(&tensor2_, {1, 3, 1});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4));
TfLiteIntArrayFree(output);
}
} // namespace
} // namespace tflite
int main(int argc, char** argv) {
::tflite::LogToStderr();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

View File

@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
"For Protobuf formats, the binary format will be used."), "For Protobuf formats, the binary format will be used."),
Flag("input_format", parsed_flags.input_format.bind(), Flag("input_format", parsed_flags.input_format.bind(),
parsed_flags.input_format.default_value(), parsed_flags.input_format.default_value(),
"Input file format. One of: tensorflow_graphdef, "), "Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."),
Flag("output_format", parsed_flags.output_format.bind(), Flag("output_format", parsed_flags.output_format.bind(),
parsed_flags.output_format.default_value(), "Output file format."), parsed_flags.output_format.default_value(),
"Output file format. "
"One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(), Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
parsed_flags.default_ranges_min.default_value(), parsed_flags.default_ranges_min.default_value(),
"If defined, will be used as the default value for the min bound " "If defined, will be used as the default value for the min bound "
@ -58,11 +60,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
Flag("inference_type", parsed_flags.inference_type.bind(), Flag("inference_type", parsed_flags.inference_type.bind(),
parsed_flags.inference_type.default_value(), parsed_flags.inference_type.default_value(),
"Target data type of arrays in the output file (for input_arrays, " "Target data type of arrays in the output file (for input_arrays, "
"this may be overridden by inference_input_type)."), "this may be overridden by inference_input_type). "
"One of FLOAT, QUANTIZED_UINT8."),
Flag("inference_input_type", parsed_flags.inference_input_type.bind(), Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
parsed_flags.inference_input_type.default_value(), parsed_flags.inference_input_type.default_value(),
"Target data type of input arrays. If not specified, inference_type " "Target data type of input arrays. "
"is used."), "If not specified, inference_type is used. "
"One of FLOAT, QUANTIZED_UINT8."),
Flag("input_type", parsed_flags.input_type.bind(), Flag("input_type", parsed_flags.input_type.bind(),
parsed_flags.input_type.default_value(), parsed_flags.input_type.default_value(),
"Deprecated ambiguous flag that set both --input_data_types and " "Deprecated ambiguous flag that set both --input_data_types and "
@ -76,35 +80,31 @@ bool ParseTocoFlagsFromCommandLineFlags(
Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(), Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
parsed_flags.drop_fake_quant.default_value(), parsed_flags.drop_fake_quant.default_value(),
"Ignore and discard FakeQuant nodes. For instance, that can be used " "Ignore and discard FakeQuant nodes. For instance, to "
"to "
"generate plain float code without fake-quantization from a " "generate plain float code without fake-quantization from a "
"quantized " "quantized graph."),
"graph."),
Flag( Flag(
"reorder_across_fake_quant", "reorder_across_fake_quant",
parsed_flags.reorder_across_fake_quant.bind(), parsed_flags.reorder_across_fake_quant.bind(),
parsed_flags.reorder_across_fake_quant.default_value(), parsed_flags.reorder_across_fake_quant.default_value(),
"Normally, FakeQuant nodes must be strict boundaries for graph " "Normally, FakeQuant nodes must be strict boundaries for graph "
"transformations, in order to ensure that quantized inference has " "transformations, in order to ensure that quantized inference has "
"the " "the exact same arithmetic behavior as quantized training --- which "
"exact same arithmetic behavior as quantized training --- which is " "is the whole point of quantized training and of FakeQuant nodes in "
"the " "the first place. "
"whole point of quantized training and of FakeQuant nodes in the " "However, that entails subtle requirements on where exactly "
"first "
"place. However, that entails subtle requirements on where exactly "
"FakeQuant nodes must be placed in the graph. Some quantized graphs " "FakeQuant nodes must be placed in the graph. Some quantized graphs "
"have FakeQuant nodes at unexpected locations, that prevent graph " "have FakeQuant nodes at unexpected locations, that prevent graph "
"transformations that are necessary in order to generate inference " "transformations that are necessary in order to generate inference "
"code for these graphs. Such graphs should be fixed, but as a " "code for these graphs. Such graphs should be fixed, but as a "
"temporary work-around, setting this reorder_across_fake_quant flag " "temporary work-around, setting this reorder_across_fake_quant flag "
"allows toco to perform necessary graph transformaitons on them, " "allows TOCO to perform necessary graph transformaitons on them, "
"at the cost of no longer faithfully matching inference and training " "at the cost of no longer faithfully matching inference and training "
"arithmetic."), "arithmetic."),
Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(), Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
parsed_flags.allow_custom_ops.default_value(), parsed_flags.allow_custom_ops.default_value(),
"If true, allow TOCO to create TF Lite Custom operators for all the" "If true, allow TOCO to create TF Lite Custom operators for all the "
"unsupported Tensorflow ops."), "unsupported TensorFlow ops."),
Flag( Flag(
"drop_control_dependency", "drop_control_dependency",
parsed_flags.drop_control_dependency.bind(), parsed_flags.drop_control_dependency.bind(),

View File

@ -57,6 +57,7 @@ py_library(
py_test( py_test(
name = "api_test", name = "api_test",
srcs = ["api_test.py"], srcs = ["api_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":py2tf_internal", ":py2tf_internal",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -66,6 +67,7 @@ py_test(
py_test( py_test(
name = "conversion_test", name = "conversion_test",
srcs = ["conversion_test.py"], srcs = ["conversion_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":py2tf_internal", ":py2tf_internal",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -76,6 +78,7 @@ py_test(
py_test( py_test(
name = "naming_test", name = "naming_test",
srcs = ["naming_test.py"], srcs = ["naming_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":py2tf_internal", ":py2tf_internal",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",

View File

@ -52,6 +52,7 @@ py_library(
py_test( py_test(
name = "break_canonicalization_test", name = "break_canonicalization_test",
srcs = ["break_canonicalization_test.py"], srcs = ["break_canonicalization_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -62,6 +63,7 @@ py_test(
py_test( py_test(
name = "call_trees_test", name = "call_trees_test",
srcs = ["call_trees_test.py"], srcs = ["call_trees_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -72,6 +74,7 @@ py_test(
py_test( py_test(
name = "continue_canonicalization_test", name = "continue_canonicalization_test",
srcs = ["continue_canonicalization_test.py"], srcs = ["continue_canonicalization_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -82,6 +85,7 @@ py_test(
py_test( py_test(
name = "control_flow_test", name = "control_flow_test",
srcs = ["control_flow_test.py"], srcs = ["control_flow_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -92,6 +96,7 @@ py_test(
py_test( py_test(
name = "builtin_functions_test", name = "builtin_functions_test",
srcs = ["builtin_functions_test.py"], srcs = ["builtin_functions_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -112,6 +117,7 @@ py_test(
py_test( py_test(
name = "logical_expressions_test", name = "logical_expressions_test",
srcs = ["logical_expressions_test.py"], srcs = ["logical_expressions_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -122,6 +128,7 @@ py_test(
py_test( py_test(
name = "print_functions_test", name = "print_functions_test",
srcs = ["print_functions_test.py"], srcs = ["print_functions_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -133,6 +140,7 @@ py_test(
py_test( py_test(
name = "side_effect_guards_test", name = "side_effect_guards_test",
srcs = ["side_effect_guards_test.py"], srcs = ["side_effect_guards_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",

View File

@ -38,6 +38,7 @@ py_library(
py_test( py_test(
name = "anno_test", name = "anno_test",
srcs = ["anno_test.py"], srcs = ["anno_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":pyct", ":pyct",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -47,6 +48,7 @@ py_test(
py_test( py_test(
name = "compiler_test", name = "compiler_test",
srcs = ["compiler_test.py"], srcs = ["compiler_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":pyct", ":pyct",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -57,6 +59,7 @@ py_test(
py_test( py_test(
name = "parser_test", name = "parser_test",
srcs = ["parser_test.py"], srcs = ["parser_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":pyct", ":pyct",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -66,6 +69,7 @@ py_test(
py_test( py_test(
name = "pretty_printer_test", name = "pretty_printer_test",
srcs = ["pretty_printer_test.py"], srcs = ["pretty_printer_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":pyct", ":pyct",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -75,6 +79,7 @@ py_test(
py_test( py_test(
name = "templates_test", name = "templates_test",
srcs = ["templates_test.py"], srcs = ["templates_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":pyct", ":pyct",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",

View File

@ -32,6 +32,7 @@ py_library(
py_test( py_test(
name = "access_test", name = "access_test",
srcs = ["access_test.py"], srcs = ["access_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":static_analysis", ":static_analysis",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -43,6 +44,7 @@ py_test(
py_test( py_test(
name = "live_values_test", name = "live_values_test",
srcs = ["live_values_test.py"], srcs = ["live_values_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":static_analysis", ":static_analysis",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -53,6 +55,7 @@ py_test(
py_test( py_test(
name = "type_info_test", name = "type_info_test",
srcs = ["type_info_test.py"], srcs = ["type_info_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":static_analysis", ":static_analysis",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",

View File

@ -230,8 +230,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
// TODO(tucker): actually maintain separate CPUAllocators for // TODO(tucker): actually maintain separate CPUAllocators for
// different numa_nodes. For now, just one. // different numa_nodes. For now, just one.
numa_node = 0; numa_node = 0;
mutex_lock lock(mu_);
{
// Here we optimize the most common use case where cuda_host_allocators_
// and cuda_al_ have already been populated and since we're only reading
// these vectors, we can get by with a shared lock. In the slower case,
// we take a unique lock and populate these vectors.
tf_shared_lock lock(mu_);
if (FLAGS_brain_gpu_record_mem_types &&
static_cast<int>(cuda_al_.size()) > 0) {
return cuda_al_[0];
}
if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
return cuda_host_allocators_[0];
}
}
mutex_lock lock(mu_);
// Find the first valid StreamExecutor to request CUDA host memory // Find the first valid StreamExecutor to request CUDA host memory
// through, since any will work. // through, since any will work.
// //

View File

@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
DisableDetailedStats(false); DisableDetailedStats(false);
} }
Cluster::~Cluster() { Cluster::~Cluster() {}
}
void Cluster::AllowSoftPlacement(bool soft_placement_state) { void Cluster::AllowSoftPlacement(bool soft_placement_state) {
options_.config.set_allow_soft_placement(soft_placement_state); options_.config.set_allow_soft_placement(soft_placement_state);

View File

@ -325,7 +325,7 @@ class VirtualScheduler {
// Boolean field for whether the cost is accurate. // Boolean field for whether the cost is accurate.
std::map<string, std::pair<int, bool>> op_costs_; std::map<string, std::pair<int, bool>> op_costs_;
Costs graph_costs_; // Graph cost. Costs graph_costs_; // Graph cost.
std::map<string, Costs> op_to_cost_; // Per-op cost. std::map<string, Costs> op_to_cost_; // Per-op cost.
// Auxilliary data structures for constructing NodeState and DeviceState. // Auxilliary data structures for constructing NodeState and DeviceState.

View File

@ -16,8 +16,8 @@ limitations under the License.
#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_ #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
#define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_ #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
#include "tensorflow/core/framework/variable.pb.h" #include "tensorflow/core/framework/variable.pb.h"
#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
#include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status.h"
namespace tensorflow { namespace tensorflow {

View File

@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice;
template <typename Device, typename T> template <typename Device, typename T>
class AdjustContrastOp : public OpKernel { class AdjustContrastOp : public OpKernel {
public: public:
explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) { explicit AdjustContrastOp(OpKernelConstruction* context)
} : OpKernel(context) {}
void Compute(OpKernelContext* context) override { void Compute(OpKernelContext* context) override {
const Tensor& input = context->input(0); const Tensor& input = context->input(0);

View File

@ -29,8 +29,7 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
class AdjustContrastOpTest : public OpsTestBase { class AdjustContrastOpTest : public OpsTestBase {};
};
TEST_F(AdjustContrastOpTest, Simple_1113) { TEST_F(AdjustContrastOpTest, Simple_1113) {
TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2") TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")

View File

@ -192,8 +192,9 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
const DeviceBase::CpuWorkerThreads& worker_threads = const DeviceBase::CpuWorkerThreads& worker_threads =
*context->device()->tensorflow_cpu_worker_threads(); *context->device()->tensorflow_cpu_worker_threads();
Shard(worker_threads.num_threads, worker_threads.workers, channel_count, Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
kCostPerChannel, [channel_count, &input_data, &output_data, scale_h]( kCostPerChannel,
int64 start_channel, int64 end_channel) { [channel_count, &input_data, &output_data, scale_h](
int64 start_channel, int64 end_channel) {
const float* p = input_data.data() + start_channel * kChannelSize; const float* p = input_data.data() + start_channel * kChannelSize;
float* q = output_data.data() + start_channel * kChannelSize; float* q = output_data.data() + start_channel * kChannelSize;
for (int i = start_channel; i < end_channel; i++) { for (int i = start_channel; i < end_channel; i++) {

View File

@ -25,7 +25,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice; typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
namespace tensorflow { namespace tensorflow {
@ -201,7 +201,7 @@ struct Add7Functor<SYCLDevice, T> {
typename TTypes<T>::ConstFlat in6, typename TTypes<T>::ConstFlat in6,
typename TTypes<T>::ConstFlat in7) { typename TTypes<T>::ConstFlat in7) {
Add7EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, Add7EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
in7); in7);
} }
}; };
@ -214,7 +214,7 @@ struct Add8Functor<SYCLDevice, T> {
typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) { typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
Add8EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, Add8EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
in7, in8); in7, in8);
} }
}; };
@ -227,7 +227,7 @@ struct Add8pFunctor<SYCLDevice, T> {
typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) { typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
Add8pEigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, Add8pEigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
in7, in8); in7, in8);
} }
}; };
@ -241,10 +241,10 @@ struct Add9Functor<SYCLDevice, T> {
typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8, typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
typename TTypes<T>::ConstFlat in9) { typename TTypes<T>::ConstFlat in9) {
Add9EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, Add9EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
in7, in8, in9); in7, in8, in9);
} }
}; };
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace functor } // namespace functor

View File

@ -52,8 +52,9 @@ class ExtractGlimpseOp : public OpKernel {
const int64 batch_size = input_shape.dim_size(0); const int64 batch_size = input_shape.dim_size(0);
const Tensor& window_size = context->input(1); const Tensor& window_size = context->input(1);
OP_REQUIRES(context, (window_size.shape().dims() == 1) && OP_REQUIRES(context,
window_size.shape().dim_size(0) == 2, (window_size.shape().dims() == 1) &&
window_size.shape().dim_size(0) == 2,
errors::InvalidArgument( errors::InvalidArgument(
"input must be a vector of size 2 (height, width)", "input must be a vector of size 2 (height, width)",
window_size.shape().DebugString())); window_size.shape().DebugString()));

View File

@ -48,9 +48,8 @@ struct SpatialAvgPooling {
typedef Eigen::GpuDevice GPUDevice; typedef Eigen::GpuDevice GPUDevice;
// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation // Launch a custom GPU kernels from Yanqing for the avgpooling backward
// that works NHWC data formats. // operation that works NHWC data formats. Arguments:
// Arguments:
// top_diff: backprop to the output of the pooling layer // top_diff: backprop to the output of the pooling layer
// num: number of input batches // num: number of input batches
// height: input height // height: input height

View File

@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
int pool_size = (hend - hstart) * (wend - wstart); int pool_size = (hend - hstart) * (wend - wstart);
gradient += gradient += top_diff_slice[(ph * pooled_width + pw) * channels] /
top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size); dtype(pool_size);
} }
} }
bottom_diff[index] = gradient; bottom_diff[index] = gradient;
@ -90,11 +90,11 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
const GPUDevice& d) { const GPUDevice& d) {
int x_size = num * height * width * channels; int x_size = num * height * width * channels;
CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d); CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
AvePoolBackwardNHWC< AvePoolBackwardNHWC<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
config.virtual_thread_count, top_diff, num, height, width, channels, config.virtual_thread_count, top_diff, num, height, width, channels,
pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w, pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
pad_t, pad_t, bottom_diff); pad_t, pad_t, bottom_diff);
return d.ok(); return d.ok();
} }

View File

@ -111,13 +111,14 @@ class Barrier : public ResourceBase {
mutex_lock lock(mu_); mutex_lock lock(mu_);
if (closed_) { if (closed_) {
OP_REQUIRES_ASYNC( OP_REQUIRES_ASYNC(
ctx, !cancel_pending_enqueues_ && ctx,
(num_inserted == 0 || !incomplete_.empty()), !cancel_pending_enqueues_ &&
(num_inserted == 0 || !incomplete_.empty()),
errors::Cancelled( errors::Cancelled(
"Barrier ", name_, " is closed. Pending enqueues cancelled: ", "Barrier ", name_, " is closed. Pending enqueues cancelled: ",
cancel_pending_enqueues_, ". Number of new insertions: ", cancel_pending_enqueues_,
num_inserted, ". Number of incomplete keys: ", ". Number of new insertions: ", num_inserted,
incomplete_.size(), "."), ". Number of incomplete keys: ", incomplete_.size(), "."),
callback); callback);
} }
@ -128,9 +129,10 @@ class Barrier : public ResourceBase {
for (int i = 0; i < num_inserted; ++i) { for (int i = 0; i < num_inserted; ++i) {
OP_REQUIRES_OK_ASYNC( OP_REQUIRES_OK_ASYNC(
ctx, InsertOneLocked<T>(ctx, keys, values, element_shape, ctx,
component_index, i, &ready_tuples, InsertOneLocked<T>(ctx, keys, values, element_shape,
&new_elements), component_index, i, &ready_tuples,
&new_elements),
callback); callback);
} }
@ -317,8 +319,9 @@ class Barrier : public ResourceBase {
return errors::Cancelled( return errors::Cancelled(
"Barrier ", name_, "Barrier ", name_,
" is closed, but attempted to insert a brand new key: ", " is closed, but attempted to insert a brand new key: ",
keys_vec(i), ". Pending enqueues cancelled: ", keys_vec(i),
cancel_pending_enqueues_, ". Insertion index: ", i, ". Pending enqueues cancelled: ", cancel_pending_enqueues_,
". Insertion index: ", i,
". Number of incomplete keys: ", incomplete_.size(), "."); ". Number of incomplete keys: ", incomplete_.size(), ".");
} }
} else { } else {
@ -532,13 +535,14 @@ class InsertManyOp : public BarrierOpKernel {
OP_REQUIRES_ASYNC( OP_REQUIRES_ASYNC(
ctx, component_index_ < barrier->num_components(), ctx, component_index_ < barrier->num_components(),
errors::InvalidArgument("The component ID is out of range ", errors::InvalidArgument("The component ID is out of range ",
component_index_, " > num_components", " (= ", component_index_, " > num_components",
barrier->num_components(), ")"), " (= ", barrier->num_components(), ")"),
callback); callback);
OP_REQUIRES_OK_ASYNC( OP_REQUIRES_OK_ASYNC(
ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING, ctx,
barrier->component_type(component_index_)}, ctx->MatchSignature({DT_STRING_REF, DT_STRING,
{}), barrier->component_type(component_index_)},
{}),
callback); callback);
const Tensor* keys; const Tensor* keys;

View File

@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/framework/resource_mgr.h"
#include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_util.h" #include "tensorflow/core/framework/tensor_util.h"
#include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/types.h"
#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
#include "tensorflow/core/kernels/batching_util/periodic_function.h" #include "tensorflow/core/kernels/batching_util/periodic_function.h"
#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
#include "tensorflow/core/kernels/concat_lib.h" #include "tensorflow/core/kernels/concat_lib.h"
#include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/kernels/split_lib.h" #include "tensorflow/core/kernels/split_lib.h"
#include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/lib/random/random.h"
#include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/macros.h"
namespace tensorflow { namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::ThreadPoolDevice CPUDevice;

View File

@ -41,7 +41,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice; typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice; typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
namespace { namespace {
@ -429,14 +429,13 @@ template <typename Scalar>
struct LaunchBatchMatMul<SYCLDevice, Scalar> { struct LaunchBatchMatMul<SYCLDevice, Scalar> {
static void Launch(OpKernelContext* context, const Tensor& in_x, static void Launch(OpKernelContext* context, const Tensor& in_x,
const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) { const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
// Number of matrix multiplies i.e. size of the batch.
// Number of matrix multiplies i.e. size of the batch. const int64 batch_size = in_x.dim_size(0);
const int64 batch_size = in_x.dim_size(0); ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y, out, out, 0, batch_size);
0, batch_size);
} }
}; };
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
template <typename Device, typename Scalar> template <typename Device, typename Scalar>
class BatchMatMul : public OpKernel { class BatchMatMul : public OpKernel {
@ -462,10 +461,10 @@ class BatchMatMul : public OpKernel {
TensorShape out_shape; TensorShape out_shape;
for (int i = 0; i < ndims - 2; ++i) { for (int i = 0; i < ndims - 2; ++i) {
OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i), OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(", errors::InvalidArgument(
i, ") must be the same: ", "In[0].dim(", i, ") and In[1].dim(", i,
in0.shape().DebugString(), " vs ", ") must be the same: ", in0.shape().DebugString(), " vs ",
in1.shape().DebugString())); in1.shape().DebugString()));
out_shape.AddDim(in0.dim_size(i)); out_shape.AddDim(in0.dim_size(i));
} }
auto n = (ndims == 2) ? 1 : out_shape.num_elements(); auto n = (ndims == 2) ? 1 : out_shape.num_elements();
@ -507,12 +506,12 @@ class BatchMatMul : public OpKernel {
bool adj_y_; bool adj_y_;
}; };
#define REGISTER_BATCH_MATMUL_CPU(TYPE) \ #define REGISTER_BATCH_MATMUL_CPU(TYPE) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \ Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
BatchMatMul<CPUDevice, TYPE>) BatchMatMul<CPUDevice, TYPE>)
#define REGISTER_BATCH_MATMUL_GPU(TYPE) \ #define REGISTER_BATCH_MATMUL_GPU(TYPE) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \ Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
BatchMatMul<GPUDevice, TYPE>) BatchMatMul<GPUDevice, TYPE>)
@ -522,5 +521,5 @@ class BatchMatMul : public OpKernel {
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \ Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
BatchMatMul<SYCLDevice, TYPE>) BatchMatMul<SYCLDevice, TYPE>)
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // end namespace tensorflow } // end namespace tensorflow

View File

@ -35,5 +35,5 @@ TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL); TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL);
TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL); TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -53,9 +53,10 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
/* Uncomment to enable benchmarks for double & complex types: */ /* Uncomment to enable benchmarks for double & complex types: */
// BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64, // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
// gpu); // gpu);
// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \ // BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu); \ // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \ // \
// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu); // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
// Typical fully connected layers // Typical fully connected layers

View File

@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice; typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice; typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T> template <typename Device, typename T>
class BatchNormOp : public OpKernel { class BatchNormOp : public OpKernel {

View File

@ -54,7 +54,7 @@ TEST_F(BatchNormOpTest, Simple) {
Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2})); Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
test::FillValues<float>( test::FillValues<float>(
&expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f, &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f,
-33.31f, -23.85f, -34.72f, -25.85f, -36.13f }); -33.31f, -23.85f, -34.72f, -25.85f, -36.13f});
test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01); test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
} }

View File

@ -56,9 +56,10 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
errors::InvalidArgument("input rank should be >= ", 1 + block_dims, errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
" instead of ", orig_input_tensor.dims())); " instead of ", orig_input_tensor.dims()));
OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) && OP_REQUIRES(context,
block_dims == orig_crops.dim_size(0) && TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
2 == orig_crops.dim_size(1), block_dims == orig_crops.dim_size(0) &&
2 == orig_crops.dim_size(1),
errors::InvalidArgument("crops should have shape [", block_dims, errors::InvalidArgument("crops should have shape [", block_dims,
", 2] instead of ", ", 2] instead of ",
orig_crops.shape().DebugString())); orig_crops.shape().DebugString()));

View File

@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#include "tensorflow/core/util/bcast.h"
#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/bcast.h"
namespace tensorflow { namespace tensorflow {

View File

@ -77,14 +77,14 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
} }
CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
if (data_format == FORMAT_NHWC) { if (data_format == FORMAT_NHWC) {
BiasNHWCKernel< BiasNHWCKernel<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
config.virtual_thread_count, input, bias, output, bias_size); config.virtual_thread_count, input, bias, output, bias_size);
} else { } else {
BiasNCHWKernel< BiasNCHWKernel<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
config.virtual_thread_count, input, bias, output, bias_size, config.virtual_thread_count, input, bias, output, bias_size,
image_size); image_size);
} }
} }
@ -206,10 +206,10 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
// Check if we have enough shared memory. // Check if we have enough shared memory.
if (shared_memory_size <= max_shared_memory_size) { if (shared_memory_size <= max_shared_memory_size) {
if (data_format == FORMAT_NHWC) { if (data_format == FORMAT_NHWC) {
BiasGradNHWC_SharedAtomics< BiasGradNHWC_SharedAtomics<T>
T><<<config.block_count, config.thread_per_block, shared_memory_size, <<<config.block_count, config.thread_per_block, shared_memory_size,
d.stream()>>>(total_count, output_backprop, bias_backprop, d.stream()>>>(total_count, output_backprop, bias_backprop,
bias_size); bias_size);
} else { } else {
// Round up the block count to multiple of bias_size. // Round up the block count to multiple of bias_size.
int group_size = (config.block_count + bias_size - 1) / bias_size; int group_size = (config.block_count + bias_size - 1) / bias_size;
@ -217,23 +217,24 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
if (config.thread_per_block < kWarpSize) { if (config.thread_per_block < kWarpSize) {
config.thread_per_block = kWarpSize; config.thread_per_block = kWarpSize;
} }
BiasGradNCHW_SharedAtomics< BiasGradNCHW_SharedAtomics<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
output_backprop, bias_backprop, batch, bias_size, image_size, output_backprop, bias_backprop, batch, bias_size, image_size,
group_size); group_size);
} }
} else { } else {
// Note that even if we don't have enough shared memory to fit the entire // Note that even if we don't have enough shared memory to fit the entire
// output block, it is possible to process one group of elements at a time. // output block, it is possible to process one group of elements at a time.
// But for now, we simply fall back to the naive implementation. // But for now, we simply fall back to the naive implementation.
if (data_format == FORMAT_NHWC) { if (data_format == FORMAT_NHWC) {
BiasGradNHWC_Naive< BiasGradNHWC_Naive<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
total_count, output_backprop, bias_backprop, bias_size); total_count, output_backprop, bias_backprop, bias_size);
} else { } else {
BiasGradNCHW_Naive< BiasGradNCHW_Naive<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
total_count, output_backprop, bias_backprop, bias_size, image_size); total_count, output_backprop, bias_backprop, bias_size,
image_size);
} }
} }
} }

View File

@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
auto *to_x = reinterpret_cast<const volatile T *>(&x); auto *to_x = reinterpret_cast<const volatile T *>(&x);
return *to_x; return *to_x;
} }
} // namespace tensorflow::internal } // namespace internal
} // namespace tensorflow } // namespace tensorflow
#endif // TENSORFLOW_UTIL_BOUNDS_CHECK_H_ #endif // TENSORFLOW_UTIL_BOUNDS_CHECK_H_

View File

@ -126,13 +126,13 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU), REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
SimpleCandidateSamplerOp<LogUniformSampler>); SimpleCandidateSamplerOp<LogUniformSampler>);
REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler") REGISTER_KERNEL_BUILDER(
.Device(DEVICE_CPU), Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU),
SimpleCandidateSamplerOp<UnigramSampler>); SimpleCandidateSamplerOp<UnigramSampler>);
REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler") REGISTER_KERNEL_BUILDER(
.Device(DEVICE_CPU), Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU),
SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>); SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
class AllCandidateSamplerOp : public BaseCandidateSamplerOp { class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
public: public:
@ -197,8 +197,9 @@ class ComputeAccidentalHitsOp : public OpKernel {
void Compute(OpKernelContext* context) override { void Compute(OpKernelContext* context) override {
const Tensor& in_true_candidates = context->input(0); const Tensor& in_true_candidates = context->input(0);
const TensorShape& in_true_candidates_shape = in_true_candidates.shape(); const TensorShape& in_true_candidates_shape = in_true_candidates.shape();
OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) && OP_REQUIRES(context,
in_true_candidates_shape.dim_size(1) == num_true_, TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
in_true_candidates_shape.dim_size(1) == num_true_,
errors::InvalidArgument( errors::InvalidArgument(
"true_candidates must be a batch_size * num_true matrix")); "true_candidates must be a batch_size * num_true matrix"));

View File

@ -36,7 +36,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice; typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice; typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
#define CURRY_TYPES2(FN, arg0) \ #define CURRY_TYPES2(FN, arg0) \
FN(arg0, bool); \ FN(arg0, bool); \
@ -223,11 +223,11 @@ class SyclCastOp : public CastOpBase {
} }
}; };
#define REGISTER_CAST_SYCL(srctype, dsttype) \ #define REGISTER_CAST_SYCL(srctype, dsttype) \
REGISTER_KERNEL_BUILDER(Name("Cast") \ REGISTER_KERNEL_BUILDER(Name("Cast") \
.TypeConstraint<srctype>("SrcT") \ .TypeConstraint<srctype>("SrcT") \
.TypeConstraint<dsttype>("DstT") \ .TypeConstraint<dsttype>("DstT") \
.Device(DEVICE_SYCL), \ .Device(DEVICE_SYCL), \
SyclCastOp) SyclCastOp)
CURRY_TYPES2(REGISTER_CAST_SYCL, bool); CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
CURRY_TYPES2(REGISTER_CAST_SYCL, int32); CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
@ -237,7 +237,7 @@ CURRY_TYPES2(REGISTER_CAST_SYCL, double);
#undef REGISTER_CAST_SYCL #undef REGISTER_CAST_SYCL
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
#undef CURRY_TYPES2 #undef CURRY_TYPES2
@ -250,6 +250,5 @@ REGISTER_KERNEL_BUILDER(
REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER(
Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"), Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"),
CpuCastOp); CpuCastOp);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // end namespace tensorflow } // end namespace tensorflow

View File

@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
p[0] = a.value; p[0] = a.value;
p[1] = 0; p[1] = 0;
#else #else
static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!"); static_assert(::tensorflow::port::kLittleEndian,
"Not a little endian system!");
p[0] = 0; p[0] = 0;
p[1] = a.value; p[1] = a.value;
#endif #endif

View File

@ -41,25 +41,25 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
o.device(d) = i.template cast<O>(); o.device(d) = i.template cast<O>();
} }
}; };
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace functor } // namespace functor
#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \ #define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
FN(arg0, arg1, bool); \ FN(arg0, arg1, bool); \
FN(arg0, arg1, uint8); \ FN(arg0, arg1, uint8); \
FN(arg0, arg1, int8); \ FN(arg0, arg1, int8); \
FN(arg0, arg1, uint16); \ FN(arg0, arg1, uint16); \
FN(arg0, arg1, int16); \ FN(arg0, arg1, int16); \
FN(arg0, arg1, int32); \ FN(arg0, arg1, int32); \
FN(arg0, arg1, int64); \ FN(arg0, arg1, int64); \
FN(arg0, arg1, float); \ FN(arg0, arg1, float); \
FN(arg0, arg1, double); \ FN(arg0, arg1, double); \
FN(arg0, arg1, std::complex<float>); \ FN(arg0, arg1, std::complex<float>); \
FN(arg0, arg1, std::complex<double>) FN(arg0, arg1, std::complex<double>)
#define CURRY_TYPES3(FN, arg0, arg1) \ #define CURRY_TYPES3(FN, arg0, arg1) \
CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \ CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
FN(arg0, arg1, Eigen::half); FN(arg0, arg1, Eigen::half);
#define CAST_CASE(DEVICE, IN, OUT) \ #define CAST_CASE(DEVICE, IN, OUT) \

View File

@ -107,10 +107,10 @@ static void BM_gpu_float_int64(int iters, int num) {
testing::UseRealTime(); testing::UseRealTime();
#if GOOGLE_CUDA #if GOOGLE_CUDA
test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters); test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters); test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} }
BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20); BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
@ -130,10 +130,10 @@ static void BM_gpu_bool_float(int iters, int num) {
testing::UseRealTime(); testing::UseRealTime();
#if GOOGLE_CUDA #if GOOGLE_CUDA
test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters); test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters); test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} }
BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20); BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
@ -180,7 +180,7 @@ static void BM_gpu_float_half(int iters, int num) {
testing::UseRealTime(); testing::UseRealTime();
#if GOOGLE_CUDA #if GOOGLE_CUDA
test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters); test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA
} }
BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20); BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
@ -191,7 +191,7 @@ static void BM_gpu_half_float(int iters, int num) {
testing::UseRealTime(); testing::UseRealTime();
#if GOOGLE_CUDA #if GOOGLE_CUDA
test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters); test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA
} }
BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20); BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);

View File

@ -107,14 +107,14 @@ class HSVToRGBOp : public OpKernel {
} }
}; };
#define REGISTER_CPU(T) \ #define REGISTER_CPU(T) \
REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
RGBToHSVOp<CPUDevice, T>); \ RGBToHSVOp<CPUDevice, T>); \
template class RGBToHSVOp<CPUDevice, T>; \ template class RGBToHSVOp<CPUDevice, T>; \
REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
HSVToRGBOp<CPUDevice, T>); \ HSVToRGBOp<CPUDevice, T>); \
template class HSVToRGBOp<CPUDevice, T>; template class HSVToRGBOp<CPUDevice, T>;
TF_CALL_float(REGISTER_CPU); TF_CALL_float(REGISTER_CPU);
TF_CALL_double(REGISTER_CPU); TF_CALL_double(REGISTER_CPU);
@ -123,40 +123,39 @@ TF_CALL_double(REGISTER_CPU);
// Forward declarations of the function specializations for GPU (to prevent // Forward declarations of the function specializations for GPU (to prevent
// building the GPU versions here, they will be built compiling _gpu.cu.cc). // building the GPU versions here, they will be built compiling _gpu.cu.cc).
namespace functor { namespace functor {
#define DECLARE_GPU(T) \ #define DECLARE_GPU(T) \
template <> \ template <> \
void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \ void RGBToHSV<GPUDevice, T>::operator()( \
TTypes<T, 2>::ConstTensor input_data, \ const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data, \
TTypes<T, 1>::Tensor range, \ TTypes<T, 1>::Tensor range, TTypes<T, 2>::Tensor output_data); \
TTypes<T, 2>::Tensor output_data); \ extern template struct RGBToHSV<GPUDevice, T>; \
extern template struct RGBToHSV<GPUDevice, T>; \ template <> \
template <> \ void HSVToRGB<GPUDevice, T>::operator()( \
void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \ const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data, \
TTypes<T, 2>::ConstTensor input_data, \ TTypes<T, 2>::Tensor output_data); \
TTypes<T, 2>::Tensor output_data); \
extern template struct HSVToRGB<GPUDevice, T>; extern template struct HSVToRGB<GPUDevice, T>;
TF_CALL_float(DECLARE_GPU); TF_CALL_float(DECLARE_GPU);
TF_CALL_double(DECLARE_GPU); TF_CALL_double(DECLARE_GPU);
} // namespace functor } // namespace functor
#define REGISTER_GPU(T) \ #define REGISTER_GPU(T) \
REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
RGBToHSVOp<GPUDevice, T>); \ RGBToHSVOp<GPUDevice, T>); \
REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
HSVToRGBOp<GPUDevice, T>); HSVToRGBOp<GPUDevice, T>);
TF_CALL_float(REGISTER_GPU); TF_CALL_float(REGISTER_GPU);
TF_CALL_double(REGISTER_GPU); TF_CALL_double(REGISTER_GPU);
#endif #endif
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL(T) \ #define REGISTER_SYCL(T) \
REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
RGBToHSVOp<SYCLDevice, T>); \ RGBToHSVOp<SYCLDevice, T>); \
REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
HSVToRGBOp<SYCLDevice, T>); HSVToRGBOp<SYCLDevice, T>);
TF_CALL_float(REGISTER_SYCL); TF_CALL_float(REGISTER_SYCL);
TF_CALL_double(REGISTER_SYCL); TF_CALL_double(REGISTER_SYCL);
#endif #endif

View File

@ -54,10 +54,9 @@ struct RGBToHSV {
// TODO(wicke): all these assignments are only necessary because a combined // TODO(wicke): all these assignments are only necessary because a combined
// expression is larger than kernel parameter space. A custom kernel is // expression is larger than kernel parameter space. A custom kernel is
// probably in order. // probably in order.
H.device(d) = (R == V).select(norm * (G - B), H.device(d) = (R == V).select(
(G == V).select( norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6),
norm * (B - R) + T(2) / T(6), norm * (R - G) + T(4) / T(6)));
norm * (R - G) + T(4) / T(6)));
H.device(d) = (range > T(0)).select(H, H.constant(T(0))); H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
H.device(d) = (H < T(0)).select(H + T(1), H); H.device(d) = (H < T(0)).select(H + T(1), H);
} }

View File

@ -17,8 +17,8 @@ limitations under the License.
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "tensorflow/core/kernels/colorspace_op.h"
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/colorspace_op.h"
namespace tensorflow { namespace tensorflow {
@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice;
template class functor::HSVToRGB<GPUDevice, T>; template class functor::HSVToRGB<GPUDevice, T>;
TF_CALL_float(INSTANTIATE_GPU); TF_CALL_float(INSTANTIATE_GPU);
TF_CALL_double(INSTANTIATE_GPU); TF_CALL_double(INSTANTIATE_GPU);
} } // namespace tensorflow
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA

View File

@ -224,34 +224,34 @@ class HSVToRGBOpTest : public OpsTestBase {
} }
}; };
#define TEST_COLORSPACE(test, dt) \ #define TEST_COLORSPACE(test, dt) \
TEST_F(test, CheckBlack) { \ TEST_F(test, CheckBlack) { \
MakeOp(dt); \ MakeOp(dt); \
CheckBlack(dt); \ CheckBlack(dt); \
} \ } \
TEST_F(test, CheckGray) { \ TEST_F(test, CheckGray) { \
MakeOp(dt); \ MakeOp(dt); \
CheckGray(dt); \ CheckGray(dt); \
} \ } \
TEST_F(test, CheckWhite) { \ TEST_F(test, CheckWhite) { \
MakeOp(dt); \ MakeOp(dt); \
CheckWhite(dt); \ CheckWhite(dt); \
} \ } \
TEST_F(test, CheckRedMax) { \ TEST_F(test, CheckRedMax) { \
MakeOp(dt); \ MakeOp(dt); \
CheckRedMax(dt); \ CheckRedMax(dt); \
} \ } \
TEST_F(test, CheckGreenMax) { \ TEST_F(test, CheckGreenMax) { \
MakeOp(dt); \ MakeOp(dt); \
CheckGreenMax(dt); \ CheckGreenMax(dt); \
} \ } \
TEST_F(test, CheckBlueMax) { \ TEST_F(test, CheckBlueMax) { \
MakeOp(dt); \ MakeOp(dt); \
CheckBlueMax(dt); \ CheckBlueMax(dt); \
} \ } \
TEST_F(test, CheckNegativeDifference) { \ TEST_F(test, CheckNegativeDifference) { \
MakeOp(dt); \ MakeOp(dt); \
CheckNegativeDifference(dt); \ CheckNegativeDifference(dt); \
} }
typedef RGBToHSVOpTest<float> rgb_to_hsv_float; typedef RGBToHSVOpTest<float> rgb_to_hsv_float;

View File

@ -41,10 +41,11 @@ namespace tensorflow {
// Assumes all inputs are nonempty // Assumes all inputs are nonempty
template <typename T> template <typename T>
void ConcatCPU(DeviceBase* d, void ConcatCPU(
const std::vector< DeviceBase* d,
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
typename TTypes<T, 2>::Matrix* output); inputs,
typename TTypes<T, 2>::Matrix* output);
#if GOOGLE_CUDA #if GOOGLE_CUDA
template <typename T> template <typename T>
void ConcatGPU( void ConcatGPU(
@ -57,11 +58,12 @@ void ConcatGPU(
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
template <typename T> template <typename T>
void ConcatSYCL(const Eigen::SyclDevice& d, void ConcatSYCL(
const std::vector< const Eigen::SyclDevice& d,
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
typename TTypes<T, 2>::Matrix* output); inputs,
#endif // TENSORFLOW_USE_SYCL typename TTypes<T, 2>::Matrix* output);
#endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow
#endif // TENSORFLOW_KERNELS_CONCAT_LIB_H_ #endif // TENSORFLOW_KERNELS_CONCAT_LIB_H_

View File

@ -48,10 +48,11 @@ struct MemCpyCopier<ResourceHandle> {
} // namespace } // namespace
template <typename T> template <typename T>
void ConcatCPU(DeviceBase* d, void ConcatCPU(
const std::vector< DeviceBase* d,
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
typename TTypes<T, 2>::Matrix* output) { inputs,
typename TTypes<T, 2>::Matrix* output) {
if (std::is_same<T, string>::value) { if (std::is_same<T, string>::value) {
// use a large cost here to force strings to be handled by separate threads // use a large cost here to force strings to be handled by separate threads
ConcatCPUImpl<T>(d, inputs, 100000, MemCpyCopier<T>(), output); ConcatCPUImpl<T>(d, inputs, 100000, MemCpyCopier<T>(), output);
@ -86,21 +87,22 @@ TF_CALL_variant(REGISTER)
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
template <typename T> template <typename T>
void ConcatSYCL(const Eigen::SyclDevice& d, void ConcatSYCL(
const std::vector< const Eigen::SyclDevice& d,
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
typename TTypes<T, 2>::Matrix* output) { inputs,
typename TTypes<T, 2>::Matrix* output) {
ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(), ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
output); output);
} }
#define REGISTER_SYCL(T) \ #define REGISTER_SYCL(T) \
template void ConcatSYCL<T>( \ template void ConcatSYCL<T>( \
const Eigen::SyclDevice&, \ const Eigen::SyclDevice&, \
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \ const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
typename TTypes<T, 2>::Matrix* output); typename TTypes<T, 2>::Matrix* output);
TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL) TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL)
#undef REGISTER_SYCL #undef REGISTER_SYCL
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -15,9 +15,9 @@ limitations under the License.
#define EIGEN_USE_THREADS #define EIGEN_USE_THREADS
#include "tensorflow/core/kernels/concat_lib.h"
#include <vector> #include <vector>
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/concat_lib.h"
#include "tensorflow/core/util/work_sharder.h" #include "tensorflow/core/util/work_sharder.h"
namespace tensorflow { namespace tensorflow {
@ -73,7 +73,7 @@ void ConcatCPUImpl(
// Sharded mode. // Sharded mode.
auto work = [&row_size, &sizes, &inputs, &output, &copier, &num_inputs]( auto work = [&row_size, &sizes, &inputs, &output, &copier, &num_inputs](
int64 start, int64 end) { int64 start, int64 end) {
int64 skipped_rows = start / row_size; int64 skipped_rows = start / row_size;
T* out = output->data() + skipped_rows * row_size; T* out = output->data() + skipped_rows * row_size;
T* out_start = output->data() + start; T* out_start = output->data() + start;
@ -160,5 +160,5 @@ void ConcatSYCLImpl(
} }
} }
} }
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -37,7 +37,7 @@ typedef Eigen::GpuDevice GPUDevice;
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice; typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM }; enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
@ -71,8 +71,9 @@ class ConcatBaseOp : public OpKernel {
const TensorShape& input_shape = values[0].shape(); const TensorShape& input_shape = values[0].shape();
int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim; int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
OP_REQUIRES(c, (0 <= axis && axis < input_dims) || OP_REQUIRES(c,
(allow_legacy_scalars() && concat_dim == 0), (0 <= axis && axis < input_dims) ||
(allow_legacy_scalars() && concat_dim == 0),
errors::InvalidArgument( errors::InvalidArgument(
"ConcatOp : Expected concatenating dimensions in the range " "ConcatOp : Expected concatenating dimensions in the range "
"[", "[",
@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel {
c, in.dims() == input_dims || (input_is_scalar && in_is_scalar), c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
errors::InvalidArgument( errors::InvalidArgument(
"ConcatOp : Ranks of all input tensors should match: shape[0] = ", "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
input_shape.DebugString(), " vs. shape[", i, "] = ", input_shape.DebugString(), " vs. shape[", i,
in.shape().DebugString())); "] = ", in.shape().DebugString()));
for (int j = 0; j < input_dims; ++j) { for (int j = 0; j < input_dims; ++j) {
if (j == axis) { if (j == axis) {
continue; continue;
@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel {
c, in.dim_size(j) == input_shape.dim_size(j), c, in.dim_size(j) == input_shape.dim_size(j),
errors::InvalidArgument( errors::InvalidArgument(
"ConcatOp : Dimensions of inputs should match: shape[0] = ", "ConcatOp : Dimensions of inputs should match: shape[0] = ",
input_shape.DebugString(), " vs. shape[", i, "] = ", input_shape.DebugString(), " vs. shape[", i,
in.shape().DebugString())); "] = ", in.shape().DebugString()));
} }
if (in.NumElements() > 0) { if (in.NumElements() > 0) {
int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0; int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
@ -142,7 +143,7 @@ class ConcatBaseOp : public OpKernel {
ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat); ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
return; return;
} }
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
ConcatCPU<T>(c->device(), inputs_flat, &output_flat); ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
} }
} }
@ -252,7 +253,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
ConcatV2Op<CPUDevice, int32>); ConcatV2Op<CPUDevice, int32>);
#undef REGISTER_SYCL #undef REGISTER_SYCL
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
class ConcatOffsetOp : public OpKernel { class ConcatOffsetOp : public OpKernel {
public: public:
@ -347,5 +348,5 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
.HostMemory("shape") .HostMemory("shape")
.HostMemory("offset"), .HostMemory("offset"),
ConcatOffsetOp); ConcatOffsetOp);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000); BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>, typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
Eigen::Unaligned> EigenMap; Eigen::Unaligned>
EigenMap;
static void MemcpyManyAlternative1(int iters, int dim2) { static void MemcpyManyAlternative1(int iters, int dim2) {
testing::StopTiming(); testing::StopTiming();

View File

@ -160,7 +160,7 @@ class ConditionalAccumulatorBase : public ResourceBase {
* Modifications to convenience macros defined in core/framework/op_kernel.h. * Modifications to convenience macros defined in core/framework/op_kernel.h.
* The below macros return a boolean if the test fails, so that the calling * The below macros return a boolean if the test fails, so that the calling
* function can get an indication that a failure has occurred. * function can get an indication that a failure has occurred.
*/ */
#define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS) \ #define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS) \
do { \ do { \
if (!TF_PREDICT_TRUE(EXP)) { \ if (!TF_PREDICT_TRUE(EXP)) { \

View File

@ -99,9 +99,10 @@ class AccumulatorTakeGradientOp
ConditionalAccumulatorBase* accumulator, ConditionalAccumulatorBase* accumulator,
DoneCallback callback) override { DoneCallback callback) override {
// Check signature // Check signature
OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32}, OP_REQUIRES_OK_ASYNC(
{accumulator->dtype()}), ctx,
callback); ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}),
callback);
} }
private: private:
@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp
REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU), REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
AccumulatorTakeGradientOp); AccumulatorTakeGradientOp);
} // namespace tensorflow } // namespace tensorflow

View File

@ -146,7 +146,6 @@ typedef Eigen::GpuDevice GPUDevice;
typedef Eigen::SyclDevice SYCLDevice; typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T, typename Index> template <typename Device, typename T, typename Index>
class FillOp : public OpKernel { class FillOp : public OpKernel {
public: public:

View File

@ -113,47 +113,47 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
#undef REGISTER_GPU_HOST_REF_KERNEL #undef REGISTER_GPU_HOST_REF_KERNEL
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_SWITCH(type) \ #define REGISTER_SYCL_SWITCH(type) \
REGISTER_KERNEL_BUILDER(Name("Switch") \ REGISTER_KERNEL_BUILDER(Name("Switch") \
.Device(DEVICE_SYCL) \ .Device(DEVICE_SYCL) \
.HostMemory("pred") \ .HostMemory("pred") \
.TypeConstraint<type>("T"),\ .TypeConstraint<type>("T"), \
SwitchOp) SwitchOp)
TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH); TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
#define REGISTER_SYCL_REF_SWITCH(type) \ #define REGISTER_SYCL_REF_SWITCH(type) \
REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ REGISTER_KERNEL_BUILDER(Name("RefSwitch") \
.Device(DEVICE_SYCL) \ .Device(DEVICE_SYCL) \
.HostMemory("pred") \ .HostMemory("pred") \
.TypeConstraint<type>("T"), \ .TypeConstraint<type>("T"), \
SwitchOp) SwitchOp)
TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH); TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
#undef REGISTER_SYCL_SWITCH #undef REGISTER_SYCL_SWITCH
#undef REGISTER_SYCL_REF_SWITCH #undef REGISTER_SYCL_REF_SWITCH
#define REGISTER_SYCL_HOST_KERNEL(type) \ #define REGISTER_SYCL_HOST_KERNEL(type) \
REGISTER_KERNEL_BUILDER(Name("Switch") \ REGISTER_KERNEL_BUILDER(Name("Switch") \
.Device(DEVICE_SYCL) \ .Device(DEVICE_SYCL) \
.HostMemory("data") \ .HostMemory("data") \
.HostMemory("pred") \ .HostMemory("pred") \
.HostMemory("output_false")\ .HostMemory("output_false") \
.HostMemory("output_true") \ .HostMemory("output_true") \
.TypeConstraint<type>("T"),\ .TypeConstraint<type>("T"), \
SwitchOp) SwitchOp)
REGISTER_SYCL_HOST_KERNEL(bool); REGISTER_SYCL_HOST_KERNEL(bool);
REGISTER_SYCL_HOST_KERNEL(string); REGISTER_SYCL_HOST_KERNEL(string);
REGISTER_SYCL_HOST_KERNEL(int32); REGISTER_SYCL_HOST_KERNEL(int32);
#define REGISTER_SYCL_HOST_REF_KERNEL(type) \ #define REGISTER_SYCL_HOST_REF_KERNEL(type) \
REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ REGISTER_KERNEL_BUILDER(Name("RefSwitch") \
.Device(DEVICE_SYCL) \ .Device(DEVICE_SYCL) \
.HostMemory("data") \ .HostMemory("data") \
.HostMemory("pred") \ .HostMemory("pred") \
.HostMemory("output_false") \ .HostMemory("output_false") \
.HostMemory("output_true") \ .HostMemory("output_true") \
.TypeConstraint<type>("T"), \ .TypeConstraint<type>("T"), \
SwitchOp) SwitchOp)
REGISTER_SYCL_HOST_REF_KERNEL(int32); REGISTER_SYCL_HOST_REF_KERNEL(int32);
@ -162,7 +162,7 @@ REGISTER_SYCL_HOST_REF_KERNEL(string);
#undef REGISTER_SYCL_HOST_KERNEL #undef REGISTER_SYCL_HOST_KERNEL
#undef REGISTER_SYCL_HOST_REF_KERNEL #undef REGISTER_SYCL_HOST_REF_KERNEL
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
class RefSelectOp : public OpKernel { class RefSelectOp : public OpKernel {
public: public:
@ -282,7 +282,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
#undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_KERNEL
#undef REGISTER_SYCL_REF_KERNEL #undef REGISTER_SYCL_REF_KERNEL
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
// Special GPU kernels for int32 and string. // Special GPU kernels for int32 and string.
// TODO(b/25387198): Also enable int32 in device memory. This kernel // TODO(b/25387198): Also enable int32 in device memory. This kernel
@ -331,7 +331,7 @@ REGISTER_SYCL_HOST_KERNEL(string);
REGISTER_SYCL_HOST_KERNEL(ResourceHandle); REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
#undef REGISTER_SYCL_HOST_KERNEL #undef REGISTER_SYCL_HOST_KERNEL
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
void EnterOp::Compute(OpKernelContext* context) { void EnterOp::Compute(OpKernelContext* context) {
if (IsRefType(context->input_dtype(0))) { if (IsRefType(context->input_dtype(0))) {
@ -360,14 +360,14 @@ REGISTER_GPU_REF_KERNEL(bool);
#undef REGISTER_GPU_REF_KERNEL #undef REGISTER_GPU_REF_KERNEL
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(type) \ #define REGISTER_SYCL_KERNEL(type) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp) Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
REGISTER_SYCL_KERNEL(bool); REGISTER_SYCL_KERNEL(bool);
TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
#define REGISTER_SYCL_REF_KERNEL(type) \ #define REGISTER_SYCL_REF_KERNEL(type) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp) Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
REGISTER_SYCL_REF_KERNEL(bool); REGISTER_SYCL_REF_KERNEL(bool);
TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
@ -398,7 +398,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
#undef REGISTER_SYCL_HOST_KERNEL #undef REGISTER_SYCL_HOST_KERNEL
#undef REGISTER_SYCL_HOST_REF_KERNEL #undef REGISTER_SYCL_HOST_REF_KERNEL
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
// Special GPU kernels for int32 and string. // Special GPU kernels for int32 and string.
// TODO(b/25387198): Also enable int32 in device memory. This kernel // TODO(b/25387198): Also enable int32 in device memory. This kernel
@ -455,10 +455,10 @@ REGISTER_GPU_REF_KERNEL(bool);
#undef REGISTER_GPU_REF_KERNEL #undef REGISTER_GPU_REF_KERNEL
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(type) \ #define REGISTER_SYCL_KERNEL(type) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp); \ Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp); \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp); Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
REGISTER_SYCL_KERNEL(bool); REGISTER_SYCL_KERNEL(bool);
TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@ -483,7 +483,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
REGISTER_SYCL_HOST_KERNEL(int32); REGISTER_SYCL_HOST_KERNEL(int32);
REGISTER_SYCL_HOST_KERNEL(string); REGISTER_SYCL_HOST_KERNEL(string);
#undef REGISTER_SYCL_HOST_KERNEL #undef REGISTER_SYCL_HOST_KERNEL
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
// Special GPU kernels for int32 and string. // Special GPU kernels for int32 and string.
// TODO(b/25387198): Also enable int32 in device memory. This kernel // TODO(b/25387198): Also enable int32 in device memory. This kernel
@ -556,12 +556,12 @@ REGISTER_GPU_HOST_KERNEL(string);
#undef REGISTER_GPU_HOST_KERNEL #undef REGISTER_GPU_HOST_KERNEL
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(type) \ #define REGISTER_SYCL_KERNEL(type) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \ Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
NextIterationOp); \ NextIterationOp); \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\ Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
NextIterationOp) NextIterationOp)
REGISTER_SYCL_KERNEL(bool); REGISTER_SYCL_KERNEL(bool);
TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@ -585,7 +585,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
REGISTER_SYCL_HOST_KERNEL(int32); REGISTER_SYCL_HOST_KERNEL(int32);
REGISTER_SYCL_HOST_KERNEL(string); REGISTER_SYCL_HOST_KERNEL(string);
#undef REGISTER_SYCL_HOST_KERNEL #undef REGISTER_SYCL_HOST_KERNEL
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
// A LoopCond op has one input and one output. The input is a boolean // A LoopCond op has one input and one output. The input is a boolean
// scalar representing the taken branches of the "pivot" Switch that // scalar representing the taken branches of the "pivot" Switch that
@ -619,7 +619,7 @@ REGISTER_KERNEL_BUILDER(Name("LoopCond")
.HostMemory("input") .HostMemory("input")
.HostMemory("output"), .HostMemory("output"),
LoopCondOp); LoopCondOp);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
// ControlTrigger kernels // ControlTrigger kernels
REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU), REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU),
@ -631,7 +631,7 @@ REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU),
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_SYCL), REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_SYCL),
ControlTriggerOp); ControlTriggerOp);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
// When called, abort op will abort the current process. This can be used to // When called, abort op will abort the current process. This can be used to
// abort remote PSs when needed. // abort remote PSs when needed.

View File

@ -91,6 +91,7 @@ class KilledBySignal {
public: public:
explicit KilledBySignal(int signum) : signum_(signum) {} explicit KilledBySignal(int signum) : signum_(signum) {}
bool operator()(int exit_status) const { return exit_status == signum_; } bool operator()(int exit_status) const { return exit_status == signum_; }
private: private:
const int signum_; const int signum_;
}; };

View File

@ -688,7 +688,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit( static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
// default value is in bytes despite the name of the environment variable // default value is in bytes despite the name of the environment variable
"TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
); );
int device_id = stream->parent()->device_ordinal(); int device_id = stream->parent()->device_ordinal();
DataType dtype = input.dtype(); DataType dtype = input.dtype();

View File

@ -679,8 +679,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
const int dims = resized_shape.dims(); const int dims = resized_shape.dims();
OP_REQUIRES( OP_REQUIRES(
context, TensorShapeUtils::IsMatrix(paddings.shape()) && context,
paddings.dim_size(1) == 2, TensorShapeUtils::IsMatrix(paddings.shape()) &&
paddings.dim_size(1) == 2,
errors::InvalidArgument("paddings must be a matrix with 2 columns: ", errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
paddings.shape().DebugString())); paddings.shape().DebugString()));
const int fixed_dims = const int fixed_dims =
@ -715,20 +716,22 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
const int32 after = const int32 after =
paddings_matrix(d, 1); // Pad after existing elements. paddings_matrix(d, 1); // Pad after existing elements.
OP_REQUIRES(context, before >= 0 && after >= 0, OP_REQUIRES(context, before >= 0 && after >= 0,
errors::InvalidArgument("paddings must be non-negative: ", errors::InvalidArgument(
before, " ", after)); "paddings must be non-negative: ", before, " ", after));
if (offset_ == 0) { // SYMMETRIC mode. if (offset_ == 0) { // SYMMETRIC mode.
OP_REQUIRES( OP_REQUIRES(
context, before <= resized_shape.dim_size(d) && context,
after <= resized_shape.dim_size(d), before <= resized_shape.dim_size(d) &&
after <= resized_shape.dim_size(d),
errors::InvalidArgument("paddings must be no greater " errors::InvalidArgument("paddings must be no greater "
"than the dimension size: ", "than the dimension size: ",
before, ", ", after, " greater than ", before, ", ", after, " greater than ",
resized_shape.dim_size(d))); resized_shape.dim_size(d)));
} else if (offset_ == 1) { // REFLECT mode. } else if (offset_ == 1) { // REFLECT mode.
OP_REQUIRES( OP_REQUIRES(
context, before < resized_shape.dim_size(d) && context,
after < resized_shape.dim_size(d), before < resized_shape.dim_size(d) &&
after < resized_shape.dim_size(d),
errors::InvalidArgument("paddings must be less than" errors::InvalidArgument("paddings must be less than"
" the dimension size: ", " the dimension size: ",
before, ", ", after, " not less than ", before, ", ", after, " not less than ",
@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
// We only check the first three dims, since the depth is accessed as an // We only check the first three dims, since the depth is accessed as an
// int64 below. // int64 below.
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), OP_REQUIRES(
std::numeric_limits<int>::max()), context,
errors::InvalidArgument("filter too large")); FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
errors::InvalidArgument("filter too large"));
} }
// The last dimension for input is in_depth. It must be the same as the // The last dimension for input is in_depth. It must be the same as the
// filter's in_depth. // filter's in_depth.
const int64 in_depth = padded_shape.dim_size(3); const int64 in_depth = padded_shape.dim_size(3);
OP_REQUIRES( OP_REQUIRES(context, in_depth == filter.dim_size(2),
context, in_depth == filter.dim_size(2), errors::InvalidArgument(
errors::InvalidArgument("input and filter must have the same depth: ", "input and filter must have the same depth: ", in_depth,
in_depth, " vs ", filter.dim_size(2))); " vs ", filter.dim_size(2)));
// The last dimension for filter is out_depth. // The last dimension for filter is out_depth.
const int out_depth = static_cast<int>(filter.dim_size(3)); const int out_depth = static_cast<int>(filter.dim_size(3));
@ -786,9 +790,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
// The second dimension for input is rows/height. // The second dimension for input is rows/height.
// The first dimension for filter is rows/height. // The first dimension for filter is rows/height.
const int64 padded_rows_raw = padded_shape.dim_size(1); const int64 padded_rows_raw = padded_shape.dim_size(1);
OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw, OP_REQUIRES(
std::numeric_limits<int>::max()), context,
errors::InvalidArgument("Input rows too large")); FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
errors::InvalidArgument("Input rows too large"));
const int padded_rows = static_cast<int>(padded_rows_raw); const int padded_rows = static_cast<int>(padded_rows_raw);
const int filter_rows = static_cast<int>(filter.dim_size(0)); const int filter_rows = static_cast<int>(filter.dim_size(0));
const int resized_rows = static_cast<int>(resized_shape.dim_size(1)); const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
@ -796,9 +801,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
// The third dimension for input is columns/width. // The third dimension for input is columns/width.
// The second dimension for filter is columns/width. // The second dimension for filter is columns/width.
const int64 padded_cols_raw = padded_shape.dim_size(2); const int64 padded_cols_raw = padded_shape.dim_size(2);
OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw, OP_REQUIRES(
std::numeric_limits<int>::max()), context,
errors::InvalidArgument("Input cols too large")); FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
errors::InvalidArgument("Input cols too large"));
const int padded_cols = static_cast<int>(padded_cols_raw); const int padded_cols = static_cast<int>(padded_cols_raw);
const int filter_cols = static_cast<int>(filter.dim_size(1)); const int filter_cols = static_cast<int>(filter.dim_size(1));
const int resized_cols = static_cast<int>(resized_shape.dim_size(2)); const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
@ -864,24 +870,26 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp); TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
}; };
#define REGISTER_FUSED(T) \ #define REGISTER_FUSED(T) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("FusedResizeAndPadConv2D") \ Name("FusedResizeAndPadConv2D") \
.Device(DEVICE_CPU) \ .Device(DEVICE_CPU) \
.TypeConstraint<T>("T"), \ .TypeConstraint<T>("T"), \
FusedResizeConv2DUsingGemmOp< \ FusedResizeConv2DUsingGemmOp< \
T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \ T, \
BILINEAR>, \ FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
BILINEAR>, \
true>); true>);
TF_CALL_float(REGISTER_FUSED); TF_CALL_float(REGISTER_FUSED);
#define REGISTER_PAD_ONLY_FUSED(T) \ #define REGISTER_PAD_ONLY_FUSED(T) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
FusedResizeConv2DUsingGemmOp< \ FusedResizeConv2DUsingGemmOp< \
T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \ T, \
NEAREST>, \ FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
NEAREST>, \
false>); false>);
TF_CALL_float(REGISTER_PAD_ONLY_FUSED); TF_CALL_float(REGISTER_PAD_ONLY_FUSED);

View File

@ -27,7 +27,6 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
// Get the Cudnn workspace limit from the environment variable, which is in MB. // Get the Cudnn workspace limit from the environment variable, which is in MB.
// Return the workspace memory limit in bytes. If no value is set, return the // Return the workspace memory limit in bytes. If no value is set, return the
// default value. // default value.

View File

@ -25,9 +25,9 @@ limitations under the License.
#include "cuda/include/cuda.h" #include "cuda/include/cuda.h"
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/conv_2d.h" #include "tensorflow/core/kernels/conv_2d.h"
#include "tensorflow/core/lib/math/math_util.h"
#include "tensorflow/core/util/cuda_kernel_helper.h" #include "tensorflow/core/util/cuda_kernel_helper.h"
#include "tensorflow/core/util/tensor_format.h" #include "tensorflow/core/util/tensor_format.h"
#include "tensorflow/core/lib/math/math_util.h"
namespace tensorflow { namespace tensorflow {
@ -252,11 +252,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
int x = threadIdx.x; int x = threadIdx.x;
Dimension<3> output_dims = { Dimension<3> output_dims = {
input_dims[0], input_dims[2], input_dims[1], input_dims[0],
input_dims[2],
input_dims[1],
}; };
Dimension<3> input_dims_in_tiles = { Dimension<3> input_dims_in_tiles = {
input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI, input_dims[0],
(input_dims[1] + TileSizeI - 1) / TileSizeI,
(input_dims[2] + TileSizeJ - 1) / TileSizeJ, (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
}; };
@ -264,7 +267,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
FlatToTensorIndex(blockIdx.x, input_dims_in_tiles); FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
Index<3> input_tile_origin = { Index<3> input_tile_origin = {
input_tile_index[0], input_tile_index[1] * TileSizeI, input_tile_index[0],
input_tile_index[1] * TileSizeI,
input_tile_index[2] * TileSizeJ, input_tile_index[2] * TileSizeJ,
}; };
@ -322,11 +326,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
__syncthreads(); __syncthreads();
Index<3> output_tile_index = { Index<3> output_tile_index = {
input_tile_index[0], input_tile_index[2], input_tile_index[1], input_tile_index[0],
input_tile_index[2],
input_tile_index[1],
}; };
Index<3> output_tile_origin = { Index<3> output_tile_origin = {
output_tile_index[0], output_tile_index[1] * TileSizeJ, output_tile_index[0],
output_tile_index[1] * TileSizeJ,
output_tile_index[2] * TileSizeI, output_tile_index[2] * TileSizeI,
}; };
@ -799,7 +806,7 @@ struct TransposeElemType<16> {
// A helper function to make RunSwapDimension1And2InTensor3 concise. This // A helper function to make RunSwapDimension1And2InTensor3 concise. This
// helper function looks at the data type and input matrix sizes and decides // helper function looks at the data type and input matrix sizes and decides
// the thread numbers and tile sizes to use. // the thread numbers and tile sizes to use.
template <typename T, bool conjugate = false > template <typename T, bool conjugate = false>
void SwapDimension1And2InTensor3WithNarrowMatrices( void SwapDimension1And2InTensor3WithNarrowMatrices(
const GPUDevice& d, const T* input, const Dimension<3>& input_dims, const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
T* output, const int kMinDimensionToUseTiles) { T* output, const int kMinDimensionToUseTiles) {
@ -902,19 +909,21 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
constexpr int kNumThreads = 256; constexpr int kNumThreads = 256;
Dimension<3> input_dims_in_tiles = { Dimension<3> input_dims_in_tiles = {
input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize), input_dims[0],
MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize), MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
}; };
int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] * int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
input_dims_in_tiles[2]; input_dims_in_tiles[2];
SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate> SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
conjugate>
<<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims, <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
output); output);
} else if (narrow_matrix) { } else if (narrow_matrix) {
SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output, SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
kMinDimensionToUseTiles); d, input, input_dims, output, kMinDimensionToUseTiles);
} else { } else {
int total_element_count = input_dims[0] * input_dims[1] * input_dims[2]; int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d); CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);

View File

@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
filter.shape().DebugString())); filter.shape().DebugString()));
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), OP_REQUIRES(
std::numeric_limits<int>::max()), context,
errors::InvalidArgument("filter too large")); FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
errors::InvalidArgument("filter too large"));
} }
// The last dimension for input is in_depth. It must be the same as the // The last dimension for input is in_depth. It must be the same as the
// filter's in_depth. // filter's in_depth.
const int64 in_depth = GetTensorDim(input, data_format_, 'C'); const int64 in_depth = GetTensorDim(input, data_format_, 'C');
OP_REQUIRES( OP_REQUIRES(context, in_depth == filter.dim_size(2),
context, in_depth == filter.dim_size(2), errors::InvalidArgument(
errors::InvalidArgument("input and filter must have the same depth: ", "input and filter must have the same depth: ", in_depth,
in_depth, " vs ", filter.dim_size(2))); " vs ", filter.dim_size(2)));
// The last dimension for filter is out_depth. // The last dimension for filter is out_depth.
const int out_depth = static_cast<int>(filter.dim_size(3)); const int out_depth = static_cast<int>(filter.dim_size(3));
@ -487,18 +488,20 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
// The second dimension for input is rows/height. // The second dimension for input is rows/height.
// The first dimension for filter is rows/height. // The first dimension for filter is rows/height.
const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H'); const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
OP_REQUIRES(context, FastBoundsCheck(input_rows_raw, OP_REQUIRES(
std::numeric_limits<int>::max()), context,
errors::InvalidArgument("Input rows too large")); FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
errors::InvalidArgument("Input rows too large"));
const int input_rows = static_cast<int>(input_rows_raw); const int input_rows = static_cast<int>(input_rows_raw);
const int filter_rows = static_cast<int>(filter.dim_size(0)); const int filter_rows = static_cast<int>(filter.dim_size(0));
// The third dimension for input is columns/width. // The third dimension for input is columns/width.
// The second dimension for filter is columns/width. // The second dimension for filter is columns/width.
const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W'); const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
OP_REQUIRES(context, FastBoundsCheck(input_cols_raw, OP_REQUIRES(
std::numeric_limits<int>::max()), context,
errors::InvalidArgument("Input cols too large")); FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
errors::InvalidArgument("Input cols too large"));
const int input_cols = static_cast<int>(input_cols_raw); const int input_cols = static_cast<int>(input_cols_raw);
const int filter_cols = static_cast<int>(filter.dim_size(1)); const int filter_cols = static_cast<int>(filter.dim_size(1));

View File

@ -17,8 +17,8 @@ limitations under the License.
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "tensorflow/core/kernels/cross_op.h"
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/cross_op.h"
namespace tensorflow { namespace tensorflow {

View File

@ -19,13 +19,13 @@ limitations under the License.
#include <limits> #include <limits>
#include "tensorflow/core/util/ctc/ctc_beam_search.h"
#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/types.h"
#include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/util/ctc/ctc_beam_search.h"
#include "tensorflow/core/util/sparse/sparse_tensor.h" #include "tensorflow/core/util/sparse/sparse_tensor.h"
namespace tensorflow { namespace tensorflow {
@ -80,16 +80,17 @@ class CTCDecodeHelper {
if (!(batch_size == (*seq_len)->dim_size(0))) { if (!(batch_size == (*seq_len)->dim_size(0))) {
return errors::FailedPrecondition( return errors::FailedPrecondition(
"len(sequence_length) != batch_size. ", "len(sequence_length): ", "len(sequence_length) != batch_size. ",
(*seq_len)->dim_size(0), " batch_size: ", batch_size); "len(sequence_length): ", (*seq_len)->dim_size(0),
" batch_size: ", batch_size);
} }
auto seq_len_t = (*seq_len)->vec<int32>(); auto seq_len_t = (*seq_len)->vec<int32>();
for (int b = 0; b < batch_size; ++b) { for (int b = 0; b < batch_size; ++b) {
if (!(seq_len_t(b) <= max_time)) { if (!(seq_len_t(b) <= max_time)) {
return errors::FailedPrecondition("sequence_length(", b, ") <= ", return errors::FailedPrecondition("sequence_length(", b,
max_time); ") <= ", max_time);
} }
} }

View File

@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel {
const int64 batch_indices = g.group()[0]; const int64 batch_indices = g.group()[0];
OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size), OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
errors::InvalidArgument("labels batch index must be between ", errors::InvalidArgument("labels batch index must be between ",
0, " and ", batch_size, " but saw: ", 0, " and ", batch_size,
batch_indices)); " but saw: ", batch_indices));
auto values = g.values<int32>(); auto values = g.values<int32>();
std::vector<int>* b_values = &labels_t[batch_indices]; std::vector<int>* b_values = &labels_t[batch_indices];

View File

@ -45,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
.HostMemory("y") .HostMemory("y")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
UnaryOp<CPUDevice, functor::abs<int32>>); UnaryOp<CPUDevice, functor::abs<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
#if TENSORFLOW_USE_SYCL #if TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double); REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -17,12 +17,12 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_gradients.h" #include "tensorflow/core/kernels/cwise_ops_gradients.h"
namespace tensorflow { namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
complex64, complex128); complex128);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double); REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA #if GOOGLE_CUDA
REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double); REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double);

View File

@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
BinaryOp<CPUDevice, functor::add<int32>>); BinaryOp<CPUDevice, functor::add<int32>>);
#endif #endif
#if TENSORFLOW_USE_SYCL #if TENSORFLOW_USE_SYCL
#define REGISTER_KERNEL(type) \ #define REGISTER_KERNEL(type) \
REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \ REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
@ -66,5 +65,5 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
.HostMemory("z") .HostMemory("z")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::add<int32>>); BinaryOp<CPUDevice, functor::add<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -22,8 +22,8 @@ namespace tensorflow {
// sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__. // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
#if !defined(__ANDROID_TYPES_SLIM__) #if !defined(__ANDROID_TYPES_SLIM__)
REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
uint8, complex128, string); complex128, string);
// Notice: String is excluded to allow marking AddV2 is_commutative and // Notice: String is excluded to allow marking AddV2 is_commutative and
// is_aggregate. // is_aggregate.
REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8, REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,

View File

@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
#if TENSORFLOW_USE_SYCL #if TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double); REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -1,10 +1,10 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
@ -17,8 +17,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_gradients.h" #include "tensorflow/core/kernels/cwise_ops_gradients.h"
namespace tensorflow { namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
complex64, complex128); complex128);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double); REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);

View File

@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
#if TENSORFLOW_USE_SYCL #if TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double); REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -17,8 +17,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_gradients.h" #include "tensorflow/core/kernels/cwise_ops_gradients.h"
namespace tensorflow { namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
complex64, complex128); complex128);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double); REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);

View File

@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
#if TENSORFLOW_USE_SYCL #if TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double); REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double); REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -16,20 +16,18 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h" #include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow { namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
complex64, complex128); complex128);
#if TENSORFLOW_USE_SYCL #if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(TYPE) \ #define REGISTER_SYCL_KERNEL(TYPE) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("Cosh") \ Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
.Device(DEVICE_SYCL) \ UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
REGISTER_SYCL_KERNEL(float); REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double); REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA #if GOOGLE_CUDA
REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double); REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);

View File

@ -54,5 +54,5 @@ REGISTER_KERNEL_BUILDER(Name("Div")
.HostMemory("z") .HostMemory("z")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::safe_div<int32>>); BinaryOp<CPUDevice, functor::safe_div<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -26,5 +26,5 @@ REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
#if TENSORFLOW_USE_SYCL #if TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double); REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
#endif #endif
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double); REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
#endif #endif
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double); REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -49,5 +49,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
.HostMemory("z") .HostMemory("z")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::safe_floor_div<int32>>); BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -40,5 +40,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
.HostMemory("z") .HostMemory("z")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>); BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -19,8 +19,8 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
namespace functor { namespace functor {
DEFINE_UNARY1(conj, complex64); DEFINE_UNARY1(conj, complex64);
DEFINE_UNARY1(conj, complex128); DEFINE_UNARY1(conj, complex128);
} // namespace functor } // namespace functor
} // namespace tensorflow } // namespace tensorflow

View File

@ -20,7 +20,7 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
namespace functor { namespace functor {
DEFINE_BINARY10(equal_to, float, Eigen::half, double, uint8, int8, int16, int64, DEFINE_BINARY10(equal_to, float, Eigen::half, double, uint8, int8, int16, int64,
complex64, complex128, bool); complex64, complex128, bool);
DEFINE_APPROXIMATE_EQUAL2(float, double); DEFINE_APPROXIMATE_EQUAL2(float, double);
} // namespace functor } // namespace functor
} // namespace tensorflow } // namespace tensorflow

View File

@ -15,8 +15,10 @@ limitations under the License.
#if GOOGLE_CUDA #if GOOGLE_CUDA
#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" #define EIGEN_USE_GPU
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
namespace tensorflow { namespace tensorflow {
namespace functor { namespace functor {
@ -38,19 +40,17 @@ struct SelectScalarFunctor<GPUDevice, T> {
typename TTypes<bool>::ConstScalar cond, typename TTypes<bool>::ConstScalar cond,
typename TTypes<T>::ConstFlat then_flat, typename TTypes<T>::ConstFlat then_flat,
typename TTypes<T>::ConstFlat else_flat) { typename TTypes<T>::ConstFlat else_flat) {
#if !defined(EIGEN_HAS_INDEX_LIST) #if !defined(EIGEN_HAS_INDEX_LIST)
Eigen::array<int, 1> rank1{1}; Eigen::array<int, 1> rank1{1};
#else #else
Eigen::IndexList<Eigen::type2index<1>> rank1; Eigen::IndexList<Eigen::type2index<1> > rank1;
#endif #endif
const int size = then_flat.dimension(0); const int size = then_flat.dimension(0);
Eigen::array<int, 1> broadcast_dims{size}; Eigen::array<int, 1> broadcast_dims{size};
To32Bit(out).device(d) = cond.reshape(rank1)
.broadcast(broadcast_dims)
.select(then_flat, else_flat);
To32Bit(out).device(d) = cond.reshape(rank1)
.broadcast(broadcast_dims)
.select(then_flat, else_flat);
} }
}; };
@ -89,8 +89,8 @@ struct BatchSelectFunctor<GPUDevice, T> {
} }
}; };
#define SELECT_FUNCTOR(T) \ #define SELECT_FUNCTOR(T) \
template struct SelectFunctor<GPUDevice, T>; \ template struct SelectFunctor<GPUDevice, T>; \
template struct SelectScalarFunctor<GPUDevice, T>; \ template struct SelectScalarFunctor<GPUDevice, T>; \
template struct BatchSelectFunctor<GPUDevice, T>; template struct BatchSelectFunctor<GPUDevice, T>;

View File

@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
.HostMemory("z") .HostMemory("z")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::greater<int32>>); BinaryOp<CPUDevice, functor::greater<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
#endif #endif
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double); REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
double);
REGISTER_KERNEL_BUILDER(Name("GreaterEqual") REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
.Device(DEVICE_SYCL) .Device(DEVICE_SYCL)
@ -44,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
.HostMemory("z") .HostMemory("z")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::greater_equal<int32>>); BinaryOp<CPUDevice, functor::greater_equal<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -21,7 +21,7 @@ REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64, REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
uint8, uint16); uint8, uint16);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA #if GOOGLE_CUDA

View File

@ -26,5 +26,5 @@ REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double); REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double); REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double); REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Less")
.HostMemory("z") .HostMemory("z")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::less<int32>>); BinaryOp<CPUDevice, functor::less<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -44,5 +44,5 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
.HostMemory("z") .HostMemory("z")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::less_equal<int32>>); BinaryOp<CPUDevice, functor::less_equal<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double); REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double); REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
.HostMemory("z") .HostMemory("z")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::maximum<int32>>); BinaryOp<CPUDevice, functor::maximum<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -43,6 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
.HostMemory("z") .HostMemory("z")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::minimum<int32>>); BinaryOp<CPUDevice, functor::minimum<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -17,8 +17,8 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
uint8, int32); int32);
#if defined(__ANDROID_TYPES_SLIM__) #if defined(__ANDROID_TYPES_SLIM__)
// We only register the first type when we have multi-argument calls in the // We only register the first type when we have multi-argument calls in the
// case where we're trying to reduce executable size, but it turns out that the // case where we're trying to reduce executable size, but it turns out that the
@ -28,7 +28,7 @@ REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32);
#if GOOGLE_CUDA #if GOOGLE_CUDA
REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double, REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,
uint8); uint8);
// A special GPU kernel for int32. // A special GPU kernel for int32.
// TODO(b/25387198): Also enable int32 in device memory. This kernel // TODO(b/25387198): Also enable int32 in device memory. This kernel
// registration requires all int32 inputs and outputs to be in host memory. // registration requires all int32 inputs and outputs to be in host memory.
@ -50,5 +50,5 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
.HostMemory("z") .HostMemory("z")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
BinaryOp<CPUDevice, functor::mul<int32>>); BinaryOp<CPUDevice, functor::mul<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -22,11 +22,11 @@ namespace tensorflow {
// sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__. // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
#if !defined(__ANDROID_TYPES_SLIM__) #if !defined(__ANDROID_TYPES_SLIM__)
REGISTER6(BinaryOp, CPU, "Mul", functor::mul, REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64,
int8, uint16, int16, int64, complex64, complex128); complex64, complex128);
#if GOOGLE_CUDA #if GOOGLE_CUDA
REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64, REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64,
complex64, complex128); complex64, complex128);
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA

View File

@ -27,7 +27,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
.HostMemory("y") .HostMemory("y")
.TypeConstraint<int32>("T"), .TypeConstraint<int32>("T"),
UnaryOp<CPUDevice, functor::neg<int32>>); UnaryOp<CPUDevice, functor::neg<int32>>);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA #if GOOGLE_CUDA
REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64, REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,

View File

@ -17,7 +17,7 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half, REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
double, uint8, int8, int16); double, uint8, int8, int16);
#if GOOGLE_CUDA #if GOOGLE_CUDA
REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half, REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
double, uint8); double, uint8);

View File

@ -30,5 +30,5 @@ REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA
#endif // !defined(__ANDROID_TYPES_SLIM__) #endif // !defined(__ANDROID_TYPES_SLIM__)
} // namespace tensorflow } // namespace tensorflow

View File

@ -38,7 +38,7 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
#endif #endif
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float); REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float, REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
Eigen::half, double, complex64, complex128); Eigen::half, double, complex64, complex128);
@ -48,5 +48,5 @@ REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
#endif #endif
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float); REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -30,7 +30,7 @@ typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice; typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T> template <typename Device, typename T>
class SelectOp : public OpKernel { class SelectOp : public OpKernel {
@ -185,7 +185,7 @@ REGISTER_SELECT_SYCL(double);
REGISTER_SELECT_SYCL(int32); REGISTER_SELECT_SYCL(int32);
REGISTER_SELECT_SYCL(int64); REGISTER_SELECT_SYCL(int64);
#undef REGISTER_SELECT_SYCL #undef REGISTER_SELECT_SYCL
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
namespace functor { namespace functor {
@ -201,13 +201,11 @@ struct SelectFunctorBase {
}; };
template <typename T> template <typename T>
struct SelectFunctor<CPUDevice, T> struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
: SelectFunctorBase<CPUDevice, T> {};
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
template <typename T> template <typename T>
struct SelectFunctor<SYCLDevice, T> struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
: SelectFunctorBase<SYCLDevice, T> {}; #endif // TENSORFLOW_USE_SYCL
#endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T> template <typename Device, typename T>
struct SelectScalarFunctorBase { struct SelectScalarFunctorBase {
@ -222,12 +220,12 @@ struct SelectScalarFunctorBase {
// CPU Specializations of Select functors with scalar // CPU Specializations of Select functors with scalar
template <typename T> template <typename T>
struct SelectScalarFunctor<CPUDevice, T> struct SelectScalarFunctor<CPUDevice, T>
: SelectScalarFunctorBase<CPUDevice, T> {}; : SelectScalarFunctorBase<CPUDevice, T> {};
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
template <typename T> template <typename T>
struct SelectScalarFunctor<SYCLDevice, T> struct SelectScalarFunctor<SYCLDevice, T>
: SelectScalarFunctorBase<SYCLDevice, T> {}; : SelectScalarFunctorBase<SYCLDevice, T> {};
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T> template <typename Device, typename T>
struct BatchSelectFunctorBase { struct BatchSelectFunctorBase {
@ -240,8 +238,8 @@ struct BatchSelectFunctorBase {
const Eigen::DenseIndex all_but_batch = then_flat_outer_dims.dimension(1); const Eigen::DenseIndex all_but_batch = then_flat_outer_dims.dimension(1);
#if !defined(EIGEN_HAS_INDEX_LIST) #if !defined(EIGEN_HAS_INDEX_LIST)
Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{ 1, all_but_batch }}; Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{1, all_but_batch}};
Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{ batch, 1 }}; Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{batch, 1}};
#else #else
Eigen::IndexList<Eigen::type2index<1>, Eigen::DenseIndex> broadcast_dims; Eigen::IndexList<Eigen::type2index<1>, Eigen::DenseIndex> broadcast_dims;
broadcast_dims.set(1, all_but_batch); broadcast_dims.set(1, all_but_batch);
@ -257,13 +255,13 @@ struct BatchSelectFunctorBase {
}; };
template <typename T> template <typename T>
struct BatchSelectFunctor<CPUDevice, T> struct BatchSelectFunctor<CPUDevice, T> : BatchSelectFunctorBase<CPUDevice, T> {
: BatchSelectFunctorBase<CPUDevice, T> {}; };
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
template <typename T> template <typename T>
struct BatchSelectFunctor<SYCLDevice, T> struct BatchSelectFunctor<SYCLDevice, T>
: BatchSelectFunctorBase<SYCLDevice, T> {}; : BatchSelectFunctorBase<SYCLDevice, T> {};
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace functor } // namespace functor

Some files were not shown because too many files have changed in this diff Show More