Branch 183429339 (#16469)

* Change `reduce_logsumexp` to internally use `reshape` rather than `squeeze`
since the latter requires the `axis` arg to be a Python `list`.

PiperOrigin-RevId: 183396533

* Kernel utils to support broadcast add and mul.

PiperOrigin-RevId: 183397494

* Updating sparsify_gather.

PiperOrigin-RevId: 183402917

* [tf.data] Move slow-path-related code into the slow path in IteratorHandleOp::Compute().

This slightly reduces the amount of work performed when an iterator is accessed (after the first access), and potentially reduces contention if concurrent steps are accessing the same iterator.

PiperOrigin-RevId: 183406221

* Cleanup: Ran clang-format on all *.{cc,h} in under grappler.

PiperOrigin-RevId: 183406440

* Increase shard count of //third_party/tensorflow/python:nn_batchnorm_test to avoid timeouts

When run under asan, the test runs for about 5 minutes, and sometimes
longer, causing frequent timeouts.

This change increases the shard count of the test to 4, which brings the run time
of the longest running shard under asan to about 2 minutes.

PiperOrigin-RevId: 183414888

* Add available choices to toco flags and fix minor formatting issues.

PiperOrigin-RevId: 183415713

* Performance improvements to some GPU code to use shared locks instead of unique locks for some hotspot cases.

PiperOrigin-RevId: 183418559

* [XLA] Improve error message for bad slices.

PiperOrigin-RevId: 183420038

* Fix py3 build rules for all py tests under py2tf.

PiperOrigin-RevId: 183422144

* Fix bug with Operation._control_inputs setter.

PiperOrigin-RevId: 183422192

* Make softmax_op_test.py work with C API enabled.

PiperOrigin-RevId: 183422829

* Cleanup: Ran clang-format on all *.{cc,h} files in tensorflow/core/kernels.

PiperOrigin-RevId: 183423961

* Fix the documentation for the dense layer for how rank > 2 inputs are handled.

PiperOrigin-RevId: 183425868

* Cleanup: Ran clang-format on all *.{cc,h} in tensorflow/core/ops.

PiperOrigin-RevId: 183429339
This commit is contained in:
Rasmus Munk Larsen 2018-01-26 13:32:16 -08:00 committed by GitHub
parent f84623507b
commit 982549ea34
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
325 changed files with 4706 additions and 4373 deletions

View File

@ -37,6 +37,9 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/protobuf.h"
using tensorflow::str_util::Join;
using tensorflow::strings::Printf;
namespace xla { namespace xla {
namespace { namespace {
@ -934,7 +937,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
"inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}", "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
BinaryOperation_Name(operation).c_str(), BinaryOperation_Name(operation).c_str(),
ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(), ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str()); Join(broadcast_dimensions, ", ").c_str());
TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
@ -1097,7 +1100,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
return InvalidArgument( return InvalidArgument(
"Map operation requires all operands to have the same shape; got: " "Map operation requires all operands to have the same shape; got: "
"%s", "%s",
tensorflow::str_util::Join(pieces, ", ").c_str()); Join(pieces, ", ").c_str());
} }
// Check that dimensions.size == arg_shape.dimensions_size() (we currently // Check that dimensions.size == arg_shape.dimensions_size() (we currently
@ -1114,7 +1117,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
if (dimensions[i] != i) { if (dimensions[i] != i) {
return InvalidArgument( return InvalidArgument(
"Map requires monotonically increasing dimension numbers, found: %s ", "Map requires monotonically increasing dimension numbers, found: %s ",
tensorflow::str_util::Join(dimensions, ", ").c_str()); Join(dimensions, ", ").c_str());
} }
} }
@ -1914,21 +1917,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts, const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
tensorflow::gtl::ArraySlice<int64> limits, tensorflow::gtl::ArraySlice<int64> limits,
tensorflow::gtl::ArraySlice<int64> strides) { tensorflow::gtl::ArraySlice<int64> strides) {
auto error = [&](const string& message) {
return InvalidArgument(
"%s in slice operation; argument shape: %s; starts: {%s}; limits: "
"{%s}; strides: {%s}",
message.c_str(), ShapeUtil::HumanString(arg).c_str(),
Join(starts, ",").c_str(), Join(limits, ",").c_str(),
Join(strides, ",").c_str());
};
TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice")); TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
VLOG(2) << tensorflow::strings::Printf( VLOG(2) << tensorflow::strings::Printf(
"slicing shape %s starts={%s} limits={%s}", "slicing shape %s starts={%s} limits={%s}",
ShapeUtil::HumanString(arg).c_str(), ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(),
tensorflow::str_util::Join(starts, ", ").c_str(), Join(limits, ", ").c_str());
tensorflow::str_util::Join(limits, ", ").c_str());
if (starts.size() != limits.size()) { if (starts.size() != limits.size()) {
return InvalidArgument("slice start and limit sizes differ: %zu vs %zu", return error(Printf("slice start and limit sizes differ: %zu vs %zu",
starts.size(), limits.size()); starts.size(), limits.size()));
} }
if (starts.size() != strides.size()) { if (starts.size() != strides.size()) {
return InvalidArgument("slice start and strides sizes differ: %zu vs %zu", return error(Printf("slice start and strides sizes differ: %zu vs %zu",
starts.size(), strides.size()); starts.size(), strides.size()));
} }
if (starts.size() != ShapeUtil::Rank(arg)) { if (starts.size() != ShapeUtil::Rank(arg)) {
@ -1947,20 +1957,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
start_index); start_index);
} }
if (limit_index > arg.dimensions(dimension)) { if (limit_index > arg.dimensions(dimension)) {
return InvalidArgument( return error(
"limit index (%lld) must be less than or equal to dimension " Printf("limit index (%lld) must be less than or equal to dimension "
"size (%lld)", "size (%lld)",
limit_index, arg.dimensions(dimension)); limit_index, arg.dimensions(dimension)));
} }
VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension, VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
start_index); start_index);
VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension, VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
limit_index); limit_index);
if (start_index > limit_index) { if (start_index > limit_index) {
return InvalidArgument( return error(
"limit index (%lld) must be greater or equal to " Printf("limit index (%lld) must be greater or equal to "
"start index (%lld) in slice with positive stride", "start index (%lld) in slice with positive stride",
limit_index, start_index); limit_index, start_index));
} }
if (stride <= 0) { if (stride <= 0) {
return InvalidArgument("stride (%lld) must be positive", stride); return InvalidArgument("stride (%lld) must be positive", stride);
@ -1983,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
"slicing shape %s at dynamic start_indices %s with slice_sizes={%s}", "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
ShapeUtil::HumanString(operand_shape).c_str(), ShapeUtil::HumanString(operand_shape).c_str(),
ShapeUtil::HumanString(start_indices_shape).c_str(), ShapeUtil::HumanString(start_indices_shape).c_str(),
tensorflow::str_util::Join(slice_sizes, ", ").c_str()); Join(slice_sizes, ", ").c_str());
if (ShapeUtil::Rank(start_indices_shape) != 1) { if (ShapeUtil::Rank(start_indices_shape) != 1) {
return InvalidArgument( return InvalidArgument(
@ -2280,8 +2290,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
return InvalidArgument( return InvalidArgument(
"Reshape dimensions [%s] are not a permutation of the operand " "Reshape dimensions [%s] are not a permutation of the operand "
"dimensions (operand shape is %s).", "dimensions (operand shape is %s).",
tensorflow::str_util::Join(dimensions, ",").c_str(), Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str());
ShapeUtil::HumanString(operand).c_str());
} }
return inferred_shape; return inferred_shape;
@ -2373,8 +2382,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
// The applied function's arity equals the number of arguments. // The applied function's arity equals the number of arguments.
if (arg_shapes.size() != to_apply.parameters_size()) { if (arg_shapes.size() != to_apply.parameters_size()) {
string computation_signature = ShapeUtil::HumanString(to_apply); string computation_signature = ShapeUtil::HumanString(to_apply);
string argument_shapes = tensorflow::str_util::Join( string argument_shapes =
arg_shapes, ", ", [](string* out, const Shape* shape) { Join(arg_shapes, ", ", [](string* out, const Shape* shape) {
tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape)); tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
}); });
return InvalidArgument( return InvalidArgument(

View File

@ -1512,5 +1512,20 @@ TEST_F(ShapeInferenceTest, Conditional) {
"must have the same shape")); "must have the same shape"));
} }
TEST_F(ShapeInferenceTest, BadSlice) {
auto arg = ShapeUtil::MakeShape(F32, {4});
StatusOr<Shape> statusor =
ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
ASSERT_FALSE(statusor.ok());
LOG(INFO) << statusor.status();
EXPECT_THAT(statusor.status().error_message(),
HasSubstr("less than or equal to dimension size"))
<< statusor.status();
EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape"))
<< statusor.status();
}
} // namespace } // namespace
} // namespace xla } // namespace xla

View File

@ -71,6 +71,32 @@ cc_library(
], ],
) )
cc_library(
name = "kernel_util",
srcs = [
"kernel_util.cc",
],
hdrs = [
"kernel_util.h",
],
deps = [
"//tensorflow/contrib/lite:builtin_op_data",
"//tensorflow/contrib/lite:context",
"//tensorflow/contrib/lite/kernels/internal:round",
],
)
tf_cc_test(
name = "kernel_util_test",
size = "small",
srcs = ["kernel_util_test.cc"],
deps = [
":kernel_util",
"//tensorflow/contrib/lite/testing:util",
"@com_google_googletest//:gtest",
],
)
cc_library( cc_library(
name = "builtin_ops", name = "builtin_ops",
srcs = [ srcs = [
@ -87,7 +113,6 @@ cc_library(
"fully_connected.cc", "fully_connected.cc",
"gather.cc", "gather.cc",
"hashtable_lookup.cc", "hashtable_lookup.cc",
"kernel_util.cc",
"l2norm.cc", "l2norm.cc",
"local_response_norm.cc", "local_response_norm.cc",
"lsh_projection.cc", "lsh_projection.cc",
@ -111,7 +136,6 @@ cc_library(
"unidirectional_sequence_rnn.cc", "unidirectional_sequence_rnn.cc",
], ],
hdrs = [ hdrs = [
"kernel_util.h",
"padding.h", "padding.h",
"register.h", "register.h",
], ],
@ -125,6 +149,7 @@ cc_library(
}), }),
deps = [ deps = [
":activation_functor", ":activation_functor",
":kernel_util",
":op_macros", ":op_macros",
"//tensorflow/contrib/lite:builtin_op_data", "//tensorflow/contrib/lite:builtin_op_data",
"//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite:framework",

View File

@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#include "tensorflow/contrib/lite/kernels/kernel_util.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h"
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <memory>
#include "tensorflow/contrib/lite/kernels/internal/round.h" #include "tensorflow/contrib/lite/kernels/internal/round.h"
namespace tflite { namespace tflite {
@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
} }
} }
bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
return TfLiteIntArrayEqual(input1->dims, input2->dims);
}
TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
TfLiteTensor* input1,
TfLiteTensor* input2,
TfLiteIntArray** output_shape) {
int64_t dims1 = NumDimensions(input1);
int64_t dims2 = NumDimensions(input2);
int64_t out_dims = std::max(dims1, dims2);
std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
for (int i = 0; i < out_dims; ++i) {
int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1);
shape->data[out_dims - i - 1] = std::max(d1, d2);
}
*output_shape = shape.release();
return kTfLiteOk;
}
} // namespace tflite } // namespace tflite

View File

@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; } inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; } inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
inline int64_t NumElements(const TfLiteTensor* t) {
int64_t count = 1;
for (int i = 0; i < NumDimensions(t); ++i) {
count *= SizeOfDimension(t, i);
}
return count;
}
inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context, inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
const TfLiteNode* node, int index) { const TfLiteNode* node, int index) {
const bool use_tensor = node->inputs->data[index] != kOptionalTensor; const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
@ -76,6 +84,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
float* activation_min, float* activation_min,
float* activation_max); float* activation_max);
// Return true if the given tensors have the same shape.
bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
// Calculate the output_shape that is necessary for element-wise operations
// with broadcasting involving the two input tensors.
TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
TfLiteTensor* input1,
TfLiteTensor* input2,
TfLiteIntArray** output_shape);
} // namespace tflite } // namespace tflite
#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_ #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_

View File

@ -0,0 +1,150 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/lite/kernels/kernel_util.h"
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/contrib/lite/testing/util.h"
namespace tflite {
namespace {
void ReportError(TfLiteContext* context, const char* format, ...) {}
class KernelUtilTest : public ::testing::Test {
public:
KernelUtilTest() {
context_.ReportError = ReportError;
tensor1_.dims = nullptr;
tensor2_.dims = nullptr;
}
~KernelUtilTest() {
TfLiteTensorFree(&tensor1_);
TfLiteTensorFree(&tensor2_);
}
void SetShape(TfLiteTensor* tensor, std::initializer_list<int> dims) {
TfLiteTensorFree(tensor);
tensor->dims = TfLiteIntArrayCreate(dims.size());
int i = 0;
for (int d : dims) {
tensor->dims->data[i] = d;
++i;
}
}
std::vector<int> GetShape(TfLiteIntArray* dims) {
std::vector<int> result;
for (int i = 0; i < dims->size; ++i) {
result.push_back(dims->data[i]);
}
return result;
}
protected:
TfLiteContext context_;
TfLiteTensor tensor1_;
TfLiteTensor tensor2_;
};
TEST_F(KernelUtilTest, SameShapeEmpty) {
EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor1_, {1, 2, 3});
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor2_, {1, 2});
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor2_, {1, 2, 3, 4});
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor2_, {1, 2, 3});
EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor2_, {});
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
SetShape(&tensor1_, {});
EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
}
TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) {
TfLiteIntArray* output = nullptr;
SetShape(&tensor1_, {1, 2});
SetShape(&tensor2_, {1, 3});
EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
EXPECT_EQ(output, nullptr);
}
TEST_F(KernelUtilTest, BroadcastShapeOnes) {
TfLiteIntArray* output = nullptr;
SetShape(&tensor1_, {1, 1});
SetShape(&tensor2_, {1, 3});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
TfLiteIntArrayFree(output);
SetShape(&tensor1_, {1, 2});
SetShape(&tensor2_, {1, 1});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
TfLiteIntArrayFree(output);
}
TEST_F(KernelUtilTest, BroadcastShapeScalars) {
TfLiteIntArray* output = nullptr;
SetShape(&tensor1_, {1, 2});
SetShape(&tensor2_, {});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2));
TfLiteIntArrayFree(output);
SetShape(&tensor1_, {});
SetShape(&tensor2_, {2});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2));
TfLiteIntArrayFree(output);
}
TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
TfLiteIntArray* output = nullptr;
SetShape(&tensor1_, {1, 2});
SetShape(&tensor2_, {3, 1, 1});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2));
TfLiteIntArrayFree(output);
SetShape(&tensor1_, {1, 2, 3, 4});
SetShape(&tensor2_, {1, 3, 1});
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
&tensor2_, &output));
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4));
TfLiteIntArrayFree(output);
}
} // namespace
} // namespace tflite
int main(int argc, char** argv) {
::tflite::LogToStderr();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

View File

@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
"For Protobuf formats, the binary format will be used."), "For Protobuf formats, the binary format will be used."),
Flag("input_format", parsed_flags.input_format.bind(), Flag("input_format", parsed_flags.input_format.bind(),
parsed_flags.input_format.default_value(), parsed_flags.input_format.default_value(),
"Input file format. One of: tensorflow_graphdef, "), "Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."),
Flag("output_format", parsed_flags.output_format.bind(), Flag("output_format", parsed_flags.output_format.bind(),
parsed_flags.output_format.default_value(), "Output file format."), parsed_flags.output_format.default_value(),
"Output file format. "
"One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(), Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
parsed_flags.default_ranges_min.default_value(), parsed_flags.default_ranges_min.default_value(),
"If defined, will be used as the default value for the min bound " "If defined, will be used as the default value for the min bound "
@ -58,11 +60,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
Flag("inference_type", parsed_flags.inference_type.bind(), Flag("inference_type", parsed_flags.inference_type.bind(),
parsed_flags.inference_type.default_value(), parsed_flags.inference_type.default_value(),
"Target data type of arrays in the output file (for input_arrays, " "Target data type of arrays in the output file (for input_arrays, "
"this may be overridden by inference_input_type)."), "this may be overridden by inference_input_type). "
"One of FLOAT, QUANTIZED_UINT8."),
Flag("inference_input_type", parsed_flags.inference_input_type.bind(), Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
parsed_flags.inference_input_type.default_value(), parsed_flags.inference_input_type.default_value(),
"Target data type of input arrays. If not specified, inference_type " "Target data type of input arrays. "
"is used."), "If not specified, inference_type is used. "
"One of FLOAT, QUANTIZED_UINT8."),
Flag("input_type", parsed_flags.input_type.bind(), Flag("input_type", parsed_flags.input_type.bind(),
parsed_flags.input_type.default_value(), parsed_flags.input_type.default_value(),
"Deprecated ambiguous flag that set both --input_data_types and " "Deprecated ambiguous flag that set both --input_data_types and "
@ -76,35 +80,31 @@ bool ParseTocoFlagsFromCommandLineFlags(
Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(), Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
parsed_flags.drop_fake_quant.default_value(), parsed_flags.drop_fake_quant.default_value(),
"Ignore and discard FakeQuant nodes. For instance, that can be used " "Ignore and discard FakeQuant nodes. For instance, to "
"to "
"generate plain float code without fake-quantization from a " "generate plain float code without fake-quantization from a "
"quantized " "quantized graph."),
"graph."),
Flag( Flag(
"reorder_across_fake_quant", "reorder_across_fake_quant",
parsed_flags.reorder_across_fake_quant.bind(), parsed_flags.reorder_across_fake_quant.bind(),
parsed_flags.reorder_across_fake_quant.default_value(), parsed_flags.reorder_across_fake_quant.default_value(),
"Normally, FakeQuant nodes must be strict boundaries for graph " "Normally, FakeQuant nodes must be strict boundaries for graph "
"transformations, in order to ensure that quantized inference has " "transformations, in order to ensure that quantized inference has "
"the " "the exact same arithmetic behavior as quantized training --- which "
"exact same arithmetic behavior as quantized training --- which is " "is the whole point of quantized training and of FakeQuant nodes in "
"the " "the first place. "
"whole point of quantized training and of FakeQuant nodes in the " "However, that entails subtle requirements on where exactly "
"first "
"place. However, that entails subtle requirements on where exactly "
"FakeQuant nodes must be placed in the graph. Some quantized graphs " "FakeQuant nodes must be placed in the graph. Some quantized graphs "
"have FakeQuant nodes at unexpected locations, that prevent graph " "have FakeQuant nodes at unexpected locations, that prevent graph "
"transformations that are necessary in order to generate inference " "transformations that are necessary in order to generate inference "
"code for these graphs. Such graphs should be fixed, but as a " "code for these graphs. Such graphs should be fixed, but as a "
"temporary work-around, setting this reorder_across_fake_quant flag " "temporary work-around, setting this reorder_across_fake_quant flag "
"allows toco to perform necessary graph transformaitons on them, " "allows TOCO to perform necessary graph transformaitons on them, "
"at the cost of no longer faithfully matching inference and training " "at the cost of no longer faithfully matching inference and training "
"arithmetic."), "arithmetic."),
Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(), Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
parsed_flags.allow_custom_ops.default_value(), parsed_flags.allow_custom_ops.default_value(),
"If true, allow TOCO to create TF Lite Custom operators for all the " "If true, allow TOCO to create TF Lite Custom operators for all the "
"unsupported Tensorflow ops."), "unsupported TensorFlow ops."),
Flag( Flag(
"drop_control_dependency", "drop_control_dependency",
parsed_flags.drop_control_dependency.bind(), parsed_flags.drop_control_dependency.bind(),

View File

@ -57,6 +57,7 @@ py_library(
py_test( py_test(
name = "api_test", name = "api_test",
srcs = ["api_test.py"], srcs = ["api_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":py2tf_internal", ":py2tf_internal",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -66,6 +67,7 @@ py_test(
py_test( py_test(
name = "conversion_test", name = "conversion_test",
srcs = ["conversion_test.py"], srcs = ["conversion_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":py2tf_internal", ":py2tf_internal",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -76,6 +78,7 @@ py_test(
py_test( py_test(
name = "naming_test", name = "naming_test",
srcs = ["naming_test.py"], srcs = ["naming_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":py2tf_internal", ":py2tf_internal",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",

View File

@ -52,6 +52,7 @@ py_library(
py_test( py_test(
name = "break_canonicalization_test", name = "break_canonicalization_test",
srcs = ["break_canonicalization_test.py"], srcs = ["break_canonicalization_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -62,6 +63,7 @@ py_test(
py_test( py_test(
name = "call_trees_test", name = "call_trees_test",
srcs = ["call_trees_test.py"], srcs = ["call_trees_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -72,6 +74,7 @@ py_test(
py_test( py_test(
name = "continue_canonicalization_test", name = "continue_canonicalization_test",
srcs = ["continue_canonicalization_test.py"], srcs = ["continue_canonicalization_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -82,6 +85,7 @@ py_test(
py_test( py_test(
name = "control_flow_test", name = "control_flow_test",
srcs = ["control_flow_test.py"], srcs = ["control_flow_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -92,6 +96,7 @@ py_test(
py_test( py_test(
name = "builtin_functions_test", name = "builtin_functions_test",
srcs = ["builtin_functions_test.py"], srcs = ["builtin_functions_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -112,6 +117,7 @@ py_test(
py_test( py_test(
name = "logical_expressions_test", name = "logical_expressions_test",
srcs = ["logical_expressions_test.py"], srcs = ["logical_expressions_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -122,6 +128,7 @@ py_test(
py_test( py_test(
name = "print_functions_test", name = "print_functions_test",
srcs = ["print_functions_test.py"], srcs = ["print_functions_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -133,6 +140,7 @@ py_test(
py_test( py_test(
name = "side_effect_guards_test", name = "side_effect_guards_test",
srcs = ["side_effect_guards_test.py"], srcs = ["side_effect_guards_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":test_lib", ":test_lib",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",

View File

@ -38,6 +38,7 @@ py_library(
py_test( py_test(
name = "anno_test", name = "anno_test",
srcs = ["anno_test.py"], srcs = ["anno_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":pyct", ":pyct",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -47,6 +48,7 @@ py_test(
py_test( py_test(
name = "compiler_test", name = "compiler_test",
srcs = ["compiler_test.py"], srcs = ["compiler_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":pyct", ":pyct",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -57,6 +59,7 @@ py_test(
py_test( py_test(
name = "parser_test", name = "parser_test",
srcs = ["parser_test.py"], srcs = ["parser_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":pyct", ":pyct",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -66,6 +69,7 @@ py_test(
py_test( py_test(
name = "pretty_printer_test", name = "pretty_printer_test",
srcs = ["pretty_printer_test.py"], srcs = ["pretty_printer_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":pyct", ":pyct",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",
@ -75,6 +79,7 @@ py_test(
py_test( py_test(
name = "templates_test", name = "templates_test",
srcs = ["templates_test.py"], srcs = ["templates_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":pyct", ":pyct",
"//tensorflow/python:client_testlib", "//tensorflow/python:client_testlib",

View File

@ -32,6 +32,7 @@ py_library(
py_test( py_test(
name = "access_test", name = "access_test",
srcs = ["access_test.py"], srcs = ["access_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":static_analysis", ":static_analysis",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -43,6 +44,7 @@ py_test(
py_test( py_test(
name = "live_values_test", name = "live_values_test",
srcs = ["live_values_test.py"], srcs = ["live_values_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":static_analysis", ":static_analysis",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",
@ -53,6 +55,7 @@ py_test(
py_test( py_test(
name = "type_info_test", name = "type_info_test",
srcs = ["type_info_test.py"], srcs = ["type_info_test.py"],
srcs_version = "PY2AND3",
deps = [ deps = [
":static_analysis", ":static_analysis",
"//tensorflow/contrib/py2tf/pyct", "//tensorflow/contrib/py2tf/pyct",

View File

@ -230,8 +230,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
// TODO(tucker): actually maintain separate CPUAllocators for // TODO(tucker): actually maintain separate CPUAllocators for
// different numa_nodes. For now, just one. // different numa_nodes. For now, just one.
numa_node = 0; numa_node = 0;
mutex_lock lock(mu_);
{
// Here we optimize the most common use case where cuda_host_allocators_
// and cuda_al_ have already been populated and since we're only reading
// these vectors, we can get by with a shared lock. In the slower case,
// we take a unique lock and populate these vectors.
tf_shared_lock lock(mu_);
if (FLAGS_brain_gpu_record_mem_types &&
static_cast<int>(cuda_al_.size()) > 0) {
return cuda_al_[0];
}
if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
return cuda_host_allocators_[0];
}
}
mutex_lock lock(mu_);
// Find the first valid StreamExecutor to request CUDA host memory // Find the first valid StreamExecutor to request CUDA host memory
// through, since any will work. // through, since any will work.
// //

View File

@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
DisableDetailedStats(false); DisableDetailedStats(false);
} }
Cluster::~Cluster() { Cluster::~Cluster() {}
}
void Cluster::AllowSoftPlacement(bool soft_placement_state) { void Cluster::AllowSoftPlacement(bool soft_placement_state) {
options_.config.set_allow_soft_placement(soft_placement_state); options_.config.set_allow_soft_placement(soft_placement_state);

View File

@ -16,8 +16,8 @@ limitations under the License.
#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_ #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
#define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_ #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
#include "tensorflow/core/framework/variable.pb.h" #include "tensorflow/core/framework/variable.pb.h"
#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
#include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status.h"
namespace tensorflow { namespace tensorflow {

View File

@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice;
template <typename Device, typename T> template <typename Device, typename T>
class AdjustContrastOp : public OpKernel { class AdjustContrastOp : public OpKernel {
public: public:
explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) { explicit AdjustContrastOp(OpKernelConstruction* context)
} : OpKernel(context) {}
void Compute(OpKernelContext* context) override { void Compute(OpKernelContext* context) override {
const Tensor& input = context->input(0); const Tensor& input = context->input(0);

View File

@ -29,8 +29,7 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
class AdjustContrastOpTest : public OpsTestBase { class AdjustContrastOpTest : public OpsTestBase {};
};
TEST_F(AdjustContrastOpTest, Simple_1113) { TEST_F(AdjustContrastOpTest, Simple_1113) {
TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2") TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")

View File

@ -192,7 +192,8 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
const DeviceBase::CpuWorkerThreads& worker_threads = const DeviceBase::CpuWorkerThreads& worker_threads =
*context->device()->tensorflow_cpu_worker_threads(); *context->device()->tensorflow_cpu_worker_threads();
Shard(worker_threads.num_threads, worker_threads.workers, channel_count, Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
kCostPerChannel, [channel_count, &input_data, &output_data, scale_h]( kCostPerChannel,
[channel_count, &input_data, &output_data, scale_h](
int64 start_channel, int64 end_channel) { int64 start_channel, int64 end_channel) {
const float* p = input_data.data() + start_channel * kChannelSize; const float* p = input_data.data() + start_channel * kChannelSize;
float* q = output_data.data() + start_channel * kChannelSize; float* q = output_data.data() + start_channel * kChannelSize;

View File

@ -52,7 +52,8 @@ class ExtractGlimpseOp : public OpKernel {
const int64 batch_size = input_shape.dim_size(0); const int64 batch_size = input_shape.dim_size(0);
const Tensor& window_size = context->input(1); const Tensor& window_size = context->input(1);
OP_REQUIRES(context, (window_size.shape().dims() == 1) && OP_REQUIRES(context,
(window_size.shape().dims() == 1) &&
window_size.shape().dim_size(0) == 2, window_size.shape().dim_size(0) == 2,
errors::InvalidArgument( errors::InvalidArgument(
"input must be a vector of size 2 (height, width)", "input must be a vector of size 2 (height, width)",

View File

@ -48,9 +48,8 @@ struct SpatialAvgPooling {
typedef Eigen::GpuDevice GPUDevice; typedef Eigen::GpuDevice GPUDevice;
// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation // Launch a custom GPU kernels from Yanqing for the avgpooling backward
// that works NHWC data formats. // operation that works NHWC data formats. Arguments:
// Arguments:
// top_diff: backprop to the output of the pooling layer // top_diff: backprop to the output of the pooling layer
// num: number of input batches // num: number of input batches
// height: input height // height: input height

View File

@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
int pool_size = (hend - hstart) * (wend - wstart); int pool_size = (hend - hstart) * (wend - wstart);
gradient += gradient += top_diff_slice[(ph * pooled_width + pw) * channels] /
top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size); dtype(pool_size);
} }
} }
bottom_diff[index] = gradient; bottom_diff[index] = gradient;
@ -90,8 +90,8 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
const GPUDevice& d) { const GPUDevice& d) {
int x_size = num * height * width * channels; int x_size = num * height * width * channels;
CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d); CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
AvePoolBackwardNHWC< AvePoolBackwardNHWC<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
config.virtual_thread_count, top_diff, num, height, width, channels, config.virtual_thread_count, top_diff, num, height, width, channels,
pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w, pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
pad_t, pad_t, bottom_diff); pad_t, pad_t, bottom_diff);

View File

@ -111,13 +111,14 @@ class Barrier : public ResourceBase {
mutex_lock lock(mu_); mutex_lock lock(mu_);
if (closed_) { if (closed_) {
OP_REQUIRES_ASYNC( OP_REQUIRES_ASYNC(
ctx, !cancel_pending_enqueues_ && ctx,
!cancel_pending_enqueues_ &&
(num_inserted == 0 || !incomplete_.empty()), (num_inserted == 0 || !incomplete_.empty()),
errors::Cancelled( errors::Cancelled(
"Barrier ", name_, " is closed. Pending enqueues cancelled: ", "Barrier ", name_, " is closed. Pending enqueues cancelled: ",
cancel_pending_enqueues_, ". Number of new insertions: ", cancel_pending_enqueues_,
num_inserted, ". Number of incomplete keys: ", ". Number of new insertions: ", num_inserted,
incomplete_.size(), "."), ". Number of incomplete keys: ", incomplete_.size(), "."),
callback); callback);
} }
@ -128,7 +129,8 @@ class Barrier : public ResourceBase {
for (int i = 0; i < num_inserted; ++i) { for (int i = 0; i < num_inserted; ++i) {
OP_REQUIRES_OK_ASYNC( OP_REQUIRES_OK_ASYNC(
ctx, InsertOneLocked<T>(ctx, keys, values, element_shape, ctx,
InsertOneLocked<T>(ctx, keys, values, element_shape,
component_index, i, &ready_tuples, component_index, i, &ready_tuples,
&new_elements), &new_elements),
callback); callback);
@ -317,8 +319,9 @@ class Barrier : public ResourceBase {
return errors::Cancelled( return errors::Cancelled(
"Barrier ", name_, "Barrier ", name_,
" is closed, but attempted to insert a brand new key: ", " is closed, but attempted to insert a brand new key: ",
keys_vec(i), ". Pending enqueues cancelled: ", keys_vec(i),
cancel_pending_enqueues_, ". Insertion index: ", i, ". Pending enqueues cancelled: ", cancel_pending_enqueues_,
". Insertion index: ", i,
". Number of incomplete keys: ", incomplete_.size(), "."); ". Number of incomplete keys: ", incomplete_.size(), ".");
} }
} else { } else {
@ -532,11 +535,12 @@ class InsertManyOp : public BarrierOpKernel {
OP_REQUIRES_ASYNC( OP_REQUIRES_ASYNC(
ctx, component_index_ < barrier->num_components(), ctx, component_index_ < barrier->num_components(),
errors::InvalidArgument("The component ID is out of range ", errors::InvalidArgument("The component ID is out of range ",
component_index_, " > num_components", " (= ", component_index_, " > num_components",
barrier->num_components(), ")"), " (= ", barrier->num_components(), ")"),
callback); callback);
OP_REQUIRES_OK_ASYNC( OP_REQUIRES_OK_ASYNC(
ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING, ctx,
ctx->MatchSignature({DT_STRING_REF, DT_STRING,
barrier->component_type(component_index_)}, barrier->component_type(component_index_)},
{}), {}),
callback); callback);

View File

@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/framework/resource_mgr.h"
#include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_util.h" #include "tensorflow/core/framework/tensor_util.h"
#include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/types.h"
#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
#include "tensorflow/core/kernels/batching_util/periodic_function.h" #include "tensorflow/core/kernels/batching_util/periodic_function.h"
#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
#include "tensorflow/core/kernels/concat_lib.h" #include "tensorflow/core/kernels/concat_lib.h"
#include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/kernels/split_lib.h" #include "tensorflow/core/kernels/split_lib.h"
#include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/lib/random/random.h"
#include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/macros.h"
namespace tensorflow { namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::ThreadPoolDevice CPUDevice;

View File

@ -429,11 +429,10 @@ template <typename Scalar>
struct LaunchBatchMatMul<SYCLDevice, Scalar> { struct LaunchBatchMatMul<SYCLDevice, Scalar> {
static void Launch(OpKernelContext* context, const Tensor& in_x, static void Launch(OpKernelContext* context, const Tensor& in_x,
const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) { const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
// Number of matrix multiplies i.e. size of the batch. // Number of matrix multiplies i.e. size of the batch.
const int64 batch_size = in_x.dim_size(0); const int64 batch_size = in_x.dim_size(0);
ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y, out, ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
0, batch_size); out, 0, batch_size);
} }
}; };
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
@ -462,9 +461,9 @@ class BatchMatMul : public OpKernel {
TensorShape out_shape; TensorShape out_shape;
for (int i = 0; i < ndims - 2; ++i) { for (int i = 0; i < ndims - 2; ++i) {
OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i), OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(", errors::InvalidArgument(
i, ") must be the same: ", "In[0].dim(", i, ") and In[1].dim(", i,
in0.shape().DebugString(), " vs ", ") must be the same: ", in0.shape().DebugString(), " vs ",
in1.shape().DebugString())); in1.shape().DebugString()));
out_shape.AddDim(in0.dim_size(i)); out_shape.AddDim(in0.dim_size(i));
} }

View File

@ -54,7 +54,8 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
// BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64, // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
// gpu); // gpu);
// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \ // BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu); \ // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
// \
// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \ // BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu); // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);

View File

@ -56,7 +56,8 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
errors::InvalidArgument("input rank should be >= ", 1 + block_dims, errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
" instead of ", orig_input_tensor.dims())); " instead of ", orig_input_tensor.dims()));
OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) && OP_REQUIRES(context,
TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
block_dims == orig_crops.dim_size(0) && block_dims == orig_crops.dim_size(0) &&
2 == orig_crops.dim_size(1), 2 == orig_crops.dim_size(1),
errors::InvalidArgument("crops should have shape [", block_dims, errors::InvalidArgument("crops should have shape [", block_dims,

View File

@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#include "tensorflow/core/util/bcast.h"
#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/bcast.h"
namespace tensorflow { namespace tensorflow {

View File

@ -77,12 +77,12 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
} }
CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
if (data_format == FORMAT_NHWC) { if (data_format == FORMAT_NHWC) {
BiasNHWCKernel< BiasNHWCKernel<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
config.virtual_thread_count, input, bias, output, bias_size); config.virtual_thread_count, input, bias, output, bias_size);
} else { } else {
BiasNCHWKernel< BiasNCHWKernel<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
config.virtual_thread_count, input, bias, output, bias_size, config.virtual_thread_count, input, bias, output, bias_size,
image_size); image_size);
} }
@ -206,8 +206,8 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
// Check if we have enough shared memory. // Check if we have enough shared memory.
if (shared_memory_size <= max_shared_memory_size) { if (shared_memory_size <= max_shared_memory_size) {
if (data_format == FORMAT_NHWC) { if (data_format == FORMAT_NHWC) {
BiasGradNHWC_SharedAtomics< BiasGradNHWC_SharedAtomics<T>
T><<<config.block_count, config.thread_per_block, shared_memory_size, <<<config.block_count, config.thread_per_block, shared_memory_size,
d.stream()>>>(total_count, output_backprop, bias_backprop, d.stream()>>>(total_count, output_backprop, bias_backprop,
bias_size); bias_size);
} else { } else {
@ -217,8 +217,8 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
if (config.thread_per_block < kWarpSize) { if (config.thread_per_block < kWarpSize) {
config.thread_per_block = kWarpSize; config.thread_per_block = kWarpSize;
} }
BiasGradNCHW_SharedAtomics< BiasGradNCHW_SharedAtomics<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
output_backprop, bias_backprop, batch, bias_size, image_size, output_backprop, bias_backprop, batch, bias_size, image_size,
group_size); group_size);
} }
@ -227,13 +227,14 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
// output block, it is possible to process one group of elements at a time. // output block, it is possible to process one group of elements at a time.
// But for now, we simply fall back to the naive implementation. // But for now, we simply fall back to the naive implementation.
if (data_format == FORMAT_NHWC) { if (data_format == FORMAT_NHWC) {
BiasGradNHWC_Naive< BiasGradNHWC_Naive<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
total_count, output_backprop, bias_backprop, bias_size); total_count, output_backprop, bias_backprop, bias_size);
} else { } else {
BiasGradNCHW_Naive< BiasGradNCHW_Naive<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
total_count, output_backprop, bias_backprop, bias_size, image_size); total_count, output_backprop, bias_backprop, bias_size,
image_size);
} }
} }
} }

View File

@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
auto *to_x = reinterpret_cast<const volatile T *>(&x); auto *to_x = reinterpret_cast<const volatile T *>(&x);
return *to_x; return *to_x;
} }
} // namespace tensorflow::internal } // namespace internal
} // namespace tensorflow } // namespace tensorflow
#endif // TENSORFLOW_UTIL_BOUNDS_CHECK_H_ #endif // TENSORFLOW_UTIL_BOUNDS_CHECK_H_

View File

@ -126,12 +126,12 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU), REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
SimpleCandidateSamplerOp<LogUniformSampler>); SimpleCandidateSamplerOp<LogUniformSampler>);
REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler") REGISTER_KERNEL_BUILDER(
.Device(DEVICE_CPU), Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU),
SimpleCandidateSamplerOp<UnigramSampler>); SimpleCandidateSamplerOp<UnigramSampler>);
REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler") REGISTER_KERNEL_BUILDER(
.Device(DEVICE_CPU), Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU),
SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>); SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
class AllCandidateSamplerOp : public BaseCandidateSamplerOp { class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
@ -197,7 +197,8 @@ class ComputeAccidentalHitsOp : public OpKernel {
void Compute(OpKernelContext* context) override { void Compute(OpKernelContext* context) override {
const Tensor& in_true_candidates = context->input(0); const Tensor& in_true_candidates = context->input(0);
const TensorShape& in_true_candidates_shape = in_true_candidates.shape(); const TensorShape& in_true_candidates_shape = in_true_candidates.shape();
OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) && OP_REQUIRES(context,
TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
in_true_candidates_shape.dim_size(1) == num_true_, in_true_candidates_shape.dim_size(1) == num_true_,
errors::InvalidArgument( errors::InvalidArgument(
"true_candidates must be a batch_size * num_true matrix")); "true_candidates must be a batch_size * num_true matrix"));

View File

@ -252,4 +252,3 @@ REGISTER_KERNEL_BUILDER(
CpuCastOp); CpuCastOp);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // end namespace tensorflow } // end namespace tensorflow

View File

@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
p[0] = a.value; p[0] = a.value;
p[1] = 0; p[1] = 0;
#else #else
static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!"); static_assert(::tensorflow::port::kLittleEndian,
"Not a little endian system!");
p[0] = 0; p[0] = 0;
p[1] = a.value; p[1] = a.value;
#endif #endif

View File

@ -108,12 +108,12 @@ class HSVToRGBOp : public OpKernel {
}; };
#define REGISTER_CPU(T) \ #define REGISTER_CPU(T) \
REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
RGBToHSVOp<CPUDevice, T>); \ RGBToHSVOp<CPUDevice, T>); \
template class RGBToHSVOp<CPUDevice, T>; \ template class RGBToHSVOp<CPUDevice, T>; \
REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
HSVToRGBOp<CPUDevice, T>); \ HSVToRGBOp<CPUDevice, T>); \
template class HSVToRGBOp<CPUDevice, T>; template class HSVToRGBOp<CPUDevice, T>;
TF_CALL_float(REGISTER_CPU); TF_CALL_float(REGISTER_CPU);
@ -125,25 +125,24 @@ TF_CALL_double(REGISTER_CPU);
namespace functor { namespace functor {
#define DECLARE_GPU(T) \ #define DECLARE_GPU(T) \
template <> \ template <> \
void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \ void RGBToHSV<GPUDevice, T>::operator()( \
TTypes<T, 2>::ConstTensor input_data, \ const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data, \
TTypes<T, 1>::Tensor range, \ TTypes<T, 1>::Tensor range, TTypes<T, 2>::Tensor output_data); \
TTypes<T, 2>::Tensor output_data); \
extern template struct RGBToHSV<GPUDevice, T>; \ extern template struct RGBToHSV<GPUDevice, T>; \
template <> \ template <> \
void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \ void HSVToRGB<GPUDevice, T>::operator()( \
TTypes<T, 2>::ConstTensor input_data, \ const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data, \
TTypes<T, 2>::Tensor output_data); \ TTypes<T, 2>::Tensor output_data); \
extern template struct HSVToRGB<GPUDevice, T>; extern template struct HSVToRGB<GPUDevice, T>;
TF_CALL_float(DECLARE_GPU); TF_CALL_float(DECLARE_GPU);
TF_CALL_double(DECLARE_GPU); TF_CALL_double(DECLARE_GPU);
} // namespace functor } // namespace functor
#define REGISTER_GPU(T) \ #define REGISTER_GPU(T) \
REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
RGBToHSVOp<GPUDevice, T>); \ RGBToHSVOp<GPUDevice, T>); \
REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
HSVToRGBOp<GPUDevice, T>); HSVToRGBOp<GPUDevice, T>);
TF_CALL_float(REGISTER_GPU); TF_CALL_float(REGISTER_GPU);
TF_CALL_double(REGISTER_GPU); TF_CALL_double(REGISTER_GPU);
@ -151,11 +150,11 @@ TF_CALL_double(REGISTER_GPU);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL(T) \ #define REGISTER_SYCL(T) \
REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
RGBToHSVOp<SYCLDevice, T>); \ RGBToHSVOp<SYCLDevice, T>); \
REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \ REGISTER_KERNEL_BUILDER( \
.TypeConstraint<T>("T"), \ Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
HSVToRGBOp<SYCLDevice, T>); HSVToRGBOp<SYCLDevice, T>);
TF_CALL_float(REGISTER_SYCL); TF_CALL_float(REGISTER_SYCL);
TF_CALL_double(REGISTER_SYCL); TF_CALL_double(REGISTER_SYCL);

View File

@ -54,9 +54,8 @@ struct RGBToHSV {
// TODO(wicke): all these assignments are only necessary because a combined // TODO(wicke): all these assignments are only necessary because a combined
// expression is larger than kernel parameter space. A custom kernel is // expression is larger than kernel parameter space. A custom kernel is
// probably in order. // probably in order.
H.device(d) = (R == V).select(norm * (G - B), H.device(d) = (R == V).select(
(G == V).select( norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6),
norm * (B - R) + T(2) / T(6),
norm * (R - G) + T(4) / T(6))); norm * (R - G) + T(4) / T(6)));
H.device(d) = (range > T(0)).select(H, H.constant(T(0))); H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
H.device(d) = (H < T(0)).select(H + T(1), H); H.device(d) = (H < T(0)).select(H + T(1), H);

View File

@ -17,8 +17,8 @@ limitations under the License.
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "tensorflow/core/kernels/colorspace_op.h"
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/colorspace_op.h"
namespace tensorflow { namespace tensorflow {
@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice;
template class functor::HSVToRGB<GPUDevice, T>; template class functor::HSVToRGB<GPUDevice, T>;
TF_CALL_float(INSTANTIATE_GPU); TF_CALL_float(INSTANTIATE_GPU);
TF_CALL_double(INSTANTIATE_GPU); TF_CALL_double(INSTANTIATE_GPU);
} } // namespace tensorflow
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA

View File

@ -41,9 +41,10 @@ namespace tensorflow {
// Assumes all inputs are nonempty // Assumes all inputs are nonempty
template <typename T> template <typename T>
void ConcatCPU(DeviceBase* d, void ConcatCPU(
const std::vector< DeviceBase* d,
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
inputs,
typename TTypes<T, 2>::Matrix* output); typename TTypes<T, 2>::Matrix* output);
#if GOOGLE_CUDA #if GOOGLE_CUDA
template <typename T> template <typename T>
@ -57,9 +58,10 @@ void ConcatGPU(
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
template <typename T> template <typename T>
void ConcatSYCL(const Eigen::SyclDevice& d, void ConcatSYCL(
const std::vector< const Eigen::SyclDevice& d,
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
inputs,
typename TTypes<T, 2>::Matrix* output); typename TTypes<T, 2>::Matrix* output);
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
} // namespace tensorflow } // namespace tensorflow

View File

@ -48,9 +48,10 @@ struct MemCpyCopier<ResourceHandle> {
} // namespace } // namespace
template <typename T> template <typename T>
void ConcatCPU(DeviceBase* d, void ConcatCPU(
const std::vector< DeviceBase* d,
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
inputs,
typename TTypes<T, 2>::Matrix* output) { typename TTypes<T, 2>::Matrix* output) {
if (std::is_same<T, string>::value) { if (std::is_same<T, string>::value) {
// use a large cost here to force strings to be handled by separate threads // use a large cost here to force strings to be handled by separate threads
@ -86,9 +87,10 @@ TF_CALL_variant(REGISTER)
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
template <typename T> template <typename T>
void ConcatSYCL(const Eigen::SyclDevice& d, void ConcatSYCL(
const std::vector< const Eigen::SyclDevice& d,
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
inputs,
typename TTypes<T, 2>::Matrix* output) { typename TTypes<T, 2>::Matrix* output) {
ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(), ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
output); output);

View File

@ -15,9 +15,9 @@ limitations under the License.
#define EIGEN_USE_THREADS #define EIGEN_USE_THREADS
#include "tensorflow/core/kernels/concat_lib.h"
#include <vector> #include <vector>
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/concat_lib.h"
#include "tensorflow/core/util/work_sharder.h" #include "tensorflow/core/util/work_sharder.h"
namespace tensorflow { namespace tensorflow {

View File

@ -71,7 +71,8 @@ class ConcatBaseOp : public OpKernel {
const TensorShape& input_shape = values[0].shape(); const TensorShape& input_shape = values[0].shape();
int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim; int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
OP_REQUIRES(c, (0 <= axis && axis < input_dims) || OP_REQUIRES(c,
(0 <= axis && axis < input_dims) ||
(allow_legacy_scalars() && concat_dim == 0), (allow_legacy_scalars() && concat_dim == 0),
errors::InvalidArgument( errors::InvalidArgument(
"ConcatOp : Expected concatenating dimensions in the range " "ConcatOp : Expected concatenating dimensions in the range "
@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel {
c, in.dims() == input_dims || (input_is_scalar && in_is_scalar), c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
errors::InvalidArgument( errors::InvalidArgument(
"ConcatOp : Ranks of all input tensors should match: shape[0] = ", "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
input_shape.DebugString(), " vs. shape[", i, "] = ", input_shape.DebugString(), " vs. shape[", i,
in.shape().DebugString())); "] = ", in.shape().DebugString()));
for (int j = 0; j < input_dims; ++j) { for (int j = 0; j < input_dims; ++j) {
if (j == axis) { if (j == axis) {
continue; continue;
@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel {
c, in.dim_size(j) == input_shape.dim_size(j), c, in.dim_size(j) == input_shape.dim_size(j),
errors::InvalidArgument( errors::InvalidArgument(
"ConcatOp : Dimensions of inputs should match: shape[0] = ", "ConcatOp : Dimensions of inputs should match: shape[0] = ",
input_shape.DebugString(), " vs. shape[", i, "] = ", input_shape.DebugString(), " vs. shape[", i,
in.shape().DebugString())); "] = ", in.shape().DebugString()));
} }
if (in.NumElements() > 0) { if (in.NumElements() > 0) {
int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0; int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;

View File

@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000); BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>, typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
Eigen::Unaligned> EigenMap; Eigen::Unaligned>
EigenMap;
static void MemcpyManyAlternative1(int iters, int dim2) { static void MemcpyManyAlternative1(int iters, int dim2) {
testing::StopTiming(); testing::StopTiming();

View File

@ -99,8 +99,9 @@ class AccumulatorTakeGradientOp
ConditionalAccumulatorBase* accumulator, ConditionalAccumulatorBase* accumulator,
DoneCallback callback) override { DoneCallback callback) override {
// Check signature // Check signature
OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32}, OP_REQUIRES_OK_ASYNC(
{accumulator->dtype()}), ctx,
ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}),
callback); callback);
} }
@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp
REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU), REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
AccumulatorTakeGradientOp); AccumulatorTakeGradientOp);
} // namespace tensorflow } // namespace tensorflow

View File

@ -146,7 +146,6 @@ typedef Eigen::GpuDevice GPUDevice;
typedef Eigen::SyclDevice SYCLDevice; typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T, typename Index> template <typename Device, typename T, typename Index>
class FillOp : public OpKernel { class FillOp : public OpKernel {
public: public:

View File

@ -91,6 +91,7 @@ class KilledBySignal {
public: public:
explicit KilledBySignal(int signum) : signum_(signum) {} explicit KilledBySignal(int signum) : signum_(signum) {}
bool operator()(int exit_status) const { return exit_status == signum_; } bool operator()(int exit_status) const { return exit_status == signum_; }
private: private:
const int signum_; const int signum_;
}; };

View File

@ -679,7 +679,8 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
const int dims = resized_shape.dims(); const int dims = resized_shape.dims();
OP_REQUIRES( OP_REQUIRES(
context, TensorShapeUtils::IsMatrix(paddings.shape()) && context,
TensorShapeUtils::IsMatrix(paddings.shape()) &&
paddings.dim_size(1) == 2, paddings.dim_size(1) == 2,
errors::InvalidArgument("paddings must be a matrix with 2 columns: ", errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
paddings.shape().DebugString())); paddings.shape().DebugString()));
@ -715,11 +716,12 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
const int32 after = const int32 after =
paddings_matrix(d, 1); // Pad after existing elements. paddings_matrix(d, 1); // Pad after existing elements.
OP_REQUIRES(context, before >= 0 && after >= 0, OP_REQUIRES(context, before >= 0 && after >= 0,
errors::InvalidArgument("paddings must be non-negative: ", errors::InvalidArgument(
before, " ", after)); "paddings must be non-negative: ", before, " ", after));
if (offset_ == 0) { // SYMMETRIC mode. if (offset_ == 0) { // SYMMETRIC mode.
OP_REQUIRES( OP_REQUIRES(
context, before <= resized_shape.dim_size(d) && context,
before <= resized_shape.dim_size(d) &&
after <= resized_shape.dim_size(d), after <= resized_shape.dim_size(d),
errors::InvalidArgument("paddings must be no greater " errors::InvalidArgument("paddings must be no greater "
"than the dimension size: ", "than the dimension size: ",
@ -727,7 +729,8 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
resized_shape.dim_size(d))); resized_shape.dim_size(d)));
} else if (offset_ == 1) { // REFLECT mode. } else if (offset_ == 1) { // REFLECT mode.
OP_REQUIRES( OP_REQUIRES(
context, before < resized_shape.dim_size(d) && context,
before < resized_shape.dim_size(d) &&
after < resized_shape.dim_size(d), after < resized_shape.dim_size(d),
errors::InvalidArgument("paddings must be less than" errors::InvalidArgument("paddings must be less than"
" the dimension size: ", " the dimension size: ",
@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
// We only check the first three dims, since the depth is accessed as an // We only check the first three dims, since the depth is accessed as an
// int64 below. // int64 below.
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), OP_REQUIRES(
std::numeric_limits<int>::max()), context,
FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
errors::InvalidArgument("filter too large")); errors::InvalidArgument("filter too large"));
} }
// The last dimension for input is in_depth. It must be the same as the // The last dimension for input is in_depth. It must be the same as the
// filter's in_depth. // filter's in_depth.
const int64 in_depth = padded_shape.dim_size(3); const int64 in_depth = padded_shape.dim_size(3);
OP_REQUIRES( OP_REQUIRES(context, in_depth == filter.dim_size(2),
context, in_depth == filter.dim_size(2), errors::InvalidArgument(
errors::InvalidArgument("input and filter must have the same depth: ", "input and filter must have the same depth: ", in_depth,
in_depth, " vs ", filter.dim_size(2))); " vs ", filter.dim_size(2)));
// The last dimension for filter is out_depth. // The last dimension for filter is out_depth.
const int out_depth = static_cast<int>(filter.dim_size(3)); const int out_depth = static_cast<int>(filter.dim_size(3));
@ -786,8 +790,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
// The second dimension for input is rows/height. // The second dimension for input is rows/height.
// The first dimension for filter is rows/height. // The first dimension for filter is rows/height.
const int64 padded_rows_raw = padded_shape.dim_size(1); const int64 padded_rows_raw = padded_shape.dim_size(1);
OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw, OP_REQUIRES(
std::numeric_limits<int>::max()), context,
FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
errors::InvalidArgument("Input rows too large")); errors::InvalidArgument("Input rows too large"));
const int padded_rows = static_cast<int>(padded_rows_raw); const int padded_rows = static_cast<int>(padded_rows_raw);
const int filter_rows = static_cast<int>(filter.dim_size(0)); const int filter_rows = static_cast<int>(filter.dim_size(0));
@ -796,8 +801,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
// The third dimension for input is columns/width. // The third dimension for input is columns/width.
// The second dimension for filter is columns/width. // The second dimension for filter is columns/width.
const int64 padded_cols_raw = padded_shape.dim_size(2); const int64 padded_cols_raw = padded_shape.dim_size(2);
OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw, OP_REQUIRES(
std::numeric_limits<int>::max()), context,
FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
errors::InvalidArgument("Input cols too large")); errors::InvalidArgument("Input cols too large"));
const int padded_cols = static_cast<int>(padded_cols_raw); const int padded_cols = static_cast<int>(padded_cols_raw);
const int filter_cols = static_cast<int>(filter.dim_size(1)); const int filter_cols = static_cast<int>(filter.dim_size(1));
@ -870,7 +876,8 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
.Device(DEVICE_CPU) \ .Device(DEVICE_CPU) \
.TypeConstraint<T>("T"), \ .TypeConstraint<T>("T"), \
FusedResizeConv2DUsingGemmOp< \ FusedResizeConv2DUsingGemmOp< \
T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \ T, \
FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
BILINEAR>, \ BILINEAR>, \
true>); true>);
@ -880,7 +887,8 @@ TF_CALL_float(REGISTER_FUSED);
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
FusedResizeConv2DUsingGemmOp< \ FusedResizeConv2DUsingGemmOp< \
T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \ T, \
FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
NEAREST>, \ NEAREST>, \
false>); false>);

View File

@ -27,7 +27,6 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
// Get the Cudnn workspace limit from the environment variable, which is in MB. // Get the Cudnn workspace limit from the environment variable, which is in MB.
// Return the workspace memory limit in bytes. If no value is set, return the // Return the workspace memory limit in bytes. If no value is set, return the
// default value. // default value.

View File

@ -25,9 +25,9 @@ limitations under the License.
#include "cuda/include/cuda.h" #include "cuda/include/cuda.h"
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/conv_2d.h" #include "tensorflow/core/kernels/conv_2d.h"
#include "tensorflow/core/lib/math/math_util.h"
#include "tensorflow/core/util/cuda_kernel_helper.h" #include "tensorflow/core/util/cuda_kernel_helper.h"
#include "tensorflow/core/util/tensor_format.h" #include "tensorflow/core/util/tensor_format.h"
#include "tensorflow/core/lib/math/math_util.h"
namespace tensorflow { namespace tensorflow {
@ -252,11 +252,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
int x = threadIdx.x; int x = threadIdx.x;
Dimension<3> output_dims = { Dimension<3> output_dims = {
input_dims[0], input_dims[2], input_dims[1], input_dims[0],
input_dims[2],
input_dims[1],
}; };
Dimension<3> input_dims_in_tiles = { Dimension<3> input_dims_in_tiles = {
input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI, input_dims[0],
(input_dims[1] + TileSizeI - 1) / TileSizeI,
(input_dims[2] + TileSizeJ - 1) / TileSizeJ, (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
}; };
@ -264,7 +267,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
FlatToTensorIndex(blockIdx.x, input_dims_in_tiles); FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
Index<3> input_tile_origin = { Index<3> input_tile_origin = {
input_tile_index[0], input_tile_index[1] * TileSizeI, input_tile_index[0],
input_tile_index[1] * TileSizeI,
input_tile_index[2] * TileSizeJ, input_tile_index[2] * TileSizeJ,
}; };
@ -322,11 +326,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
__syncthreads(); __syncthreads();
Index<3> output_tile_index = { Index<3> output_tile_index = {
input_tile_index[0], input_tile_index[2], input_tile_index[1], input_tile_index[0],
input_tile_index[2],
input_tile_index[1],
}; };
Index<3> output_tile_origin = { Index<3> output_tile_origin = {
output_tile_index[0], output_tile_index[1] * TileSizeJ, output_tile_index[0],
output_tile_index[1] * TileSizeJ,
output_tile_index[2] * TileSizeI, output_tile_index[2] * TileSizeI,
}; };
@ -902,19 +909,21 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
constexpr int kNumThreads = 256; constexpr int kNumThreads = 256;
Dimension<3> input_dims_in_tiles = { Dimension<3> input_dims_in_tiles = {
input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize), input_dims[0],
MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize), MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
}; };
int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] * int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
input_dims_in_tiles[2]; input_dims_in_tiles[2];
SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate> SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
conjugate>
<<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims, <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
output); output);
} else if (narrow_matrix) { } else if (narrow_matrix) {
SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output, SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
kMinDimensionToUseTiles); d, input, input_dims, output, kMinDimensionToUseTiles);
} else { } else {
int total_element_count = input_dims[0] * input_dims[1] * input_dims[2]; int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d); CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);

View File

@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
filter.shape().DebugString())); filter.shape().DebugString()));
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), OP_REQUIRES(
std::numeric_limits<int>::max()), context,
FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
errors::InvalidArgument("filter too large")); errors::InvalidArgument("filter too large"));
} }
// The last dimension for input is in_depth. It must be the same as the // The last dimension for input is in_depth. It must be the same as the
// filter's in_depth. // filter's in_depth.
const int64 in_depth = GetTensorDim(input, data_format_, 'C'); const int64 in_depth = GetTensorDim(input, data_format_, 'C');
OP_REQUIRES( OP_REQUIRES(context, in_depth == filter.dim_size(2),
context, in_depth == filter.dim_size(2), errors::InvalidArgument(
errors::InvalidArgument("input and filter must have the same depth: ", "input and filter must have the same depth: ", in_depth,
in_depth, " vs ", filter.dim_size(2))); " vs ", filter.dim_size(2)));
// The last dimension for filter is out_depth. // The last dimension for filter is out_depth.
const int out_depth = static_cast<int>(filter.dim_size(3)); const int out_depth = static_cast<int>(filter.dim_size(3));
@ -487,8 +488,9 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
// The second dimension for input is rows/height. // The second dimension for input is rows/height.
// The first dimension for filter is rows/height. // The first dimension for filter is rows/height.
const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H'); const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
OP_REQUIRES(context, FastBoundsCheck(input_rows_raw, OP_REQUIRES(
std::numeric_limits<int>::max()), context,
FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
errors::InvalidArgument("Input rows too large")); errors::InvalidArgument("Input rows too large"));
const int input_rows = static_cast<int>(input_rows_raw); const int input_rows = static_cast<int>(input_rows_raw);
const int filter_rows = static_cast<int>(filter.dim_size(0)); const int filter_rows = static_cast<int>(filter.dim_size(0));
@ -496,8 +498,9 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
// The third dimension for input is columns/width. // The third dimension for input is columns/width.
// The second dimension for filter is columns/width. // The second dimension for filter is columns/width.
const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W'); const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
OP_REQUIRES(context, FastBoundsCheck(input_cols_raw, OP_REQUIRES(
std::numeric_limits<int>::max()), context,
FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
errors::InvalidArgument("Input cols too large")); errors::InvalidArgument("Input cols too large"));
const int input_cols = static_cast<int>(input_cols_raw); const int input_cols = static_cast<int>(input_cols_raw);
const int filter_cols = static_cast<int>(filter.dim_size(1)); const int filter_cols = static_cast<int>(filter.dim_size(1));

View File

@ -17,8 +17,8 @@ limitations under the License.
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "tensorflow/core/kernels/cross_op.h"
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/cross_op.h"
namespace tensorflow { namespace tensorflow {

View File

@ -19,13 +19,13 @@ limitations under the License.
#include <limits> #include <limits>
#include "tensorflow/core/util/ctc/ctc_beam_search.h"
#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/types.h"
#include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/util/ctc/ctc_beam_search.h"
#include "tensorflow/core/util/sparse/sparse_tensor.h" #include "tensorflow/core/util/sparse/sparse_tensor.h"
namespace tensorflow { namespace tensorflow {
@ -80,16 +80,17 @@ class CTCDecodeHelper {
if (!(batch_size == (*seq_len)->dim_size(0))) { if (!(batch_size == (*seq_len)->dim_size(0))) {
return errors::FailedPrecondition( return errors::FailedPrecondition(
"len(sequence_length) != batch_size. ", "len(sequence_length): ", "len(sequence_length) != batch_size. ",
(*seq_len)->dim_size(0), " batch_size: ", batch_size); "len(sequence_length): ", (*seq_len)->dim_size(0),
" batch_size: ", batch_size);
} }
auto seq_len_t = (*seq_len)->vec<int32>(); auto seq_len_t = (*seq_len)->vec<int32>();
for (int b = 0; b < batch_size; ++b) { for (int b = 0; b < batch_size; ++b) {
if (!(seq_len_t(b) <= max_time)) { if (!(seq_len_t(b) <= max_time)) {
return errors::FailedPrecondition("sequence_length(", b, ") <= ", return errors::FailedPrecondition("sequence_length(", b,
max_time); ") <= ", max_time);
} }
} }

View File

@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel {
const int64 batch_indices = g.group()[0]; const int64 batch_indices = g.group()[0];
OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size), OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
errors::InvalidArgument("labels batch index must be between ", errors::InvalidArgument("labels batch index must be between ",
0, " and ", batch_size, " but saw: ", 0, " and ", batch_size,
batch_indices)); " but saw: ", batch_indices));
auto values = g.values<int32>(); auto values = g.values<int32>();
std::vector<int>* b_values = &labels_t[batch_indices]; std::vector<int>* b_values = &labels_t[batch_indices];

View File

@ -17,8 +17,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_gradients.h" #include "tensorflow/core/kernels/cwise_ops_gradients.h"
namespace tensorflow { namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
complex64, complex128); complex128);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double); REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);

View File

@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
BinaryOp<CPUDevice, functor::add<int32>>); BinaryOp<CPUDevice, functor::add<int32>>);
#endif #endif
#if TENSORFLOW_USE_SYCL #if TENSORFLOW_USE_SYCL
#define REGISTER_KERNEL(type) \ #define REGISTER_KERNEL(type) \
REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \ REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \

View File

@ -22,8 +22,8 @@ namespace tensorflow {
// sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__. // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
#if !defined(__ANDROID_TYPES_SLIM__) #if !defined(__ANDROID_TYPES_SLIM__)
REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
uint8, complex128, string); complex128, string);
// Notice: String is excluded to allow marking AddV2 is_commutative and // Notice: String is excluded to allow marking AddV2 is_commutative and
// is_aggregate. // is_aggregate.
REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8, REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,

View File

@ -17,8 +17,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_gradients.h" #include "tensorflow/core/kernels/cwise_ops_gradients.h"
namespace tensorflow { namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
complex64, complex128); complex128);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double); REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);

View File

@ -17,8 +17,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_gradients.h" #include "tensorflow/core/kernels/cwise_ops_gradients.h"
namespace tensorflow { namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
complex64, complex128); complex128);
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double); REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);

View File

@ -16,15 +16,13 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h" #include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow { namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
complex64, complex128); complex128);
#if TENSORFLOW_USE_SYCL #if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(TYPE) \ #define REGISTER_SYCL_KERNEL(TYPE) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("Cosh") \ Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
.Device(DEVICE_SYCL) \
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::cosh<TYPE>>); UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
REGISTER_SYCL_KERNEL(float); REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double); REGISTER_SYCL_KERNEL(double);

View File

@ -15,8 +15,10 @@ limitations under the License.
#if GOOGLE_CUDA #if GOOGLE_CUDA
#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" #define EIGEN_USE_GPU
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
namespace tensorflow { namespace tensorflow {
namespace functor { namespace functor {
@ -38,7 +40,6 @@ struct SelectScalarFunctor<GPUDevice, T> {
typename TTypes<bool>::ConstScalar cond, typename TTypes<bool>::ConstScalar cond,
typename TTypes<T>::ConstFlat then_flat, typename TTypes<T>::ConstFlat then_flat,
typename TTypes<T>::ConstFlat else_flat) { typename TTypes<T>::ConstFlat else_flat) {
#if !defined(EIGEN_HAS_INDEX_LIST) #if !defined(EIGEN_HAS_INDEX_LIST)
Eigen::array<int, 1> rank1{1}; Eigen::array<int, 1> rank1{1};
#else #else
@ -50,7 +51,6 @@ struct SelectScalarFunctor<GPUDevice, T> {
To32Bit(out).device(d) = cond.reshape(rank1) To32Bit(out).device(d) = cond.reshape(rank1)
.broadcast(broadcast_dims) .broadcast(broadcast_dims)
.select(then_flat, else_flat); .select(then_flat, else_flat);
} }
}; };

View File

@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
#endif #endif
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double); REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
double);
REGISTER_KERNEL_BUILDER(Name("GreaterEqual") REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
.Device(DEVICE_SYCL) .Device(DEVICE_SYCL)

View File

@ -17,8 +17,8 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
uint8, int32); int32);
#if defined(__ANDROID_TYPES_SLIM__) #if defined(__ANDROID_TYPES_SLIM__)
// We only register the first type when we have multi-argument calls in the // We only register the first type when we have multi-argument calls in the
// case where we're trying to reduce executable size, but it turns out that the // case where we're trying to reduce executable size, but it turns out that the

View File

@ -22,8 +22,8 @@ namespace tensorflow {
// sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__. // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
#if !defined(__ANDROID_TYPES_SLIM__) #if !defined(__ANDROID_TYPES_SLIM__)
REGISTER6(BinaryOp, CPU, "Mul", functor::mul, REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64,
int8, uint16, int16, int64, complex64, complex128); complex64, complex128);
#if GOOGLE_CUDA #if GOOGLE_CUDA
REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64, REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64,
complex64, complex128); complex64, complex128);

View File

@ -201,12 +201,10 @@ struct SelectFunctorBase {
}; };
template <typename T> template <typename T>
struct SelectFunctor<CPUDevice, T> struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
: SelectFunctorBase<CPUDevice, T> {};
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
template <typename T> template <typename T>
struct SelectFunctor<SYCLDevice, T> struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
: SelectFunctorBase<SYCLDevice, T> {};
#endif // TENSORFLOW_USE_SYCL #endif // TENSORFLOW_USE_SYCL
template <typename Device, typename T> template <typename Device, typename T>
@ -257,8 +255,8 @@ struct BatchSelectFunctorBase {
}; };
template <typename T> template <typename T>
struct BatchSelectFunctor<CPUDevice, T> struct BatchSelectFunctor<CPUDevice, T> : BatchSelectFunctorBase<CPUDevice, T> {
: BatchSelectFunctorBase<CPUDevice, T> {}; };
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
template <typename T> template <typename T>
struct BatchSelectFunctor<SYCLDevice, T> struct BatchSelectFunctor<SYCLDevice, T>

View File

@ -16,15 +16,13 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h" #include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow { namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
complex64, complex128); complex128);
#if TENSORFLOW_USE_SYCL #if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(TYPE) \ #define REGISTER_SYCL_KERNEL(TYPE) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
Name("Sinh") \ Name("Sinh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
.Device(DEVICE_SYCL) \
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::sinh<TYPE>>); UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
REGISTER_SYCL_KERNEL(float); REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double); REGISTER_SYCL_KERNEL(double);

View File

@ -57,8 +57,8 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
in1(ctx->input(1)), in1(ctx->input(1)),
bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) { bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) {
if (!bcast.IsValid()) { if (!bcast.IsValid()) {
ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ", ctx->SetStatus(errors::InvalidArgument(
in0.shape().DebugString(), " vs. ", "Incompatible shapes: ", in0.shape().DebugString(), " vs. ",
in1.shape().DebugString())); in1.shape().DebugString()));
return; return;
} }

View File

@ -171,7 +171,6 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
} }
}; };
#ifdef TENSORFLOW_USE_SYCL #ifdef TENSORFLOW_USE_SYCL
// Partial specialization of BinaryFunctor for SYCL devices // Partial specialization of BinaryFunctor for SYCL devices
typedef Eigen::SyclDevice SYCLDevice; typedef Eigen::SyclDevice SYCLDevice;

View File

@ -51,7 +51,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
void operator()(const SYCLDevice& d, typename Functor::tout_type out, void operator()(const SYCLDevice& d, typename Functor::tout_type out,
typename Functor::tin_type in0, typename Functor::tin_type in0,
typename Functor::tin_type in1, bool* error) { typename Functor::tin_type in1, bool* error) {
To32Bit(out).device(d) = To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func()); To32Bit(out).device(d) =
To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
} }
void Left(const SYCLDevice& d, typename Functor::tout_type out, void Left(const SYCLDevice& d, typename Functor::tout_type out,
@ -61,7 +62,9 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
constexpr int NumDims = Functor::tin_type::NumDimensions; constexpr int NumDims = Functor::tin_type::NumDimensions;
static_assert(NumDims == 1, "Unexpected size"); static_assert(NumDims == 1, "Unexpected size");
Eigen::Sizes<1> scalar_dim; Eigen::Sizes<1> scalar_dim;
out.device(d) = scalar.reshape(scalar_dim).broadcast(in.dimensions()).binaryExpr(in, Binary()); out.device(d) = scalar.reshape(scalar_dim)
.broadcast(in.dimensions())
.binaryExpr(in, Binary());
} }
void Right(const SYCLDevice& d, typename Functor::tout_type out, void Right(const SYCLDevice& d, typename Functor::tout_type out,
@ -71,7 +74,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
constexpr int NumDims = Functor::tin_type::NumDimensions; constexpr int NumDims = Functor::tin_type::NumDimensions;
static_assert(NumDims == 1, "Unexpected size"); static_assert(NumDims == 1, "Unexpected size");
Eigen::Sizes<1> scalar_dim; Eigen::Sizes<1> scalar_dim;
out.device(d) = in.binaryExpr(scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary()); out.device(d) = in.binaryExpr(
scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
} }
void BCast(const SYCLDevice& d, void BCast(const SYCLDevice& d,

View File

@ -430,13 +430,10 @@ class IteratorStateVariant {
REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant, REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
kIteratorVariantTypeName); kIteratorVariantTypeName);
// TODO(mrry): Can we simply use the template kernel here?
class IteratorHandleOp : public OpKernel { class IteratorHandleOp : public OpKernel {
public: public:
explicit IteratorHandleOp(OpKernelConstruction* ctx) explicit IteratorHandleOp(OpKernelConstruction* ctx)
: OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) { : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_STRING, TensorShape({2}),
&handle_, nullptr));
OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
@ -460,20 +457,21 @@ class IteratorHandleOp : public OpKernel {
} }
void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) { void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
{
mutex_lock l(mu_); mutex_lock l(mu_);
if (resource_ == nullptr) {
FunctionLibraryRuntime* lib = context->function_library(); FunctionLibraryRuntime* lib = context->function_library();
std::unique_ptr<DeviceMgr> device_mgr(nullptr); std::unique_ptr<DeviceMgr> device_mgr(nullptr);
std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr); std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr); std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
// If the iterator is shared then we construct a new FLR, and pass that in. // If the iterator is shared then we construct a new FLR, and pass that
// NOTE(mrry,rohanj): In this case it is not possible to call remote // in. NOTE(mrry,rohanj): In this case it is not possible to call remote
// functions from the iterator. We may add this functionality if there // functions from the iterator. We may add this functionality if there
// is sufficient demand, but it will require a significant refactoring. // is sufficient demand, but it will require a significant refactoring.
if (!name_.empty()) { if (!name_.empty()) {
lib = CreateFLR(context, &device_mgr, &flib_def, &pflr); lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr);
} }
if (resource_ == nullptr) {
ResourceMgr* mgr = context->resource_manager(); ResourceMgr* mgr = context->resource_manager();
OP_REQUIRES_OK(context, cinfo_.Init(mgr, def())); OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
@ -482,8 +480,8 @@ class IteratorHandleOp : public OpKernel {
context, context,
mgr->LookupOrCreate<IteratorResource>( mgr->LookupOrCreate<IteratorResource>(
cinfo_.container(), cinfo_.name(), &resource, cinfo_.container(), cinfo_.name(), &resource,
[lib, &device_mgr, &flib_def, &pflr, this](IteratorResource** ret) [lib, &device_mgr, &flib_def, &pflr,
EXCLUSIVE_LOCKS_REQUIRED(mu_) { this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
*ret = new IteratorResource( *ret = new IteratorResource(
output_dtypes_, output_shapes_, graph_def_version_, output_dtypes_, output_shapes_, graph_def_version_,
std::move(device_mgr), std::move(flib_def), std::move(device_mgr), std::move(flib_def),
@ -498,18 +496,12 @@ class IteratorHandleOp : public OpKernel {
return; return;
} }
auto h = handle_.AccessTensor(context)->template flat<string>();
h(0) = cinfo_.container();
h(1) = cinfo_.name();
resource_ = resource; resource_ = resource;
} }
if (context->expected_output_dtype(0) == DT_RESOURCE) { }
OP_REQUIRES_OK(context, MakeResourceHandleToOutput( OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
context, 0, cinfo_.container(), cinfo_.name(), context, 0, cinfo_.container(), cinfo_.name(),
MakeTypeIndex<IteratorResource>())); MakeTypeIndex<IteratorResource>()));
} else {
context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
}
} }
private: private:
@ -526,7 +518,7 @@ class IteratorHandleOp : public OpKernel {
return Status::OK(); return Status::OK();
} }
FunctionLibraryRuntime* CreateFLR( FunctionLibraryRuntime* CreatePrivateFLR(
OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr, OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
std::unique_ptr<FunctionLibraryDefinition>* flib_def, std::unique_ptr<FunctionLibraryDefinition>* flib_def,
std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) { std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
@ -546,9 +538,8 @@ class IteratorHandleOp : public OpKernel {
} }
mutex mu_; mutex mu_;
ContainerInfo cinfo_ GUARDED_BY(mu_); ContainerInfo cinfo_; // Written once under mu_ then constant afterwards.
IteratorResource* resource_ GUARDED_BY(mu_) = nullptr; IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
PersistentTensor handle_ GUARDED_BY(mu_);
DataTypeVector output_dtypes_; DataTypeVector output_dtypes_;
std::vector<PartialTensorShape> output_shapes_; std::vector<PartialTensorShape> output_shapes_;
const int graph_def_version_; const int graph_def_version_;

View File

@ -91,9 +91,9 @@ class DecodeCSVOp : public OpKernel {
} else { } else {
int32 value; int32 value;
OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value), OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value),
errors::InvalidArgument("Field ", f, " in record ", i, errors::InvalidArgument(
" is not a valid int32: ", "Field ", f, " in record ", i,
fields[f])); " is not a valid int32: ", fields[f]));
output[f]->flat<int32>()(i) = value; output[f]->flat<int32>()(i) = value;
} }
break; break;
@ -111,9 +111,9 @@ class DecodeCSVOp : public OpKernel {
} else { } else {
int64 value; int64 value;
OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value), OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value),
errors::InvalidArgument("Field ", f, " in record ", i, errors::InvalidArgument(
" is not a valid int64: ", "Field ", f, " in record ", i,
fields[f])); " is not a valid int64: ", fields[f]));
output[f]->flat<int64>()(i) = value; output[f]->flat<int64>()(i) = value;
} }
break; break;
@ -130,9 +130,9 @@ class DecodeCSVOp : public OpKernel {
} else { } else {
float value; float value;
OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value), OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
errors::InvalidArgument("Field ", f, " in record ", i, errors::InvalidArgument(
" is not a valid float: ", "Field ", f, " in record ", i,
fields[f])); " is not a valid float: ", fields[f]));
output[f]->flat<float>()(i) = value; output[f]->flat<float>()(i) = value;
} }
break; break;
@ -150,9 +150,9 @@ class DecodeCSVOp : public OpKernel {
} else { } else {
double value; double value;
OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value), OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
errors::InvalidArgument("Field ", f, " in record ", i, errors::InvalidArgument(
" is not a valid double: ", "Field ", f, " in record ", i,
fields[f])); " is not a valid double: ", fields[f]));
output[f]->flat<double>()(i) = value; output[f]->flat<double>()(i) = value;
} }
break; break;
@ -208,7 +208,8 @@ class DecodeCSVOp : public OpKernel {
if (!quoted) { if (!quoted) {
while (static_cast<size_t>(current_idx) < input.size() && while (static_cast<size_t>(current_idx) < input.size() &&
input[current_idx] != delim_) { input[current_idx] != delim_) {
OP_REQUIRES(ctx, (!use_quote_delim_ || input[current_idx] != '"') && OP_REQUIRES(ctx,
(!use_quote_delim_ || input[current_idx] != '"') &&
input[current_idx] != '\n' && input[current_idx] != '\n' &&
input[current_idx] != '\r', input[current_idx] != '\r',
errors::InvalidArgument( errors::InvalidArgument(
@ -238,7 +239,8 @@ class DecodeCSVOp : public OpKernel {
} }
OP_REQUIRES( OP_REQUIRES(
ctx, (static_cast<size_t>(current_idx) < input.size() && ctx,
(static_cast<size_t>(current_idx) < input.size() &&
input[current_idx] == '"' && input[current_idx] == '"' &&
(static_cast<size_t>(current_idx) == input.size() - 1 || (static_cast<size_t>(current_idx) == input.size() - 1 ||
input[current_idx + 1] == delim_)), input[current_idx + 1] == delim_)),

View File

@ -87,10 +87,11 @@ class DecodeImageOp : public OpKernel {
channels_ = 3; channels_ = 3;
} else { } else {
OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_)); OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 || OP_REQUIRES(
channels_ == 4, context,
errors::InvalidArgument( channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
"channels must be 0, 1, 3, or 4, got ", channels_)); errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
channels_));
} }
flags_.components = channels_; flags_.components = channels_;
@ -114,8 +115,9 @@ class DecodeImageOp : public OpKernel {
if (format_ == kJpgFormat) { if (format_ == kJpgFormat) {
OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio)); OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 || OP_REQUIRES(context,
flags_.ratio == 4 || flags_.ratio == 8, flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
flags_.ratio == 8,
errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ", errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
flags_.ratio)); flags_.ratio));
OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling", OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
@ -130,7 +132,8 @@ class DecodeImageOp : public OpKernel {
string dct_method; string dct_method;
OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method)); OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
OP_REQUIRES( OP_REQUIRES(
context, (dct_method.empty() || dct_method == "INTEGER_FAST" || context,
(dct_method.empty() || dct_method == "INTEGER_FAST" ||
dct_method == "INTEGER_ACCURATE"), dct_method == "INTEGER_ACCURATE"),
errors::InvalidArgument("dct_method must be one of " errors::InvalidArgument("dct_method must be one of "
"{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}")); "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
@ -157,9 +160,9 @@ class DecodeImageOp : public OpKernel {
errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ", errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
FileFormatString(magic, input))); FileFormatString(magic, input)));
OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(), OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
errors::InvalidArgument(FileFormatString(magic, input), errors::InvalidArgument(
" contents are too large for int: ", FileFormatString(magic, input),
input.size())); " contents are too large for int: ", input.size()));
OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8, OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
errors::InvalidArgument(FileFormatString(magic, input), errors::InvalidArgument(FileFormatString(magic, input),
" does not support uint16 output")); " does not support uint16 output"));
@ -212,7 +215,8 @@ class DecodeImageOp : public OpKernel {
input.data(), input.size(), flags, nullptr /* nwarn */, input.data(), input.size(), flags, nullptr /* nwarn */,
[=, &output](int width, int height, int channels) -> uint8* { [=, &output](int width, int height, int channels) -> uint8* {
Status status(context->allocate_output( Status status(context->allocate_output(
0, format_ == kGifFormat 0,
format_ == kGifFormat
? TensorShape({1, height, width, channels}) ? TensorShape({1, height, width, channels})
: TensorShape({height, width, channels}), : TensorShape({height, width, channels}),
&output)); &output));

View File

@ -120,8 +120,8 @@ bool CanUseDeepConv2D(int stride_rows, int stride_cols, int filter_rows,
VLOG(2) << "CanUseDeepConv2D" VLOG(2) << "CanUseDeepConv2D"
<< " deep_conv_cost: " << deep_conv_cost << " deep_conv_cost: " << deep_conv_cost
<< " direct_conv_cost: " << direct_conv_cost << " direct_conv_cost: " << direct_conv_cost << " deep_direct_ratio: "
<< " deep_direct_ratio: " << (static_cast<float>(deep_conv_cost) / << (static_cast<float>(deep_conv_cost) /
static_cast<float>(direct_conv_cost)) static_cast<float>(direct_conv_cost))
<< " use_deep_conv: " << (deep_conv_cost < direct_conv_cost); << " use_deep_conv: " << (deep_conv_cost < direct_conv_cost);
return deep_conv_cost < direct_conv_cost; return deep_conv_cost < direct_conv_cost;

View File

@ -308,10 +308,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
// in_depth for input and filter must match. // in_depth for input and filter must match.
const int64 in_depth = GetTensorDim(input, data_format_, 'C'); const int64 in_depth = GetTensorDim(input, data_format_, 'C');
OP_REQUIRES( OP_REQUIRES(context, in_depth == filter.dim_size(2),
context, in_depth == filter.dim_size(2), errors::InvalidArgument(
errors::InvalidArgument("input and filter must have the same depth: ", "input and filter must have the same depth: ", in_depth,
in_depth, " vs ", filter.dim_size(2))); " vs ", filter.dim_size(2)));
// The last dimension for filter is depth multiplier. // The last dimension for filter is depth multiplier.
const int32 depth_multiplier = filter.dim_size(3); const int32 depth_multiplier = filter.dim_size(3);
@ -430,8 +430,9 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
#endif #endif
#if GOOGLE_CUDA #if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"), .Device(DEVICE_GPU)
.TypeConstraint<Eigen::half>("T"),
DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>); DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER(

View File

@ -17,12 +17,12 @@ limitations under the License.
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "external/cub_archive/cub/util_ptx.cuh"
#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/kernels/depthwise_conv_op.h" #include "tensorflow/core/kernels/depthwise_conv_op.h"
#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/cuda_kernel_helper.h" #include "tensorflow/core/util/cuda_kernel_helper.h"
#include "tensorflow/core/util/tensor_format.h" #include "tensorflow/core/util/tensor_format.h"
#include "external/cub_archive/cub/util_ptx.cuh"
#if !defined(_MSC_VER) #if !defined(_MSC_VER)
#define UNROLL _Pragma("unroll") #define UNROLL _Pragma("unroll")

View File

@ -29,8 +29,8 @@ limitations under the License.
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/work_sharder.h" #include "tensorflow/core/util/work_sharder.h"
namespace tensorflow { namespace tensorflow {
@ -47,8 +47,9 @@ class DiagOp : public OpKernel {
void Compute(OpKernelContext* context) override { void Compute(OpKernelContext* context) override {
const Tensor& diagonal = context->input(0); const Tensor& diagonal = context->input(0);
const int num_dims = diagonal.dims(); const int num_dims = diagonal.dims();
OP_REQUIRES(context, 0 != num_dims, errors::InvalidArgument( OP_REQUIRES(
"Input must be at least rank 1, got 0")); context, 0 != num_dims,
errors::InvalidArgument("Input must be at least rank 1, got 0"));
TensorShape out_shape; TensorShape out_shape;
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
out_shape.AddDim(diagonal.dim_size(i)); out_shape.AddDim(diagonal.dim_size(i));
@ -60,9 +61,8 @@ class DiagOp : public OpKernel {
OP_REQUIRES_OK(context, OP_REQUIRES_OK(context,
context->allocate_output(0, out_shape, &output_tensor)); context->allocate_output(0, out_shape, &output_tensor));
functor::DiagFunctor<Device, T> diagFunc; functor::DiagFunctor<Device, T> diagFunc;
Status s = diagFunc(context, Status s =
diagonal.NumElements(), diagFunc(context, diagonal.NumElements(), diagonal.flat<T>().data(),
diagonal.flat<T>().data(),
output_tensor->flat<T>().data()); output_tensor->flat<T>().data());
OP_REQUIRES_OK(context, s); OP_REQUIRES_OK(context, s);
} }
@ -83,11 +83,11 @@ class DiagPartOp : public OpKernel {
even and positive, got shape ", even and positive, got shape ",
tensor.shape().DebugString())); tensor.shape().DebugString()));
for (int i = 0; i < out_dims; i++) { for (int i = 0; i < out_dims; i++) {
OP_REQUIRES(context, tensor.dim_size(i) == tensor.dim_size(i + out_dims), OP_REQUIRES(
errors::InvalidArgument( context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
"Invalid shape ", tensor.shape().DebugString(), errors::InvalidArgument("Invalid shape ",
": dimensions ", i, " and ", i + out_dims, " do not match.") tensor.shape().DebugString(), ": dimensions ",
); i, " and ", i + out_dims, " do not match."));
} }
TensorShape out_shape; TensorShape out_shape;
@ -96,13 +96,10 @@ class DiagPartOp : public OpKernel {
} }
Tensor* output = nullptr; Tensor* output = nullptr;
OP_REQUIRES_OK(context, OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
context->allocate_output(0, out_shape, &output));
functor::DiagPartFunctor<Device, T> diagPartFunc; functor::DiagPartFunctor<Device, T> diagPartFunc;
Status s = diagPartFunc(context, Status s = diagPartFunc(context, out_shape.num_elements(),
out_shape.num_elements(), tensor.flat<T>().data(), output->flat<T>().data());
tensor.flat<T>().data(),
output->flat<T>().data());
OP_REQUIRES_OK(context, s); OP_REQUIRES_OK(context, s);
} }
}; };
@ -129,9 +126,8 @@ class DiagPartOp : public OpKernel {
namespace functor { namespace functor {
template <typename T> template <typename T>
struct DiagFunctor<CPUDevice, T> { struct DiagFunctor<CPUDevice, T> {
EIGEN_ALWAYS_INLINE Status EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
operator() (OpKernelContext* context, const int64 size, const int64 size, const T* in, T* out) {
const T* in, T* out) {
// This subprocess is responsible for writing values in index range // This subprocess is responsible for writing values in index range
// [start*size, limit*size) // [start*size, limit*size)
auto subDiag = [in, out, size](int64 start, int64 limit) { auto subDiag = [in, out, size](int64 start, int64 limit) {
@ -143,17 +139,16 @@ struct DiagFunctor<CPUDevice, T> {
// Here, 5 is a empirical factor of cost_per_unit. // Here, 5 is a empirical factor of cost_per_unit.
auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
Shard(worker_threads.num_threads, worker_threads.workers, size, Shard(worker_threads.num_threads, worker_threads.workers, size, 5 * size,
5 * size, subDiag); subDiag);
return Status::OK(); return Status::OK();
} }
}; };
template <typename T> template <typename T>
struct DiagPartFunctor<CPUDevice, T> { struct DiagPartFunctor<CPUDevice, T> {
EIGEN_ALWAYS_INLINE Status EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
operator() (OpKernelContext* context, const int64 size, const int64 size, const T* in, T* out) {
const T* in, T* out) {
// This subprocess is responsible for extracting values in index range // This subprocess is responsible for extracting values in index range
// [start, limit) // [start, limit)
auto subDiagPart = [in, out, size](int64 start, int64 limit) { auto subDiagPart = [in, out, size](int64 start, int64 limit) {
@ -164,14 +159,13 @@ struct DiagPartFunctor<CPUDevice, T> {
// Here, 5 is a empirical factor of cost_per_unit. // Here, 5 is a empirical factor of cost_per_unit.
auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
Shard(worker_threads.num_threads, worker_threads.workers, size, Shard(worker_threads.num_threads, worker_threads.workers, size, 5,
5, subDiagPart); subDiagPart);
return Status::OK(); return Status::OK();
} }
}; };
} // namespace functor } // namespace functor
// Register the CPU kernels. // Register the CPU kernels.
#define REGISTER_DIAGOP(T) \ #define REGISTER_DIAGOP(T) \
REGISTER_KERNEL_BUILDER( \ REGISTER_KERNEL_BUILDER( \
@ -250,6 +244,4 @@ TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU);
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA
} // namespace tensorflow } // namespace tensorflow

View File

@ -26,14 +26,14 @@ namespace functor {
template <typename Device, typename T> template <typename Device, typename T>
struct DiagFunctor { struct DiagFunctor {
Status operator() (OpKernelContext* context, const int64 size, Status operator()(OpKernelContext* context, const int64 size, const T* in,
const T* in, T* out); T* out);
}; };
template <typename Device, typename T> template <typename Device, typename T>
struct DiagPartFunctor { struct DiagPartFunctor {
Status operator() (OpKernelContext* context, const int64 size, Status operator()(OpKernelContext* context, const int64 size, const T* in,
const T* in, T* out); T* out);
}; };
} // namespace functor } // namespace functor

View File

@ -19,8 +19,8 @@ limitations under the License.
#include <complex> #include <complex>
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"
#include "tensorflow/core/kernels/diag_op.h" #include "tensorflow/core/kernels/diag_op.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"
namespace tensorflow { namespace tensorflow {
namespace functor { namespace functor {
@ -28,10 +28,8 @@ namespace functor {
typedef Eigen::GpuDevice GPUDevice; typedef Eigen::GpuDevice GPUDevice;
template <typename T> template <typename T>
__global__ void DiagCudaKernel(const int num_threads, __global__ void DiagCudaKernel(const int num_threads, const int64 size,
const int64 size, const T* in, T* out) {
const T* in,
T* out) {
CUDA_1D_KERNEL_LOOP(index, num_threads) { CUDA_1D_KERNEL_LOOP(index, num_threads) {
// Fill the diagonal elements or set to zero in other place. // Fill the diagonal elements or set to zero in other place.
if (index % (1 + size) == 0) { if (index % (1 + size) == 0) {
@ -44,9 +42,8 @@ __global__ void DiagCudaKernel(const int num_threads,
template <typename T> template <typename T>
struct DiagFunctor<GPUDevice, T> { struct DiagFunctor<GPUDevice, T> {
EIGEN_ALWAYS_INLINE Status EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
operator() (OpKernelContext* context, const int64 size, const int64 size, const T* in, T* out) {
const T* in, T* out) {
// Empty tensor couldn't launch the kernel. // Empty tensor couldn't launch the kernel.
if (size == 0) { if (size == 0) {
return Status::OK(); return Status::OK();
@ -56,25 +53,22 @@ struct DiagFunctor<GPUDevice, T> {
// so this may overflow for `size*size` in extreme cases, // so this may overflow for `size*size` in extreme cases,
// here is checking the multiplication overflow for integer. // here is checking the multiplication overflow for integer.
if (size && (int(size * size) / size) != size) { if (size && (int(size * size) / size) != size) {
return errors::Internal( return errors::Internal("DiagOp got input size too large.");
"DiagOp got input size too large.");
} }
int virtual_thread_count = int(size * size); int virtual_thread_count = int(size * size);
// Launch the GPU kernel. // Launch the GPU kernel.
const GPUDevice& device = context->eigen_device<GPUDevice>(); const GPUDevice& device = context->eigen_device<GPUDevice>();
CudaLaunchConfig diag_config = GetCudaLaunchConfig( CudaLaunchConfig diag_config =
virtual_thread_count, device); GetCudaLaunchConfig(virtual_thread_count, device);
DiagCudaKernel<<<diag_config.block_count, DiagCudaKernel<<<diag_config.block_count, diag_config.thread_per_block, 0,
diag_config.thread_per_block, device.stream()>>>(diag_config.virtual_thread_count, size,
0, device.stream()>>>( in, out);
diag_config.virtual_thread_count, size, in, out);
auto err = cudaGetLastError(); auto err = cudaGetLastError();
if (err != cudaSuccess) { if (err != cudaSuccess) {
return errors::Internal( return errors::Internal(
"Could not launch DiagOp kernel: ", "Could not launch DiagOp kernel: ", cudaGetErrorString(err), ".");
cudaGetErrorString(err), ".");
} }
return Status::OK(); return Status::OK();
} }
@ -87,12 +81,9 @@ template struct DiagFunctor<GPUDevice, int64>;
template struct DiagFunctor<GPUDevice, complex64>; template struct DiagFunctor<GPUDevice, complex64>;
template struct DiagFunctor<GPUDevice, complex128>; template struct DiagFunctor<GPUDevice, complex128>;
template <typename T> template <typename T>
__global__ void DiagPartCudaKernel(const int num_threads, __global__ void DiagPartCudaKernel(const int num_threads, const int64 size,
const int64 size, const T* in, T* out) {
const T* in,
T* out) {
CUDA_1D_KERNEL_LOOP(index, num_threads) { CUDA_1D_KERNEL_LOOP(index, num_threads) {
out[index] = in[(1 + size) * index]; out[index] = in[(1 + size) * index];
} }
@ -100,9 +91,8 @@ __global__ void DiagPartCudaKernel(const int num_threads,
template <typename T> template <typename T>
struct DiagPartFunctor<GPUDevice, T> { struct DiagPartFunctor<GPUDevice, T> {
EIGEN_ALWAYS_INLINE Status EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
operator() (OpKernelContext* context, const int64 size, const int64 size, const T* in, T* out) {
const T* in, T* out) {
// Empty tensor couldn't launch the kernel. // Empty tensor couldn't launch the kernel.
if (size == 0) { if (size == 0) {
return Status::OK(); return Status::OK();
@ -111,16 +101,14 @@ struct DiagPartFunctor<GPUDevice, T> {
// Extract the diagonal elements. // Extract the diagonal elements.
CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device); CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device);
DiagPartCudaKernel<<<diag_config.block_count, DiagPartCudaKernel<<<diag_config.block_count, diag_config.thread_per_block,
diag_config.thread_per_block, 0, device.stream()>>>(diag_config.virtual_thread_count,
0, device.stream()>>>( size, in, out);
diag_config.virtual_thread_count, size, in, out);
auto err = cudaGetLastError(); auto err = cudaGetLastError();
if (err != cudaSuccess) { if (err != cudaSuccess) {
return errors::Internal( return errors::Internal(
"Could not launch DiagPartOp kernel: ", "Could not launch DiagPartOp kernel: ", cudaGetErrorString(err), ".");
cudaGetErrorString(err), ".");
} }
return Status::OK(); return Status::OK();
} }

View File

@ -51,4 +51,3 @@ BM_Diag(128);
BM_Diag(512); BM_Diag(512);
} // end namespace tensorflow } // end namespace tensorflow

View File

@ -91,10 +91,10 @@ void ParseSizes(OpKernelContext* context, const std::vector<int32>& strides,
filter.shape().DebugString())); filter.shape().DebugString()));
const int filter_rows = filter.dim_size(0); const int filter_rows = filter.dim_size(0);
const int filter_cols = filter.dim_size(1); const int filter_cols = filter.dim_size(1);
OP_REQUIRES( OP_REQUIRES(context, depth == filter.dim_size(2),
context, depth == filter.dim_size(2), errors::InvalidArgument(
errors::InvalidArgument("input and filter must have the same depth: ", "input and filter must have the same depth: ", depth, " vs ",
depth, " vs ", filter.dim_size(2))); filter.dim_size(2)));
// Effective filter size, after introducing rate - 1 zeros between each // Effective filter size, after introducing rate - 1 zeros between each
// non-zero filter element. // non-zero filter element.
@ -234,7 +234,8 @@ class DilationBackpropInputOp : public OpKernel {
// [ batch, out_rows, out_cols, depth ] // [ batch, out_rows, out_cols, depth ]
const int batch = input.dim_size(0); const int batch = input.dim_size(0);
const int depth = input.dim_size(3); const int depth = input.dim_size(3);
OP_REQUIRES(context, batch == out_backprop.dim_size(0) && OP_REQUIRES(context,
batch == out_backprop.dim_size(0) &&
out_rows == out_backprop.dim_size(1) && out_rows == out_backprop.dim_size(1) &&
out_cols == out_backprop.dim_size(2) && out_cols == out_backprop.dim_size(2) &&
depth == out_backprop.dim_size(3), depth == out_backprop.dim_size(3),
@ -353,7 +354,8 @@ class DilationBackpropFilterOp : public OpKernel {
// [ batch, out_rows, out_cols, depth ] // [ batch, out_rows, out_cols, depth ]
const int batch = input.dim_size(0); const int batch = input.dim_size(0);
const int depth = input.dim_size(3); const int depth = input.dim_size(3);
OP_REQUIRES(context, batch == out_backprop.dim_size(0) && OP_REQUIRES(context,
batch == out_backprop.dim_size(0) &&
out_rows == out_backprop.dim_size(1) && out_rows == out_backprop.dim_size(1) &&
out_cols == out_backprop.dim_size(2) && out_cols == out_backprop.dim_size(2) &&
depth == out_backprop.dim_size(3), depth == out_backprop.dim_size(3),

View File

@ -61,9 +61,8 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
const int w_in = w_beg + w * rate_cols; const int w_in = w_beg + w * rate_cols;
if (w_in >= 0 && w_in < input_cols) { if (w_in >= 0 && w_in < input_cols) {
const T val = const T val =
input_ptr[d + input_ptr[d + depth * (w_in +
depth * input_cols * (h_in + input_rows * b))] +
(w_in + input_cols * (h_in + input_rows * b))] +
filter_ptr[d + depth * (w + filter_cols * h)]; filter_ptr[d + depth * (w + filter_cols * h)];
if (val > cur_val) { if (val > cur_val) {
cur_val = val; cur_val = val;
@ -106,9 +105,8 @@ __global__ void DilationBackpropInputKernel(
const int w_in = w_beg + w * rate_cols; const int w_in = w_beg + w * rate_cols;
if (w_in >= 0 && w_in < input_cols) { if (w_in >= 0 && w_in < input_cols) {
const T val = const T val =
input_ptr[d + input_ptr[d + depth * (w_in +
depth * input_cols * (h_in + input_rows * b))] +
(w_in + input_cols * (h_in + input_rows * b))] +
filter_ptr[d + depth * (w + filter_cols * h)]; filter_ptr[d + depth * (w + filter_cols * h)];
if (val > cur_val) { if (val > cur_val) {
cur_val = val; cur_val = val;
@ -156,9 +154,8 @@ __global__ void DilationBackpropFilterKernel(
const int w_in = w_beg + w * rate_cols; const int w_in = w_beg + w * rate_cols;
if (w_in >= 0 && w_in < input_cols) { if (w_in >= 0 && w_in < input_cols) {
const T val = const T val =
input_ptr[d + input_ptr[d + depth * (w_in +
depth * input_cols * (h_in + input_rows * b))] +
(w_in + input_cols * (h_in + input_rows * b))] +
filter_ptr[d + depth * (w + filter_cols * h)]; filter_ptr[d + depth * (w + filter_cols * h)];
if (val > cur_val) { if (val > cur_val) {
cur_val = val; cur_val = val;

View File

@ -29,8 +29,7 @@ template <class T>
class DrawBoundingBoxesOp : public OpKernel { class DrawBoundingBoxesOp : public OpKernel {
public: public:
explicit DrawBoundingBoxesOp(OpKernelConstruction* context) explicit DrawBoundingBoxesOp(OpKernelConstruction* context)
: OpKernel(context) { : OpKernel(context) {}
}
void Compute(OpKernelContext* context) override { void Compute(OpKernelContext* context) override {
const Tensor& images = context->input(0); const Tensor& images = context->input(0);
@ -94,35 +93,28 @@ class DrawBoundingBoxesOp : public OpKernel {
int64 color_index = bb % color_table_length; int64 color_index = bb % color_table_length;
const int64 min_box_row = const int64 min_box_row =
static_cast<float>(tboxes(b, bb, 0)) * (height - 1); static_cast<float>(tboxes(b, bb, 0)) * (height - 1);
const int64 min_box_row_clamp = const int64 min_box_row_clamp = std::max<int64>(min_box_row, 0);
std::max<int64>(min_box_row, 0);
const int64 max_box_row = const int64 max_box_row =
static_cast<float>(tboxes(b, bb, 2)) * (height - 1); static_cast<float>(tboxes(b, bb, 2)) * (height - 1);
const int64 max_box_row_clamp = const int64 max_box_row_clamp =
std::min<int64>(max_box_row, height - 1); std::min<int64>(max_box_row, height - 1);
const int64 min_box_col = const int64 min_box_col =
static_cast<float>(tboxes(b, bb, 1)) * (width - 1); static_cast<float>(tboxes(b, bb, 1)) * (width - 1);
const int64 min_box_col_clamp = const int64 min_box_col_clamp = std::max<int64>(min_box_col, 0);
std::max<int64>(min_box_col, 0);
const int64 max_box_col = const int64 max_box_col =
static_cast<float>(tboxes(b, bb, 3)) * (width - 1); static_cast<float>(tboxes(b, bb, 3)) * (width - 1);
const int64 max_box_col_clamp = const int64 max_box_col_clamp = std::min<int64>(max_box_col, width - 1);
std::min<int64>(max_box_col, width - 1);
if (min_box_row > max_box_row || min_box_col > max_box_col) { if (min_box_row > max_box_row || min_box_col > max_box_col) {
LOG(WARNING) << "Bounding box (" << min_box_row LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
<< "," << min_box_col << "," << max_box_row << "," << max_box_col
<< "," << max_box_row
<< "," << max_box_col
<< ") is inverted and will not be drawn."; << ") is inverted and will not be drawn.";
continue; continue;
} }
if (min_box_row >= height || max_box_row < 0 || if (min_box_row >= height || max_box_row < 0 || min_box_col >= width ||
min_box_col >= width || max_box_col < 0) { max_box_col < 0) {
LOG(WARNING) << "Bounding box (" << min_box_row LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
<< "," << min_box_col << "," << max_box_row << "," << max_box_col
<< "," << max_box_row
<< "," << max_box_col
<< ") is completely outside the image" << ") is completely outside the image"
<< " and will not be drawn."; << " and will not be drawn.";
continue; continue;

View File

@ -103,7 +103,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
// Walk through data and copy the data to the appropriate output tensor // Walk through data and copy the data to the appropriate output tensor
const auto data_flat = data->flat<T>(); const auto data_flat = data->flat<T>();
std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>, std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
Eigen::Aligned> > out_vec; Eigen::Aligned> >
out_vec;
out_vec.reserve(num_partitions_); out_vec.reserve(num_partitions_);
for (int p = 0; p < num_partitions_; p++) { for (int p = 0; p < num_partitions_; p++) {
out_vec.push_back(outputs[p]->vec<T>()); out_vec.push_back(outputs[p]->vec<T>());
@ -124,7 +125,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
} else { } else {
// If data has extra dimensions, use Eigen slices // If data has extra dimensions, use Eigen slices
std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
Eigen::Aligned> > out_flat; Eigen::Aligned> >
out_flat;
out_flat.reserve(num_partitions_); out_flat.reserve(num_partitions_);
for (int p = 0; p < num_partitions_; p++) { for (int p = 0; p < num_partitions_; p++) {
out_flat.push_back(outputs[p]->flat_outer_dims<T>()); out_flat.push_back(outputs[p]->flat_outer_dims<T>());

View File

@ -79,8 +79,8 @@ template <typename T>
void RangeInit(const GPUDevice& d, const T start, const T delta, void RangeInit(const GPUDevice& d, const T start, const T delta,
const int32 size, typename TTypes<T>::Flat out) { const int32 size, typename TTypes<T>::Flat out) {
CudaLaunchConfig config = GetCudaLaunchConfig(size, d); CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
RangeInitKernel< RangeInitKernel<T>
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
start, delta, size, out.data()); start, delta, size, out.data());
} }
@ -103,9 +103,8 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
T* out, int64 gather_dim_size, int64 indices_size, T* out, int64 gather_dim_size, int64 indices_size,
int64 slice_size, int64 out_size) { int64 slice_size, int64 out_size) {
CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d); CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
GatherOpKernel< GatherOpKernel<T, int32, true>
T, int32, <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
true><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
params, indices, out, gather_dim_size, indices_size, slice_size, params, indices, out, gather_dim_size, indices_size, slice_size,
out_size); out_size);
} }
@ -231,10 +230,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
OP_REQUIRES_ASYNC( OP_REQUIRES_ASYNC(
c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()), c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()),
errors::InvalidArgument("data.shape must start with partitions.shape, ", errors::InvalidArgument(
"data.shape must start with partitions.shape, ",
"got data.shape = ", data.shape().DebugString(), "got data.shape = ", data.shape().DebugString(),
", partitions.shape = ", ", partitions.shape = ", partitions.shape().DebugString()),
partitions.shape().DebugString()),
done); done);
Tensor partition_count; Tensor partition_count;
@ -245,7 +244,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
AllocatorAttributes alloc_attr; AllocatorAttributes alloc_attr;
alloc_attr.set_on_host(true); alloc_attr.set_on_host(true);
OP_REQUIRES_OK_ASYNC( OP_REQUIRES_OK_ASYNC(
c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), c,
c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
&partition_count, alloc_attr), &partition_count, alloc_attr),
done); done);
auto e_part_count = partition_count.flat<int32>(); auto e_part_count = partition_count.flat<int32>();
@ -259,7 +259,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
// Prepare for counting. // Prepare for counting.
OP_REQUIRES_OK_ASYNC( OP_REQUIRES_OK_ASYNC(
c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), c,
c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
&partition_count), &partition_count),
done); done);
Tensor indices_out; Tensor indices_out;
@ -280,7 +281,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
alloc_attr.set_on_host(true); alloc_attr.set_on_host(true);
alloc_attr.set_gpu_compatible(true); alloc_attr.set_gpu_compatible(true);
OP_REQUIRES_OK_ASYNC( OP_REQUIRES_OK_ASYNC(
c, c->allocate_temp(partition_count.dtype(), partition_count.shape(), c,
c->allocate_temp(partition_count.dtype(), partition_count.shape(),
&cpu_tensor, alloc_attr), &cpu_tensor, alloc_attr),
done); done);
perftools::gputools::DeviceMemoryBase wrapped( perftools::gputools::DeviceMemoryBase wrapped(
@ -340,8 +342,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream); indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
// Allocate temporary storage. // Allocate temporary storage.
OP_REQUIRES_OK_ASYNC( OP_REQUIRES_OK_ASYNC(
c, c->allocate_temp( c,
DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}), c->allocate_temp(DT_INT8,
TensorShape({static_cast<int64>(temp_storage_bytes)}),
&cub_temp_storage), &cub_temp_storage),
done); done);
// Radix-sort the partition information. // Radix-sort the partition information.
@ -376,7 +379,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
zero_functor(device, partition_count->flat<int32>()); zero_functor(device, partition_count->flat<int32>());
// Allocate memory for aggregates_out. // Allocate memory for aggregates_out.
OP_REQUIRES_OK_ASYNC( OP_REQUIRES_OK_ASYNC(
c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), c,
c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
&aggregates_out), &aggregates_out),
done); done);
// Obtain the pointers to inner buffers. // Obtain the pointers to inner buffers.
@ -408,8 +412,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
num_runs_ptr, reduction_op, N, cu_stream); num_runs_ptr, reduction_op, N, cu_stream);
// Allocate temporary storage. // Allocate temporary storage.
OP_REQUIRES_OK_ASYNC( OP_REQUIRES_OK_ASYNC(
c, c->allocate_temp( c,
DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}), c->allocate_temp(DT_INT8,
TensorShape({static_cast<int64>(temp_storage_bytes)}),
&cub_temp_storage), &cub_temp_storage),
done); done);
// Run reduce-by-key. The effect is that we count how many times // Run reduce-by-key. The effect is that we count how many times

View File

@ -23,7 +23,7 @@ namespace {
void EigenApprox(float a, float b) { void EigenApprox(float a, float b) {
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
} }
} } // namespace
TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) { TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) {
const ptrdiff_t depth = 3; const ptrdiff_t depth = 3;

View File

@ -25,31 +25,43 @@ namespace Eigen {
* *
* \brief Extract glimpses from an input tensor. * \brief Extract glimpses from an input tensor.
* *
* The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch). * The input parameter is expected to be a col-major tensor with a rank of 4
* The width and height parameters specify the extension of the returned glimpses. * (depth, x, y, and batch). The width and height parameters specify the
* The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension. * extension of the returned glimpses. The offsets parameter specifies the x, y
* The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension. * locations of the center of the glimpses relative to the center of the input
* The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center. * image. The vector is expected to contain one IndexPair for each image in the
* batch dimension. The normalized boolean indicates if incoming coordinates are
* normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each
* height and width dimension. The centered boolean indicates if incoming
* coordinates are centered relative to the image, in which case -1.0 and 1.0
* correspond to minimum and maximum of each dimension while 0.0 corresponds to
* the center.
* *
* The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch). * The result can be assigned to a tensor of rank equal to that of the input.
* The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size. * The result will be laid out in col-major order (depth, x, y, batch). The
* dimensions of the result will be equal to the dimensions of the input except
* for width and height which will be equal to the requested glimpse size.
*/ */
namespace { namespace {
template <typename Index> template <typename Index>
struct GlimpseExtractionOp { struct GlimpseExtractionOp {
GlimpseExtractionOp(const Index width, const Index height, GlimpseExtractionOp(const Index width, const Index height,
const std::vector<IndexPair<float> >& offsets, const std::vector<IndexPair<float> >& offsets,
const bool normalized, const bool normalized, const bool centered,
const bool centered, const bool uniform_noise)
const bool uniform_noise) : : width_(width),
width_(width), height_(height), offsets_(offsets), height_(height),
normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { } offsets_(offsets),
normalized_(normalized),
centered_(centered),
uniform_noise_(uniform_noise) {}
template <typename Input> template <typename Input>
DSizes<Index, 4> dimensions(const Input& input) const { DSizes<Index, 4> dimensions(const Input& input) const {
typedef typename internal::traits<Input>::Index IndexType; typedef typename internal::traits<Input>::Index IndexType;
typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4, typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
internal::traits<Input>::Layout, IndexType> > Ref; internal::traits<Input>::Layout, IndexType> >
Ref;
Ref in(input); Ref in(input);
DSizes<Index, 4> dims = in.dimensions(); DSizes<Index, 4> dims = in.dimensions();
@ -62,12 +74,12 @@ struct GlimpseExtractionOp {
} }
template <typename Input, typename Output, typename Device> template <typename Input, typename Output, typename Device>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC void eval(const Input& input, Output& output,
void eval(const Input& input, Output& output, const Device& device) const const Device& device) const {
{
typedef typename internal::traits<Input>::Index IndexType; typedef typename internal::traits<Input>::Index IndexType;
typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4, typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
internal::traits<Input>::Layout, IndexType> > Ref; internal::traits<Input>::Layout, IndexType> >
Ref;
Ref in(input); Ref in(input);
const Index num_channels = in.dimension(0); const Index num_channels = in.dimension(0);
const Index input_width = in.dimension(1); const Index input_width = in.dimension(1);
@ -215,21 +227,22 @@ struct GlimpseExtractionOp {
const bool centered_; const bool centered_;
const bool uniform_noise_; const bool uniform_noise_;
}; };
} } // namespace
template <typename Input> template <typename Input>
EIGEN_ALWAYS_INLINE EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<
static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input> const GlimpseExtractionOp<typename internal::traits<Input>::Index>,
const Input>
ExtractGlimpses(const Input& input, ExtractGlimpses(const Input& input,
const typename internal::traits<Input>::Index width, const typename internal::traits<Input>::Index width,
const typename internal::traits<Input>::Index height, const typename internal::traits<Input>::Index height,
const std::vector<IndexPair<float> >& offsets, const std::vector<IndexPair<float> >& offsets,
const bool normalized = true, const bool centered = true, const bool normalized = true, const bool centered = true,
const bool uniform_noise = true) const bool uniform_noise = true) {
{ EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE); YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
YOU_MADE_A_PROGRAMMING_MISTAKE);
typedef typename internal::traits<Input>::Index Index; typedef typename internal::traits<Input>::Index Index;
const GlimpseExtractionOp<Index> op(width, height, offsets, normalized, const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,

View File

@ -23,7 +23,7 @@ namespace {
void EigenApprox(float a, float b) { void EigenApprox(float a, float b) {
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
} }
} } // namespace
TEST(EigenAttentionTest, Simple) { TEST(EigenAttentionTest, Simple) {
const ptrdiff_t depth = 3; const ptrdiff_t depth = 3;

View File

@ -25,7 +25,7 @@ void EigenApprox(float a, float b) {
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
} }
static int ceil_div(int a, int b) { return (a + b - 1) / b; } static int ceil_div(int a, int b) { return (a + b - 1) / b; }
} } // namespace
TEST(EigenBackwardSpatialConvolutionsTest, TEST(EigenBackwardSpatialConvolutionsTest,
test_simple_spatial_convolution_backward_input_valid) { test_simple_spatial_convolution_backward_input_valid) {

View File

@ -23,7 +23,7 @@ namespace {
void EigenApprox(float a, float b) { void EigenApprox(float a, float b) {
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
} }
} } // namespace
TEST(EigenPoolingTest, Simple) { TEST(EigenPoolingTest, Simple) {
const int depth = 10; const int depth = 10;

View File

@ -25,9 +25,11 @@ namespace Eigen {
* *
* \brief Applies a softmax * \brief Applies a softmax
* *
* The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other). * The input parameter is expected to be a col-major tensor with a rank of 2
* (depth and other).
* *
* The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order. * The result can be assigned to a tensor of rank and dimensions equal to that
* of the input. The result will be laid out in col-major order.
* *
*/ */
@ -41,8 +43,7 @@ struct SoftmaxOp {
} }
template <typename Input, typename Output, typename Device> template <typename Input, typename Output, typename Device>
void eval(const Input& input, Output& output, const Device& device) const void eval(const Input& input, Output& output, const Device& device) const {
{
#if !defined(EIGEN_HAS_INDEX_LIST) #if !defined(EIGEN_HAS_INDEX_LIST)
// nvcc doesn't support cxx11 // nvcc doesn't support cxx11
Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim; Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
@ -59,27 +60,35 @@ struct SoftmaxOp {
Eigen::IndexList<Eigen::type2index<0> > depth_dim; Eigen::IndexList<Eigen::type2index<0> > depth_dim;
Eigen::IndexList<int, Eigen::type2index<1> > bcast; Eigen::IndexList<int, Eigen::type2index<1> > bcast;
bcast.set(0, dimensions(input)[0]); bcast.set(0, dimensions(input)[0]);
Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d; Eigen::IndexList<Eigen::type2index<1>,
typename internal::traits<Input>::Index>
dims2d;
dims2d.set(1, dimensions(input)[1]); dims2d.set(1, dimensions(input)[1]);
#endif #endif
output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp(); output.device(device) =
output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); ((input -
input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) *
beta_)
.exp();
output.device(device) =
output /
(output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
} }
private: private:
const float beta_; const float beta_;
}; };
} } // namespace
template <typename Input> template <typename Input>
EIGEN_ALWAYS_INLINE EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<const SoftmaxOp,
static const TensorCustomUnaryOp<const SoftmaxOp, const Input> const Input>
SoftMax(const Input& input, const float beta) SoftMax(const Input& input, const float beta) {
{ EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE); YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2,
YOU_MADE_A_PROGRAMMING_MISTAKE);
const SoftmaxOp op(beta); const SoftmaxOp op(beta);
return input.customOp(op); return input.customOp(op);

View File

@ -23,7 +23,7 @@ namespace {
void EigenApprox(float a, float b) { void EigenApprox(float a, float b) {
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
} }
} } // namespace
TEST(EigenSoftmaxTest, Simple) { TEST(EigenSoftmaxTest, Simple) {
const int depth = 1024; const int depth = 1024;

View File

@ -80,8 +80,9 @@ class EncodeJpegOp : public OpKernel {
errors::InvalidArgument("image must be 3-dimensional", errors::InvalidArgument("image must be 3-dimensional",
image.shape().DebugString())); image.shape().DebugString()));
OP_REQUIRES(context, FastBoundsCheck(image.NumElements(), OP_REQUIRES(
std::numeric_limits<int32>::max()), context,
FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
errors::InvalidArgument( errors::InvalidArgument(
"Cannot encode images with >= max int32 elements")); "Cannot encode images with >= max int32 elements"));
@ -100,8 +101,9 @@ class EncodeJpegOp : public OpKernel {
} else if (channels == 3) { } else if (channels == 3) {
adjusted_flags.format = jpeg::FORMAT_RGB; adjusted_flags.format = jpeg::FORMAT_RGB;
} else { } else {
OP_REQUIRES(context, false, errors::InvalidArgument( OP_REQUIRES(
"image must have 1 or 3 channels, got ", context, false,
errors::InvalidArgument("image must have 1 or 3 channels, got ",
image.shape().DebugString())); image.shape().DebugString()));
} }
} else { } else {

View File

@ -346,7 +346,8 @@ class SingleSequenceExampleParserOp : public OpKernel {
feature_list_sparse_keys[di].scalar<string>()(); feature_list_sparse_keys[di].scalar<string>()();
} }
OP_REQUIRES( OP_REQUIRES(
ctx, TensorShapeUtils::IsVector( ctx,
TensorShapeUtils::IsVector(
feature_list_dense_missing_assumed_empty->shape()), feature_list_dense_missing_assumed_empty->shape()),
errors::InvalidArgument( errors::InvalidArgument(
"Expected feature_list_dense_missing_assumed_empty ", "Expected feature_list_dense_missing_assumed_empty ",
@ -386,12 +387,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
required[d] = (def_value.NumElements() == 0); // No default provided. required[d] = (def_value.NumElements() == 0); // No default provided.
if (def_value.NumElements() > 0) { if (def_value.NumElements() > 0) {
OP_REQUIRES( OP_REQUIRES(ctx, def_value.shape() == attrs_.context_dense_shapes[d],
ctx, def_value.shape() == attrs_.context_dense_shapes[d],
errors::InvalidArgument( errors::InvalidArgument(
"def_value[", d, "].shape() == ", "def_value[", d,
def_value.shape().DebugString(), " != context_dense_shapes_[", "].shape() == ", def_value.shape().DebugString(),
d, "] == ", attrs_.context_dense_shapes[d].DebugString())); " != context_dense_shapes_[", d,
"] == ", attrs_.context_dense_shapes[d].DebugString()));
OP_REQUIRES( OP_REQUIRES(
ctx, def_value.dtype() == attrs_.context_dense_types[d], ctx, def_value.dtype() == attrs_.context_dense_types[d],
errors::InvalidArgument( errors::InvalidArgument(
@ -576,12 +577,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
const Feature& f = fl.feature(t); const Feature& f = fl.feature(t);
bool types_match; bool types_match;
OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match)); OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
OP_REQUIRES( OP_REQUIRES(ctx, types_match,
ctx, types_match,
errors::InvalidArgument( errors::InvalidArgument(
"Name: ", name, ", Feature list: ", key, ", Index: ", t, "Name: ", name, ", Feature list: ", key, ", Index: ", t,
". Data types don't match. ", "Expected type: ", ". Data types don't match. ",
DataTypeString(dtype), " Feature is: ", ProtoDebugString(f))); "Expected type: ", DataTypeString(dtype),
" Feature is: ", ProtoDebugString(f)));
OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f, OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f,
feature_list_dense_values[d])); feature_list_dense_values[d]));
} }

View File

@ -122,13 +122,9 @@ static string D(const char* s) {
return ret; return ret;
} }
REGISTER_KERNEL_BUILDER(Name("Fact") REGISTER_KERNEL_BUILDER(
.Device(DEVICE_CPU) Name("Fact").Device(DEVICE_CPU).Label(D("Yoxmos").c_str()), FactOpKernel2);
.Label(D("Yoxmos").c_str()), REGISTER_KERNEL_BUILDER(
FactOpKernel2); Name("Fact").Device(DEVICE_CPU).Label(D("yoxmos").c_str()), FactOpKernel2);
REGISTER_KERNEL_BUILDER(Name("Fact")
.Device(DEVICE_CPU)
.Label(D("yoxmos").c_str()),
FactOpKernel2);
} // namespace tensorflow } // namespace tensorflow

View File

@ -378,8 +378,7 @@ TEST_F(QuantOpsTest, WithArgsGradient_RegularRange) {
Tensor* output = GetOutput(0); Tensor* output = GetOutput(0);
auto input_flat = GetInput(0).flat<float>(); auto input_flat = GetInput(0).flat<float>();
Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3})); Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
FillValues<float>(&expected, FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
{0.0f, input_flat(1), input_flat(2),
input_flat(3), input_flat(4), 0.0f}); input_flat(3), input_flat(4), 0.0f});
ExpectClose(expected, *output); ExpectClose(expected, *output);
} }
@ -2167,21 +2166,19 @@ TEST_F(QuantOpsTest,
Tensor* output_bprop_wrt_input = GetOutput(0); Tensor* output_bprop_wrt_input = GetOutput(0);
Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3})); Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
auto grad_flat = GetInput(0).flat<float>(); auto grad_flat = GetInput(0).flat<float>();
FillValues<float>(&expected_bprop_wrt_input, FillValues<float>(
{0.0f, grad_flat(1), grad_flat(2), &expected_bprop_wrt_input,
grad_flat(3), grad_flat(4), 0.0f}); {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input); ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
Tensor* output_bprop_wrt_min = GetOutput(1); Tensor* output_bprop_wrt_min = GetOutput(1);
Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3})); Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
FillValues<float>(&expected_bprop_wrt_min, FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
{grad_flat(0), 0.0f, 0.0f});
ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min); ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
Tensor* output_bprop_wrt_max = GetOutput(2); Tensor* output_bprop_wrt_max = GetOutput(2);
Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3})); Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
FillValues<float>(&expected_bprop_wrt_max, FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
{0.0f, 0.0f, grad_flat(5)});
ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max); ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
} }
@ -2215,21 +2212,19 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_4Bits_NarrowRange) {
Tensor* output_bprop_wrt_input = GetOutput(0); Tensor* output_bprop_wrt_input = GetOutput(0);
Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3})); Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
auto grad_flat = GetInput(0).flat<float>(); auto grad_flat = GetInput(0).flat<float>();
FillValues<float>(&expected_bprop_wrt_input, FillValues<float>(
{0.0f, grad_flat(1), grad_flat(2), &expected_bprop_wrt_input,
grad_flat(3), grad_flat(4), 0.0f}); {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input); ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
Tensor* output_bprop_wrt_min = GetOutput(1); Tensor* output_bprop_wrt_min = GetOutput(1);
Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3})); Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
FillValues<float>(&expected_bprop_wrt_min, FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
{grad_flat(0), 0.0f, 0.0f});
ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min); ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
Tensor* output_bprop_wrt_max = GetOutput(2); Tensor* output_bprop_wrt_max = GetOutput(2);
Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3})); Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
FillValues<float>(&expected_bprop_wrt_max, FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
{0.0f, 0.0f, grad_flat(5)});
ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max); ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
} }
@ -2270,8 +2265,7 @@ TEST_F(QuantOpsTest,
Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
TensorShape({1, 2, 3, 4})); TensorShape({1, 2, 3, 4}));
auto grad_flat = GetInput(0).flat<float>(); auto grad_flat = GetInput(0).flat<float>();
FillValues<float>( FillValues<float>(&expected_bprop_wrt_input,
&expected_bprop_wrt_input,
{0.0f, grad_flat(1), grad_flat(2), 0.0f, {0.0f, grad_flat(1), grad_flat(2), 0.0f,
0.0f, grad_flat(5), grad_flat(6), 0.0f, 0.0f, grad_flat(5), grad_flat(6), 0.0f,
0.0f, grad_flat(9), grad_flat(10), 0.0f, 0.0f, grad_flat(9), grad_flat(10), 0.0f,

View File

@ -255,8 +255,8 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
// TODO(josh11b): This makes two copies of callback, avoid this if possible. // TODO(josh11b): This makes two copies of callback, avoid this if possible.
dequeue_attempts_.emplace_back( dequeue_attempts_.emplace_back(
num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token, num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
[callback, allow_small_batch, this](Attempt* attempt) [callback, allow_small_batch,
EXCLUSIVE_LOCKS_REQUIRED(mu_) { this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
int64 queue_size = queues_[0].size(); int64 queue_size = queues_[0].size();
if (closed_ && queue_size < attempt->elements_requested) { if (closed_ && queue_size < attempt->elements_requested) {
@ -317,9 +317,8 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
const TensorShape shape = const TensorShape shape =
ManyOutShape(i, attempt->elements_requested); ManyOutShape(i, attempt->elements_requested);
Tensor element; Tensor element;
attempt->context->SetStatus( attempt->context->SetStatus(attempt->context->allocate_temp(
attempt->context->allocate_temp(component_dtypes_[i], component_dtypes_[i], shape, &element));
shape, &element));
if (!attempt->context->status().ok()) return kComplete; if (!attempt->context->status().ok()) return kComplete;
attempt->tuple.emplace_back(element); attempt->tuple.emplace_back(element);
} }
@ -327,8 +326,8 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
result = kProgress; result = kProgress;
Tuple tuple; Tuple tuple;
DequeueLocked(attempt->context, &tuple); DequeueLocked(attempt->context, &tuple);
const int64 index = attempt->tuple[0].dim_size(0) - const int64 index =
attempt->elements_requested; attempt->tuple[0].dim_size(0) - attempt->elements_requested;
for (int i = 0; i < num_components(); ++i) { for (int i = 0; i < num_components(); ++i) {
attempt->context->SetStatus(batch_util::CopyElementToSlice( attempt->context->SetStatus(batch_util::CopyElementToSlice(
std::move(tuple[i]), &attempt->tuple[i], index)); std::move(tuple[i]), &attempt->tuple[i], index));

View File

@ -18,8 +18,8 @@ limitations under the License.
#define EIGEN_USE_THREADS #define EIGEN_USE_THREADS
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/types.h"
#include "tensorflow/core/framework/variant_encode_decode.h" #include "tensorflow/core/framework/variant_encode_decode.h"
@ -118,7 +118,8 @@ DEFINE_SETONE_SYCL(double);
template <typename T> template <typename T>
struct FillFunctor<Eigen::ThreadPoolDevice, T> { struct FillFunctor<Eigen::ThreadPoolDevice, T> {
void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out, void operator()(const Eigen::ThreadPoolDevice& d,
typename TTypes<T>::Flat out,
typename TTypes<T>::ConstScalar in) { typename TTypes<T>::ConstScalar in) {
out.device(d) = out.constant(in()); out.device(d) = out.constant(in());
} }
@ -150,8 +151,7 @@ struct FillFunctor<Eigen::SyclDevice, T> {
} }
}; };
#define DEFINE_FILL_SYCL(T) \ #define DEFINE_FILL_SYCL(T) template struct FillFunctor<Eigen::SyclDevice, T>;
template struct FillFunctor<Eigen::SyclDevice, T>;
DEFINE_FILL_SYCL(float); DEFINE_FILL_SYCL(float);
DEFINE_FILL_SYCL(double); DEFINE_FILL_SYCL(double);
TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL) TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)

View File

@ -232,7 +232,8 @@ class FractionalAvgPoolGradOp : public OpKernel {
// Grab the inputs. // Grab the inputs.
const Tensor& orig_input_tensor_shape = context->input(0); const Tensor& orig_input_tensor_shape = context->input(0);
OP_REQUIRES(context, orig_input_tensor_shape.dims() == 1 && OP_REQUIRES(context,
orig_input_tensor_shape.dims() == 1 &&
orig_input_tensor_shape.NumElements() == 4, orig_input_tensor_shape.NumElements() == 4,
errors::InvalidArgument("original input tensor shape must be" errors::InvalidArgument("original input tensor shape must be"
"1-dimensional and 4 elements")); "1-dimensional and 4 elements"));

View File

@ -253,8 +253,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
args.push_back(ctx->input(i)); args.push_back(ctx->input(i));
} }
std::vector<Tensor>* rets = new std::vector<Tensor>; std::vector<Tensor>* rets = new std::vector<Tensor>;
lib->Run( lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) {
opts, handle, args, rets, [ctx, done, rets](const Status& status) {
if (!status.ok()) { if (!status.ok()) {
ctx->SetStatus(status); ctx->SetStatus(status);
} else if (rets->size() != ctx->num_outputs()) { } else if (rets->size() != ctx->num_outputs()) {

View File

@ -68,7 +68,8 @@ void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
template <class T> template <class T>
void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d, void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
typename TTypes<T>::Flat out) { typename TTypes<T>::Flat out) {
To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN()); To32Bit(out).device(d) =
To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
} }
template class VarianceToInvVariance<float>; template class VarianceToInvVariance<float>;

View File

@ -18,12 +18,12 @@ limitations under the License.
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/type_traits.h" #include "tensorflow/core/framework/type_traits.h"
#include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/kernels/bounds_check.h"
#include "tensorflow/core/platform/prefetch.h" #include "tensorflow/core/platform/prefetch.h"
#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/types.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/util/work_sharder.h" #include "tensorflow/core/util/work_sharder.h"
namespace tensorflow { namespace tensorflow {
@ -52,7 +52,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
const size_t slice_bytes = slice_elems * sizeof(T); const size_t slice_bytes = slice_elems * sizeof(T);
auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
mutex mu; mutex mu;
// Store the value of invalidate index for printing error information, it's a shared variable. // Store the value of invalidate index for printing error information, it's a
// shared variable.
SliceIndex result = -1; SliceIndex result = -1;
auto work = [&](int64 start, int64 end) { auto work = [&](int64 start, int64 end) {
SliceIndex batch_idx = static_cast<SliceIndex>(start / indices_size); SliceIndex batch_idx = static_cast<SliceIndex>(start / indices_size);
@ -66,7 +67,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
SliceIndex b_next = batch_idx + 1; SliceIndex b_next = batch_idx + 1;
if ((batch_idx == batch_idx_end && i_next < indices_idx_end) || if ((batch_idx == batch_idx_end && i_next < indices_idx_end) ||
(i_next < indices_size)) { (i_next < indices_size)) {
port::prefetch<port::PREFETCH_HINT_T0>(&params(batch_idx, indices(i_next), 0)); port::prefetch<port::PREFETCH_HINT_T0>(
&params(batch_idx, indices(i_next), 0));
port::prefetch<port::PREFETCH_HINT_T0>(&out(batch_idx, i_next, 0)); port::prefetch<port::PREFETCH_HINT_T0>(&out(batch_idx, i_next, 0));
b_next = batch_idx; b_next = batch_idx;
} else if (b_next <= batch_idx_end) { } else if (b_next <= batch_idx_end) {
@ -85,7 +87,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
// ahead-of-time compilation binary size). // ahead-of-time compilation binary size).
if (is_simple_type<T>::value) { if (is_simple_type<T>::value) {
// Avoid auto-promotion to Index from SliceIndex by casting. // Avoid auto-promotion to Index from SliceIndex by casting.
memcpy(out_base + (batch_idx * indices_size + indices_idx) * slice_elems, memcpy(
out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
params_base + (batch_idx * static_cast<SliceIndex>(limit) + params_base + (batch_idx * static_cast<SliceIndex>(limit) +
static_cast<SliceIndex>(index)) * static_cast<SliceIndex>(index)) *
slice_elems, slice_elems,
@ -99,8 +102,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
} }
}; };
Shard(worker_threads->num_threads, worker_threads->workers, batch_size*indices_size, Shard(worker_threads->num_threads, worker_threads->workers,
slice_elems * sizeof(T), work); batch_size * indices_size, slice_elems * sizeof(T), work);
return result; return result;
} }
@ -143,7 +146,8 @@ struct GatherFunctorCPU {
template <typename Device, typename T, typename Index> template <typename Device, typename T, typename Index>
struct GatherFunctor { struct GatherFunctor {
int64 operator()(OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor params, int64 operator()(OpKernelContext* ctx,
typename TTypes<T, 3>::ConstTensor params,
typename TTypes<Index>::ConstFlat indices, typename TTypes<Index>::ConstFlat indices,
typename TTypes<T, 3>::Tensor out); typename TTypes<T, 3>::Tensor out);
}; };

View File

@ -106,8 +106,7 @@ class GatherOp : public OpKernel {
auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size}); auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
functor::GatherFunctor<Device, T, Index> functor; functor::GatherFunctor<Device, T, Index> functor;
int64 bad_i = functor(c, params_flat, int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
indices_flat, out_flat);
OP_REQUIRES( OP_REQUIRES(
c, bad_i < 0, c, bad_i < 0,

View File

@ -50,9 +50,8 @@ class HingeLossUpdater : public DualLossUpdater {
// valid value for new dual = 0 // valid value for new dual = 0
// c. new optimal value > 1.0. Then new optimal value should be set to 1.0. // c. new optimal value > 1.0. Then new optimal value should be set to 1.0.
const double candidate_optimal_dual = const double candidate_optimal_dual =
current_dual + current_dual + (label - wx) / (num_loss_partitions * example_weight *
(label - wx) / weighted_example_norm);
(num_loss_partitions * example_weight * weighted_example_norm);
if (label * candidate_optimal_dual < 0) { if (label * candidate_optimal_dual < 0) {
return 0.0; return 0.0;
} }

View File

@ -17,16 +17,16 @@ limitations under the License.
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "tensorflow/core/kernels/histogram_op.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "external/cub_archive/cub/device/device_histogram.cuh" #include "external/cub_archive/cub/device/device_histogram.cuh"
#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/kernels/histogram_op.h"
#include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/cuda_kernel_helper.h" #include "tensorflow/core/util/cuda_kernel_helper.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
namespace tensorflow { namespace tensorflow {
@ -104,8 +104,8 @@ struct HistogramFixedWidthFunctor<GPUDevice, T, Tout> {
/* num_samples */ num_samples, /* num_samples */ num_samples,
/* stream */ stream); /* stream */ stream);
if (err != cudaSuccess) { if (err != cudaSuccess) {
return errors::Internal("Could not launch HistogramRange: ", return errors::Internal(
cudaGetErrorString(err), "."); "Could not launch HistogramRange: ", cudaGetErrorString(err), ".");
} }
return Status::OK(); return Status::OK();

View File

@ -109,7 +109,8 @@ struct ImageResizerState {
ValidateAndCalculateOutputSize(context, input); ValidateAndCalculateOutputSize(context, input);
if (!context->status().ok()) return; if (!context->status().ok()) return;
OP_REQUIRES_OK(context, context->allocate_output( OP_REQUIRES_OK(context, context->allocate_output(
0, TensorShape({input.dim_size(0), out_height, 0,
TensorShape({input.dim_size(0), out_height,
out_width, input.dim_size(3)}), out_width, input.dim_size(3)}),
&output)); &output));
} }
@ -168,7 +169,8 @@ struct ImageResizerGradientState {
CalculateResizeScale(original_width, resized_width, align_corners_); CalculateResizeScale(original_width, resized_width, align_corners_);
output = nullptr; output = nullptr;
OP_REQUIRES_OK(context, context->allocate_output( OP_REQUIRES_OK(context, context->allocate_output(
0, TensorShape({batch_size, original_height, 0,
TensorShape({batch_size, original_height,
original_width, channels}), original_width, channels}),
&output)); &output));
} }

Some files were not shown because too many files have changed in this diff Show More