Branch 183429339 (#16469)
* Change `reduce_logsumexp` to internally use `reshape` rather than `squeeze` since the latter requires the `axis` arg to be a Python `list`. PiperOrigin-RevId: 183396533 * Kernel utils to support broadcast add and mul. PiperOrigin-RevId: 183397494 * Updating sparsify_gather. PiperOrigin-RevId: 183402917 * [tf.data] Move slow-path-related code into the slow path in IteratorHandleOp::Compute(). This slightly reduces the amount of work performed when an iterator is accessed (after the first access), and potentially reduces contention if concurrent steps are accessing the same iterator. PiperOrigin-RevId: 183406221 * Cleanup: Ran clang-format on all *.{cc,h} in under grappler. PiperOrigin-RevId: 183406440 * Increase shard count of //third_party/tensorflow/python:nn_batchnorm_test to avoid timeouts When run under asan, the test runs for about 5 minutes, and sometimes longer, causing frequent timeouts. This change increases the shard count of the test to 4, which brings the run time of the longest running shard under asan to about 2 minutes. PiperOrigin-RevId: 183414888 * Add available choices to toco flags and fix minor formatting issues. PiperOrigin-RevId: 183415713 * Performance improvements to some GPU code to use shared locks instead of unique locks for some hotspot cases. PiperOrigin-RevId: 183418559 * [XLA] Improve error message for bad slices. PiperOrigin-RevId: 183420038 * Fix py3 build rules for all py tests under py2tf. PiperOrigin-RevId: 183422144 * Fix bug with Operation._control_inputs setter. PiperOrigin-RevId: 183422192 * Make softmax_op_test.py work with C API enabled. PiperOrigin-RevId: 183422829 * Cleanup: Ran clang-format on all *.{cc,h} files in tensorflow/core/kernels. PiperOrigin-RevId: 183423961 * Fix the documentation for the dense layer for how rank > 2 inputs are handled. PiperOrigin-RevId: 183425868 * Cleanup: Ran clang-format on all *.{cc,h} in tensorflow/core/ops. PiperOrigin-RevId: 183429339
This commit is contained in:
parent
f84623507b
commit
982549ea34
@ -37,6 +37,9 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/logging.h"
|
#include "tensorflow/core/platform/logging.h"
|
||||||
#include "tensorflow/core/platform/protobuf.h"
|
#include "tensorflow/core/platform/protobuf.h"
|
||||||
|
|
||||||
|
using tensorflow::str_util::Join;
|
||||||
|
using tensorflow::strings::Printf;
|
||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
@ -934,7 +937,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
|
|||||||
"inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
|
"inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
|
||||||
BinaryOperation_Name(operation).c_str(),
|
BinaryOperation_Name(operation).c_str(),
|
||||||
ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
|
ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
|
||||||
tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str());
|
Join(broadcast_dimensions, ", ").c_str());
|
||||||
TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
|
TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
|
||||||
TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
|
TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
|
||||||
|
|
||||||
@ -1097,7 +1100,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
|
|||||||
return InvalidArgument(
|
return InvalidArgument(
|
||||||
"Map operation requires all operands to have the same shape; got: "
|
"Map operation requires all operands to have the same shape; got: "
|
||||||
"%s",
|
"%s",
|
||||||
tensorflow::str_util::Join(pieces, ", ").c_str());
|
Join(pieces, ", ").c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that dimensions.size == arg_shape.dimensions_size() (we currently
|
// Check that dimensions.size == arg_shape.dimensions_size() (we currently
|
||||||
@ -1114,7 +1117,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
|
|||||||
if (dimensions[i] != i) {
|
if (dimensions[i] != i) {
|
||||||
return InvalidArgument(
|
return InvalidArgument(
|
||||||
"Map requires monotonically increasing dimension numbers, found: %s ",
|
"Map requires monotonically increasing dimension numbers, found: %s ",
|
||||||
tensorflow::str_util::Join(dimensions, ", ").c_str());
|
Join(dimensions, ", ").c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1914,21 +1917,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
|
|||||||
const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
|
const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
|
||||||
tensorflow::gtl::ArraySlice<int64> limits,
|
tensorflow::gtl::ArraySlice<int64> limits,
|
||||||
tensorflow::gtl::ArraySlice<int64> strides) {
|
tensorflow::gtl::ArraySlice<int64> strides) {
|
||||||
|
auto error = [&](const string& message) {
|
||||||
|
return InvalidArgument(
|
||||||
|
"%s in slice operation; argument shape: %s; starts: {%s}; limits: "
|
||||||
|
"{%s}; strides: {%s}",
|
||||||
|
message.c_str(), ShapeUtil::HumanString(arg).c_str(),
|
||||||
|
Join(starts, ",").c_str(), Join(limits, ",").c_str(),
|
||||||
|
Join(strides, ",").c_str());
|
||||||
|
};
|
||||||
TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
|
TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
|
||||||
VLOG(2) << tensorflow::strings::Printf(
|
VLOG(2) << tensorflow::strings::Printf(
|
||||||
"slicing shape %s starts={%s} limits={%s}",
|
"slicing shape %s starts={%s} limits={%s}",
|
||||||
ShapeUtil::HumanString(arg).c_str(),
|
ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(),
|
||||||
tensorflow::str_util::Join(starts, ", ").c_str(),
|
Join(limits, ", ").c_str());
|
||||||
tensorflow::str_util::Join(limits, ", ").c_str());
|
|
||||||
|
|
||||||
if (starts.size() != limits.size()) {
|
if (starts.size() != limits.size()) {
|
||||||
return InvalidArgument("slice start and limit sizes differ: %zu vs %zu",
|
return error(Printf("slice start and limit sizes differ: %zu vs %zu",
|
||||||
starts.size(), limits.size());
|
starts.size(), limits.size()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (starts.size() != strides.size()) {
|
if (starts.size() != strides.size()) {
|
||||||
return InvalidArgument("slice start and strides sizes differ: %zu vs %zu",
|
return error(Printf("slice start and strides sizes differ: %zu vs %zu",
|
||||||
starts.size(), strides.size());
|
starts.size(), strides.size()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (starts.size() != ShapeUtil::Rank(arg)) {
|
if (starts.size() != ShapeUtil::Rank(arg)) {
|
||||||
@ -1947,20 +1957,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
|
|||||||
start_index);
|
start_index);
|
||||||
}
|
}
|
||||||
if (limit_index > arg.dimensions(dimension)) {
|
if (limit_index > arg.dimensions(dimension)) {
|
||||||
return InvalidArgument(
|
return error(
|
||||||
"limit index (%lld) must be less than or equal to dimension "
|
Printf("limit index (%lld) must be less than or equal to dimension "
|
||||||
"size (%lld)",
|
"size (%lld)",
|
||||||
limit_index, arg.dimensions(dimension));
|
limit_index, arg.dimensions(dimension)));
|
||||||
}
|
}
|
||||||
VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
|
VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
|
||||||
start_index);
|
start_index);
|
||||||
VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
|
VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
|
||||||
limit_index);
|
limit_index);
|
||||||
if (start_index > limit_index) {
|
if (start_index > limit_index) {
|
||||||
return InvalidArgument(
|
return error(
|
||||||
"limit index (%lld) must be greater or equal to "
|
Printf("limit index (%lld) must be greater or equal to "
|
||||||
"start index (%lld) in slice with positive stride",
|
"start index (%lld) in slice with positive stride",
|
||||||
limit_index, start_index);
|
limit_index, start_index));
|
||||||
}
|
}
|
||||||
if (stride <= 0) {
|
if (stride <= 0) {
|
||||||
return InvalidArgument("stride (%lld) must be positive", stride);
|
return InvalidArgument("stride (%lld) must be positive", stride);
|
||||||
@ -1983,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
|
|||||||
"slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
|
"slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
|
||||||
ShapeUtil::HumanString(operand_shape).c_str(),
|
ShapeUtil::HumanString(operand_shape).c_str(),
|
||||||
ShapeUtil::HumanString(start_indices_shape).c_str(),
|
ShapeUtil::HumanString(start_indices_shape).c_str(),
|
||||||
tensorflow::str_util::Join(slice_sizes, ", ").c_str());
|
Join(slice_sizes, ", ").c_str());
|
||||||
|
|
||||||
if (ShapeUtil::Rank(start_indices_shape) != 1) {
|
if (ShapeUtil::Rank(start_indices_shape) != 1) {
|
||||||
return InvalidArgument(
|
return InvalidArgument(
|
||||||
@ -2280,8 +2290,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
|
|||||||
return InvalidArgument(
|
return InvalidArgument(
|
||||||
"Reshape dimensions [%s] are not a permutation of the operand "
|
"Reshape dimensions [%s] are not a permutation of the operand "
|
||||||
"dimensions (operand shape is %s).",
|
"dimensions (operand shape is %s).",
|
||||||
tensorflow::str_util::Join(dimensions, ",").c_str(),
|
Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str());
|
||||||
ShapeUtil::HumanString(operand).c_str());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return inferred_shape;
|
return inferred_shape;
|
||||||
@ -2373,8 +2382,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
|
|||||||
// The applied function's arity equals the number of arguments.
|
// The applied function's arity equals the number of arguments.
|
||||||
if (arg_shapes.size() != to_apply.parameters_size()) {
|
if (arg_shapes.size() != to_apply.parameters_size()) {
|
||||||
string computation_signature = ShapeUtil::HumanString(to_apply);
|
string computation_signature = ShapeUtil::HumanString(to_apply);
|
||||||
string argument_shapes = tensorflow::str_util::Join(
|
string argument_shapes =
|
||||||
arg_shapes, ", ", [](string* out, const Shape* shape) {
|
Join(arg_shapes, ", ", [](string* out, const Shape* shape) {
|
||||||
tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
|
tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
|
||||||
});
|
});
|
||||||
return InvalidArgument(
|
return InvalidArgument(
|
||||||
|
@ -1512,5 +1512,20 @@ TEST_F(ShapeInferenceTest, Conditional) {
|
|||||||
"must have the same shape"));
|
"must have the same shape"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(ShapeInferenceTest, BadSlice) {
|
||||||
|
auto arg = ShapeUtil::MakeShape(F32, {4});
|
||||||
|
StatusOr<Shape> statusor =
|
||||||
|
ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
|
||||||
|
ASSERT_FALSE(statusor.ok());
|
||||||
|
|
||||||
|
LOG(INFO) << statusor.status();
|
||||||
|
|
||||||
|
EXPECT_THAT(statusor.status().error_message(),
|
||||||
|
HasSubstr("less than or equal to dimension size"))
|
||||||
|
<< statusor.status();
|
||||||
|
EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape"))
|
||||||
|
<< statusor.status();
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
} // namespace xla
|
} // namespace xla
|
||||||
|
@ -71,6 +71,32 @@ cc_library(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "kernel_util",
|
||||||
|
srcs = [
|
||||||
|
"kernel_util.cc",
|
||||||
|
],
|
||||||
|
hdrs = [
|
||||||
|
"kernel_util.h",
|
||||||
|
],
|
||||||
|
deps = [
|
||||||
|
"//tensorflow/contrib/lite:builtin_op_data",
|
||||||
|
"//tensorflow/contrib/lite:context",
|
||||||
|
"//tensorflow/contrib/lite/kernels/internal:round",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
tf_cc_test(
|
||||||
|
name = "kernel_util_test",
|
||||||
|
size = "small",
|
||||||
|
srcs = ["kernel_util_test.cc"],
|
||||||
|
deps = [
|
||||||
|
":kernel_util",
|
||||||
|
"//tensorflow/contrib/lite/testing:util",
|
||||||
|
"@com_google_googletest//:gtest",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "builtin_ops",
|
name = "builtin_ops",
|
||||||
srcs = [
|
srcs = [
|
||||||
@ -87,7 +113,6 @@ cc_library(
|
|||||||
"fully_connected.cc",
|
"fully_connected.cc",
|
||||||
"gather.cc",
|
"gather.cc",
|
||||||
"hashtable_lookup.cc",
|
"hashtable_lookup.cc",
|
||||||
"kernel_util.cc",
|
|
||||||
"l2norm.cc",
|
"l2norm.cc",
|
||||||
"local_response_norm.cc",
|
"local_response_norm.cc",
|
||||||
"lsh_projection.cc",
|
"lsh_projection.cc",
|
||||||
@ -111,7 +136,6 @@ cc_library(
|
|||||||
"unidirectional_sequence_rnn.cc",
|
"unidirectional_sequence_rnn.cc",
|
||||||
],
|
],
|
||||||
hdrs = [
|
hdrs = [
|
||||||
"kernel_util.h",
|
|
||||||
"padding.h",
|
"padding.h",
|
||||||
"register.h",
|
"register.h",
|
||||||
],
|
],
|
||||||
@ -125,6 +149,7 @@ cc_library(
|
|||||||
}),
|
}),
|
||||||
deps = [
|
deps = [
|
||||||
":activation_functor",
|
":activation_functor",
|
||||||
|
":kernel_util",
|
||||||
":op_macros",
|
":op_macros",
|
||||||
"//tensorflow/contrib/lite:builtin_op_data",
|
"//tensorflow/contrib/lite:builtin_op_data",
|
||||||
"//tensorflow/contrib/lite:framework",
|
"//tensorflow/contrib/lite:framework",
|
||||||
|
@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
#include "tensorflow/contrib/lite/kernels/kernel_util.h"
|
#include "tensorflow/contrib/lite/kernels/kernel_util.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
#include "tensorflow/contrib/lite/kernels/internal/round.h"
|
#include "tensorflow/contrib/lite/kernels/internal/round.h"
|
||||||
|
|
||||||
namespace tflite {
|
namespace tflite {
|
||||||
@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
|
||||||
|
return TfLiteIntArrayEqual(input1->dims, input2->dims);
|
||||||
|
}
|
||||||
|
|
||||||
|
TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
|
||||||
|
TfLiteTensor* input1,
|
||||||
|
TfLiteTensor* input2,
|
||||||
|
TfLiteIntArray** output_shape) {
|
||||||
|
int64_t dims1 = NumDimensions(input1);
|
||||||
|
int64_t dims2 = NumDimensions(input2);
|
||||||
|
int64_t out_dims = std::max(dims1, dims2);
|
||||||
|
std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
|
||||||
|
TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
|
||||||
|
for (int i = 0; i < out_dims; ++i) {
|
||||||
|
int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
|
||||||
|
int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
|
||||||
|
TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1);
|
||||||
|
shape->data[out_dims - i - 1] = std::max(d1, d2);
|
||||||
|
}
|
||||||
|
*output_shape = shape.release();
|
||||||
|
return kTfLiteOk;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace tflite
|
} // namespace tflite
|
||||||
|
@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
|
|||||||
inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
|
inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
|
||||||
inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
|
inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
|
||||||
|
|
||||||
|
inline int64_t NumElements(const TfLiteTensor* t) {
|
||||||
|
int64_t count = 1;
|
||||||
|
for (int i = 0; i < NumDimensions(t); ++i) {
|
||||||
|
count *= SizeOfDimension(t, i);
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
|
inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
|
||||||
const TfLiteNode* node, int index) {
|
const TfLiteNode* node, int index) {
|
||||||
const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
|
const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
|
||||||
@ -76,6 +84,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
|
|||||||
float* activation_min,
|
float* activation_min,
|
||||||
float* activation_max);
|
float* activation_max);
|
||||||
|
|
||||||
|
// Return true if the given tensors have the same shape.
|
||||||
|
bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
|
||||||
|
|
||||||
|
// Calculate the output_shape that is necessary for element-wise operations
|
||||||
|
// with broadcasting involving the two input tensors.
|
||||||
|
TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
|
||||||
|
TfLiteTensor* input1,
|
||||||
|
TfLiteTensor* input2,
|
||||||
|
TfLiteIntArray** output_shape);
|
||||||
} // namespace tflite
|
} // namespace tflite
|
||||||
|
|
||||||
#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
|
#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
|
||||||
|
150
tensorflow/contrib/lite/kernels/kernel_util_test.cc
Normal file
150
tensorflow/contrib/lite/kernels/kernel_util_test.cc
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
#include "tensorflow/contrib/lite/kernels/kernel_util.h"
|
||||||
|
|
||||||
|
#include <gmock/gmock.h>
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include "tensorflow/contrib/lite/testing/util.h"
|
||||||
|
|
||||||
|
namespace tflite {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
void ReportError(TfLiteContext* context, const char* format, ...) {}
|
||||||
|
|
||||||
|
class KernelUtilTest : public ::testing::Test {
|
||||||
|
public:
|
||||||
|
KernelUtilTest() {
|
||||||
|
context_.ReportError = ReportError;
|
||||||
|
|
||||||
|
tensor1_.dims = nullptr;
|
||||||
|
tensor2_.dims = nullptr;
|
||||||
|
}
|
||||||
|
~KernelUtilTest() {
|
||||||
|
TfLiteTensorFree(&tensor1_);
|
||||||
|
TfLiteTensorFree(&tensor2_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void SetShape(TfLiteTensor* tensor, std::initializer_list<int> dims) {
|
||||||
|
TfLiteTensorFree(tensor);
|
||||||
|
tensor->dims = TfLiteIntArrayCreate(dims.size());
|
||||||
|
int i = 0;
|
||||||
|
for (int d : dims) {
|
||||||
|
tensor->dims->data[i] = d;
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> GetShape(TfLiteIntArray* dims) {
|
||||||
|
std::vector<int> result;
|
||||||
|
for (int i = 0; i < dims->size; ++i) {
|
||||||
|
result.push_back(dims->data[i]);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
TfLiteContext context_;
|
||||||
|
TfLiteTensor tensor1_;
|
||||||
|
TfLiteTensor tensor2_;
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST_F(KernelUtilTest, SameShapeEmpty) {
|
||||||
|
EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
|
||||||
|
|
||||||
|
SetShape(&tensor1_, {1, 2, 3});
|
||||||
|
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
|
||||||
|
|
||||||
|
SetShape(&tensor2_, {1, 2});
|
||||||
|
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
|
||||||
|
|
||||||
|
SetShape(&tensor2_, {1, 2, 3, 4});
|
||||||
|
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
|
||||||
|
|
||||||
|
SetShape(&tensor2_, {1, 2, 3});
|
||||||
|
EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
|
||||||
|
|
||||||
|
SetShape(&tensor2_, {});
|
||||||
|
EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
|
||||||
|
|
||||||
|
SetShape(&tensor1_, {});
|
||||||
|
EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) {
|
||||||
|
TfLiteIntArray* output = nullptr;
|
||||||
|
SetShape(&tensor1_, {1, 2});
|
||||||
|
SetShape(&tensor2_, {1, 3});
|
||||||
|
EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
|
||||||
|
&tensor2_, &output));
|
||||||
|
EXPECT_EQ(output, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(KernelUtilTest, BroadcastShapeOnes) {
|
||||||
|
TfLiteIntArray* output = nullptr;
|
||||||
|
SetShape(&tensor1_, {1, 1});
|
||||||
|
SetShape(&tensor2_, {1, 3});
|
||||||
|
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
|
||||||
|
&tensor2_, &output));
|
||||||
|
TfLiteIntArrayFree(output);
|
||||||
|
|
||||||
|
SetShape(&tensor1_, {1, 2});
|
||||||
|
SetShape(&tensor2_, {1, 1});
|
||||||
|
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
|
||||||
|
&tensor2_, &output));
|
||||||
|
TfLiteIntArrayFree(output);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(KernelUtilTest, BroadcastShapeScalars) {
|
||||||
|
TfLiteIntArray* output = nullptr;
|
||||||
|
SetShape(&tensor1_, {1, 2});
|
||||||
|
SetShape(&tensor2_, {});
|
||||||
|
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
|
||||||
|
&tensor2_, &output));
|
||||||
|
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2));
|
||||||
|
TfLiteIntArrayFree(output);
|
||||||
|
|
||||||
|
SetShape(&tensor1_, {});
|
||||||
|
SetShape(&tensor2_, {2});
|
||||||
|
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
|
||||||
|
&tensor2_, &output));
|
||||||
|
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2));
|
||||||
|
TfLiteIntArrayFree(output);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
|
||||||
|
TfLiteIntArray* output = nullptr;
|
||||||
|
SetShape(&tensor1_, {1, 2});
|
||||||
|
SetShape(&tensor2_, {3, 1, 1});
|
||||||
|
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
|
||||||
|
&tensor2_, &output));
|
||||||
|
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2));
|
||||||
|
TfLiteIntArrayFree(output);
|
||||||
|
|
||||||
|
SetShape(&tensor1_, {1, 2, 3, 4});
|
||||||
|
SetShape(&tensor2_, {1, 3, 1});
|
||||||
|
EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
|
||||||
|
&tensor2_, &output));
|
||||||
|
EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4));
|
||||||
|
TfLiteIntArrayFree(output);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
} // namespace tflite
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
::tflite::LogToStderr();
|
||||||
|
::testing::InitGoogleTest(&argc, argv);
|
||||||
|
return RUN_ALL_TESTS();
|
||||||
|
}
|
@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
|
|||||||
"For Protobuf formats, the binary format will be used."),
|
"For Protobuf formats, the binary format will be used."),
|
||||||
Flag("input_format", parsed_flags.input_format.bind(),
|
Flag("input_format", parsed_flags.input_format.bind(),
|
||||||
parsed_flags.input_format.default_value(),
|
parsed_flags.input_format.default_value(),
|
||||||
"Input file format. One of: tensorflow_graphdef, "),
|
"Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."),
|
||||||
Flag("output_format", parsed_flags.output_format.bind(),
|
Flag("output_format", parsed_flags.output_format.bind(),
|
||||||
parsed_flags.output_format.default_value(), "Output file format."),
|
parsed_flags.output_format.default_value(),
|
||||||
|
"Output file format. "
|
||||||
|
"One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
|
||||||
Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
|
Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
|
||||||
parsed_flags.default_ranges_min.default_value(),
|
parsed_flags.default_ranges_min.default_value(),
|
||||||
"If defined, will be used as the default value for the min bound "
|
"If defined, will be used as the default value for the min bound "
|
||||||
@ -58,11 +60,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
|
|||||||
Flag("inference_type", parsed_flags.inference_type.bind(),
|
Flag("inference_type", parsed_flags.inference_type.bind(),
|
||||||
parsed_flags.inference_type.default_value(),
|
parsed_flags.inference_type.default_value(),
|
||||||
"Target data type of arrays in the output file (for input_arrays, "
|
"Target data type of arrays in the output file (for input_arrays, "
|
||||||
"this may be overridden by inference_input_type)."),
|
"this may be overridden by inference_input_type). "
|
||||||
|
"One of FLOAT, QUANTIZED_UINT8."),
|
||||||
Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
|
Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
|
||||||
parsed_flags.inference_input_type.default_value(),
|
parsed_flags.inference_input_type.default_value(),
|
||||||
"Target data type of input arrays. If not specified, inference_type "
|
"Target data type of input arrays. "
|
||||||
"is used."),
|
"If not specified, inference_type is used. "
|
||||||
|
"One of FLOAT, QUANTIZED_UINT8."),
|
||||||
Flag("input_type", parsed_flags.input_type.bind(),
|
Flag("input_type", parsed_flags.input_type.bind(),
|
||||||
parsed_flags.input_type.default_value(),
|
parsed_flags.input_type.default_value(),
|
||||||
"Deprecated ambiguous flag that set both --input_data_types and "
|
"Deprecated ambiguous flag that set both --input_data_types and "
|
||||||
@ -76,35 +80,31 @@ bool ParseTocoFlagsFromCommandLineFlags(
|
|||||||
|
|
||||||
Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
|
Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
|
||||||
parsed_flags.drop_fake_quant.default_value(),
|
parsed_flags.drop_fake_quant.default_value(),
|
||||||
"Ignore and discard FakeQuant nodes. For instance, that can be used "
|
"Ignore and discard FakeQuant nodes. For instance, to "
|
||||||
"to "
|
|
||||||
"generate plain float code without fake-quantization from a "
|
"generate plain float code without fake-quantization from a "
|
||||||
"quantized "
|
"quantized graph."),
|
||||||
"graph."),
|
|
||||||
Flag(
|
Flag(
|
||||||
"reorder_across_fake_quant",
|
"reorder_across_fake_quant",
|
||||||
parsed_flags.reorder_across_fake_quant.bind(),
|
parsed_flags.reorder_across_fake_quant.bind(),
|
||||||
parsed_flags.reorder_across_fake_quant.default_value(),
|
parsed_flags.reorder_across_fake_quant.default_value(),
|
||||||
"Normally, FakeQuant nodes must be strict boundaries for graph "
|
"Normally, FakeQuant nodes must be strict boundaries for graph "
|
||||||
"transformations, in order to ensure that quantized inference has "
|
"transformations, in order to ensure that quantized inference has "
|
||||||
"the "
|
"the exact same arithmetic behavior as quantized training --- which "
|
||||||
"exact same arithmetic behavior as quantized training --- which is "
|
"is the whole point of quantized training and of FakeQuant nodes in "
|
||||||
"the "
|
"the first place. "
|
||||||
"whole point of quantized training and of FakeQuant nodes in the "
|
"However, that entails subtle requirements on where exactly "
|
||||||
"first "
|
|
||||||
"place. However, that entails subtle requirements on where exactly "
|
|
||||||
"FakeQuant nodes must be placed in the graph. Some quantized graphs "
|
"FakeQuant nodes must be placed in the graph. Some quantized graphs "
|
||||||
"have FakeQuant nodes at unexpected locations, that prevent graph "
|
"have FakeQuant nodes at unexpected locations, that prevent graph "
|
||||||
"transformations that are necessary in order to generate inference "
|
"transformations that are necessary in order to generate inference "
|
||||||
"code for these graphs. Such graphs should be fixed, but as a "
|
"code for these graphs. Such graphs should be fixed, but as a "
|
||||||
"temporary work-around, setting this reorder_across_fake_quant flag "
|
"temporary work-around, setting this reorder_across_fake_quant flag "
|
||||||
"allows toco to perform necessary graph transformaitons on them, "
|
"allows TOCO to perform necessary graph transformaitons on them, "
|
||||||
"at the cost of no longer faithfully matching inference and training "
|
"at the cost of no longer faithfully matching inference and training "
|
||||||
"arithmetic."),
|
"arithmetic."),
|
||||||
Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
|
Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
|
||||||
parsed_flags.allow_custom_ops.default_value(),
|
parsed_flags.allow_custom_ops.default_value(),
|
||||||
"If true, allow TOCO to create TF Lite Custom operators for all the "
|
"If true, allow TOCO to create TF Lite Custom operators for all the "
|
||||||
"unsupported Tensorflow ops."),
|
"unsupported TensorFlow ops."),
|
||||||
Flag(
|
Flag(
|
||||||
"drop_control_dependency",
|
"drop_control_dependency",
|
||||||
parsed_flags.drop_control_dependency.bind(),
|
parsed_flags.drop_control_dependency.bind(),
|
||||||
|
@ -57,6 +57,7 @@ py_library(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "api_test",
|
name = "api_test",
|
||||||
srcs = ["api_test.py"],
|
srcs = ["api_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":py2tf_internal",
|
":py2tf_internal",
|
||||||
"//tensorflow/python:client_testlib",
|
"//tensorflow/python:client_testlib",
|
||||||
@ -66,6 +67,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "conversion_test",
|
name = "conversion_test",
|
||||||
srcs = ["conversion_test.py"],
|
srcs = ["conversion_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":py2tf_internal",
|
":py2tf_internal",
|
||||||
"//tensorflow/python:client_testlib",
|
"//tensorflow/python:client_testlib",
|
||||||
@ -76,6 +78,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "naming_test",
|
name = "naming_test",
|
||||||
srcs = ["naming_test.py"],
|
srcs = ["naming_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":py2tf_internal",
|
":py2tf_internal",
|
||||||
"//tensorflow/python:client_testlib",
|
"//tensorflow/python:client_testlib",
|
||||||
|
@ -52,6 +52,7 @@ py_library(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "break_canonicalization_test",
|
name = "break_canonicalization_test",
|
||||||
srcs = ["break_canonicalization_test.py"],
|
srcs = ["break_canonicalization_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":test_lib",
|
":test_lib",
|
||||||
"//tensorflow/contrib/py2tf/pyct",
|
"//tensorflow/contrib/py2tf/pyct",
|
||||||
@ -62,6 +63,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "call_trees_test",
|
name = "call_trees_test",
|
||||||
srcs = ["call_trees_test.py"],
|
srcs = ["call_trees_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":test_lib",
|
":test_lib",
|
||||||
"//tensorflow/contrib/py2tf/pyct",
|
"//tensorflow/contrib/py2tf/pyct",
|
||||||
@ -72,6 +74,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "continue_canonicalization_test",
|
name = "continue_canonicalization_test",
|
||||||
srcs = ["continue_canonicalization_test.py"],
|
srcs = ["continue_canonicalization_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":test_lib",
|
":test_lib",
|
||||||
"//tensorflow/contrib/py2tf/pyct",
|
"//tensorflow/contrib/py2tf/pyct",
|
||||||
@ -82,6 +85,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "control_flow_test",
|
name = "control_flow_test",
|
||||||
srcs = ["control_flow_test.py"],
|
srcs = ["control_flow_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":test_lib",
|
":test_lib",
|
||||||
"//tensorflow/contrib/py2tf/pyct",
|
"//tensorflow/contrib/py2tf/pyct",
|
||||||
@ -92,6 +96,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "builtin_functions_test",
|
name = "builtin_functions_test",
|
||||||
srcs = ["builtin_functions_test.py"],
|
srcs = ["builtin_functions_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":test_lib",
|
":test_lib",
|
||||||
"//tensorflow/contrib/py2tf/pyct",
|
"//tensorflow/contrib/py2tf/pyct",
|
||||||
@ -112,6 +117,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "logical_expressions_test",
|
name = "logical_expressions_test",
|
||||||
srcs = ["logical_expressions_test.py"],
|
srcs = ["logical_expressions_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":test_lib",
|
":test_lib",
|
||||||
"//tensorflow/contrib/py2tf/pyct",
|
"//tensorflow/contrib/py2tf/pyct",
|
||||||
@ -122,6 +128,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "print_functions_test",
|
name = "print_functions_test",
|
||||||
srcs = ["print_functions_test.py"],
|
srcs = ["print_functions_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":test_lib",
|
":test_lib",
|
||||||
"//tensorflow/contrib/py2tf/pyct",
|
"//tensorflow/contrib/py2tf/pyct",
|
||||||
@ -133,6 +140,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "side_effect_guards_test",
|
name = "side_effect_guards_test",
|
||||||
srcs = ["side_effect_guards_test.py"],
|
srcs = ["side_effect_guards_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":test_lib",
|
":test_lib",
|
||||||
"//tensorflow/contrib/py2tf/pyct",
|
"//tensorflow/contrib/py2tf/pyct",
|
||||||
|
@ -38,6 +38,7 @@ py_library(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "anno_test",
|
name = "anno_test",
|
||||||
srcs = ["anno_test.py"],
|
srcs = ["anno_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":pyct",
|
":pyct",
|
||||||
"//tensorflow/python:client_testlib",
|
"//tensorflow/python:client_testlib",
|
||||||
@ -47,6 +48,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "compiler_test",
|
name = "compiler_test",
|
||||||
srcs = ["compiler_test.py"],
|
srcs = ["compiler_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":pyct",
|
":pyct",
|
||||||
"//tensorflow/python:client_testlib",
|
"//tensorflow/python:client_testlib",
|
||||||
@ -57,6 +59,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "parser_test",
|
name = "parser_test",
|
||||||
srcs = ["parser_test.py"],
|
srcs = ["parser_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":pyct",
|
":pyct",
|
||||||
"//tensorflow/python:client_testlib",
|
"//tensorflow/python:client_testlib",
|
||||||
@ -66,6 +69,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "pretty_printer_test",
|
name = "pretty_printer_test",
|
||||||
srcs = ["pretty_printer_test.py"],
|
srcs = ["pretty_printer_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":pyct",
|
":pyct",
|
||||||
"//tensorflow/python:client_testlib",
|
"//tensorflow/python:client_testlib",
|
||||||
@ -75,6 +79,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "templates_test",
|
name = "templates_test",
|
||||||
srcs = ["templates_test.py"],
|
srcs = ["templates_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":pyct",
|
":pyct",
|
||||||
"//tensorflow/python:client_testlib",
|
"//tensorflow/python:client_testlib",
|
||||||
|
@ -32,6 +32,7 @@ py_library(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "access_test",
|
name = "access_test",
|
||||||
srcs = ["access_test.py"],
|
srcs = ["access_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":static_analysis",
|
":static_analysis",
|
||||||
"//tensorflow/contrib/py2tf/pyct",
|
"//tensorflow/contrib/py2tf/pyct",
|
||||||
@ -43,6 +44,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "live_values_test",
|
name = "live_values_test",
|
||||||
srcs = ["live_values_test.py"],
|
srcs = ["live_values_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":static_analysis",
|
":static_analysis",
|
||||||
"//tensorflow/contrib/py2tf/pyct",
|
"//tensorflow/contrib/py2tf/pyct",
|
||||||
@ -53,6 +55,7 @@ py_test(
|
|||||||
py_test(
|
py_test(
|
||||||
name = "type_info_test",
|
name = "type_info_test",
|
||||||
srcs = ["type_info_test.py"],
|
srcs = ["type_info_test.py"],
|
||||||
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":static_analysis",
|
":static_analysis",
|
||||||
"//tensorflow/contrib/py2tf/pyct",
|
"//tensorflow/contrib/py2tf/pyct",
|
||||||
|
@ -230,8 +230,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
|
|||||||
// TODO(tucker): actually maintain separate CPUAllocators for
|
// TODO(tucker): actually maintain separate CPUAllocators for
|
||||||
// different numa_nodes. For now, just one.
|
// different numa_nodes. For now, just one.
|
||||||
numa_node = 0;
|
numa_node = 0;
|
||||||
mutex_lock lock(mu_);
|
|
||||||
|
|
||||||
|
{
|
||||||
|
// Here we optimize the most common use case where cuda_host_allocators_
|
||||||
|
// and cuda_al_ have already been populated and since we're only reading
|
||||||
|
// these vectors, we can get by with a shared lock. In the slower case,
|
||||||
|
// we take a unique lock and populate these vectors.
|
||||||
|
tf_shared_lock lock(mu_);
|
||||||
|
|
||||||
|
if (FLAGS_brain_gpu_record_mem_types &&
|
||||||
|
static_cast<int>(cuda_al_.size()) > 0) {
|
||||||
|
return cuda_al_[0];
|
||||||
|
}
|
||||||
|
if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
|
||||||
|
return cuda_host_allocators_[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
mutex_lock lock(mu_);
|
||||||
// Find the first valid StreamExecutor to request CUDA host memory
|
// Find the first valid StreamExecutor to request CUDA host memory
|
||||||
// through, since any will work.
|
// through, since any will work.
|
||||||
//
|
//
|
||||||
|
@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
|
|||||||
DisableDetailedStats(false);
|
DisableDetailedStats(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
Cluster::~Cluster() {
|
Cluster::~Cluster() {}
|
||||||
}
|
|
||||||
|
|
||||||
void Cluster::AllowSoftPlacement(bool soft_placement_state) {
|
void Cluster::AllowSoftPlacement(bool soft_placement_state) {
|
||||||
options_.config.set_allow_soft_placement(soft_placement_state);
|
options_.config.set_allow_soft_placement(soft_placement_state);
|
||||||
|
@ -16,8 +16,8 @@ limitations under the License.
|
|||||||
#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
|
#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
|
||||||
#define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
|
#define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
|
||||||
|
|
||||||
#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
|
|
||||||
#include "tensorflow/core/framework/variable.pb.h"
|
#include "tensorflow/core/framework/variable.pb.h"
|
||||||
|
#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
|
||||||
#include "tensorflow/core/lib/core/status.h"
|
#include "tensorflow/core/lib/core/status.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice;
|
|||||||
template <typename Device, typename T>
|
template <typename Device, typename T>
|
||||||
class AdjustContrastOp : public OpKernel {
|
class AdjustContrastOp : public OpKernel {
|
||||||
public:
|
public:
|
||||||
explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) {
|
explicit AdjustContrastOp(OpKernelConstruction* context)
|
||||||
}
|
: OpKernel(context) {}
|
||||||
|
|
||||||
void Compute(OpKernelContext* context) override {
|
void Compute(OpKernelContext* context) override {
|
||||||
const Tensor& input = context->input(0);
|
const Tensor& input = context->input(0);
|
||||||
|
@ -29,8 +29,7 @@ limitations under the License.
|
|||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
class AdjustContrastOpTest : public OpsTestBase {
|
class AdjustContrastOpTest : public OpsTestBase {};
|
||||||
};
|
|
||||||
|
|
||||||
TEST_F(AdjustContrastOpTest, Simple_1113) {
|
TEST_F(AdjustContrastOpTest, Simple_1113) {
|
||||||
TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
|
TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
|
||||||
|
@ -192,7 +192,8 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
|
|||||||
const DeviceBase::CpuWorkerThreads& worker_threads =
|
const DeviceBase::CpuWorkerThreads& worker_threads =
|
||||||
*context->device()->tensorflow_cpu_worker_threads();
|
*context->device()->tensorflow_cpu_worker_threads();
|
||||||
Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
|
Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
|
||||||
kCostPerChannel, [channel_count, &input_data, &output_data, scale_h](
|
kCostPerChannel,
|
||||||
|
[channel_count, &input_data, &output_data, scale_h](
|
||||||
int64 start_channel, int64 end_channel) {
|
int64 start_channel, int64 end_channel) {
|
||||||
const float* p = input_data.data() + start_channel * kChannelSize;
|
const float* p = input_data.data() + start_channel * kChannelSize;
|
||||||
float* q = output_data.data() + start_channel * kChannelSize;
|
float* q = output_data.data() + start_channel * kChannelSize;
|
||||||
|
@ -52,7 +52,8 @@ class ExtractGlimpseOp : public OpKernel {
|
|||||||
const int64 batch_size = input_shape.dim_size(0);
|
const int64 batch_size = input_shape.dim_size(0);
|
||||||
|
|
||||||
const Tensor& window_size = context->input(1);
|
const Tensor& window_size = context->input(1);
|
||||||
OP_REQUIRES(context, (window_size.shape().dims() == 1) &&
|
OP_REQUIRES(context,
|
||||||
|
(window_size.shape().dims() == 1) &&
|
||||||
window_size.shape().dim_size(0) == 2,
|
window_size.shape().dim_size(0) == 2,
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
"input must be a vector of size 2 (height, width)",
|
"input must be a vector of size 2 (height, width)",
|
||||||
|
@ -48,9 +48,8 @@ struct SpatialAvgPooling {
|
|||||||
|
|
||||||
typedef Eigen::GpuDevice GPUDevice;
|
typedef Eigen::GpuDevice GPUDevice;
|
||||||
|
|
||||||
// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation
|
// Launch a custom GPU kernels from Yanqing for the avgpooling backward
|
||||||
// that works NHWC data formats.
|
// operation that works NHWC data formats. Arguments:
|
||||||
// Arguments:
|
|
||||||
// top_diff: backprop to the output of the pooling layer
|
// top_diff: backprop to the output of the pooling layer
|
||||||
// num: number of input batches
|
// num: number of input batches
|
||||||
// height: input height
|
// height: input height
|
||||||
|
@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
|
|||||||
hstart = max(hstart, 0);
|
hstart = max(hstart, 0);
|
||||||
wstart = max(wstart, 0);
|
wstart = max(wstart, 0);
|
||||||
int pool_size = (hend - hstart) * (wend - wstart);
|
int pool_size = (hend - hstart) * (wend - wstart);
|
||||||
gradient +=
|
gradient += top_diff_slice[(ph * pooled_width + pw) * channels] /
|
||||||
top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size);
|
dtype(pool_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bottom_diff[index] = gradient;
|
bottom_diff[index] = gradient;
|
||||||
@ -90,8 +90,8 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
|
|||||||
const GPUDevice& d) {
|
const GPUDevice& d) {
|
||||||
int x_size = num * height * width * channels;
|
int x_size = num * height * width * channels;
|
||||||
CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
|
CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
|
||||||
AvePoolBackwardNHWC<
|
AvePoolBackwardNHWC<T>
|
||||||
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||||
config.virtual_thread_count, top_diff, num, height, width, channels,
|
config.virtual_thread_count, top_diff, num, height, width, channels,
|
||||||
pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
|
pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
|
||||||
pad_t, pad_t, bottom_diff);
|
pad_t, pad_t, bottom_diff);
|
||||||
|
@ -111,13 +111,14 @@ class Barrier : public ResourceBase {
|
|||||||
mutex_lock lock(mu_);
|
mutex_lock lock(mu_);
|
||||||
if (closed_) {
|
if (closed_) {
|
||||||
OP_REQUIRES_ASYNC(
|
OP_REQUIRES_ASYNC(
|
||||||
ctx, !cancel_pending_enqueues_ &&
|
ctx,
|
||||||
|
!cancel_pending_enqueues_ &&
|
||||||
(num_inserted == 0 || !incomplete_.empty()),
|
(num_inserted == 0 || !incomplete_.empty()),
|
||||||
errors::Cancelled(
|
errors::Cancelled(
|
||||||
"Barrier ", name_, " is closed. Pending enqueues cancelled: ",
|
"Barrier ", name_, " is closed. Pending enqueues cancelled: ",
|
||||||
cancel_pending_enqueues_, ". Number of new insertions: ",
|
cancel_pending_enqueues_,
|
||||||
num_inserted, ". Number of incomplete keys: ",
|
". Number of new insertions: ", num_inserted,
|
||||||
incomplete_.size(), "."),
|
". Number of incomplete keys: ", incomplete_.size(), "."),
|
||||||
callback);
|
callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -128,7 +129,8 @@ class Barrier : public ResourceBase {
|
|||||||
|
|
||||||
for (int i = 0; i < num_inserted; ++i) {
|
for (int i = 0; i < num_inserted; ++i) {
|
||||||
OP_REQUIRES_OK_ASYNC(
|
OP_REQUIRES_OK_ASYNC(
|
||||||
ctx, InsertOneLocked<T>(ctx, keys, values, element_shape,
|
ctx,
|
||||||
|
InsertOneLocked<T>(ctx, keys, values, element_shape,
|
||||||
component_index, i, &ready_tuples,
|
component_index, i, &ready_tuples,
|
||||||
&new_elements),
|
&new_elements),
|
||||||
callback);
|
callback);
|
||||||
@ -317,8 +319,9 @@ class Barrier : public ResourceBase {
|
|||||||
return errors::Cancelled(
|
return errors::Cancelled(
|
||||||
"Barrier ", name_,
|
"Barrier ", name_,
|
||||||
" is closed, but attempted to insert a brand new key: ",
|
" is closed, but attempted to insert a brand new key: ",
|
||||||
keys_vec(i), ". Pending enqueues cancelled: ",
|
keys_vec(i),
|
||||||
cancel_pending_enqueues_, ". Insertion index: ", i,
|
". Pending enqueues cancelled: ", cancel_pending_enqueues_,
|
||||||
|
". Insertion index: ", i,
|
||||||
". Number of incomplete keys: ", incomplete_.size(), ".");
|
". Number of incomplete keys: ", incomplete_.size(), ".");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -532,11 +535,12 @@ class InsertManyOp : public BarrierOpKernel {
|
|||||||
OP_REQUIRES_ASYNC(
|
OP_REQUIRES_ASYNC(
|
||||||
ctx, component_index_ < barrier->num_components(),
|
ctx, component_index_ < barrier->num_components(),
|
||||||
errors::InvalidArgument("The component ID is out of range ",
|
errors::InvalidArgument("The component ID is out of range ",
|
||||||
component_index_, " > num_components", " (= ",
|
component_index_, " > num_components",
|
||||||
barrier->num_components(), ")"),
|
" (= ", barrier->num_components(), ")"),
|
||||||
callback);
|
callback);
|
||||||
OP_REQUIRES_OK_ASYNC(
|
OP_REQUIRES_OK_ASYNC(
|
||||||
ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING,
|
ctx,
|
||||||
|
ctx->MatchSignature({DT_STRING_REF, DT_STRING,
|
||||||
barrier->component_type(component_index_)},
|
barrier->component_type(component_index_)},
|
||||||
{}),
|
{}),
|
||||||
callback);
|
callback);
|
||||||
|
@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
|
|
||||||
#include "tensorflow/core/framework/op_kernel.h"
|
#include "tensorflow/core/framework/op_kernel.h"
|
||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
#include "tensorflow/core/framework/resource_mgr.h"
|
#include "tensorflow/core/framework/resource_mgr.h"
|
||||||
#include "tensorflow/core/framework/tensor.h"
|
#include "tensorflow/core/framework/tensor.h"
|
||||||
#include "tensorflow/core/framework/tensor_util.h"
|
#include "tensorflow/core/framework/tensor_util.h"
|
||||||
#include "tensorflow/core/framework/types.h"
|
#include "tensorflow/core/framework/types.h"
|
||||||
#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
|
|
||||||
#include "tensorflow/core/kernels/batching_util/periodic_function.h"
|
#include "tensorflow/core/kernels/batching_util/periodic_function.h"
|
||||||
|
#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
|
||||||
#include "tensorflow/core/kernels/concat_lib.h"
|
#include "tensorflow/core/kernels/concat_lib.h"
|
||||||
#include "tensorflow/core/kernels/ops_util.h"
|
#include "tensorflow/core/kernels/ops_util.h"
|
||||||
#include "tensorflow/core/kernels/split_lib.h"
|
#include "tensorflow/core/kernels/split_lib.h"
|
||||||
#include "tensorflow/core/lib/random/random.h"
|
#include "tensorflow/core/lib/random/random.h"
|
||||||
#include "tensorflow/core/platform/macros.h"
|
#include "tensorflow/core/platform/macros.h"
|
||||||
|
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
typedef Eigen::ThreadPoolDevice CPUDevice;
|
typedef Eigen::ThreadPoolDevice CPUDevice;
|
||||||
|
@ -429,11 +429,10 @@ template <typename Scalar>
|
|||||||
struct LaunchBatchMatMul<SYCLDevice, Scalar> {
|
struct LaunchBatchMatMul<SYCLDevice, Scalar> {
|
||||||
static void Launch(OpKernelContext* context, const Tensor& in_x,
|
static void Launch(OpKernelContext* context, const Tensor& in_x,
|
||||||
const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
|
const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
|
||||||
|
|
||||||
// Number of matrix multiplies i.e. size of the batch.
|
// Number of matrix multiplies i.e. size of the batch.
|
||||||
const int64 batch_size = in_x.dim_size(0);
|
const int64 batch_size = in_x.dim_size(0);
|
||||||
ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y, out,
|
ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
|
||||||
0, batch_size);
|
out, 0, batch_size);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif // TENSORFLOW_USE_SYCL
|
#endif // TENSORFLOW_USE_SYCL
|
||||||
@ -462,9 +461,9 @@ class BatchMatMul : public OpKernel {
|
|||||||
TensorShape out_shape;
|
TensorShape out_shape;
|
||||||
for (int i = 0; i < ndims - 2; ++i) {
|
for (int i = 0; i < ndims - 2; ++i) {
|
||||||
OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
|
OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
|
||||||
errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(",
|
errors::InvalidArgument(
|
||||||
i, ") must be the same: ",
|
"In[0].dim(", i, ") and In[1].dim(", i,
|
||||||
in0.shape().DebugString(), " vs ",
|
") must be the same: ", in0.shape().DebugString(), " vs ",
|
||||||
in1.shape().DebugString()));
|
in1.shape().DebugString()));
|
||||||
out_shape.AddDim(in0.dim_size(i));
|
out_shape.AddDim(in0.dim_size(i));
|
||||||
}
|
}
|
||||||
|
@ -54,7 +54,8 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
|
|||||||
// BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
|
// BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
|
||||||
// gpu);
|
// gpu);
|
||||||
// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
|
// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
|
||||||
// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu); \
|
// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
|
||||||
|
// \
|
||||||
// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
|
// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
|
||||||
// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
|
// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
|
||||||
|
|
||||||
|
@ -56,7 +56,8 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
|
|||||||
errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
|
errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
|
||||||
" instead of ", orig_input_tensor.dims()));
|
" instead of ", orig_input_tensor.dims()));
|
||||||
|
|
||||||
OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
|
OP_REQUIRES(context,
|
||||||
|
TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
|
||||||
block_dims == orig_crops.dim_size(0) &&
|
block_dims == orig_crops.dim_size(0) &&
|
||||||
2 == orig_crops.dim_size(1),
|
2 == orig_crops.dim_size(1),
|
||||||
errors::InvalidArgument("crops should have shape [", block_dims,
|
errors::InvalidArgument("crops should have shape [", block_dims,
|
||||||
|
@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#include "tensorflow/core/util/bcast.h"
|
|
||||||
#include "tensorflow/core/framework/op.h"
|
#include "tensorflow/core/framework/op.h"
|
||||||
#include "tensorflow/core/framework/op_kernel.h"
|
#include "tensorflow/core/framework/op_kernel.h"
|
||||||
#include "tensorflow/core/platform/macros.h"
|
#include "tensorflow/core/platform/macros.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
|
#include "tensorflow/core/util/bcast.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
|
@ -77,12 +77,12 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
|
|||||||
}
|
}
|
||||||
CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
|
CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
|
||||||
if (data_format == FORMAT_NHWC) {
|
if (data_format == FORMAT_NHWC) {
|
||||||
BiasNHWCKernel<
|
BiasNHWCKernel<T>
|
||||||
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||||
config.virtual_thread_count, input, bias, output, bias_size);
|
config.virtual_thread_count, input, bias, output, bias_size);
|
||||||
} else {
|
} else {
|
||||||
BiasNCHWKernel<
|
BiasNCHWKernel<T>
|
||||||
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||||
config.virtual_thread_count, input, bias, output, bias_size,
|
config.virtual_thread_count, input, bias, output, bias_size,
|
||||||
image_size);
|
image_size);
|
||||||
}
|
}
|
||||||
@ -206,8 +206,8 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
|
|||||||
// Check if we have enough shared memory.
|
// Check if we have enough shared memory.
|
||||||
if (shared_memory_size <= max_shared_memory_size) {
|
if (shared_memory_size <= max_shared_memory_size) {
|
||||||
if (data_format == FORMAT_NHWC) {
|
if (data_format == FORMAT_NHWC) {
|
||||||
BiasGradNHWC_SharedAtomics<
|
BiasGradNHWC_SharedAtomics<T>
|
||||||
T><<<config.block_count, config.thread_per_block, shared_memory_size,
|
<<<config.block_count, config.thread_per_block, shared_memory_size,
|
||||||
d.stream()>>>(total_count, output_backprop, bias_backprop,
|
d.stream()>>>(total_count, output_backprop, bias_backprop,
|
||||||
bias_size);
|
bias_size);
|
||||||
} else {
|
} else {
|
||||||
@ -217,8 +217,8 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
|
|||||||
if (config.thread_per_block < kWarpSize) {
|
if (config.thread_per_block < kWarpSize) {
|
||||||
config.thread_per_block = kWarpSize;
|
config.thread_per_block = kWarpSize;
|
||||||
}
|
}
|
||||||
BiasGradNCHW_SharedAtomics<
|
BiasGradNCHW_SharedAtomics<T>
|
||||||
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||||
output_backprop, bias_backprop, batch, bias_size, image_size,
|
output_backprop, bias_backprop, batch, bias_size, image_size,
|
||||||
group_size);
|
group_size);
|
||||||
}
|
}
|
||||||
@ -227,13 +227,14 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
|
|||||||
// output block, it is possible to process one group of elements at a time.
|
// output block, it is possible to process one group of elements at a time.
|
||||||
// But for now, we simply fall back to the naive implementation.
|
// But for now, we simply fall back to the naive implementation.
|
||||||
if (data_format == FORMAT_NHWC) {
|
if (data_format == FORMAT_NHWC) {
|
||||||
BiasGradNHWC_Naive<
|
BiasGradNHWC_Naive<T>
|
||||||
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||||
total_count, output_backprop, bias_backprop, bias_size);
|
total_count, output_backprop, bias_backprop, bias_size);
|
||||||
} else {
|
} else {
|
||||||
BiasGradNCHW_Naive<
|
BiasGradNCHW_Naive<T>
|
||||||
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||||
total_count, output_backprop, bias_backprop, bias_size, image_size);
|
total_count, output_backprop, bias_backprop, bias_size,
|
||||||
|
image_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
|
|||||||
auto *to_x = reinterpret_cast<const volatile T *>(&x);
|
auto *to_x = reinterpret_cast<const volatile T *>(&x);
|
||||||
return *to_x;
|
return *to_x;
|
||||||
}
|
}
|
||||||
} // namespace tensorflow::internal
|
} // namespace internal
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // TENSORFLOW_UTIL_BOUNDS_CHECK_H_
|
#endif // TENSORFLOW_UTIL_BOUNDS_CHECK_H_
|
||||||
|
@ -126,12 +126,12 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
|
|||||||
REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
|
REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
|
||||||
SimpleCandidateSamplerOp<LogUniformSampler>);
|
SimpleCandidateSamplerOp<LogUniformSampler>);
|
||||||
|
|
||||||
REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler")
|
REGISTER_KERNEL_BUILDER(
|
||||||
.Device(DEVICE_CPU),
|
Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU),
|
||||||
SimpleCandidateSamplerOp<UnigramSampler>);
|
SimpleCandidateSamplerOp<UnigramSampler>);
|
||||||
|
|
||||||
REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler")
|
REGISTER_KERNEL_BUILDER(
|
||||||
.Device(DEVICE_CPU),
|
Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU),
|
||||||
SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
|
SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
|
||||||
|
|
||||||
class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
|
class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
|
||||||
@ -197,7 +197,8 @@ class ComputeAccidentalHitsOp : public OpKernel {
|
|||||||
void Compute(OpKernelContext* context) override {
|
void Compute(OpKernelContext* context) override {
|
||||||
const Tensor& in_true_candidates = context->input(0);
|
const Tensor& in_true_candidates = context->input(0);
|
||||||
const TensorShape& in_true_candidates_shape = in_true_candidates.shape();
|
const TensorShape& in_true_candidates_shape = in_true_candidates.shape();
|
||||||
OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
|
OP_REQUIRES(context,
|
||||||
|
TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
|
||||||
in_true_candidates_shape.dim_size(1) == num_true_,
|
in_true_candidates_shape.dim_size(1) == num_true_,
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
"true_candidates must be a batch_size * num_true matrix"));
|
"true_candidates must be a batch_size * num_true matrix"));
|
||||||
|
@ -252,4 +252,3 @@ REGISTER_KERNEL_BUILDER(
|
|||||||
CpuCastOp);
|
CpuCastOp);
|
||||||
#endif // TENSORFLOW_USE_SYCL
|
#endif // TENSORFLOW_USE_SYCL
|
||||||
} // end namespace tensorflow
|
} // end namespace tensorflow
|
||||||
|
|
||||||
|
@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
|
|||||||
p[0] = a.value;
|
p[0] = a.value;
|
||||||
p[1] = 0;
|
p[1] = 0;
|
||||||
#else
|
#else
|
||||||
static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");
|
static_assert(::tensorflow::port::kLittleEndian,
|
||||||
|
"Not a little endian system!");
|
||||||
p[0] = 0;
|
p[0] = 0;
|
||||||
p[1] = a.value;
|
p[1] = a.value;
|
||||||
#endif
|
#endif
|
||||||
|
@ -108,12 +108,12 @@ class HSVToRGBOp : public OpKernel {
|
|||||||
};
|
};
|
||||||
|
|
||||||
#define REGISTER_CPU(T) \
|
#define REGISTER_CPU(T) \
|
||||||
REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \
|
REGISTER_KERNEL_BUILDER( \
|
||||||
.TypeConstraint<T>("T"), \
|
Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
|
||||||
RGBToHSVOp<CPUDevice, T>); \
|
RGBToHSVOp<CPUDevice, T>); \
|
||||||
template class RGBToHSVOp<CPUDevice, T>; \
|
template class RGBToHSVOp<CPUDevice, T>; \
|
||||||
REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \
|
REGISTER_KERNEL_BUILDER( \
|
||||||
.TypeConstraint<T>("T"), \
|
Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
|
||||||
HSVToRGBOp<CPUDevice, T>); \
|
HSVToRGBOp<CPUDevice, T>); \
|
||||||
template class HSVToRGBOp<CPUDevice, T>;
|
template class HSVToRGBOp<CPUDevice, T>;
|
||||||
TF_CALL_float(REGISTER_CPU);
|
TF_CALL_float(REGISTER_CPU);
|
||||||
@ -125,25 +125,24 @@ TF_CALL_double(REGISTER_CPU);
|
|||||||
namespace functor {
|
namespace functor {
|
||||||
#define DECLARE_GPU(T) \
|
#define DECLARE_GPU(T) \
|
||||||
template <> \
|
template <> \
|
||||||
void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \
|
void RGBToHSV<GPUDevice, T>::operator()( \
|
||||||
TTypes<T, 2>::ConstTensor input_data, \
|
const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data, \
|
||||||
TTypes<T, 1>::Tensor range, \
|
TTypes<T, 1>::Tensor range, TTypes<T, 2>::Tensor output_data); \
|
||||||
TTypes<T, 2>::Tensor output_data); \
|
|
||||||
extern template struct RGBToHSV<GPUDevice, T>; \
|
extern template struct RGBToHSV<GPUDevice, T>; \
|
||||||
template <> \
|
template <> \
|
||||||
void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \
|
void HSVToRGB<GPUDevice, T>::operator()( \
|
||||||
TTypes<T, 2>::ConstTensor input_data, \
|
const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data, \
|
||||||
TTypes<T, 2>::Tensor output_data); \
|
TTypes<T, 2>::Tensor output_data); \
|
||||||
extern template struct HSVToRGB<GPUDevice, T>;
|
extern template struct HSVToRGB<GPUDevice, T>;
|
||||||
TF_CALL_float(DECLARE_GPU);
|
TF_CALL_float(DECLARE_GPU);
|
||||||
TF_CALL_double(DECLARE_GPU);
|
TF_CALL_double(DECLARE_GPU);
|
||||||
} // namespace functor
|
} // namespace functor
|
||||||
#define REGISTER_GPU(T) \
|
#define REGISTER_GPU(T) \
|
||||||
REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \
|
REGISTER_KERNEL_BUILDER( \
|
||||||
.TypeConstraint<T>("T"), \
|
Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
|
||||||
RGBToHSVOp<GPUDevice, T>); \
|
RGBToHSVOp<GPUDevice, T>); \
|
||||||
REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \
|
REGISTER_KERNEL_BUILDER( \
|
||||||
.TypeConstraint<T>("T"), \
|
Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
|
||||||
HSVToRGBOp<GPUDevice, T>);
|
HSVToRGBOp<GPUDevice, T>);
|
||||||
TF_CALL_float(REGISTER_GPU);
|
TF_CALL_float(REGISTER_GPU);
|
||||||
TF_CALL_double(REGISTER_GPU);
|
TF_CALL_double(REGISTER_GPU);
|
||||||
@ -151,11 +150,11 @@ TF_CALL_double(REGISTER_GPU);
|
|||||||
|
|
||||||
#ifdef TENSORFLOW_USE_SYCL
|
#ifdef TENSORFLOW_USE_SYCL
|
||||||
#define REGISTER_SYCL(T) \
|
#define REGISTER_SYCL(T) \
|
||||||
REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \
|
REGISTER_KERNEL_BUILDER( \
|
||||||
.TypeConstraint<T>("T"), \
|
Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
|
||||||
RGBToHSVOp<SYCLDevice, T>); \
|
RGBToHSVOp<SYCLDevice, T>); \
|
||||||
REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \
|
REGISTER_KERNEL_BUILDER( \
|
||||||
.TypeConstraint<T>("T"), \
|
Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
|
||||||
HSVToRGBOp<SYCLDevice, T>);
|
HSVToRGBOp<SYCLDevice, T>);
|
||||||
TF_CALL_float(REGISTER_SYCL);
|
TF_CALL_float(REGISTER_SYCL);
|
||||||
TF_CALL_double(REGISTER_SYCL);
|
TF_CALL_double(REGISTER_SYCL);
|
||||||
|
@ -54,9 +54,8 @@ struct RGBToHSV {
|
|||||||
// TODO(wicke): all these assignments are only necessary because a combined
|
// TODO(wicke): all these assignments are only necessary because a combined
|
||||||
// expression is larger than kernel parameter space. A custom kernel is
|
// expression is larger than kernel parameter space. A custom kernel is
|
||||||
// probably in order.
|
// probably in order.
|
||||||
H.device(d) = (R == V).select(norm * (G - B),
|
H.device(d) = (R == V).select(
|
||||||
(G == V).select(
|
norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6),
|
||||||
norm * (B - R) + T(2) / T(6),
|
|
||||||
norm * (R - G) + T(4) / T(6)));
|
norm * (R - G) + T(4) / T(6)));
|
||||||
H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
|
H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
|
||||||
H.device(d) = (H < T(0)).select(H + T(1), H);
|
H.device(d) = (H < T(0)).select(H + T(1), H);
|
||||||
|
@ -17,8 +17,8 @@ limitations under the License.
|
|||||||
|
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#include "tensorflow/core/kernels/colorspace_op.h"
|
|
||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
|
#include "tensorflow/core/kernels/colorspace_op.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice;
|
|||||||
template class functor::HSVToRGB<GPUDevice, T>;
|
template class functor::HSVToRGB<GPUDevice, T>;
|
||||||
TF_CALL_float(INSTANTIATE_GPU);
|
TF_CALL_float(INSTANTIATE_GPU);
|
||||||
TF_CALL_double(INSTANTIATE_GPU);
|
TF_CALL_double(INSTANTIATE_GPU);
|
||||||
}
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA
|
||||||
|
@ -41,9 +41,10 @@ namespace tensorflow {
|
|||||||
|
|
||||||
// Assumes all inputs are nonempty
|
// Assumes all inputs are nonempty
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ConcatCPU(DeviceBase* d,
|
void ConcatCPU(
|
||||||
const std::vector<
|
DeviceBase* d,
|
||||||
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
|
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
|
||||||
|
inputs,
|
||||||
typename TTypes<T, 2>::Matrix* output);
|
typename TTypes<T, 2>::Matrix* output);
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -57,9 +58,10 @@ void ConcatGPU(
|
|||||||
|
|
||||||
#ifdef TENSORFLOW_USE_SYCL
|
#ifdef TENSORFLOW_USE_SYCL
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ConcatSYCL(const Eigen::SyclDevice& d,
|
void ConcatSYCL(
|
||||||
const std::vector<
|
const Eigen::SyclDevice& d,
|
||||||
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
|
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
|
||||||
|
inputs,
|
||||||
typename TTypes<T, 2>::Matrix* output);
|
typename TTypes<T, 2>::Matrix* output);
|
||||||
#endif // TENSORFLOW_USE_SYCL
|
#endif // TENSORFLOW_USE_SYCL
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
@ -48,9 +48,10 @@ struct MemCpyCopier<ResourceHandle> {
|
|||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ConcatCPU(DeviceBase* d,
|
void ConcatCPU(
|
||||||
const std::vector<
|
DeviceBase* d,
|
||||||
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
|
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
|
||||||
|
inputs,
|
||||||
typename TTypes<T, 2>::Matrix* output) {
|
typename TTypes<T, 2>::Matrix* output) {
|
||||||
if (std::is_same<T, string>::value) {
|
if (std::is_same<T, string>::value) {
|
||||||
// use a large cost here to force strings to be handled by separate threads
|
// use a large cost here to force strings to be handled by separate threads
|
||||||
@ -86,9 +87,10 @@ TF_CALL_variant(REGISTER)
|
|||||||
|
|
||||||
#ifdef TENSORFLOW_USE_SYCL
|
#ifdef TENSORFLOW_USE_SYCL
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ConcatSYCL(const Eigen::SyclDevice& d,
|
void ConcatSYCL(
|
||||||
const std::vector<
|
const Eigen::SyclDevice& d,
|
||||||
std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
|
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
|
||||||
|
inputs,
|
||||||
typename TTypes<T, 2>::Matrix* output) {
|
typename TTypes<T, 2>::Matrix* output) {
|
||||||
ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
|
ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
|
||||||
output);
|
output);
|
||||||
|
@ -15,9 +15,9 @@ limitations under the License.
|
|||||||
|
|
||||||
#define EIGEN_USE_THREADS
|
#define EIGEN_USE_THREADS
|
||||||
|
|
||||||
#include "tensorflow/core/kernels/concat_lib.h"
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
|
#include "tensorflow/core/kernels/concat_lib.h"
|
||||||
#include "tensorflow/core/util/work_sharder.h"
|
#include "tensorflow/core/util/work_sharder.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
@ -71,7 +71,8 @@ class ConcatBaseOp : public OpKernel {
|
|||||||
const TensorShape& input_shape = values[0].shape();
|
const TensorShape& input_shape = values[0].shape();
|
||||||
|
|
||||||
int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
|
int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
|
||||||
OP_REQUIRES(c, (0 <= axis && axis < input_dims) ||
|
OP_REQUIRES(c,
|
||||||
|
(0 <= axis && axis < input_dims) ||
|
||||||
(allow_legacy_scalars() && concat_dim == 0),
|
(allow_legacy_scalars() && concat_dim == 0),
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
"ConcatOp : Expected concatenating dimensions in the range "
|
"ConcatOp : Expected concatenating dimensions in the range "
|
||||||
@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel {
|
|||||||
c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
|
c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
"ConcatOp : Ranks of all input tensors should match: shape[0] = ",
|
"ConcatOp : Ranks of all input tensors should match: shape[0] = ",
|
||||||
input_shape.DebugString(), " vs. shape[", i, "] = ",
|
input_shape.DebugString(), " vs. shape[", i,
|
||||||
in.shape().DebugString()));
|
"] = ", in.shape().DebugString()));
|
||||||
for (int j = 0; j < input_dims; ++j) {
|
for (int j = 0; j < input_dims; ++j) {
|
||||||
if (j == axis) {
|
if (j == axis) {
|
||||||
continue;
|
continue;
|
||||||
@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel {
|
|||||||
c, in.dim_size(j) == input_shape.dim_size(j),
|
c, in.dim_size(j) == input_shape.dim_size(j),
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
"ConcatOp : Dimensions of inputs should match: shape[0] = ",
|
"ConcatOp : Dimensions of inputs should match: shape[0] = ",
|
||||||
input_shape.DebugString(), " vs. shape[", i, "] = ",
|
input_shape.DebugString(), " vs. shape[", i,
|
||||||
in.shape().DebugString()));
|
"] = ", in.shape().DebugString()));
|
||||||
}
|
}
|
||||||
if (in.NumElements() > 0) {
|
if (in.NumElements() > 0) {
|
||||||
int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
|
int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
|
||||||
|
@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
|
|||||||
BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
|
BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
|
||||||
|
|
||||||
typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
|
typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
|
||||||
Eigen::Unaligned> EigenMap;
|
Eigen::Unaligned>
|
||||||
|
EigenMap;
|
||||||
static void MemcpyManyAlternative1(int iters, int dim2) {
|
static void MemcpyManyAlternative1(int iters, int dim2) {
|
||||||
testing::StopTiming();
|
testing::StopTiming();
|
||||||
|
|
||||||
|
@ -99,8 +99,9 @@ class AccumulatorTakeGradientOp
|
|||||||
ConditionalAccumulatorBase* accumulator,
|
ConditionalAccumulatorBase* accumulator,
|
||||||
DoneCallback callback) override {
|
DoneCallback callback) override {
|
||||||
// Check signature
|
// Check signature
|
||||||
OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
|
OP_REQUIRES_OK_ASYNC(
|
||||||
{accumulator->dtype()}),
|
ctx,
|
||||||
|
ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}),
|
||||||
callback);
|
callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp
|
|||||||
REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
|
REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
|
||||||
AccumulatorTakeGradientOp);
|
AccumulatorTakeGradientOp);
|
||||||
|
|
||||||
|
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
@ -146,7 +146,6 @@ typedef Eigen::GpuDevice GPUDevice;
|
|||||||
typedef Eigen::SyclDevice SYCLDevice;
|
typedef Eigen::SyclDevice SYCLDevice;
|
||||||
#endif // TENSORFLOW_USE_SYCL
|
#endif // TENSORFLOW_USE_SYCL
|
||||||
|
|
||||||
|
|
||||||
template <typename Device, typename T, typename Index>
|
template <typename Device, typename T, typename Index>
|
||||||
class FillOp : public OpKernel {
|
class FillOp : public OpKernel {
|
||||||
public:
|
public:
|
||||||
|
@ -91,6 +91,7 @@ class KilledBySignal {
|
|||||||
public:
|
public:
|
||||||
explicit KilledBySignal(int signum) : signum_(signum) {}
|
explicit KilledBySignal(int signum) : signum_(signum) {}
|
||||||
bool operator()(int exit_status) const { return exit_status == signum_; }
|
bool operator()(int exit_status) const { return exit_status == signum_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const int signum_;
|
const int signum_;
|
||||||
};
|
};
|
||||||
|
@ -679,7 +679,8 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
|
|||||||
|
|
||||||
const int dims = resized_shape.dims();
|
const int dims = resized_shape.dims();
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(
|
||||||
context, TensorShapeUtils::IsMatrix(paddings.shape()) &&
|
context,
|
||||||
|
TensorShapeUtils::IsMatrix(paddings.shape()) &&
|
||||||
paddings.dim_size(1) == 2,
|
paddings.dim_size(1) == 2,
|
||||||
errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
|
errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
|
||||||
paddings.shape().DebugString()));
|
paddings.shape().DebugString()));
|
||||||
@ -715,11 +716,12 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
|
|||||||
const int32 after =
|
const int32 after =
|
||||||
paddings_matrix(d, 1); // Pad after existing elements.
|
paddings_matrix(d, 1); // Pad after existing elements.
|
||||||
OP_REQUIRES(context, before >= 0 && after >= 0,
|
OP_REQUIRES(context, before >= 0 && after >= 0,
|
||||||
errors::InvalidArgument("paddings must be non-negative: ",
|
errors::InvalidArgument(
|
||||||
before, " ", after));
|
"paddings must be non-negative: ", before, " ", after));
|
||||||
if (offset_ == 0) { // SYMMETRIC mode.
|
if (offset_ == 0) { // SYMMETRIC mode.
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(
|
||||||
context, before <= resized_shape.dim_size(d) &&
|
context,
|
||||||
|
before <= resized_shape.dim_size(d) &&
|
||||||
after <= resized_shape.dim_size(d),
|
after <= resized_shape.dim_size(d),
|
||||||
errors::InvalidArgument("paddings must be no greater "
|
errors::InvalidArgument("paddings must be no greater "
|
||||||
"than the dimension size: ",
|
"than the dimension size: ",
|
||||||
@ -727,7 +729,8 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
|
|||||||
resized_shape.dim_size(d)));
|
resized_shape.dim_size(d)));
|
||||||
} else if (offset_ == 1) { // REFLECT mode.
|
} else if (offset_ == 1) { // REFLECT mode.
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(
|
||||||
context, before < resized_shape.dim_size(d) &&
|
context,
|
||||||
|
before < resized_shape.dim_size(d) &&
|
||||||
after < resized_shape.dim_size(d),
|
after < resized_shape.dim_size(d),
|
||||||
errors::InvalidArgument("paddings must be less than"
|
errors::InvalidArgument("paddings must be less than"
|
||||||
" the dimension size: ",
|
" the dimension size: ",
|
||||||
@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
|
|||||||
// We only check the first three dims, since the depth is accessed as an
|
// We only check the first three dims, since the depth is accessed as an
|
||||||
// int64 below.
|
// int64 below.
|
||||||
for (int i = 0; i < 3; i++) {
|
for (int i = 0; i < 3; i++) {
|
||||||
OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
|
OP_REQUIRES(
|
||||||
std::numeric_limits<int>::max()),
|
context,
|
||||||
|
FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
|
||||||
errors::InvalidArgument("filter too large"));
|
errors::InvalidArgument("filter too large"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// The last dimension for input is in_depth. It must be the same as the
|
// The last dimension for input is in_depth. It must be the same as the
|
||||||
// filter's in_depth.
|
// filter's in_depth.
|
||||||
const int64 in_depth = padded_shape.dim_size(3);
|
const int64 in_depth = padded_shape.dim_size(3);
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(context, in_depth == filter.dim_size(2),
|
||||||
context, in_depth == filter.dim_size(2),
|
errors::InvalidArgument(
|
||||||
errors::InvalidArgument("input and filter must have the same depth: ",
|
"input and filter must have the same depth: ", in_depth,
|
||||||
in_depth, " vs ", filter.dim_size(2)));
|
" vs ", filter.dim_size(2)));
|
||||||
|
|
||||||
// The last dimension for filter is out_depth.
|
// The last dimension for filter is out_depth.
|
||||||
const int out_depth = static_cast<int>(filter.dim_size(3));
|
const int out_depth = static_cast<int>(filter.dim_size(3));
|
||||||
@ -786,8 +790,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
|
|||||||
// The second dimension for input is rows/height.
|
// The second dimension for input is rows/height.
|
||||||
// The first dimension for filter is rows/height.
|
// The first dimension for filter is rows/height.
|
||||||
const int64 padded_rows_raw = padded_shape.dim_size(1);
|
const int64 padded_rows_raw = padded_shape.dim_size(1);
|
||||||
OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw,
|
OP_REQUIRES(
|
||||||
std::numeric_limits<int>::max()),
|
context,
|
||||||
|
FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
|
||||||
errors::InvalidArgument("Input rows too large"));
|
errors::InvalidArgument("Input rows too large"));
|
||||||
const int padded_rows = static_cast<int>(padded_rows_raw);
|
const int padded_rows = static_cast<int>(padded_rows_raw);
|
||||||
const int filter_rows = static_cast<int>(filter.dim_size(0));
|
const int filter_rows = static_cast<int>(filter.dim_size(0));
|
||||||
@ -796,8 +801,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
|
|||||||
// The third dimension for input is columns/width.
|
// The third dimension for input is columns/width.
|
||||||
// The second dimension for filter is columns/width.
|
// The second dimension for filter is columns/width.
|
||||||
const int64 padded_cols_raw = padded_shape.dim_size(2);
|
const int64 padded_cols_raw = padded_shape.dim_size(2);
|
||||||
OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw,
|
OP_REQUIRES(
|
||||||
std::numeric_limits<int>::max()),
|
context,
|
||||||
|
FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
|
||||||
errors::InvalidArgument("Input cols too large"));
|
errors::InvalidArgument("Input cols too large"));
|
||||||
const int padded_cols = static_cast<int>(padded_cols_raw);
|
const int padded_cols = static_cast<int>(padded_cols_raw);
|
||||||
const int filter_cols = static_cast<int>(filter.dim_size(1));
|
const int filter_cols = static_cast<int>(filter.dim_size(1));
|
||||||
@ -870,7 +876,8 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
|
|||||||
.Device(DEVICE_CPU) \
|
.Device(DEVICE_CPU) \
|
||||||
.TypeConstraint<T>("T"), \
|
.TypeConstraint<T>("T"), \
|
||||||
FusedResizeConv2DUsingGemmOp< \
|
FusedResizeConv2DUsingGemmOp< \
|
||||||
T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
|
T, \
|
||||||
|
FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
|
||||||
BILINEAR>, \
|
BILINEAR>, \
|
||||||
true>);
|
true>);
|
||||||
|
|
||||||
@ -880,7 +887,8 @@ TF_CALL_float(REGISTER_FUSED);
|
|||||||
REGISTER_KERNEL_BUILDER( \
|
REGISTER_KERNEL_BUILDER( \
|
||||||
Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
|
Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
|
||||||
FusedResizeConv2DUsingGemmOp< \
|
FusedResizeConv2DUsingGemmOp< \
|
||||||
T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
|
T, \
|
||||||
|
FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
|
||||||
NEAREST>, \
|
NEAREST>, \
|
||||||
false>);
|
false>);
|
||||||
|
|
||||||
|
@ -27,7 +27,6 @@ limitations under the License.
|
|||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
|
|
||||||
// Get the Cudnn workspace limit from the environment variable, which is in MB.
|
// Get the Cudnn workspace limit from the environment variable, which is in MB.
|
||||||
// Return the workspace memory limit in bytes. If no value is set, return the
|
// Return the workspace memory limit in bytes. If no value is set, return the
|
||||||
// default value.
|
// default value.
|
||||||
|
@ -25,9 +25,9 @@ limitations under the License.
|
|||||||
#include "cuda/include/cuda.h"
|
#include "cuda/include/cuda.h"
|
||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
#include "tensorflow/core/kernels/conv_2d.h"
|
#include "tensorflow/core/kernels/conv_2d.h"
|
||||||
|
#include "tensorflow/core/lib/math/math_util.h"
|
||||||
#include "tensorflow/core/util/cuda_kernel_helper.h"
|
#include "tensorflow/core/util/cuda_kernel_helper.h"
|
||||||
#include "tensorflow/core/util/tensor_format.h"
|
#include "tensorflow/core/util/tensor_format.h"
|
||||||
#include "tensorflow/core/lib/math/math_util.h"
|
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
@ -252,11 +252,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
|
|||||||
int x = threadIdx.x;
|
int x = threadIdx.x;
|
||||||
|
|
||||||
Dimension<3> output_dims = {
|
Dimension<3> output_dims = {
|
||||||
input_dims[0], input_dims[2], input_dims[1],
|
input_dims[0],
|
||||||
|
input_dims[2],
|
||||||
|
input_dims[1],
|
||||||
};
|
};
|
||||||
|
|
||||||
Dimension<3> input_dims_in_tiles = {
|
Dimension<3> input_dims_in_tiles = {
|
||||||
input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI,
|
input_dims[0],
|
||||||
|
(input_dims[1] + TileSizeI - 1) / TileSizeI,
|
||||||
(input_dims[2] + TileSizeJ - 1) / TileSizeJ,
|
(input_dims[2] + TileSizeJ - 1) / TileSizeJ,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -264,7 +267,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
|
|||||||
FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
|
FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
|
||||||
|
|
||||||
Index<3> input_tile_origin = {
|
Index<3> input_tile_origin = {
|
||||||
input_tile_index[0], input_tile_index[1] * TileSizeI,
|
input_tile_index[0],
|
||||||
|
input_tile_index[1] * TileSizeI,
|
||||||
input_tile_index[2] * TileSizeJ,
|
input_tile_index[2] * TileSizeJ,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -322,11 +326,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
Index<3> output_tile_index = {
|
Index<3> output_tile_index = {
|
||||||
input_tile_index[0], input_tile_index[2], input_tile_index[1],
|
input_tile_index[0],
|
||||||
|
input_tile_index[2],
|
||||||
|
input_tile_index[1],
|
||||||
};
|
};
|
||||||
|
|
||||||
Index<3> output_tile_origin = {
|
Index<3> output_tile_origin = {
|
||||||
output_tile_index[0], output_tile_index[1] * TileSizeJ,
|
output_tile_index[0],
|
||||||
|
output_tile_index[1] * TileSizeJ,
|
||||||
output_tile_index[2] * TileSizeI,
|
output_tile_index[2] * TileSizeI,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -902,19 +909,21 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
|
|||||||
constexpr int kNumThreads = 256;
|
constexpr int kNumThreads = 256;
|
||||||
|
|
||||||
Dimension<3> input_dims_in_tiles = {
|
Dimension<3> input_dims_in_tiles = {
|
||||||
input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
|
input_dims[0],
|
||||||
|
MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
|
||||||
MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
|
MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
|
||||||
};
|
};
|
||||||
|
|
||||||
int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
|
int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
|
||||||
input_dims_in_tiles[2];
|
input_dims_in_tiles[2];
|
||||||
SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate>
|
SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
|
||||||
|
conjugate>
|
||||||
<<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
|
<<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
|
||||||
output);
|
output);
|
||||||
|
|
||||||
} else if (narrow_matrix) {
|
} else if (narrow_matrix) {
|
||||||
SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output,
|
SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
|
||||||
kMinDimensionToUseTiles);
|
d, input, input_dims, output, kMinDimensionToUseTiles);
|
||||||
} else {
|
} else {
|
||||||
int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
|
int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
|
||||||
CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
|
CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
|
||||||
|
@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
|
|||||||
filter.shape().DebugString()));
|
filter.shape().DebugString()));
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
for (int i = 0; i < 3; i++) {
|
||||||
OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
|
OP_REQUIRES(
|
||||||
std::numeric_limits<int>::max()),
|
context,
|
||||||
|
FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
|
||||||
errors::InvalidArgument("filter too large"));
|
errors::InvalidArgument("filter too large"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// The last dimension for input is in_depth. It must be the same as the
|
// The last dimension for input is in_depth. It must be the same as the
|
||||||
// filter's in_depth.
|
// filter's in_depth.
|
||||||
const int64 in_depth = GetTensorDim(input, data_format_, 'C');
|
const int64 in_depth = GetTensorDim(input, data_format_, 'C');
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(context, in_depth == filter.dim_size(2),
|
||||||
context, in_depth == filter.dim_size(2),
|
errors::InvalidArgument(
|
||||||
errors::InvalidArgument("input and filter must have the same depth: ",
|
"input and filter must have the same depth: ", in_depth,
|
||||||
in_depth, " vs ", filter.dim_size(2)));
|
" vs ", filter.dim_size(2)));
|
||||||
|
|
||||||
// The last dimension for filter is out_depth.
|
// The last dimension for filter is out_depth.
|
||||||
const int out_depth = static_cast<int>(filter.dim_size(3));
|
const int out_depth = static_cast<int>(filter.dim_size(3));
|
||||||
@ -487,8 +488,9 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
|
|||||||
// The second dimension for input is rows/height.
|
// The second dimension for input is rows/height.
|
||||||
// The first dimension for filter is rows/height.
|
// The first dimension for filter is rows/height.
|
||||||
const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
|
const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
|
||||||
OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
|
OP_REQUIRES(
|
||||||
std::numeric_limits<int>::max()),
|
context,
|
||||||
|
FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
|
||||||
errors::InvalidArgument("Input rows too large"));
|
errors::InvalidArgument("Input rows too large"));
|
||||||
const int input_rows = static_cast<int>(input_rows_raw);
|
const int input_rows = static_cast<int>(input_rows_raw);
|
||||||
const int filter_rows = static_cast<int>(filter.dim_size(0));
|
const int filter_rows = static_cast<int>(filter.dim_size(0));
|
||||||
@ -496,8 +498,9 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
|
|||||||
// The third dimension for input is columns/width.
|
// The third dimension for input is columns/width.
|
||||||
// The second dimension for filter is columns/width.
|
// The second dimension for filter is columns/width.
|
||||||
const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
|
const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
|
||||||
OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
|
OP_REQUIRES(
|
||||||
std::numeric_limits<int>::max()),
|
context,
|
||||||
|
FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
|
||||||
errors::InvalidArgument("Input cols too large"));
|
errors::InvalidArgument("Input cols too large"));
|
||||||
const int input_cols = static_cast<int>(input_cols_raw);
|
const int input_cols = static_cast<int>(input_cols_raw);
|
||||||
const int filter_cols = static_cast<int>(filter.dim_size(1));
|
const int filter_cols = static_cast<int>(filter.dim_size(1));
|
||||||
|
@ -17,8 +17,8 @@ limitations under the License.
|
|||||||
|
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#include "tensorflow/core/kernels/cross_op.h"
|
|
||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
|
#include "tensorflow/core/kernels/cross_op.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
|
@ -19,13 +19,13 @@ limitations under the License.
|
|||||||
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
#include "tensorflow/core/util/ctc/ctc_beam_search.h"
|
|
||||||
#include "tensorflow/core/framework/op.h"
|
#include "tensorflow/core/framework/op.h"
|
||||||
#include "tensorflow/core/framework/op_kernel.h"
|
#include "tensorflow/core/framework/op_kernel.h"
|
||||||
#include "tensorflow/core/framework/types.h"
|
#include "tensorflow/core/framework/types.h"
|
||||||
#include "tensorflow/core/lib/core/status.h"
|
#include "tensorflow/core/lib/core/status.h"
|
||||||
#include "tensorflow/core/platform/logging.h"
|
#include "tensorflow/core/platform/logging.h"
|
||||||
#include "tensorflow/core/platform/macros.h"
|
#include "tensorflow/core/platform/macros.h"
|
||||||
|
#include "tensorflow/core/util/ctc/ctc_beam_search.h"
|
||||||
#include "tensorflow/core/util/sparse/sparse_tensor.h"
|
#include "tensorflow/core/util/sparse/sparse_tensor.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
@ -80,16 +80,17 @@ class CTCDecodeHelper {
|
|||||||
|
|
||||||
if (!(batch_size == (*seq_len)->dim_size(0))) {
|
if (!(batch_size == (*seq_len)->dim_size(0))) {
|
||||||
return errors::FailedPrecondition(
|
return errors::FailedPrecondition(
|
||||||
"len(sequence_length) != batch_size. ", "len(sequence_length): ",
|
"len(sequence_length) != batch_size. ",
|
||||||
(*seq_len)->dim_size(0), " batch_size: ", batch_size);
|
"len(sequence_length): ", (*seq_len)->dim_size(0),
|
||||||
|
" batch_size: ", batch_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto seq_len_t = (*seq_len)->vec<int32>();
|
auto seq_len_t = (*seq_len)->vec<int32>();
|
||||||
|
|
||||||
for (int b = 0; b < batch_size; ++b) {
|
for (int b = 0; b < batch_size; ++b) {
|
||||||
if (!(seq_len_t(b) <= max_time)) {
|
if (!(seq_len_t(b) <= max_time)) {
|
||||||
return errors::FailedPrecondition("sequence_length(", b, ") <= ",
|
return errors::FailedPrecondition("sequence_length(", b,
|
||||||
max_time);
|
") <= ", max_time);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel {
|
|||||||
const int64 batch_indices = g.group()[0];
|
const int64 batch_indices = g.group()[0];
|
||||||
OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
|
OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
|
||||||
errors::InvalidArgument("labels batch index must be between ",
|
errors::InvalidArgument("labels batch index must be between ",
|
||||||
0, " and ", batch_size, " but saw: ",
|
0, " and ", batch_size,
|
||||||
batch_indices));
|
" but saw: ", batch_indices));
|
||||||
|
|
||||||
auto values = g.values<int32>();
|
auto values = g.values<int32>();
|
||||||
std::vector<int>* b_values = &labels_t[batch_indices];
|
std::vector<int>* b_values = &labels_t[batch_indices];
|
||||||
|
@ -17,8 +17,8 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/kernels/cwise_ops_gradients.h"
|
#include "tensorflow/core/kernels/cwise_ops_gradients.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
|
REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
|
||||||
complex64, complex128);
|
complex128);
|
||||||
|
|
||||||
#ifdef TENSORFLOW_USE_SYCL
|
#ifdef TENSORFLOW_USE_SYCL
|
||||||
REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
|
REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
|
||||||
|
@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
|
|||||||
BinaryOp<CPUDevice, functor::add<int32>>);
|
BinaryOp<CPUDevice, functor::add<int32>>);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if TENSORFLOW_USE_SYCL
|
#if TENSORFLOW_USE_SYCL
|
||||||
#define REGISTER_KERNEL(type) \
|
#define REGISTER_KERNEL(type) \
|
||||||
REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
|
REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
|
||||||
|
@ -22,8 +22,8 @@ namespace tensorflow {
|
|||||||
// sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
|
// sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
|
||||||
#if !defined(__ANDROID_TYPES_SLIM__)
|
#if !defined(__ANDROID_TYPES_SLIM__)
|
||||||
|
|
||||||
REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
|
REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
|
||||||
uint8, complex128, string);
|
complex128, string);
|
||||||
// Notice: String is excluded to allow marking AddV2 is_commutative and
|
// Notice: String is excluded to allow marking AddV2 is_commutative and
|
||||||
// is_aggregate.
|
// is_aggregate.
|
||||||
REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
|
REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
|
||||||
|
@ -17,8 +17,8 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/kernels/cwise_ops_gradients.h"
|
#include "tensorflow/core/kernels/cwise_ops_gradients.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
|
REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
|
||||||
complex64, complex128);
|
complex128);
|
||||||
|
|
||||||
#ifdef TENSORFLOW_USE_SYCL
|
#ifdef TENSORFLOW_USE_SYCL
|
||||||
REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
|
REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
|
||||||
|
@ -17,8 +17,8 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/kernels/cwise_ops_gradients.h"
|
#include "tensorflow/core/kernels/cwise_ops_gradients.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
|
REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
|
||||||
complex64, complex128);
|
complex128);
|
||||||
|
|
||||||
#ifdef TENSORFLOW_USE_SYCL
|
#ifdef TENSORFLOW_USE_SYCL
|
||||||
REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
|
REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
|
||||||
|
@ -16,15 +16,13 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/kernels/cwise_ops_common.h"
|
#include "tensorflow/core/kernels/cwise_ops_common.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double,
|
REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
|
||||||
complex64, complex128);
|
complex128);
|
||||||
|
|
||||||
#if TENSORFLOW_USE_SYCL
|
#if TENSORFLOW_USE_SYCL
|
||||||
#define REGISTER_SYCL_KERNEL(TYPE) \
|
#define REGISTER_SYCL_KERNEL(TYPE) \
|
||||||
REGISTER_KERNEL_BUILDER( \
|
REGISTER_KERNEL_BUILDER( \
|
||||||
Name("Cosh") \
|
Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
|
||||||
.Device(DEVICE_SYCL) \
|
|
||||||
.TypeConstraint<TYPE>("T"), \
|
|
||||||
UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
|
UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
|
||||||
REGISTER_SYCL_KERNEL(float);
|
REGISTER_SYCL_KERNEL(float);
|
||||||
REGISTER_SYCL_KERNEL(double);
|
REGISTER_SYCL_KERNEL(double);
|
||||||
|
@ -15,8 +15,10 @@ limitations under the License.
|
|||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA
|
||||||
|
|
||||||
#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||||
|
#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
namespace functor {
|
namespace functor {
|
||||||
@ -38,7 +40,6 @@ struct SelectScalarFunctor<GPUDevice, T> {
|
|||||||
typename TTypes<bool>::ConstScalar cond,
|
typename TTypes<bool>::ConstScalar cond,
|
||||||
typename TTypes<T>::ConstFlat then_flat,
|
typename TTypes<T>::ConstFlat then_flat,
|
||||||
typename TTypes<T>::ConstFlat else_flat) {
|
typename TTypes<T>::ConstFlat else_flat) {
|
||||||
|
|
||||||
#if !defined(EIGEN_HAS_INDEX_LIST)
|
#if !defined(EIGEN_HAS_INDEX_LIST)
|
||||||
Eigen::array<int, 1> rank1{1};
|
Eigen::array<int, 1> rank1{1};
|
||||||
#else
|
#else
|
||||||
@ -50,7 +51,6 @@ struct SelectScalarFunctor<GPUDevice, T> {
|
|||||||
To32Bit(out).device(d) = cond.reshape(rank1)
|
To32Bit(out).device(d) = cond.reshape(rank1)
|
||||||
.broadcast(broadcast_dims)
|
.broadcast(broadcast_dims)
|
||||||
.select(then_flat, else_flat);
|
.select(then_flat, else_flat);
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef TENSORFLOW_USE_SYCL
|
#ifdef TENSORFLOW_USE_SYCL
|
||||||
REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double);
|
REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
|
||||||
|
double);
|
||||||
|
|
||||||
REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
|
REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
|
||||||
.Device(DEVICE_SYCL)
|
.Device(DEVICE_SYCL)
|
||||||
|
@ -17,8 +17,8 @@ limitations under the License.
|
|||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
|
REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
|
||||||
uint8, int32);
|
int32);
|
||||||
#if defined(__ANDROID_TYPES_SLIM__)
|
#if defined(__ANDROID_TYPES_SLIM__)
|
||||||
// We only register the first type when we have multi-argument calls in the
|
// We only register the first type when we have multi-argument calls in the
|
||||||
// case where we're trying to reduce executable size, but it turns out that the
|
// case where we're trying to reduce executable size, but it turns out that the
|
||||||
|
@ -22,8 +22,8 @@ namespace tensorflow {
|
|||||||
// sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
|
// sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
|
||||||
#if !defined(__ANDROID_TYPES_SLIM__)
|
#if !defined(__ANDROID_TYPES_SLIM__)
|
||||||
|
|
||||||
REGISTER6(BinaryOp, CPU, "Mul", functor::mul,
|
REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64,
|
||||||
int8, uint16, int16, int64, complex64, complex128);
|
complex64, complex128);
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA
|
||||||
REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64,
|
REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64,
|
||||||
complex64, complex128);
|
complex64, complex128);
|
||||||
|
@ -201,12 +201,10 @@ struct SelectFunctorBase {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct SelectFunctor<CPUDevice, T>
|
struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
|
||||||
: SelectFunctorBase<CPUDevice, T> {};
|
|
||||||
#ifdef TENSORFLOW_USE_SYCL
|
#ifdef TENSORFLOW_USE_SYCL
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct SelectFunctor<SYCLDevice, T>
|
struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
|
||||||
: SelectFunctorBase<SYCLDevice, T> {};
|
|
||||||
#endif // TENSORFLOW_USE_SYCL
|
#endif // TENSORFLOW_USE_SYCL
|
||||||
|
|
||||||
template <typename Device, typename T>
|
template <typename Device, typename T>
|
||||||
@ -257,8 +255,8 @@ struct BatchSelectFunctorBase {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct BatchSelectFunctor<CPUDevice, T>
|
struct BatchSelectFunctor<CPUDevice, T> : BatchSelectFunctorBase<CPUDevice, T> {
|
||||||
: BatchSelectFunctorBase<CPUDevice, T> {};
|
};
|
||||||
#ifdef TENSORFLOW_USE_SYCL
|
#ifdef TENSORFLOW_USE_SYCL
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct BatchSelectFunctor<SYCLDevice, T>
|
struct BatchSelectFunctor<SYCLDevice, T>
|
||||||
|
@ -16,15 +16,13 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/kernels/cwise_ops_common.h"
|
#include "tensorflow/core/kernels/cwise_ops_common.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double,
|
REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
|
||||||
complex64, complex128);
|
complex128);
|
||||||
|
|
||||||
#if TENSORFLOW_USE_SYCL
|
#if TENSORFLOW_USE_SYCL
|
||||||
#define REGISTER_SYCL_KERNEL(TYPE) \
|
#define REGISTER_SYCL_KERNEL(TYPE) \
|
||||||
REGISTER_KERNEL_BUILDER( \
|
REGISTER_KERNEL_BUILDER( \
|
||||||
Name("Sinh") \
|
Name("Sinh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
|
||||||
.Device(DEVICE_SYCL) \
|
|
||||||
.TypeConstraint<TYPE>("T"), \
|
|
||||||
UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
|
UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
|
||||||
REGISTER_SYCL_KERNEL(float);
|
REGISTER_SYCL_KERNEL(float);
|
||||||
REGISTER_SYCL_KERNEL(double);
|
REGISTER_SYCL_KERNEL(double);
|
||||||
|
@ -57,8 +57,8 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
|
|||||||
in1(ctx->input(1)),
|
in1(ctx->input(1)),
|
||||||
bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) {
|
bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) {
|
||||||
if (!bcast.IsValid()) {
|
if (!bcast.IsValid()) {
|
||||||
ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ",
|
ctx->SetStatus(errors::InvalidArgument(
|
||||||
in0.shape().DebugString(), " vs. ",
|
"Incompatible shapes: ", in0.shape().DebugString(), " vs. ",
|
||||||
in1.shape().DebugString()));
|
in1.shape().DebugString()));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -171,7 +171,6 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#ifdef TENSORFLOW_USE_SYCL
|
#ifdef TENSORFLOW_USE_SYCL
|
||||||
// Partial specialization of BinaryFunctor for SYCL devices
|
// Partial specialization of BinaryFunctor for SYCL devices
|
||||||
typedef Eigen::SyclDevice SYCLDevice;
|
typedef Eigen::SyclDevice SYCLDevice;
|
||||||
|
@ -51,7 +51,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
|
|||||||
void operator()(const SYCLDevice& d, typename Functor::tout_type out,
|
void operator()(const SYCLDevice& d, typename Functor::tout_type out,
|
||||||
typename Functor::tin_type in0,
|
typename Functor::tin_type in0,
|
||||||
typename Functor::tin_type in1, bool* error) {
|
typename Functor::tin_type in1, bool* error) {
|
||||||
To32Bit(out).device(d) = To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
|
To32Bit(out).device(d) =
|
||||||
|
To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
|
||||||
}
|
}
|
||||||
|
|
||||||
void Left(const SYCLDevice& d, typename Functor::tout_type out,
|
void Left(const SYCLDevice& d, typename Functor::tout_type out,
|
||||||
@ -61,7 +62,9 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
|
|||||||
constexpr int NumDims = Functor::tin_type::NumDimensions;
|
constexpr int NumDims = Functor::tin_type::NumDimensions;
|
||||||
static_assert(NumDims == 1, "Unexpected size");
|
static_assert(NumDims == 1, "Unexpected size");
|
||||||
Eigen::Sizes<1> scalar_dim;
|
Eigen::Sizes<1> scalar_dim;
|
||||||
out.device(d) = scalar.reshape(scalar_dim).broadcast(in.dimensions()).binaryExpr(in, Binary());
|
out.device(d) = scalar.reshape(scalar_dim)
|
||||||
|
.broadcast(in.dimensions())
|
||||||
|
.binaryExpr(in, Binary());
|
||||||
}
|
}
|
||||||
|
|
||||||
void Right(const SYCLDevice& d, typename Functor::tout_type out,
|
void Right(const SYCLDevice& d, typename Functor::tout_type out,
|
||||||
@ -71,7 +74,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
|
|||||||
constexpr int NumDims = Functor::tin_type::NumDimensions;
|
constexpr int NumDims = Functor::tin_type::NumDimensions;
|
||||||
static_assert(NumDims == 1, "Unexpected size");
|
static_assert(NumDims == 1, "Unexpected size");
|
||||||
Eigen::Sizes<1> scalar_dim;
|
Eigen::Sizes<1> scalar_dim;
|
||||||
out.device(d) = in.binaryExpr(scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
|
out.device(d) = in.binaryExpr(
|
||||||
|
scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
|
||||||
}
|
}
|
||||||
|
|
||||||
void BCast(const SYCLDevice& d,
|
void BCast(const SYCLDevice& d,
|
||||||
|
@ -430,13 +430,10 @@ class IteratorStateVariant {
|
|||||||
REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
|
REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
|
||||||
kIteratorVariantTypeName);
|
kIteratorVariantTypeName);
|
||||||
|
|
||||||
// TODO(mrry): Can we simply use the template kernel here?
|
|
||||||
class IteratorHandleOp : public OpKernel {
|
class IteratorHandleOp : public OpKernel {
|
||||||
public:
|
public:
|
||||||
explicit IteratorHandleOp(OpKernelConstruction* ctx)
|
explicit IteratorHandleOp(OpKernelConstruction* ctx)
|
||||||
: OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
|
: OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
|
||||||
OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_STRING, TensorShape({2}),
|
|
||||||
&handle_, nullptr));
|
|
||||||
OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
|
OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
|
||||||
OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
|
OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
|
||||||
OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
|
OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
|
||||||
@ -460,20 +457,21 @@ class IteratorHandleOp : public OpKernel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
|
void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
|
||||||
|
{
|
||||||
mutex_lock l(mu_);
|
mutex_lock l(mu_);
|
||||||
|
if (resource_ == nullptr) {
|
||||||
FunctionLibraryRuntime* lib = context->function_library();
|
FunctionLibraryRuntime* lib = context->function_library();
|
||||||
std::unique_ptr<DeviceMgr> device_mgr(nullptr);
|
std::unique_ptr<DeviceMgr> device_mgr(nullptr);
|
||||||
std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
|
std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
|
||||||
std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
|
std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
|
||||||
// If the iterator is shared then we construct a new FLR, and pass that in.
|
// If the iterator is shared then we construct a new FLR, and pass that
|
||||||
// NOTE(mrry,rohanj): In this case it is not possible to call remote
|
// in. NOTE(mrry,rohanj): In this case it is not possible to call remote
|
||||||
// functions from the iterator. We may add this functionality if there
|
// functions from the iterator. We may add this functionality if there
|
||||||
// is sufficient demand, but it will require a significant refactoring.
|
// is sufficient demand, but it will require a significant refactoring.
|
||||||
if (!name_.empty()) {
|
if (!name_.empty()) {
|
||||||
lib = CreateFLR(context, &device_mgr, &flib_def, &pflr);
|
lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (resource_ == nullptr) {
|
|
||||||
ResourceMgr* mgr = context->resource_manager();
|
ResourceMgr* mgr = context->resource_manager();
|
||||||
OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
|
OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
|
||||||
|
|
||||||
@ -482,8 +480,8 @@ class IteratorHandleOp : public OpKernel {
|
|||||||
context,
|
context,
|
||||||
mgr->LookupOrCreate<IteratorResource>(
|
mgr->LookupOrCreate<IteratorResource>(
|
||||||
cinfo_.container(), cinfo_.name(), &resource,
|
cinfo_.container(), cinfo_.name(), &resource,
|
||||||
[lib, &device_mgr, &flib_def, &pflr, this](IteratorResource** ret)
|
[lib, &device_mgr, &flib_def, &pflr,
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(mu_) {
|
this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
|
||||||
*ret = new IteratorResource(
|
*ret = new IteratorResource(
|
||||||
output_dtypes_, output_shapes_, graph_def_version_,
|
output_dtypes_, output_shapes_, graph_def_version_,
|
||||||
std::move(device_mgr), std::move(flib_def),
|
std::move(device_mgr), std::move(flib_def),
|
||||||
@ -498,18 +496,12 @@ class IteratorHandleOp : public OpKernel {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto h = handle_.AccessTensor(context)->template flat<string>();
|
|
||||||
h(0) = cinfo_.container();
|
|
||||||
h(1) = cinfo_.name();
|
|
||||||
resource_ = resource;
|
resource_ = resource;
|
||||||
}
|
}
|
||||||
if (context->expected_output_dtype(0) == DT_RESOURCE) {
|
}
|
||||||
OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
|
OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
|
||||||
context, 0, cinfo_.container(), cinfo_.name(),
|
context, 0, cinfo_.container(), cinfo_.name(),
|
||||||
MakeTypeIndex<IteratorResource>()));
|
MakeTypeIndex<IteratorResource>()));
|
||||||
} else {
|
|
||||||
context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -526,7 +518,7 @@ class IteratorHandleOp : public OpKernel {
|
|||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
FunctionLibraryRuntime* CreateFLR(
|
FunctionLibraryRuntime* CreatePrivateFLR(
|
||||||
OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
|
OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
|
||||||
std::unique_ptr<FunctionLibraryDefinition>* flib_def,
|
std::unique_ptr<FunctionLibraryDefinition>* flib_def,
|
||||||
std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
|
std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
|
||||||
@ -546,9 +538,8 @@ class IteratorHandleOp : public OpKernel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
mutex mu_;
|
mutex mu_;
|
||||||
ContainerInfo cinfo_ GUARDED_BY(mu_);
|
ContainerInfo cinfo_; // Written once under mu_ then constant afterwards.
|
||||||
IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
|
IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
|
||||||
PersistentTensor handle_ GUARDED_BY(mu_);
|
|
||||||
DataTypeVector output_dtypes_;
|
DataTypeVector output_dtypes_;
|
||||||
std::vector<PartialTensorShape> output_shapes_;
|
std::vector<PartialTensorShape> output_shapes_;
|
||||||
const int graph_def_version_;
|
const int graph_def_version_;
|
||||||
|
@ -91,9 +91,9 @@ class DecodeCSVOp : public OpKernel {
|
|||||||
} else {
|
} else {
|
||||||
int32 value;
|
int32 value;
|
||||||
OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value),
|
OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value),
|
||||||
errors::InvalidArgument("Field ", f, " in record ", i,
|
errors::InvalidArgument(
|
||||||
" is not a valid int32: ",
|
"Field ", f, " in record ", i,
|
||||||
fields[f]));
|
" is not a valid int32: ", fields[f]));
|
||||||
output[f]->flat<int32>()(i) = value;
|
output[f]->flat<int32>()(i) = value;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -111,9 +111,9 @@ class DecodeCSVOp : public OpKernel {
|
|||||||
} else {
|
} else {
|
||||||
int64 value;
|
int64 value;
|
||||||
OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value),
|
OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value),
|
||||||
errors::InvalidArgument("Field ", f, " in record ", i,
|
errors::InvalidArgument(
|
||||||
" is not a valid int64: ",
|
"Field ", f, " in record ", i,
|
||||||
fields[f]));
|
" is not a valid int64: ", fields[f]));
|
||||||
output[f]->flat<int64>()(i) = value;
|
output[f]->flat<int64>()(i) = value;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -130,9 +130,9 @@ class DecodeCSVOp : public OpKernel {
|
|||||||
} else {
|
} else {
|
||||||
float value;
|
float value;
|
||||||
OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
|
OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
|
||||||
errors::InvalidArgument("Field ", f, " in record ", i,
|
errors::InvalidArgument(
|
||||||
" is not a valid float: ",
|
"Field ", f, " in record ", i,
|
||||||
fields[f]));
|
" is not a valid float: ", fields[f]));
|
||||||
output[f]->flat<float>()(i) = value;
|
output[f]->flat<float>()(i) = value;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -150,9 +150,9 @@ class DecodeCSVOp : public OpKernel {
|
|||||||
} else {
|
} else {
|
||||||
double value;
|
double value;
|
||||||
OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
|
OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
|
||||||
errors::InvalidArgument("Field ", f, " in record ", i,
|
errors::InvalidArgument(
|
||||||
" is not a valid double: ",
|
"Field ", f, " in record ", i,
|
||||||
fields[f]));
|
" is not a valid double: ", fields[f]));
|
||||||
output[f]->flat<double>()(i) = value;
|
output[f]->flat<double>()(i) = value;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -208,7 +208,8 @@ class DecodeCSVOp : public OpKernel {
|
|||||||
if (!quoted) {
|
if (!quoted) {
|
||||||
while (static_cast<size_t>(current_idx) < input.size() &&
|
while (static_cast<size_t>(current_idx) < input.size() &&
|
||||||
input[current_idx] != delim_) {
|
input[current_idx] != delim_) {
|
||||||
OP_REQUIRES(ctx, (!use_quote_delim_ || input[current_idx] != '"') &&
|
OP_REQUIRES(ctx,
|
||||||
|
(!use_quote_delim_ || input[current_idx] != '"') &&
|
||||||
input[current_idx] != '\n' &&
|
input[current_idx] != '\n' &&
|
||||||
input[current_idx] != '\r',
|
input[current_idx] != '\r',
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
@ -238,7 +239,8 @@ class DecodeCSVOp : public OpKernel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(
|
||||||
ctx, (static_cast<size_t>(current_idx) < input.size() &&
|
ctx,
|
||||||
|
(static_cast<size_t>(current_idx) < input.size() &&
|
||||||
input[current_idx] == '"' &&
|
input[current_idx] == '"' &&
|
||||||
(static_cast<size_t>(current_idx) == input.size() - 1 ||
|
(static_cast<size_t>(current_idx) == input.size() - 1 ||
|
||||||
input[current_idx + 1] == delim_)),
|
input[current_idx + 1] == delim_)),
|
||||||
|
@ -87,10 +87,11 @@ class DecodeImageOp : public OpKernel {
|
|||||||
channels_ = 3;
|
channels_ = 3;
|
||||||
} else {
|
} else {
|
||||||
OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
|
OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
|
||||||
OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
|
OP_REQUIRES(
|
||||||
channels_ == 4,
|
context,
|
||||||
errors::InvalidArgument(
|
channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
|
||||||
"channels must be 0, 1, 3, or 4, got ", channels_));
|
errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
|
||||||
|
channels_));
|
||||||
}
|
}
|
||||||
flags_.components = channels_;
|
flags_.components = channels_;
|
||||||
|
|
||||||
@ -114,8 +115,9 @@ class DecodeImageOp : public OpKernel {
|
|||||||
|
|
||||||
if (format_ == kJpgFormat) {
|
if (format_ == kJpgFormat) {
|
||||||
OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
|
OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
|
||||||
OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
|
OP_REQUIRES(context,
|
||||||
flags_.ratio == 4 || flags_.ratio == 8,
|
flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
|
||||||
|
flags_.ratio == 8,
|
||||||
errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
|
errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
|
||||||
flags_.ratio));
|
flags_.ratio));
|
||||||
OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
|
OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
|
||||||
@ -130,7 +132,8 @@ class DecodeImageOp : public OpKernel {
|
|||||||
string dct_method;
|
string dct_method;
|
||||||
OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
|
OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(
|
||||||
context, (dct_method.empty() || dct_method == "INTEGER_FAST" ||
|
context,
|
||||||
|
(dct_method.empty() || dct_method == "INTEGER_FAST" ||
|
||||||
dct_method == "INTEGER_ACCURATE"),
|
dct_method == "INTEGER_ACCURATE"),
|
||||||
errors::InvalidArgument("dct_method must be one of "
|
errors::InvalidArgument("dct_method must be one of "
|
||||||
"{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
|
"{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
|
||||||
@ -157,9 +160,9 @@ class DecodeImageOp : public OpKernel {
|
|||||||
errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
|
errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
|
||||||
FileFormatString(magic, input)));
|
FileFormatString(magic, input)));
|
||||||
OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
|
OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
|
||||||
errors::InvalidArgument(FileFormatString(magic, input),
|
errors::InvalidArgument(
|
||||||
" contents are too large for int: ",
|
FileFormatString(magic, input),
|
||||||
input.size()));
|
" contents are too large for int: ", input.size()));
|
||||||
OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
|
OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
|
||||||
errors::InvalidArgument(FileFormatString(magic, input),
|
errors::InvalidArgument(FileFormatString(magic, input),
|
||||||
" does not support uint16 output"));
|
" does not support uint16 output"));
|
||||||
@ -212,7 +215,8 @@ class DecodeImageOp : public OpKernel {
|
|||||||
input.data(), input.size(), flags, nullptr /* nwarn */,
|
input.data(), input.size(), flags, nullptr /* nwarn */,
|
||||||
[=, &output](int width, int height, int channels) -> uint8* {
|
[=, &output](int width, int height, int channels) -> uint8* {
|
||||||
Status status(context->allocate_output(
|
Status status(context->allocate_output(
|
||||||
0, format_ == kGifFormat
|
0,
|
||||||
|
format_ == kGifFormat
|
||||||
? TensorShape({1, height, width, channels})
|
? TensorShape({1, height, width, channels})
|
||||||
: TensorShape({height, width, channels}),
|
: TensorShape({height, width, channels}),
|
||||||
&output));
|
&output));
|
||||||
|
@ -120,8 +120,8 @@ bool CanUseDeepConv2D(int stride_rows, int stride_cols, int filter_rows,
|
|||||||
|
|
||||||
VLOG(2) << "CanUseDeepConv2D"
|
VLOG(2) << "CanUseDeepConv2D"
|
||||||
<< " deep_conv_cost: " << deep_conv_cost
|
<< " deep_conv_cost: " << deep_conv_cost
|
||||||
<< " direct_conv_cost: " << direct_conv_cost
|
<< " direct_conv_cost: " << direct_conv_cost << " deep_direct_ratio: "
|
||||||
<< " deep_direct_ratio: " << (static_cast<float>(deep_conv_cost) /
|
<< (static_cast<float>(deep_conv_cost) /
|
||||||
static_cast<float>(direct_conv_cost))
|
static_cast<float>(direct_conv_cost))
|
||||||
<< " use_deep_conv: " << (deep_conv_cost < direct_conv_cost);
|
<< " use_deep_conv: " << (deep_conv_cost < direct_conv_cost);
|
||||||
return deep_conv_cost < direct_conv_cost;
|
return deep_conv_cost < direct_conv_cost;
|
||||||
|
@ -308,10 +308,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
|
|||||||
|
|
||||||
// in_depth for input and filter must match.
|
// in_depth for input and filter must match.
|
||||||
const int64 in_depth = GetTensorDim(input, data_format_, 'C');
|
const int64 in_depth = GetTensorDim(input, data_format_, 'C');
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(context, in_depth == filter.dim_size(2),
|
||||||
context, in_depth == filter.dim_size(2),
|
errors::InvalidArgument(
|
||||||
errors::InvalidArgument("input and filter must have the same depth: ",
|
"input and filter must have the same depth: ", in_depth,
|
||||||
in_depth, " vs ", filter.dim_size(2)));
|
" vs ", filter.dim_size(2)));
|
||||||
|
|
||||||
// The last dimension for filter is depth multiplier.
|
// The last dimension for filter is depth multiplier.
|
||||||
const int32 depth_multiplier = filter.dim_size(3);
|
const int32 depth_multiplier = filter.dim_size(3);
|
||||||
@ -430,8 +430,9 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA
|
||||||
REGISTER_KERNEL_BUILDER(
|
REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
|
||||||
Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
|
.Device(DEVICE_GPU)
|
||||||
|
.TypeConstraint<Eigen::half>("T"),
|
||||||
DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
|
DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
|
||||||
|
|
||||||
REGISTER_KERNEL_BUILDER(
|
REGISTER_KERNEL_BUILDER(
|
||||||
|
@ -17,12 +17,12 @@ limitations under the License.
|
|||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||||
|
#include "external/cub_archive/cub/util_ptx.cuh"
|
||||||
#include "tensorflow/core/framework/op_kernel.h"
|
#include "tensorflow/core/framework/op_kernel.h"
|
||||||
#include "tensorflow/core/kernels/depthwise_conv_op.h"
|
#include "tensorflow/core/kernels/depthwise_conv_op.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
#include "tensorflow/core/util/cuda_kernel_helper.h"
|
#include "tensorflow/core/util/cuda_kernel_helper.h"
|
||||||
#include "tensorflow/core/util/tensor_format.h"
|
#include "tensorflow/core/util/tensor_format.h"
|
||||||
#include "external/cub_archive/cub/util_ptx.cuh"
|
|
||||||
|
|
||||||
#if !defined(_MSC_VER)
|
#if !defined(_MSC_VER)
|
||||||
#define UNROLL _Pragma("unroll")
|
#define UNROLL _Pragma("unroll")
|
||||||
|
@ -29,8 +29,8 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
#include "tensorflow/core/framework/tensor.h"
|
#include "tensorflow/core/framework/tensor.h"
|
||||||
#include "tensorflow/core/framework/tensor_types.h"
|
#include "tensorflow/core/framework/tensor_types.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
|
||||||
#include "tensorflow/core/platform/logging.h"
|
#include "tensorflow/core/platform/logging.h"
|
||||||
|
#include "tensorflow/core/platform/types.h"
|
||||||
#include "tensorflow/core/util/work_sharder.h"
|
#include "tensorflow/core/util/work_sharder.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
@ -47,8 +47,9 @@ class DiagOp : public OpKernel {
|
|||||||
void Compute(OpKernelContext* context) override {
|
void Compute(OpKernelContext* context) override {
|
||||||
const Tensor& diagonal = context->input(0);
|
const Tensor& diagonal = context->input(0);
|
||||||
const int num_dims = diagonal.dims();
|
const int num_dims = diagonal.dims();
|
||||||
OP_REQUIRES(context, 0 != num_dims, errors::InvalidArgument(
|
OP_REQUIRES(
|
||||||
"Input must be at least rank 1, got 0"));
|
context, 0 != num_dims,
|
||||||
|
errors::InvalidArgument("Input must be at least rank 1, got 0"));
|
||||||
TensorShape out_shape;
|
TensorShape out_shape;
|
||||||
for (int i = 0; i < num_dims; ++i) {
|
for (int i = 0; i < num_dims; ++i) {
|
||||||
out_shape.AddDim(diagonal.dim_size(i));
|
out_shape.AddDim(diagonal.dim_size(i));
|
||||||
@ -60,9 +61,8 @@ class DiagOp : public OpKernel {
|
|||||||
OP_REQUIRES_OK(context,
|
OP_REQUIRES_OK(context,
|
||||||
context->allocate_output(0, out_shape, &output_tensor));
|
context->allocate_output(0, out_shape, &output_tensor));
|
||||||
functor::DiagFunctor<Device, T> diagFunc;
|
functor::DiagFunctor<Device, T> diagFunc;
|
||||||
Status s = diagFunc(context,
|
Status s =
|
||||||
diagonal.NumElements(),
|
diagFunc(context, diagonal.NumElements(), diagonal.flat<T>().data(),
|
||||||
diagonal.flat<T>().data(),
|
|
||||||
output_tensor->flat<T>().data());
|
output_tensor->flat<T>().data());
|
||||||
OP_REQUIRES_OK(context, s);
|
OP_REQUIRES_OK(context, s);
|
||||||
}
|
}
|
||||||
@ -83,11 +83,11 @@ class DiagPartOp : public OpKernel {
|
|||||||
even and positive, got shape ",
|
even and positive, got shape ",
|
||||||
tensor.shape().DebugString()));
|
tensor.shape().DebugString()));
|
||||||
for (int i = 0; i < out_dims; i++) {
|
for (int i = 0; i < out_dims; i++) {
|
||||||
OP_REQUIRES(context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
|
OP_REQUIRES(
|
||||||
errors::InvalidArgument(
|
context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
|
||||||
"Invalid shape ", tensor.shape().DebugString(),
|
errors::InvalidArgument("Invalid shape ",
|
||||||
": dimensions ", i, " and ", i + out_dims, " do not match.")
|
tensor.shape().DebugString(), ": dimensions ",
|
||||||
);
|
i, " and ", i + out_dims, " do not match."));
|
||||||
}
|
}
|
||||||
|
|
||||||
TensorShape out_shape;
|
TensorShape out_shape;
|
||||||
@ -96,13 +96,10 @@ class DiagPartOp : public OpKernel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Tensor* output = nullptr;
|
Tensor* output = nullptr;
|
||||||
OP_REQUIRES_OK(context,
|
OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
|
||||||
context->allocate_output(0, out_shape, &output));
|
|
||||||
functor::DiagPartFunctor<Device, T> diagPartFunc;
|
functor::DiagPartFunctor<Device, T> diagPartFunc;
|
||||||
Status s = diagPartFunc(context,
|
Status s = diagPartFunc(context, out_shape.num_elements(),
|
||||||
out_shape.num_elements(),
|
tensor.flat<T>().data(), output->flat<T>().data());
|
||||||
tensor.flat<T>().data(),
|
|
||||||
output->flat<T>().data());
|
|
||||||
OP_REQUIRES_OK(context, s);
|
OP_REQUIRES_OK(context, s);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -129,9 +126,8 @@ class DiagPartOp : public OpKernel {
|
|||||||
namespace functor {
|
namespace functor {
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct DiagFunctor<CPUDevice, T> {
|
struct DiagFunctor<CPUDevice, T> {
|
||||||
EIGEN_ALWAYS_INLINE Status
|
EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
|
||||||
operator() (OpKernelContext* context, const int64 size,
|
const int64 size, const T* in, T* out) {
|
||||||
const T* in, T* out) {
|
|
||||||
// This subprocess is responsible for writing values in index range
|
// This subprocess is responsible for writing values in index range
|
||||||
// [start*size, limit*size)
|
// [start*size, limit*size)
|
||||||
auto subDiag = [in, out, size](int64 start, int64 limit) {
|
auto subDiag = [in, out, size](int64 start, int64 limit) {
|
||||||
@ -143,17 +139,16 @@ struct DiagFunctor<CPUDevice, T> {
|
|||||||
|
|
||||||
// Here, 5 is a empirical factor of cost_per_unit.
|
// Here, 5 is a empirical factor of cost_per_unit.
|
||||||
auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
|
auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
|
||||||
Shard(worker_threads.num_threads, worker_threads.workers, size,
|
Shard(worker_threads.num_threads, worker_threads.workers, size, 5 * size,
|
||||||
5 * size, subDiag);
|
subDiag);
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct DiagPartFunctor<CPUDevice, T> {
|
struct DiagPartFunctor<CPUDevice, T> {
|
||||||
EIGEN_ALWAYS_INLINE Status
|
EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
|
||||||
operator() (OpKernelContext* context, const int64 size,
|
const int64 size, const T* in, T* out) {
|
||||||
const T* in, T* out) {
|
|
||||||
// This subprocess is responsible for extracting values in index range
|
// This subprocess is responsible for extracting values in index range
|
||||||
// [start, limit)
|
// [start, limit)
|
||||||
auto subDiagPart = [in, out, size](int64 start, int64 limit) {
|
auto subDiagPart = [in, out, size](int64 start, int64 limit) {
|
||||||
@ -164,14 +159,13 @@ struct DiagPartFunctor<CPUDevice, T> {
|
|||||||
|
|
||||||
// Here, 5 is a empirical factor of cost_per_unit.
|
// Here, 5 is a empirical factor of cost_per_unit.
|
||||||
auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
|
auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
|
||||||
Shard(worker_threads.num_threads, worker_threads.workers, size,
|
Shard(worker_threads.num_threads, worker_threads.workers, size, 5,
|
||||||
5, subDiagPart);
|
subDiagPart);
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} // namespace functor
|
} // namespace functor
|
||||||
|
|
||||||
|
|
||||||
// Register the CPU kernels.
|
// Register the CPU kernels.
|
||||||
#define REGISTER_DIAGOP(T) \
|
#define REGISTER_DIAGOP(T) \
|
||||||
REGISTER_KERNEL_BUILDER( \
|
REGISTER_KERNEL_BUILDER( \
|
||||||
@ -250,6 +244,4 @@ TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU);
|
|||||||
|
|
||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA
|
||||||
|
|
||||||
|
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
|
@ -26,14 +26,14 @@ namespace functor {
|
|||||||
|
|
||||||
template <typename Device, typename T>
|
template <typename Device, typename T>
|
||||||
struct DiagFunctor {
|
struct DiagFunctor {
|
||||||
Status operator() (OpKernelContext* context, const int64 size,
|
Status operator()(OpKernelContext* context, const int64 size, const T* in,
|
||||||
const T* in, T* out);
|
T* out);
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Device, typename T>
|
template <typename Device, typename T>
|
||||||
struct DiagPartFunctor {
|
struct DiagPartFunctor {
|
||||||
Status operator() (OpKernelContext* context, const int64 size,
|
Status operator()(OpKernelContext* context, const int64 size, const T* in,
|
||||||
const T* in, T* out);
|
T* out);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace functor
|
} // namespace functor
|
||||||
|
@ -19,8 +19,8 @@ limitations under the License.
|
|||||||
|
|
||||||
#include <complex>
|
#include <complex>
|
||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
#include "tensorflow/core/util/cuda_kernel_helper.h"
|
|
||||||
#include "tensorflow/core/kernels/diag_op.h"
|
#include "tensorflow/core/kernels/diag_op.h"
|
||||||
|
#include "tensorflow/core/util/cuda_kernel_helper.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
namespace functor {
|
namespace functor {
|
||||||
@ -28,10 +28,8 @@ namespace functor {
|
|||||||
typedef Eigen::GpuDevice GPUDevice;
|
typedef Eigen::GpuDevice GPUDevice;
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void DiagCudaKernel(const int num_threads,
|
__global__ void DiagCudaKernel(const int num_threads, const int64 size,
|
||||||
const int64 size,
|
const T* in, T* out) {
|
||||||
const T* in,
|
|
||||||
T* out) {
|
|
||||||
CUDA_1D_KERNEL_LOOP(index, num_threads) {
|
CUDA_1D_KERNEL_LOOP(index, num_threads) {
|
||||||
// Fill the diagonal elements or set to zero in other place.
|
// Fill the diagonal elements or set to zero in other place.
|
||||||
if (index % (1 + size) == 0) {
|
if (index % (1 + size) == 0) {
|
||||||
@ -44,9 +42,8 @@ __global__ void DiagCudaKernel(const int num_threads,
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct DiagFunctor<GPUDevice, T> {
|
struct DiagFunctor<GPUDevice, T> {
|
||||||
EIGEN_ALWAYS_INLINE Status
|
EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
|
||||||
operator() (OpKernelContext* context, const int64 size,
|
const int64 size, const T* in, T* out) {
|
||||||
const T* in, T* out) {
|
|
||||||
// Empty tensor couldn't launch the kernel.
|
// Empty tensor couldn't launch the kernel.
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
@ -56,25 +53,22 @@ struct DiagFunctor<GPUDevice, T> {
|
|||||||
// so this may overflow for `size*size` in extreme cases,
|
// so this may overflow for `size*size` in extreme cases,
|
||||||
// here is checking the multiplication overflow for integer.
|
// here is checking the multiplication overflow for integer.
|
||||||
if (size && (int(size * size) / size) != size) {
|
if (size && (int(size * size) / size) != size) {
|
||||||
return errors::Internal(
|
return errors::Internal("DiagOp got input size too large.");
|
||||||
"DiagOp got input size too large.");
|
|
||||||
}
|
}
|
||||||
int virtual_thread_count = int(size * size);
|
int virtual_thread_count = int(size * size);
|
||||||
|
|
||||||
// Launch the GPU kernel.
|
// Launch the GPU kernel.
|
||||||
const GPUDevice& device = context->eigen_device<GPUDevice>();
|
const GPUDevice& device = context->eigen_device<GPUDevice>();
|
||||||
CudaLaunchConfig diag_config = GetCudaLaunchConfig(
|
CudaLaunchConfig diag_config =
|
||||||
virtual_thread_count, device);
|
GetCudaLaunchConfig(virtual_thread_count, device);
|
||||||
DiagCudaKernel<<<diag_config.block_count,
|
DiagCudaKernel<<<diag_config.block_count, diag_config.thread_per_block, 0,
|
||||||
diag_config.thread_per_block,
|
device.stream()>>>(diag_config.virtual_thread_count, size,
|
||||||
0, device.stream()>>>(
|
in, out);
|
||||||
diag_config.virtual_thread_count, size, in, out);
|
|
||||||
|
|
||||||
auto err = cudaGetLastError();
|
auto err = cudaGetLastError();
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
return errors::Internal(
|
return errors::Internal(
|
||||||
"Could not launch DiagOp kernel: ",
|
"Could not launch DiagOp kernel: ", cudaGetErrorString(err), ".");
|
||||||
cudaGetErrorString(err), ".");
|
|
||||||
}
|
}
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
@ -87,12 +81,9 @@ template struct DiagFunctor<GPUDevice, int64>;
|
|||||||
template struct DiagFunctor<GPUDevice, complex64>;
|
template struct DiagFunctor<GPUDevice, complex64>;
|
||||||
template struct DiagFunctor<GPUDevice, complex128>;
|
template struct DiagFunctor<GPUDevice, complex128>;
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void DiagPartCudaKernel(const int num_threads,
|
__global__ void DiagPartCudaKernel(const int num_threads, const int64 size,
|
||||||
const int64 size,
|
const T* in, T* out) {
|
||||||
const T* in,
|
|
||||||
T* out) {
|
|
||||||
CUDA_1D_KERNEL_LOOP(index, num_threads) {
|
CUDA_1D_KERNEL_LOOP(index, num_threads) {
|
||||||
out[index] = in[(1 + size) * index];
|
out[index] = in[(1 + size) * index];
|
||||||
}
|
}
|
||||||
@ -100,9 +91,8 @@ __global__ void DiagPartCudaKernel(const int num_threads,
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct DiagPartFunctor<GPUDevice, T> {
|
struct DiagPartFunctor<GPUDevice, T> {
|
||||||
EIGEN_ALWAYS_INLINE Status
|
EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
|
||||||
operator() (OpKernelContext* context, const int64 size,
|
const int64 size, const T* in, T* out) {
|
||||||
const T* in, T* out) {
|
|
||||||
// Empty tensor couldn't launch the kernel.
|
// Empty tensor couldn't launch the kernel.
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
@ -111,16 +101,14 @@ struct DiagPartFunctor<GPUDevice, T> {
|
|||||||
|
|
||||||
// Extract the diagonal elements.
|
// Extract the diagonal elements.
|
||||||
CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device);
|
CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device);
|
||||||
DiagPartCudaKernel<<<diag_config.block_count,
|
DiagPartCudaKernel<<<diag_config.block_count, diag_config.thread_per_block,
|
||||||
diag_config.thread_per_block,
|
0, device.stream()>>>(diag_config.virtual_thread_count,
|
||||||
0, device.stream()>>>(
|
size, in, out);
|
||||||
diag_config.virtual_thread_count, size, in, out);
|
|
||||||
|
|
||||||
auto err = cudaGetLastError();
|
auto err = cudaGetLastError();
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
return errors::Internal(
|
return errors::Internal(
|
||||||
"Could not launch DiagPartOp kernel: ",
|
"Could not launch DiagPartOp kernel: ", cudaGetErrorString(err), ".");
|
||||||
cudaGetErrorString(err), ".");
|
|
||||||
}
|
}
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
@ -51,4 +51,3 @@ BM_Diag(128);
|
|||||||
BM_Diag(512);
|
BM_Diag(512);
|
||||||
|
|
||||||
} // end namespace tensorflow
|
} // end namespace tensorflow
|
||||||
|
|
||||||
|
@ -91,10 +91,10 @@ void ParseSizes(OpKernelContext* context, const std::vector<int32>& strides,
|
|||||||
filter.shape().DebugString()));
|
filter.shape().DebugString()));
|
||||||
const int filter_rows = filter.dim_size(0);
|
const int filter_rows = filter.dim_size(0);
|
||||||
const int filter_cols = filter.dim_size(1);
|
const int filter_cols = filter.dim_size(1);
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(context, depth == filter.dim_size(2),
|
||||||
context, depth == filter.dim_size(2),
|
errors::InvalidArgument(
|
||||||
errors::InvalidArgument("input and filter must have the same depth: ",
|
"input and filter must have the same depth: ", depth, " vs ",
|
||||||
depth, " vs ", filter.dim_size(2)));
|
filter.dim_size(2)));
|
||||||
|
|
||||||
// Effective filter size, after introducing rate - 1 zeros between each
|
// Effective filter size, after introducing rate - 1 zeros between each
|
||||||
// non-zero filter element.
|
// non-zero filter element.
|
||||||
@ -234,7 +234,8 @@ class DilationBackpropInputOp : public OpKernel {
|
|||||||
// [ batch, out_rows, out_cols, depth ]
|
// [ batch, out_rows, out_cols, depth ]
|
||||||
const int batch = input.dim_size(0);
|
const int batch = input.dim_size(0);
|
||||||
const int depth = input.dim_size(3);
|
const int depth = input.dim_size(3);
|
||||||
OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
|
OP_REQUIRES(context,
|
||||||
|
batch == out_backprop.dim_size(0) &&
|
||||||
out_rows == out_backprop.dim_size(1) &&
|
out_rows == out_backprop.dim_size(1) &&
|
||||||
out_cols == out_backprop.dim_size(2) &&
|
out_cols == out_backprop.dim_size(2) &&
|
||||||
depth == out_backprop.dim_size(3),
|
depth == out_backprop.dim_size(3),
|
||||||
@ -353,7 +354,8 @@ class DilationBackpropFilterOp : public OpKernel {
|
|||||||
// [ batch, out_rows, out_cols, depth ]
|
// [ batch, out_rows, out_cols, depth ]
|
||||||
const int batch = input.dim_size(0);
|
const int batch = input.dim_size(0);
|
||||||
const int depth = input.dim_size(3);
|
const int depth = input.dim_size(3);
|
||||||
OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
|
OP_REQUIRES(context,
|
||||||
|
batch == out_backprop.dim_size(0) &&
|
||||||
out_rows == out_backprop.dim_size(1) &&
|
out_rows == out_backprop.dim_size(1) &&
|
||||||
out_cols == out_backprop.dim_size(2) &&
|
out_cols == out_backprop.dim_size(2) &&
|
||||||
depth == out_backprop.dim_size(3),
|
depth == out_backprop.dim_size(3),
|
||||||
|
@ -61,9 +61,8 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
|
|||||||
const int w_in = w_beg + w * rate_cols;
|
const int w_in = w_beg + w * rate_cols;
|
||||||
if (w_in >= 0 && w_in < input_cols) {
|
if (w_in >= 0 && w_in < input_cols) {
|
||||||
const T val =
|
const T val =
|
||||||
input_ptr[d +
|
input_ptr[d + depth * (w_in +
|
||||||
depth *
|
input_cols * (h_in + input_rows * b))] +
|
||||||
(w_in + input_cols * (h_in + input_rows * b))] +
|
|
||||||
filter_ptr[d + depth * (w + filter_cols * h)];
|
filter_ptr[d + depth * (w + filter_cols * h)];
|
||||||
if (val > cur_val) {
|
if (val > cur_val) {
|
||||||
cur_val = val;
|
cur_val = val;
|
||||||
@ -106,9 +105,8 @@ __global__ void DilationBackpropInputKernel(
|
|||||||
const int w_in = w_beg + w * rate_cols;
|
const int w_in = w_beg + w * rate_cols;
|
||||||
if (w_in >= 0 && w_in < input_cols) {
|
if (w_in >= 0 && w_in < input_cols) {
|
||||||
const T val =
|
const T val =
|
||||||
input_ptr[d +
|
input_ptr[d + depth * (w_in +
|
||||||
depth *
|
input_cols * (h_in + input_rows * b))] +
|
||||||
(w_in + input_cols * (h_in + input_rows * b))] +
|
|
||||||
filter_ptr[d + depth * (w + filter_cols * h)];
|
filter_ptr[d + depth * (w + filter_cols * h)];
|
||||||
if (val > cur_val) {
|
if (val > cur_val) {
|
||||||
cur_val = val;
|
cur_val = val;
|
||||||
@ -156,9 +154,8 @@ __global__ void DilationBackpropFilterKernel(
|
|||||||
const int w_in = w_beg + w * rate_cols;
|
const int w_in = w_beg + w * rate_cols;
|
||||||
if (w_in >= 0 && w_in < input_cols) {
|
if (w_in >= 0 && w_in < input_cols) {
|
||||||
const T val =
|
const T val =
|
||||||
input_ptr[d +
|
input_ptr[d + depth * (w_in +
|
||||||
depth *
|
input_cols * (h_in + input_rows * b))] +
|
||||||
(w_in + input_cols * (h_in + input_rows * b))] +
|
|
||||||
filter_ptr[d + depth * (w + filter_cols * h)];
|
filter_ptr[d + depth * (w + filter_cols * h)];
|
||||||
if (val > cur_val) {
|
if (val > cur_val) {
|
||||||
cur_val = val;
|
cur_val = val;
|
||||||
|
@ -29,8 +29,7 @@ template <class T>
|
|||||||
class DrawBoundingBoxesOp : public OpKernel {
|
class DrawBoundingBoxesOp : public OpKernel {
|
||||||
public:
|
public:
|
||||||
explicit DrawBoundingBoxesOp(OpKernelConstruction* context)
|
explicit DrawBoundingBoxesOp(OpKernelConstruction* context)
|
||||||
: OpKernel(context) {
|
: OpKernel(context) {}
|
||||||
}
|
|
||||||
|
|
||||||
void Compute(OpKernelContext* context) override {
|
void Compute(OpKernelContext* context) override {
|
||||||
const Tensor& images = context->input(0);
|
const Tensor& images = context->input(0);
|
||||||
@ -94,35 +93,28 @@ class DrawBoundingBoxesOp : public OpKernel {
|
|||||||
int64 color_index = bb % color_table_length;
|
int64 color_index = bb % color_table_length;
|
||||||
const int64 min_box_row =
|
const int64 min_box_row =
|
||||||
static_cast<float>(tboxes(b, bb, 0)) * (height - 1);
|
static_cast<float>(tboxes(b, bb, 0)) * (height - 1);
|
||||||
const int64 min_box_row_clamp =
|
const int64 min_box_row_clamp = std::max<int64>(min_box_row, 0);
|
||||||
std::max<int64>(min_box_row, 0);
|
|
||||||
const int64 max_box_row =
|
const int64 max_box_row =
|
||||||
static_cast<float>(tboxes(b, bb, 2)) * (height - 1);
|
static_cast<float>(tboxes(b, bb, 2)) * (height - 1);
|
||||||
const int64 max_box_row_clamp =
|
const int64 max_box_row_clamp =
|
||||||
std::min<int64>(max_box_row, height - 1);
|
std::min<int64>(max_box_row, height - 1);
|
||||||
const int64 min_box_col =
|
const int64 min_box_col =
|
||||||
static_cast<float>(tboxes(b, bb, 1)) * (width - 1);
|
static_cast<float>(tboxes(b, bb, 1)) * (width - 1);
|
||||||
const int64 min_box_col_clamp =
|
const int64 min_box_col_clamp = std::max<int64>(min_box_col, 0);
|
||||||
std::max<int64>(min_box_col, 0);
|
|
||||||
const int64 max_box_col =
|
const int64 max_box_col =
|
||||||
static_cast<float>(tboxes(b, bb, 3)) * (width - 1);
|
static_cast<float>(tboxes(b, bb, 3)) * (width - 1);
|
||||||
const int64 max_box_col_clamp =
|
const int64 max_box_col_clamp = std::min<int64>(max_box_col, width - 1);
|
||||||
std::min<int64>(max_box_col, width - 1);
|
|
||||||
|
|
||||||
if (min_box_row > max_box_row || min_box_col > max_box_col) {
|
if (min_box_row > max_box_row || min_box_col > max_box_col) {
|
||||||
LOG(WARNING) << "Bounding box (" << min_box_row
|
LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
|
||||||
<< "," << min_box_col
|
<< "," << max_box_row << "," << max_box_col
|
||||||
<< "," << max_box_row
|
|
||||||
<< "," << max_box_col
|
|
||||||
<< ") is inverted and will not be drawn.";
|
<< ") is inverted and will not be drawn.";
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (min_box_row >= height || max_box_row < 0 ||
|
if (min_box_row >= height || max_box_row < 0 || min_box_col >= width ||
|
||||||
min_box_col >= width || max_box_col < 0) {
|
max_box_col < 0) {
|
||||||
LOG(WARNING) << "Bounding box (" << min_box_row
|
LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
|
||||||
<< "," << min_box_col
|
<< "," << max_box_row << "," << max_box_col
|
||||||
<< "," << max_box_row
|
|
||||||
<< "," << max_box_col
|
|
||||||
<< ") is completely outside the image"
|
<< ") is completely outside the image"
|
||||||
<< " and will not be drawn.";
|
<< " and will not be drawn.";
|
||||||
continue;
|
continue;
|
||||||
|
@ -103,7 +103,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
|
|||||||
// Walk through data and copy the data to the appropriate output tensor
|
// Walk through data and copy the data to the appropriate output tensor
|
||||||
const auto data_flat = data->flat<T>();
|
const auto data_flat = data->flat<T>();
|
||||||
std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
|
std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
|
||||||
Eigen::Aligned> > out_vec;
|
Eigen::Aligned> >
|
||||||
|
out_vec;
|
||||||
out_vec.reserve(num_partitions_);
|
out_vec.reserve(num_partitions_);
|
||||||
for (int p = 0; p < num_partitions_; p++) {
|
for (int p = 0; p < num_partitions_; p++) {
|
||||||
out_vec.push_back(outputs[p]->vec<T>());
|
out_vec.push_back(outputs[p]->vec<T>());
|
||||||
@ -124,7 +125,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
|
|||||||
} else {
|
} else {
|
||||||
// If data has extra dimensions, use Eigen slices
|
// If data has extra dimensions, use Eigen slices
|
||||||
std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
|
std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
|
||||||
Eigen::Aligned> > out_flat;
|
Eigen::Aligned> >
|
||||||
|
out_flat;
|
||||||
out_flat.reserve(num_partitions_);
|
out_flat.reserve(num_partitions_);
|
||||||
for (int p = 0; p < num_partitions_; p++) {
|
for (int p = 0; p < num_partitions_; p++) {
|
||||||
out_flat.push_back(outputs[p]->flat_outer_dims<T>());
|
out_flat.push_back(outputs[p]->flat_outer_dims<T>());
|
||||||
|
@ -79,8 +79,8 @@ template <typename T>
|
|||||||
void RangeInit(const GPUDevice& d, const T start, const T delta,
|
void RangeInit(const GPUDevice& d, const T start, const T delta,
|
||||||
const int32 size, typename TTypes<T>::Flat out) {
|
const int32 size, typename TTypes<T>::Flat out) {
|
||||||
CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
|
CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
|
||||||
RangeInitKernel<
|
RangeInitKernel<T>
|
||||||
T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||||
start, delta, size, out.data());
|
start, delta, size, out.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,9 +103,8 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
|
|||||||
T* out, int64 gather_dim_size, int64 indices_size,
|
T* out, int64 gather_dim_size, int64 indices_size,
|
||||||
int64 slice_size, int64 out_size) {
|
int64 slice_size, int64 out_size) {
|
||||||
CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
|
CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
|
||||||
GatherOpKernel<
|
GatherOpKernel<T, int32, true>
|
||||||
T, int32,
|
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
||||||
true><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
|
|
||||||
params, indices, out, gather_dim_size, indices_size, slice_size,
|
params, indices, out, gather_dim_size, indices_size, slice_size,
|
||||||
out_size);
|
out_size);
|
||||||
}
|
}
|
||||||
@ -231,10 +230,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
|
|||||||
|
|
||||||
OP_REQUIRES_ASYNC(
|
OP_REQUIRES_ASYNC(
|
||||||
c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()),
|
c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()),
|
||||||
errors::InvalidArgument("data.shape must start with partitions.shape, ",
|
errors::InvalidArgument(
|
||||||
|
"data.shape must start with partitions.shape, ",
|
||||||
"got data.shape = ", data.shape().DebugString(),
|
"got data.shape = ", data.shape().DebugString(),
|
||||||
", partitions.shape = ",
|
", partitions.shape = ", partitions.shape().DebugString()),
|
||||||
partitions.shape().DebugString()),
|
|
||||||
done);
|
done);
|
||||||
|
|
||||||
Tensor partition_count;
|
Tensor partition_count;
|
||||||
@ -245,7 +244,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
|
|||||||
AllocatorAttributes alloc_attr;
|
AllocatorAttributes alloc_attr;
|
||||||
alloc_attr.set_on_host(true);
|
alloc_attr.set_on_host(true);
|
||||||
OP_REQUIRES_OK_ASYNC(
|
OP_REQUIRES_OK_ASYNC(
|
||||||
c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
|
c,
|
||||||
|
c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
|
||||||
&partition_count, alloc_attr),
|
&partition_count, alloc_attr),
|
||||||
done);
|
done);
|
||||||
auto e_part_count = partition_count.flat<int32>();
|
auto e_part_count = partition_count.flat<int32>();
|
||||||
@ -259,7 +259,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
|
|||||||
|
|
||||||
// Prepare for counting.
|
// Prepare for counting.
|
||||||
OP_REQUIRES_OK_ASYNC(
|
OP_REQUIRES_OK_ASYNC(
|
||||||
c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
|
c,
|
||||||
|
c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
|
||||||
&partition_count),
|
&partition_count),
|
||||||
done);
|
done);
|
||||||
Tensor indices_out;
|
Tensor indices_out;
|
||||||
@ -280,7 +281,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
|
|||||||
alloc_attr.set_on_host(true);
|
alloc_attr.set_on_host(true);
|
||||||
alloc_attr.set_gpu_compatible(true);
|
alloc_attr.set_gpu_compatible(true);
|
||||||
OP_REQUIRES_OK_ASYNC(
|
OP_REQUIRES_OK_ASYNC(
|
||||||
c, c->allocate_temp(partition_count.dtype(), partition_count.shape(),
|
c,
|
||||||
|
c->allocate_temp(partition_count.dtype(), partition_count.shape(),
|
||||||
&cpu_tensor, alloc_attr),
|
&cpu_tensor, alloc_attr),
|
||||||
done);
|
done);
|
||||||
perftools::gputools::DeviceMemoryBase wrapped(
|
perftools::gputools::DeviceMemoryBase wrapped(
|
||||||
@ -340,8 +342,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
|
|||||||
indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
|
indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
|
||||||
// Allocate temporary storage.
|
// Allocate temporary storage.
|
||||||
OP_REQUIRES_OK_ASYNC(
|
OP_REQUIRES_OK_ASYNC(
|
||||||
c, c->allocate_temp(
|
c,
|
||||||
DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
|
c->allocate_temp(DT_INT8,
|
||||||
|
TensorShape({static_cast<int64>(temp_storage_bytes)}),
|
||||||
&cub_temp_storage),
|
&cub_temp_storage),
|
||||||
done);
|
done);
|
||||||
// Radix-sort the partition information.
|
// Radix-sort the partition information.
|
||||||
@ -376,7 +379,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
|
|||||||
zero_functor(device, partition_count->flat<int32>());
|
zero_functor(device, partition_count->flat<int32>());
|
||||||
// Allocate memory for aggregates_out.
|
// Allocate memory for aggregates_out.
|
||||||
OP_REQUIRES_OK_ASYNC(
|
OP_REQUIRES_OK_ASYNC(
|
||||||
c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
|
c,
|
||||||
|
c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
|
||||||
&aggregates_out),
|
&aggregates_out),
|
||||||
done);
|
done);
|
||||||
// Obtain the pointers to inner buffers.
|
// Obtain the pointers to inner buffers.
|
||||||
@ -408,8 +412,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
|
|||||||
num_runs_ptr, reduction_op, N, cu_stream);
|
num_runs_ptr, reduction_op, N, cu_stream);
|
||||||
// Allocate temporary storage.
|
// Allocate temporary storage.
|
||||||
OP_REQUIRES_OK_ASYNC(
|
OP_REQUIRES_OK_ASYNC(
|
||||||
c, c->allocate_temp(
|
c,
|
||||||
DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
|
c->allocate_temp(DT_INT8,
|
||||||
|
TensorShape({static_cast<int64>(temp_storage_bytes)}),
|
||||||
&cub_temp_storage),
|
&cub_temp_storage),
|
||||||
done);
|
done);
|
||||||
// Run reduce-by-key. The effect is that we count how many times
|
// Run reduce-by-key. The effect is that we count how many times
|
||||||
|
@ -23,7 +23,7 @@ namespace {
|
|||||||
void EigenApprox(float a, float b) {
|
void EigenApprox(float a, float b) {
|
||||||
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
|
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
|
||||||
}
|
}
|
||||||
}
|
} // namespace
|
||||||
|
|
||||||
TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) {
|
TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) {
|
||||||
const ptrdiff_t depth = 3;
|
const ptrdiff_t depth = 3;
|
||||||
|
@ -25,31 +25,43 @@ namespace Eigen {
|
|||||||
*
|
*
|
||||||
* \brief Extract glimpses from an input tensor.
|
* \brief Extract glimpses from an input tensor.
|
||||||
*
|
*
|
||||||
* The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch).
|
* The input parameter is expected to be a col-major tensor with a rank of 4
|
||||||
* The width and height parameters specify the extension of the returned glimpses.
|
* (depth, x, y, and batch). The width and height parameters specify the
|
||||||
* The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension.
|
* extension of the returned glimpses. The offsets parameter specifies the x, y
|
||||||
* The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension.
|
* locations of the center of the glimpses relative to the center of the input
|
||||||
* The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center.
|
* image. The vector is expected to contain one IndexPair for each image in the
|
||||||
|
* batch dimension. The normalized boolean indicates if incoming coordinates are
|
||||||
|
* normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each
|
||||||
|
* height and width dimension. The centered boolean indicates if incoming
|
||||||
|
* coordinates are centered relative to the image, in which case -1.0 and 1.0
|
||||||
|
* correspond to minimum and maximum of each dimension while 0.0 corresponds to
|
||||||
|
* the center.
|
||||||
*
|
*
|
||||||
* The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch).
|
* The result can be assigned to a tensor of rank equal to that of the input.
|
||||||
* The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size.
|
* The result will be laid out in col-major order (depth, x, y, batch). The
|
||||||
|
* dimensions of the result will be equal to the dimensions of the input except
|
||||||
|
* for width and height which will be equal to the requested glimpse size.
|
||||||
*/
|
*/
|
||||||
namespace {
|
namespace {
|
||||||
template <typename Index>
|
template <typename Index>
|
||||||
struct GlimpseExtractionOp {
|
struct GlimpseExtractionOp {
|
||||||
GlimpseExtractionOp(const Index width, const Index height,
|
GlimpseExtractionOp(const Index width, const Index height,
|
||||||
const std::vector<IndexPair<float> >& offsets,
|
const std::vector<IndexPair<float> >& offsets,
|
||||||
const bool normalized,
|
const bool normalized, const bool centered,
|
||||||
const bool centered,
|
const bool uniform_noise)
|
||||||
const bool uniform_noise) :
|
: width_(width),
|
||||||
width_(width), height_(height), offsets_(offsets),
|
height_(height),
|
||||||
normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { }
|
offsets_(offsets),
|
||||||
|
normalized_(normalized),
|
||||||
|
centered_(centered),
|
||||||
|
uniform_noise_(uniform_noise) {}
|
||||||
|
|
||||||
template <typename Input>
|
template <typename Input>
|
||||||
DSizes<Index, 4> dimensions(const Input& input) const {
|
DSizes<Index, 4> dimensions(const Input& input) const {
|
||||||
typedef typename internal::traits<Input>::Index IndexType;
|
typedef typename internal::traits<Input>::Index IndexType;
|
||||||
typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
|
typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
|
||||||
internal::traits<Input>::Layout, IndexType> > Ref;
|
internal::traits<Input>::Layout, IndexType> >
|
||||||
|
Ref;
|
||||||
Ref in(input);
|
Ref in(input);
|
||||||
|
|
||||||
DSizes<Index, 4> dims = in.dimensions();
|
DSizes<Index, 4> dims = in.dimensions();
|
||||||
@ -62,12 +74,12 @@ struct GlimpseExtractionOp {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename Input, typename Output, typename Device>
|
template <typename Input, typename Output, typename Device>
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC void eval(const Input& input, Output& output,
|
||||||
void eval(const Input& input, Output& output, const Device& device) const
|
const Device& device) const {
|
||||||
{
|
|
||||||
typedef typename internal::traits<Input>::Index IndexType;
|
typedef typename internal::traits<Input>::Index IndexType;
|
||||||
typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
|
typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
|
||||||
internal::traits<Input>::Layout, IndexType> > Ref;
|
internal::traits<Input>::Layout, IndexType> >
|
||||||
|
Ref;
|
||||||
Ref in(input);
|
Ref in(input);
|
||||||
const Index num_channels = in.dimension(0);
|
const Index num_channels = in.dimension(0);
|
||||||
const Index input_width = in.dimension(1);
|
const Index input_width = in.dimension(1);
|
||||||
@ -215,21 +227,22 @@ struct GlimpseExtractionOp {
|
|||||||
const bool centered_;
|
const bool centered_;
|
||||||
const bool uniform_noise_;
|
const bool uniform_noise_;
|
||||||
};
|
};
|
||||||
}
|
} // namespace
|
||||||
|
|
||||||
|
|
||||||
template <typename Input>
|
template <typename Input>
|
||||||
EIGEN_ALWAYS_INLINE
|
EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<
|
||||||
static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input>
|
const GlimpseExtractionOp<typename internal::traits<Input>::Index>,
|
||||||
|
const Input>
|
||||||
ExtractGlimpses(const Input& input,
|
ExtractGlimpses(const Input& input,
|
||||||
const typename internal::traits<Input>::Index width,
|
const typename internal::traits<Input>::Index width,
|
||||||
const typename internal::traits<Input>::Index height,
|
const typename internal::traits<Input>::Index height,
|
||||||
const std::vector<IndexPair<float> >& offsets,
|
const std::vector<IndexPair<float> >& offsets,
|
||||||
const bool normalized = true, const bool centered = true,
|
const bool normalized = true, const bool centered = true,
|
||||||
const bool uniform_noise = true)
|
const bool uniform_noise = true) {
|
||||||
{
|
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
|
||||||
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||||
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
|
||||||
|
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||||
|
|
||||||
typedef typename internal::traits<Input>::Index Index;
|
typedef typename internal::traits<Input>::Index Index;
|
||||||
const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
|
const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
|
||||||
|
@ -23,7 +23,7 @@ namespace {
|
|||||||
void EigenApprox(float a, float b) {
|
void EigenApprox(float a, float b) {
|
||||||
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
|
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
|
||||||
}
|
}
|
||||||
}
|
} // namespace
|
||||||
|
|
||||||
TEST(EigenAttentionTest, Simple) {
|
TEST(EigenAttentionTest, Simple) {
|
||||||
const ptrdiff_t depth = 3;
|
const ptrdiff_t depth = 3;
|
||||||
|
@ -25,7 +25,7 @@ void EigenApprox(float a, float b) {
|
|||||||
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
|
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
|
||||||
}
|
}
|
||||||
static int ceil_div(int a, int b) { return (a + b - 1) / b; }
|
static int ceil_div(int a, int b) { return (a + b - 1) / b; }
|
||||||
}
|
} // namespace
|
||||||
|
|
||||||
TEST(EigenBackwardSpatialConvolutionsTest,
|
TEST(EigenBackwardSpatialConvolutionsTest,
|
||||||
test_simple_spatial_convolution_backward_input_valid) {
|
test_simple_spatial_convolution_backward_input_valid) {
|
||||||
|
@ -23,7 +23,7 @@ namespace {
|
|||||||
void EigenApprox(float a, float b) {
|
void EigenApprox(float a, float b) {
|
||||||
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
|
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
|
||||||
}
|
}
|
||||||
}
|
} // namespace
|
||||||
|
|
||||||
TEST(EigenPoolingTest, Simple) {
|
TEST(EigenPoolingTest, Simple) {
|
||||||
const int depth = 10;
|
const int depth = 10;
|
||||||
|
@ -25,9 +25,11 @@ namespace Eigen {
|
|||||||
*
|
*
|
||||||
* \brief Applies a softmax
|
* \brief Applies a softmax
|
||||||
*
|
*
|
||||||
* The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other).
|
* The input parameter is expected to be a col-major tensor with a rank of 2
|
||||||
|
* (depth and other).
|
||||||
*
|
*
|
||||||
* The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order.
|
* The result can be assigned to a tensor of rank and dimensions equal to that
|
||||||
|
* of the input. The result will be laid out in col-major order.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -41,8 +43,7 @@ struct SoftmaxOp {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename Input, typename Output, typename Device>
|
template <typename Input, typename Output, typename Device>
|
||||||
void eval(const Input& input, Output& output, const Device& device) const
|
void eval(const Input& input, Output& output, const Device& device) const {
|
||||||
{
|
|
||||||
#if !defined(EIGEN_HAS_INDEX_LIST)
|
#if !defined(EIGEN_HAS_INDEX_LIST)
|
||||||
// nvcc doesn't support cxx11
|
// nvcc doesn't support cxx11
|
||||||
Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
|
Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
|
||||||
@ -59,27 +60,35 @@ struct SoftmaxOp {
|
|||||||
Eigen::IndexList<Eigen::type2index<0> > depth_dim;
|
Eigen::IndexList<Eigen::type2index<0> > depth_dim;
|
||||||
Eigen::IndexList<int, Eigen::type2index<1> > bcast;
|
Eigen::IndexList<int, Eigen::type2index<1> > bcast;
|
||||||
bcast.set(0, dimensions(input)[0]);
|
bcast.set(0, dimensions(input)[0]);
|
||||||
Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d;
|
Eigen::IndexList<Eigen::type2index<1>,
|
||||||
|
typename internal::traits<Input>::Index>
|
||||||
|
dims2d;
|
||||||
dims2d.set(1, dimensions(input)[1]);
|
dims2d.set(1, dimensions(input)[1]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp();
|
output.device(device) =
|
||||||
output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
|
((input -
|
||||||
|
input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) *
|
||||||
|
beta_)
|
||||||
|
.exp();
|
||||||
|
output.device(device) =
|
||||||
|
output /
|
||||||
|
(output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const float beta_;
|
const float beta_;
|
||||||
};
|
};
|
||||||
}
|
} // namespace
|
||||||
|
|
||||||
|
|
||||||
template <typename Input>
|
template <typename Input>
|
||||||
EIGEN_ALWAYS_INLINE
|
EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<const SoftmaxOp,
|
||||||
static const TensorCustomUnaryOp<const SoftmaxOp, const Input>
|
const Input>
|
||||||
SoftMax(const Input& input, const float beta)
|
SoftMax(const Input& input, const float beta) {
|
||||||
{
|
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
|
||||||
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||||
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2,
|
||||||
|
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||||
|
|
||||||
const SoftmaxOp op(beta);
|
const SoftmaxOp op(beta);
|
||||||
return input.customOp(op);
|
return input.customOp(op);
|
||||||
|
@ -23,7 +23,7 @@ namespace {
|
|||||||
void EigenApprox(float a, float b) {
|
void EigenApprox(float a, float b) {
|
||||||
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
|
ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
|
||||||
}
|
}
|
||||||
}
|
} // namespace
|
||||||
|
|
||||||
TEST(EigenSoftmaxTest, Simple) {
|
TEST(EigenSoftmaxTest, Simple) {
|
||||||
const int depth = 1024;
|
const int depth = 1024;
|
||||||
|
@ -80,8 +80,9 @@ class EncodeJpegOp : public OpKernel {
|
|||||||
errors::InvalidArgument("image must be 3-dimensional",
|
errors::InvalidArgument("image must be 3-dimensional",
|
||||||
image.shape().DebugString()));
|
image.shape().DebugString()));
|
||||||
|
|
||||||
OP_REQUIRES(context, FastBoundsCheck(image.NumElements(),
|
OP_REQUIRES(
|
||||||
std::numeric_limits<int32>::max()),
|
context,
|
||||||
|
FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
"Cannot encode images with >= max int32 elements"));
|
"Cannot encode images with >= max int32 elements"));
|
||||||
|
|
||||||
@ -100,8 +101,9 @@ class EncodeJpegOp : public OpKernel {
|
|||||||
} else if (channels == 3) {
|
} else if (channels == 3) {
|
||||||
adjusted_flags.format = jpeg::FORMAT_RGB;
|
adjusted_flags.format = jpeg::FORMAT_RGB;
|
||||||
} else {
|
} else {
|
||||||
OP_REQUIRES(context, false, errors::InvalidArgument(
|
OP_REQUIRES(
|
||||||
"image must have 1 or 3 channels, got ",
|
context, false,
|
||||||
|
errors::InvalidArgument("image must have 1 or 3 channels, got ",
|
||||||
image.shape().DebugString()));
|
image.shape().DebugString()));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -346,7 +346,8 @@ class SingleSequenceExampleParserOp : public OpKernel {
|
|||||||
feature_list_sparse_keys[di].scalar<string>()();
|
feature_list_sparse_keys[di].scalar<string>()();
|
||||||
}
|
}
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(
|
||||||
ctx, TensorShapeUtils::IsVector(
|
ctx,
|
||||||
|
TensorShapeUtils::IsVector(
|
||||||
feature_list_dense_missing_assumed_empty->shape()),
|
feature_list_dense_missing_assumed_empty->shape()),
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
"Expected feature_list_dense_missing_assumed_empty ",
|
"Expected feature_list_dense_missing_assumed_empty ",
|
||||||
@ -386,12 +387,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
|
|||||||
required[d] = (def_value.NumElements() == 0); // No default provided.
|
required[d] = (def_value.NumElements() == 0); // No default provided.
|
||||||
|
|
||||||
if (def_value.NumElements() > 0) {
|
if (def_value.NumElements() > 0) {
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(ctx, def_value.shape() == attrs_.context_dense_shapes[d],
|
||||||
ctx, def_value.shape() == attrs_.context_dense_shapes[d],
|
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
"def_value[", d, "].shape() == ",
|
"def_value[", d,
|
||||||
def_value.shape().DebugString(), " != context_dense_shapes_[",
|
"].shape() == ", def_value.shape().DebugString(),
|
||||||
d, "] == ", attrs_.context_dense_shapes[d].DebugString()));
|
" != context_dense_shapes_[", d,
|
||||||
|
"] == ", attrs_.context_dense_shapes[d].DebugString()));
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(
|
||||||
ctx, def_value.dtype() == attrs_.context_dense_types[d],
|
ctx, def_value.dtype() == attrs_.context_dense_types[d],
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
@ -576,12 +577,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
|
|||||||
const Feature& f = fl.feature(t);
|
const Feature& f = fl.feature(t);
|
||||||
bool types_match;
|
bool types_match;
|
||||||
OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
|
OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(ctx, types_match,
|
||||||
ctx, types_match,
|
|
||||||
errors::InvalidArgument(
|
errors::InvalidArgument(
|
||||||
"Name: ", name, ", Feature list: ", key, ", Index: ", t,
|
"Name: ", name, ", Feature list: ", key, ", Index: ", t,
|
||||||
". Data types don't match. ", "Expected type: ",
|
". Data types don't match. ",
|
||||||
DataTypeString(dtype), " Feature is: ", ProtoDebugString(f)));
|
"Expected type: ", DataTypeString(dtype),
|
||||||
|
" Feature is: ", ProtoDebugString(f)));
|
||||||
OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f,
|
OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f,
|
||||||
feature_list_dense_values[d]));
|
feature_list_dense_values[d]));
|
||||||
}
|
}
|
||||||
|
@ -122,13 +122,9 @@ static string D(const char* s) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTER_KERNEL_BUILDER(Name("Fact")
|
REGISTER_KERNEL_BUILDER(
|
||||||
.Device(DEVICE_CPU)
|
Name("Fact").Device(DEVICE_CPU).Label(D("Yoxmos").c_str()), FactOpKernel2);
|
||||||
.Label(D("Yoxmos").c_str()),
|
REGISTER_KERNEL_BUILDER(
|
||||||
FactOpKernel2);
|
Name("Fact").Device(DEVICE_CPU).Label(D("yoxmos").c_str()), FactOpKernel2);
|
||||||
REGISTER_KERNEL_BUILDER(Name("Fact")
|
|
||||||
.Device(DEVICE_CPU)
|
|
||||||
.Label(D("yoxmos").c_str()),
|
|
||||||
FactOpKernel2);
|
|
||||||
|
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
@ -378,8 +378,7 @@ TEST_F(QuantOpsTest, WithArgsGradient_RegularRange) {
|
|||||||
Tensor* output = GetOutput(0);
|
Tensor* output = GetOutput(0);
|
||||||
auto input_flat = GetInput(0).flat<float>();
|
auto input_flat = GetInput(0).flat<float>();
|
||||||
Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
|
Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
|
||||||
FillValues<float>(&expected,
|
FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
|
||||||
{0.0f, input_flat(1), input_flat(2),
|
|
||||||
input_flat(3), input_flat(4), 0.0f});
|
input_flat(3), input_flat(4), 0.0f});
|
||||||
ExpectClose(expected, *output);
|
ExpectClose(expected, *output);
|
||||||
}
|
}
|
||||||
@ -2167,21 +2166,19 @@ TEST_F(QuantOpsTest,
|
|||||||
Tensor* output_bprop_wrt_input = GetOutput(0);
|
Tensor* output_bprop_wrt_input = GetOutput(0);
|
||||||
Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
|
Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
|
||||||
auto grad_flat = GetInput(0).flat<float>();
|
auto grad_flat = GetInput(0).flat<float>();
|
||||||
FillValues<float>(&expected_bprop_wrt_input,
|
FillValues<float>(
|
||||||
{0.0f, grad_flat(1), grad_flat(2),
|
&expected_bprop_wrt_input,
|
||||||
grad_flat(3), grad_flat(4), 0.0f});
|
{0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
|
||||||
ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
|
ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
|
||||||
|
|
||||||
Tensor* output_bprop_wrt_min = GetOutput(1);
|
Tensor* output_bprop_wrt_min = GetOutput(1);
|
||||||
Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
|
Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
|
||||||
FillValues<float>(&expected_bprop_wrt_min,
|
FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
|
||||||
{grad_flat(0), 0.0f, 0.0f});
|
|
||||||
ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
|
ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
|
||||||
|
|
||||||
Tensor* output_bprop_wrt_max = GetOutput(2);
|
Tensor* output_bprop_wrt_max = GetOutput(2);
|
||||||
Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
|
Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
|
||||||
FillValues<float>(&expected_bprop_wrt_max,
|
FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
|
||||||
{0.0f, 0.0f, grad_flat(5)});
|
|
||||||
ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
|
ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2215,21 +2212,19 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_4Bits_NarrowRange) {
|
|||||||
Tensor* output_bprop_wrt_input = GetOutput(0);
|
Tensor* output_bprop_wrt_input = GetOutput(0);
|
||||||
Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
|
Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
|
||||||
auto grad_flat = GetInput(0).flat<float>();
|
auto grad_flat = GetInput(0).flat<float>();
|
||||||
FillValues<float>(&expected_bprop_wrt_input,
|
FillValues<float>(
|
||||||
{0.0f, grad_flat(1), grad_flat(2),
|
&expected_bprop_wrt_input,
|
||||||
grad_flat(3), grad_flat(4), 0.0f});
|
{0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
|
||||||
ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
|
ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
|
||||||
|
|
||||||
Tensor* output_bprop_wrt_min = GetOutput(1);
|
Tensor* output_bprop_wrt_min = GetOutput(1);
|
||||||
Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
|
Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
|
||||||
FillValues<float>(&expected_bprop_wrt_min,
|
FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
|
||||||
{grad_flat(0), 0.0f, 0.0f});
|
|
||||||
ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
|
ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
|
||||||
|
|
||||||
Tensor* output_bprop_wrt_max = GetOutput(2);
|
Tensor* output_bprop_wrt_max = GetOutput(2);
|
||||||
Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
|
Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
|
||||||
FillValues<float>(&expected_bprop_wrt_max,
|
FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
|
||||||
{0.0f, 0.0f, grad_flat(5)});
|
|
||||||
ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
|
ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2270,8 +2265,7 @@ TEST_F(QuantOpsTest,
|
|||||||
Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
|
Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
|
||||||
TensorShape({1, 2, 3, 4}));
|
TensorShape({1, 2, 3, 4}));
|
||||||
auto grad_flat = GetInput(0).flat<float>();
|
auto grad_flat = GetInput(0).flat<float>();
|
||||||
FillValues<float>(
|
FillValues<float>(&expected_bprop_wrt_input,
|
||||||
&expected_bprop_wrt_input,
|
|
||||||
{0.0f, grad_flat(1), grad_flat(2), 0.0f,
|
{0.0f, grad_flat(1), grad_flat(2), 0.0f,
|
||||||
0.0f, grad_flat(5), grad_flat(6), 0.0f,
|
0.0f, grad_flat(5), grad_flat(6), 0.0f,
|
||||||
0.0f, grad_flat(9), grad_flat(10), 0.0f,
|
0.0f, grad_flat(9), grad_flat(10), 0.0f,
|
||||||
|
@ -255,8 +255,8 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
|
|||||||
// TODO(josh11b): This makes two copies of callback, avoid this if possible.
|
// TODO(josh11b): This makes two copies of callback, avoid this if possible.
|
||||||
dequeue_attempts_.emplace_back(
|
dequeue_attempts_.emplace_back(
|
||||||
num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
|
num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
|
||||||
[callback, allow_small_batch, this](Attempt* attempt)
|
[callback, allow_small_batch,
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(mu_) {
|
this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
|
||||||
int64 queue_size = queues_[0].size();
|
int64 queue_size = queues_[0].size();
|
||||||
|
|
||||||
if (closed_ && queue_size < attempt->elements_requested) {
|
if (closed_ && queue_size < attempt->elements_requested) {
|
||||||
@ -317,9 +317,8 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
|
|||||||
const TensorShape shape =
|
const TensorShape shape =
|
||||||
ManyOutShape(i, attempt->elements_requested);
|
ManyOutShape(i, attempt->elements_requested);
|
||||||
Tensor element;
|
Tensor element;
|
||||||
attempt->context->SetStatus(
|
attempt->context->SetStatus(attempt->context->allocate_temp(
|
||||||
attempt->context->allocate_temp(component_dtypes_[i],
|
component_dtypes_[i], shape, &element));
|
||||||
shape, &element));
|
|
||||||
if (!attempt->context->status().ok()) return kComplete;
|
if (!attempt->context->status().ok()) return kComplete;
|
||||||
attempt->tuple.emplace_back(element);
|
attempt->tuple.emplace_back(element);
|
||||||
}
|
}
|
||||||
@ -327,8 +326,8 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
|
|||||||
result = kProgress;
|
result = kProgress;
|
||||||
Tuple tuple;
|
Tuple tuple;
|
||||||
DequeueLocked(attempt->context, &tuple);
|
DequeueLocked(attempt->context, &tuple);
|
||||||
const int64 index = attempt->tuple[0].dim_size(0) -
|
const int64 index =
|
||||||
attempt->elements_requested;
|
attempt->tuple[0].dim_size(0) - attempt->elements_requested;
|
||||||
for (int i = 0; i < num_components(); ++i) {
|
for (int i = 0; i < num_components(); ++i) {
|
||||||
attempt->context->SetStatus(batch_util::CopyElementToSlice(
|
attempt->context->SetStatus(batch_util::CopyElementToSlice(
|
||||||
std::move(tuple[i]), &attempt->tuple[i], index));
|
std::move(tuple[i]), &attempt->tuple[i], index));
|
||||||
|
@ -18,8 +18,8 @@ limitations under the License.
|
|||||||
#define EIGEN_USE_THREADS
|
#define EIGEN_USE_THREADS
|
||||||
|
|
||||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||||
#include "tensorflow/core/framework/tensor_types.h"
|
|
||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
|
#include "tensorflow/core/framework/tensor_types.h"
|
||||||
#include "tensorflow/core/framework/types.h"
|
#include "tensorflow/core/framework/types.h"
|
||||||
#include "tensorflow/core/framework/variant_encode_decode.h"
|
#include "tensorflow/core/framework/variant_encode_decode.h"
|
||||||
|
|
||||||
@ -118,7 +118,8 @@ DEFINE_SETONE_SYCL(double);
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct FillFunctor<Eigen::ThreadPoolDevice, T> {
|
struct FillFunctor<Eigen::ThreadPoolDevice, T> {
|
||||||
void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out,
|
void operator()(const Eigen::ThreadPoolDevice& d,
|
||||||
|
typename TTypes<T>::Flat out,
|
||||||
typename TTypes<T>::ConstScalar in) {
|
typename TTypes<T>::ConstScalar in) {
|
||||||
out.device(d) = out.constant(in());
|
out.device(d) = out.constant(in());
|
||||||
}
|
}
|
||||||
@ -150,8 +151,7 @@ struct FillFunctor<Eigen::SyclDevice, T> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#define DEFINE_FILL_SYCL(T) \
|
#define DEFINE_FILL_SYCL(T) template struct FillFunctor<Eigen::SyclDevice, T>;
|
||||||
template struct FillFunctor<Eigen::SyclDevice, T>;
|
|
||||||
DEFINE_FILL_SYCL(float);
|
DEFINE_FILL_SYCL(float);
|
||||||
DEFINE_FILL_SYCL(double);
|
DEFINE_FILL_SYCL(double);
|
||||||
TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
|
TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
|
||||||
|
@ -232,7 +232,8 @@ class FractionalAvgPoolGradOp : public OpKernel {
|
|||||||
|
|
||||||
// Grab the inputs.
|
// Grab the inputs.
|
||||||
const Tensor& orig_input_tensor_shape = context->input(0);
|
const Tensor& orig_input_tensor_shape = context->input(0);
|
||||||
OP_REQUIRES(context, orig_input_tensor_shape.dims() == 1 &&
|
OP_REQUIRES(context,
|
||||||
|
orig_input_tensor_shape.dims() == 1 &&
|
||||||
orig_input_tensor_shape.NumElements() == 4,
|
orig_input_tensor_shape.NumElements() == 4,
|
||||||
errors::InvalidArgument("original input tensor shape must be"
|
errors::InvalidArgument("original input tensor shape must be"
|
||||||
"1-dimensional and 4 elements"));
|
"1-dimensional and 4 elements"));
|
||||||
|
@ -253,8 +253,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
|
|||||||
args.push_back(ctx->input(i));
|
args.push_back(ctx->input(i));
|
||||||
}
|
}
|
||||||
std::vector<Tensor>* rets = new std::vector<Tensor>;
|
std::vector<Tensor>* rets = new std::vector<Tensor>;
|
||||||
lib->Run(
|
lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) {
|
||||||
opts, handle, args, rets, [ctx, done, rets](const Status& status) {
|
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
ctx->SetStatus(status);
|
ctx->SetStatus(status);
|
||||||
} else if (rets->size() != ctx->num_outputs()) {
|
} else if (rets->size() != ctx->num_outputs()) {
|
||||||
|
@ -68,7 +68,8 @@ void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
|
|||||||
template <class T>
|
template <class T>
|
||||||
void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
|
void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
|
||||||
typename TTypes<T>::Flat out) {
|
typename TTypes<T>::Flat out) {
|
||||||
To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
|
To32Bit(out).device(d) =
|
||||||
|
To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
|
||||||
}
|
}
|
||||||
|
|
||||||
template class VarianceToInvVariance<float>;
|
template class VarianceToInvVariance<float>;
|
||||||
|
@ -18,12 +18,12 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||||
|
|
||||||
|
#include "tensorflow/core/framework/op_kernel.h"
|
||||||
#include "tensorflow/core/framework/tensor_types.h"
|
#include "tensorflow/core/framework/tensor_types.h"
|
||||||
#include "tensorflow/core/framework/type_traits.h"
|
#include "tensorflow/core/framework/type_traits.h"
|
||||||
#include "tensorflow/core/kernels/bounds_check.h"
|
#include "tensorflow/core/kernels/bounds_check.h"
|
||||||
#include "tensorflow/core/platform/prefetch.h"
|
#include "tensorflow/core/platform/prefetch.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
#include "tensorflow/core/framework/op_kernel.h"
|
|
||||||
#include "tensorflow/core/util/work_sharder.h"
|
#include "tensorflow/core/util/work_sharder.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
@ -52,7 +52,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
|
|||||||
const size_t slice_bytes = slice_elems * sizeof(T);
|
const size_t slice_bytes = slice_elems * sizeof(T);
|
||||||
auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
|
auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
|
||||||
mutex mu;
|
mutex mu;
|
||||||
// Store the value of invalidate index for printing error information, it's a shared variable.
|
// Store the value of invalidate index for printing error information, it's a
|
||||||
|
// shared variable.
|
||||||
SliceIndex result = -1;
|
SliceIndex result = -1;
|
||||||
auto work = [&](int64 start, int64 end) {
|
auto work = [&](int64 start, int64 end) {
|
||||||
SliceIndex batch_idx = static_cast<SliceIndex>(start / indices_size);
|
SliceIndex batch_idx = static_cast<SliceIndex>(start / indices_size);
|
||||||
@ -66,7 +67,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
|
|||||||
SliceIndex b_next = batch_idx + 1;
|
SliceIndex b_next = batch_idx + 1;
|
||||||
if ((batch_idx == batch_idx_end && i_next < indices_idx_end) ||
|
if ((batch_idx == batch_idx_end && i_next < indices_idx_end) ||
|
||||||
(i_next < indices_size)) {
|
(i_next < indices_size)) {
|
||||||
port::prefetch<port::PREFETCH_HINT_T0>(¶ms(batch_idx, indices(i_next), 0));
|
port::prefetch<port::PREFETCH_HINT_T0>(
|
||||||
|
¶ms(batch_idx, indices(i_next), 0));
|
||||||
port::prefetch<port::PREFETCH_HINT_T0>(&out(batch_idx, i_next, 0));
|
port::prefetch<port::PREFETCH_HINT_T0>(&out(batch_idx, i_next, 0));
|
||||||
b_next = batch_idx;
|
b_next = batch_idx;
|
||||||
} else if (b_next <= batch_idx_end) {
|
} else if (b_next <= batch_idx_end) {
|
||||||
@ -85,7 +87,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
|
|||||||
// ahead-of-time compilation binary size).
|
// ahead-of-time compilation binary size).
|
||||||
if (is_simple_type<T>::value) {
|
if (is_simple_type<T>::value) {
|
||||||
// Avoid auto-promotion to Index from SliceIndex by casting.
|
// Avoid auto-promotion to Index from SliceIndex by casting.
|
||||||
memcpy(out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
|
memcpy(
|
||||||
|
out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
|
||||||
params_base + (batch_idx * static_cast<SliceIndex>(limit) +
|
params_base + (batch_idx * static_cast<SliceIndex>(limit) +
|
||||||
static_cast<SliceIndex>(index)) *
|
static_cast<SliceIndex>(index)) *
|
||||||
slice_elems,
|
slice_elems,
|
||||||
@ -99,8 +102,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Shard(worker_threads->num_threads, worker_threads->workers, batch_size*indices_size,
|
Shard(worker_threads->num_threads, worker_threads->workers,
|
||||||
slice_elems * sizeof(T), work);
|
batch_size * indices_size, slice_elems * sizeof(T), work);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,7 +146,8 @@ struct GatherFunctorCPU {
|
|||||||
|
|
||||||
template <typename Device, typename T, typename Index>
|
template <typename Device, typename T, typename Index>
|
||||||
struct GatherFunctor {
|
struct GatherFunctor {
|
||||||
int64 operator()(OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor params,
|
int64 operator()(OpKernelContext* ctx,
|
||||||
|
typename TTypes<T, 3>::ConstTensor params,
|
||||||
typename TTypes<Index>::ConstFlat indices,
|
typename TTypes<Index>::ConstFlat indices,
|
||||||
typename TTypes<T, 3>::Tensor out);
|
typename TTypes<T, 3>::Tensor out);
|
||||||
};
|
};
|
||||||
|
@ -106,8 +106,7 @@ class GatherOp : public OpKernel {
|
|||||||
auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
|
auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
|
||||||
|
|
||||||
functor::GatherFunctor<Device, T, Index> functor;
|
functor::GatherFunctor<Device, T, Index> functor;
|
||||||
int64 bad_i = functor(c, params_flat,
|
int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
|
||||||
indices_flat, out_flat);
|
|
||||||
|
|
||||||
OP_REQUIRES(
|
OP_REQUIRES(
|
||||||
c, bad_i < 0,
|
c, bad_i < 0,
|
||||||
|
@ -50,9 +50,8 @@ class HingeLossUpdater : public DualLossUpdater {
|
|||||||
// valid value for new dual = 0
|
// valid value for new dual = 0
|
||||||
// c. new optimal value > 1.0. Then new optimal value should be set to 1.0.
|
// c. new optimal value > 1.0. Then new optimal value should be set to 1.0.
|
||||||
const double candidate_optimal_dual =
|
const double candidate_optimal_dual =
|
||||||
current_dual +
|
current_dual + (label - wx) / (num_loss_partitions * example_weight *
|
||||||
(label - wx) /
|
weighted_example_norm);
|
||||||
(num_loss_partitions * example_weight * weighted_example_norm);
|
|
||||||
if (label * candidate_optimal_dual < 0) {
|
if (label * candidate_optimal_dual < 0) {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
@ -17,16 +17,16 @@ limitations under the License.
|
|||||||
|
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#include "tensorflow/core/kernels/histogram_op.h"
|
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||||
#include "external/cub_archive/cub/device/device_histogram.cuh"
|
#include "external/cub_archive/cub/device/device_histogram.cuh"
|
||||||
#include "tensorflow/core/framework/op_kernel.h"
|
#include "tensorflow/core/framework/op_kernel.h"
|
||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
#include "tensorflow/core/framework/tensor.h"
|
#include "tensorflow/core/framework/tensor.h"
|
||||||
#include "tensorflow/core/framework/tensor_shape.h"
|
#include "tensorflow/core/framework/tensor_shape.h"
|
||||||
|
#include "tensorflow/core/kernels/histogram_op.h"
|
||||||
#include "tensorflow/core/platform/logging.h"
|
#include "tensorflow/core/platform/logging.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
#include "tensorflow/core/util/cuda_kernel_helper.h"
|
#include "tensorflow/core/util/cuda_kernel_helper.h"
|
||||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
@ -104,8 +104,8 @@ struct HistogramFixedWidthFunctor<GPUDevice, T, Tout> {
|
|||||||
/* num_samples */ num_samples,
|
/* num_samples */ num_samples,
|
||||||
/* stream */ stream);
|
/* stream */ stream);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
return errors::Internal("Could not launch HistogramRange: ",
|
return errors::Internal(
|
||||||
cudaGetErrorString(err), ".");
|
"Could not launch HistogramRange: ", cudaGetErrorString(err), ".");
|
||||||
}
|
}
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
@ -109,7 +109,8 @@ struct ImageResizerState {
|
|||||||
ValidateAndCalculateOutputSize(context, input);
|
ValidateAndCalculateOutputSize(context, input);
|
||||||
if (!context->status().ok()) return;
|
if (!context->status().ok()) return;
|
||||||
OP_REQUIRES_OK(context, context->allocate_output(
|
OP_REQUIRES_OK(context, context->allocate_output(
|
||||||
0, TensorShape({input.dim_size(0), out_height,
|
0,
|
||||||
|
TensorShape({input.dim_size(0), out_height,
|
||||||
out_width, input.dim_size(3)}),
|
out_width, input.dim_size(3)}),
|
||||||
&output));
|
&output));
|
||||||
}
|
}
|
||||||
@ -168,7 +169,8 @@ struct ImageResizerGradientState {
|
|||||||
CalculateResizeScale(original_width, resized_width, align_corners_);
|
CalculateResizeScale(original_width, resized_width, align_corners_);
|
||||||
output = nullptr;
|
output = nullptr;
|
||||||
OP_REQUIRES_OK(context, context->allocate_output(
|
OP_REQUIRES_OK(context, context->allocate_output(
|
||||||
0, TensorShape({batch_size, original_height,
|
0,
|
||||||
|
TensorShape({batch_size, original_height,
|
||||||
original_width, channels}),
|
original_width, channels}),
|
||||||
&output));
|
&output));
|
||||||
}
|
}
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user