Merge changes from github.
END_PUBLIC --- Commitc2b8927f2
authored by Dandelion Man?<dandelion@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Fix another d3v4 regression in the graph visualizer. PiperOrigin-RevId: 156343038 --- Commit170f0b350
authored by Peter Hawkins<phawkins@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: [TF:XLA] Add XLA implementation of ResourceStridedSliceAssign. PiperOrigin-RevId: 156341053 --- Commit1390dd68f
authored by Vijay Vasudevan<vrv@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: When Op Type is not registered, log the hostname of the machine that it is running on in the error message, since the message could be routed back during a failure on a remote binary, and it is hard to tell which machine it came from. Ideally, we'd somehow log the name of the binary running instead, but we don't have a function to get that right now. PiperOrigin-RevId: 156337679 --- Commit9ca8a151b
authored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Internal change. PiperOrigin-RevId: 156335942 --- Commit40255434c
authored by Martin Wicke<wicke@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Deprecate contrib/learn/dataframe. To be removed June 15. PiperOrigin-RevId: 156333930 --- Commit7f71b7fbe
authored by A. Unique TensorFlower<gardener@tensorflow.org> Committed by TensorFlower Gardener<gardener@tensorflow.org>: BEGIN_PUBLIC Automated g4 rollback of changelist 156123287 PiperOrigin-RevId: 156503903
This commit is contained in:
parent
c311af00f2
commit
53cb26d05a
tensorflow
c
cc
compiler
aot
tests
tf2xla
BUILD
kernels
gather_op_kernel_float_int32.ccgather_op_kernel_float_int64.ccindex_ops_kernel_argmax_float_1d.ccindex_ops_kernel_argmax_float_2d.ccstrided_slice_op.cc
xla_helpers.ccxla_local_runtime_context.hxla
contrib
boosted_trees/lib/quantiles
cmake
factorization
ffmpeg/default
keras/api/keras/callbacks
learn/python/learn/estimators
makefile
opt
rnn/python/ops
seq2seq/python/ops
verbs
core
BUILD
common_runtime/gpu
distributed_runtime
grappler/costs
kernels
BUILDbasic_ops_benchmark_test.ccdecode_bmp_op.ccdeep_conv2d.ccdepthwise_conv_op.ccfft_ops.cc
hexagon
non_max_suppression_op.ccnon_max_suppression_op_test.ccremote_fused_graph_execute_op_test.cctraining_ops.cctraining_ops.htraining_ops_gpu.cu.cclib
ops
platform/cloud
public
docs_src
examples
java/src/main
java/org/tensorflow
native
63
tensorflow/c/generate-pc.sh
Executable file
63
tensorflow/c/generate-pc.sh
Executable file
@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
TF_PREFIX='/usr/local'
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 OPTIONS"
|
||||
echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
|
||||
echo -e "-v, --version\tset TensorFlow version"
|
||||
echo -e "-h, --help\tdisplay this message"
|
||||
}
|
||||
|
||||
# read the options
|
||||
ARGS=`getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@"`
|
||||
eval set -- "$ARGS"
|
||||
|
||||
# extract options and their arguments into variables.
|
||||
while true ; do
|
||||
case "$1" in
|
||||
-h|--help) usage ; exit ;;
|
||||
-p|--prefix)
|
||||
case "$2" in
|
||||
"") shift 2 ;;
|
||||
*) TF_PREFIX=$2 ; shift 2 ;;
|
||||
esac ;;
|
||||
-v|--version)
|
||||
case "$2" in
|
||||
"") shift 2 ;;
|
||||
*) TF_VERSION=$2 ; shift 2 ;;
|
||||
esac ;;
|
||||
--) shift ; echo "Try '$0 --help' for more information."; exit 1 ;;
|
||||
*) echo "Internal error! Try '$0 --help' for more information." ; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
|
||||
|
||||
cat << EOF > tensorflow.pc
|
||||
prefix=${TF_PREFIX}
|
||||
exec_prefix=\${prefix}
|
||||
libdir=\${exec_prefix}/lib
|
||||
includedir=\${prefix}/include
|
||||
|
||||
Name: TensorFlow
|
||||
Version: ${TF_VERSION}
|
||||
Description: Library for computation using data flow graphs for scalable machine learning
|
||||
Requires:
|
||||
Libs: -L\${libdir} -ltensorflow
|
||||
Cflags: -I\${includedir}
|
||||
EOF
|
@ -260,7 +260,7 @@ TEST_F(GradientsTest, StackUnstack_StopBackprop) {
|
||||
}
|
||||
|
||||
TEST_F(GradientsTest, DependentGradOutputs) {
|
||||
// Tests that dependant gradients (in this case the gradients w.r.t to the
|
||||
// Tests that dependent gradients (in this case the gradients w.r.t to the
|
||||
// output and one input of MatMul) are computed properly.
|
||||
|
||||
// Create two chained MatMul ops.
|
||||
|
@ -36,7 +36,7 @@ auto* load_attempt_count = monitoring::Counter<2>::New(
|
||||
"status");
|
||||
auto* load_latency = monitoring::Counter<1>::New(
|
||||
"/tensorflow/cc/saved_model/load_latency",
|
||||
"Latency in microseconds for SavedModels that were succesfully loaded.",
|
||||
"Latency in microseconds for SavedModels that were successfully loaded.",
|
||||
"model_path");
|
||||
constexpr char kLoadAttemptFail[] = "fail";
|
||||
constexpr char kLoadAttemptSuccess[] = "success";
|
||||
|
@ -365,7 +365,7 @@ Status GenerateHeader(const HeaderOpts& opts, const Config& config,
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace Eigen { class ThreadPoolDevice; }
|
||||
namespace Eigen { struct ThreadPoolDevice; }
|
||||
|
||||
// (Implementation detail) Entry point to the function in the object file.
|
||||
extern "C" void {{ENTRY}}(
|
||||
|
@ -15,7 +15,7 @@
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace Eigen { class ThreadPoolDevice; }
|
||||
namespace Eigen { struct ThreadPoolDevice; }
|
||||
|
||||
// (Implementation detail) Entry point to the function in the object file.
|
||||
extern "C" void entry_point(
|
||||
|
@ -31,6 +31,8 @@ namespace {
|
||||
inline void* aligned_malloc(size_t size, int minimum_alignment) {
|
||||
#if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN)
|
||||
return memalign(minimum_alignment, size);
|
||||
#elif defined(COMPILER_MSVC)
|
||||
return _aligned_malloc(size, minimum_alignment);
|
||||
#else // !__ANDROID__ && !OS_ANDROID && !OS_CYGWIN
|
||||
void* ptr = nullptr;
|
||||
// posix_memalign requires that the requested alignment be at least
|
||||
@ -45,7 +47,13 @@ inline void* aligned_malloc(size_t size, int minimum_alignment) {
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void aligned_free(void* aligned_memory) { free(aligned_memory); }
|
||||
inline void aligned_free(void* aligned_memory) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
_aligned_free(aligned_memory);
|
||||
#else
|
||||
free(aligned_memory);
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t align_to(size_t n, size_t align) {
|
||||
return (((n - 1) / align) + 1) * align;
|
||||
|
@ -170,6 +170,19 @@ tf_xla_py_test(
|
||||
],
|
||||
)
|
||||
|
||||
tf_xla_py_test(
|
||||
name = "slice_ops_test",
|
||||
size = "small",
|
||||
srcs = ["slice_ops_test.py"],
|
||||
deps = [
|
||||
":xla_test",
|
||||
"//tensorflow/python:array_ops",
|
||||
"//tensorflow/python:data_flow_ops",
|
||||
"//tensorflow/python:framework_for_generated_wrappers",
|
||||
"//tensorflow/python:platform_test",
|
||||
],
|
||||
)
|
||||
|
||||
tf_xla_py_test(
|
||||
name = "function_test",
|
||||
size = "small",
|
||||
|
132
tensorflow/compiler/tests/slice_ops_test.py
Normal file
132
tensorflow/compiler/tests/slice_ops_test.py
Normal file
@ -0,0 +1,132 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Tests for slicing."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
from tensorflow.compiler.tests.xla_test import XLATestCase
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.platform import googletest
|
||||
|
||||
|
||||
class SliceTest(XLATestCase):
|
||||
|
||||
def test1D(self):
|
||||
for dtype in self.numeric_types:
|
||||
with self.test_session():
|
||||
i = array_ops.placeholder(dtype, shape=[10])
|
||||
with self.test_scope():
|
||||
o = array_ops.slice(i, [2], [4])
|
||||
params = {
|
||||
i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
|
||||
}
|
||||
result = o.eval(feed_dict=params)
|
||||
|
||||
self.assertAllEqual([2, 3, 4, 5], result)
|
||||
|
||||
def test3D(self):
|
||||
for dtype in self.numeric_types:
|
||||
with self.test_session():
|
||||
i = array_ops.placeholder(dtype, shape=[3, 3, 10])
|
||||
with self.test_scope():
|
||||
o = array_ops.slice(i, [1, 2, 2], [1, 1, 4])
|
||||
params = {
|
||||
i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
|
||||
[5, 3, 1, 7, 9, 2, 4, 6, 8, 0]],
|
||||
[[5, 5, 5, 5, 5, 5, 5, 5, 5, 5], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
|
||||
[8, 7, 6, 5, 4, 3, 2, 1, 8, 7]],
|
||||
[[7, 5, 7, 5, 7, 5, 7, 5, 7, 5], [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
|
||||
[9, 8, 7, 9, 8, 7, 9, 8, 7, 9]]]
|
||||
}
|
||||
result = o.eval(feed_dict=params)
|
||||
|
||||
self.assertAllEqual([[[6, 5, 4, 3]]], result)
|
||||
|
||||
|
||||
class StridedSliceTest(XLATestCase):
|
||||
|
||||
def test1D(self):
|
||||
for dtype in self.numeric_types:
|
||||
with self.test_session():
|
||||
i = array_ops.placeholder(dtype, shape=[10])
|
||||
with self.test_scope():
|
||||
o = array_ops.strided_slice(i, [2], [6], [2])
|
||||
params = {
|
||||
i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
|
||||
}
|
||||
result = o.eval(feed_dict=params)
|
||||
|
||||
self.assertAllEqual([2, 4], result)
|
||||
|
||||
def test1DNegtiveStride(self):
|
||||
for dtype in self.numeric_types:
|
||||
with self.test_session():
|
||||
i = array_ops.placeholder(dtype, shape=[10])
|
||||
with self.test_scope():
|
||||
o = array_ops.strided_slice(i, [6], [2], [-2])
|
||||
params = {
|
||||
i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
|
||||
}
|
||||
result = o.eval(feed_dict=params)
|
||||
|
||||
self.assertAllEqual([6, 4], result)
|
||||
|
||||
def test3D(self):
|
||||
for dtype in self.numeric_types:
|
||||
with self.test_session():
|
||||
i = array_ops.placeholder(dtype, shape=[3, 3, 10])
|
||||
with self.test_scope():
|
||||
o = array_ops.strided_slice(i, [0, 2, 2], [2, 3, 6], [1, 1, 2])
|
||||
params = {
|
||||
i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
|
||||
[5, 3, 1, 7, 9, 2, 4, 6, 8, 0]],
|
||||
[[5, 5, 5, 5, 5, 5, 5, 5, 5, 5], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
|
||||
[8, 7, 6, 5, 4, 3, 2, 1, 8, 7]],
|
||||
[[7, 5, 7, 5, 7, 5, 7, 5, 7, 5], [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
|
||||
[9, 8, 7, 9, 8, 7, 9, 8, 7, 9]]]
|
||||
}
|
||||
result = o.eval(feed_dict=params)
|
||||
|
||||
self.assertAllEqual([[[1, 9]], [[6, 4]]], result)
|
||||
|
||||
def test3DNegativeStride(self):
|
||||
for dtype in self.numeric_types:
|
||||
with self.test_session():
|
||||
i = array_ops.placeholder(dtype, shape=[3, 4, 10])
|
||||
with self.test_scope():
|
||||
o = array_ops.strided_slice(i, [2, 2, 6], [0, 0, 2], [-1, -1, -2])
|
||||
params = {
|
||||
i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
|
||||
[5, 3, 1, 7, 9, 2, 4, 6, 8, 0], [4, 5, 2, 4, 3, 7, 6, 8, 9,
|
||||
4]],
|
||||
[[5, 5, 5, 5, 5, 5, 5, 5, 5, 5], [4, 3, 4, 5, 7, 6, 5, 3, 4, 5],
|
||||
[8, 7, 6, 5, 4, 3, 2, 1, 8, 7], [7, 1, 7, 1, 8, 1, 8, 1, 3,
|
||||
1]],
|
||||
[[7, 5, 7, 5, 7, 5, 7, 5, 7, 5], [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
|
||||
[9, 8, 7, 9, 8, 7, 9, 8, 7, 9], [9, 9, 5, 5, 6, 6, 3, 3, 6,
|
||||
6]]]
|
||||
}
|
||||
result = o.eval(feed_dict=params)
|
||||
|
||||
self.assertAllEqual([[[9, 8], [1, 1]], [[2, 4], [5, 7]]], result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
googletest.main()
|
@ -45,6 +45,7 @@ cc_library(
|
||||
"//tensorflow/compiler/xla:literal_util",
|
||||
"//tensorflow/compiler/xla:shape_util",
|
||||
"//tensorflow/compiler/xla:statusor",
|
||||
"//tensorflow/compiler/xla:types",
|
||||
"//tensorflow/compiler/xla:xla_data_proto",
|
||||
"//tensorflow/compiler/xla/client:client_library",
|
||||
"//tensorflow/compiler/xla/client:computation",
|
||||
|
@ -20,6 +20,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/kernels/gather_functor.h"
|
||||
#include "tensorflow/core/platform/dynamic_annotations.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
@ -63,7 +64,6 @@ EIGEN_STRONG_INLINE void gather_float_int32_xla_impl(float* out, void** data) {
|
||||
|
||||
// Implements gather on CPU. This is called by an XLA custom call, set up by
|
||||
// gather_op.cc.
|
||||
extern "C" void __attribute__((visibility("default")))
|
||||
gather_float_int32_xla_impl(float* out, void** data) {
|
||||
extern "C" void TF_EXPORT gather_float_int32_xla_impl(float* out, void** data) {
|
||||
tensorflow::gather_float_int32_xla_impl(out, data);
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/kernels/gather_functor.h"
|
||||
#include "tensorflow/core/platform/dynamic_annotations.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
@ -63,7 +64,6 @@ EIGEN_STRONG_INLINE void gather_float_int64_xla_impl(float* out, void** data) {
|
||||
|
||||
// Implements gather on CPU. This is called by an XLA custom call, set up by
|
||||
// gather_op.cc.
|
||||
extern "C" void __attribute__((visibility("default")))
|
||||
gather_float_int64_xla_impl(float* out, void** data) {
|
||||
extern "C" void TF_EXPORT gather_float_int64_xla_impl(float* out, void** data) {
|
||||
tensorflow::gather_float_int64_xla_impl(out, data);
|
||||
}
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/platform/dynamic_annotations.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace tensorflow {
|
||||
@ -43,7 +44,6 @@ EIGEN_STRONG_INLINE void argmax_float_1d_xla_impl(void* out, void** data) {
|
||||
|
||||
// Implements argmax on CPU. This is called by an XLA custom call, set up by
|
||||
// index_ops.cc.
|
||||
extern "C" void __attribute__((visibility("default")))
|
||||
argmax_float_1d_xla_impl(void* out, void** data) {
|
||||
extern "C" void TF_EXPORT argmax_float_1d_xla_impl(void* out, void** data) {
|
||||
tensorflow::argmax_float_1d_xla_impl(out, data);
|
||||
}
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/platform/dynamic_annotations.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace tensorflow {
|
||||
@ -45,7 +46,6 @@ EIGEN_STRONG_INLINE void argmax_float_2d_xla_impl(void* out, void** data) {
|
||||
|
||||
// Implements argmax on CPU. This is called by an XLA custom call, set up by
|
||||
// index_ops.cc.
|
||||
extern "C" void __attribute__((visibility("default")))
|
||||
argmax_float_2d_xla_impl(void* out, void** data) {
|
||||
extern "C" void TF_EXPORT argmax_float_2d_xla_impl(void* out, void** data) {
|
||||
tensorflow::argmax_float_2d_xla_impl(out, data);
|
||||
}
|
||||
|
@ -77,11 +77,9 @@ class StridedSliceOp : public XlaOpKernel {
|
||||
|
||||
gtl::InlinedVector<int64, 4> dimensions_to_reverse;
|
||||
gtl::InlinedVector<int64, 4> slice_begin, slice_end;
|
||||
bool simple_strides = true;
|
||||
for (int i = 0; i < begin.size(); ++i) {
|
||||
// TODO(phawkins): implement strides != 1 when b/30878775 is fixed.
|
||||
OP_REQUIRES(
|
||||
ctx, strides[i] == 1 || strides[i] == -1,
|
||||
errors::Unimplemented("Strides != 1 or -1 are not yet implemented"));
|
||||
simple_strides &= (std::abs(strides[i]) == 1);
|
||||
if (strides[i] > 0) {
|
||||
slice_begin.push_back(begin[i]);
|
||||
slice_end.push_back(end[i]);
|
||||
@ -99,6 +97,35 @@ class StridedSliceOp : public XlaOpKernel {
|
||||
slice = ctx->builder()->Rev(slice, dimensions_to_reverse);
|
||||
}
|
||||
|
||||
// If at least one of the strides is > 1 (or < -1) then use Slice
|
||||
// to pull out each of the strided slices, and Concat to put them
|
||||
// together again.
|
||||
if (!simple_strides) {
|
||||
// Re-adjust the begin and end now that the periphery has been
|
||||
// sliced away.
|
||||
for (int d = 0; d < strides.size(); ++d) {
|
||||
slice_end[d] -= slice_begin[d];
|
||||
slice_begin[d] = 0;
|
||||
}
|
||||
|
||||
for (int d = 0; d < strides.size(); ++d) {
|
||||
int64 stride = std::abs(strides[d]);
|
||||
if (stride > 1) {
|
||||
std::vector<xla::ComputationDataHandle> to_concat;
|
||||
int64 end = slice_end[d];
|
||||
for (int64 i = 0; i < end; i += stride) {
|
||||
slice_begin[d] = i;
|
||||
slice_end[d] = i + 1;
|
||||
to_concat.push_back(
|
||||
ctx->builder()->Slice(slice, slice_begin, slice_end));
|
||||
}
|
||||
slice = ctx->builder()->ConcatInDim(to_concat, d);
|
||||
slice_begin[d] = 0;
|
||||
slice_end[d] = to_concat.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
slice = ctx->builder()->Reshape(slice, final_shape.dim_sizes());
|
||||
ctx->SetOutput(0, slice);
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ limitations under the License.
|
||||
#include "tensorflow/compiler/tf2xla/type_util.h"
|
||||
#include "tensorflow/compiler/tf2xla/xla_context.h"
|
||||
#include "tensorflow/compiler/xla/client/computation_builder.h"
|
||||
#include "tensorflow/compiler/xla/types.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/lib/gtl/array_slice.h"
|
||||
|
||||
@ -89,7 +90,9 @@ xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
|
||||
case xla::U16:
|
||||
LOG(FATAL) << "u16/s16 literals not yet implemented";
|
||||
case xla::F16:
|
||||
LOG(FATAL) << "f16 literals not yet implemented";
|
||||
literal =
|
||||
*xla::LiteralUtil::CreateR0<xla::half>(static_cast<xla::half>(value));
|
||||
break;
|
||||
case xla::TUPLE:
|
||||
LOG(FATAL) << "tuple element type is not integral";
|
||||
case xla::OPAQUE:
|
||||
@ -107,6 +110,9 @@ xla::ComputationDataHandle XlaHelpers::FloatLiteral(xla::ComputationBuilder* b,
|
||||
xla::PrimitiveType type;
|
||||
TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
|
||||
switch (type) {
|
||||
case xla::F16:
|
||||
return b->ConstantR0<xla::half>(static_cast<xla::half>(value));
|
||||
break;
|
||||
case xla::F32:
|
||||
return b->ConstantR0<float>(static_cast<float>(value));
|
||||
break;
|
||||
|
@ -23,7 +23,7 @@ limitations under the License.
|
||||
// actually used. E.g. some ahead-of-time compiled computations don't need a
|
||||
// thread pool.
|
||||
namespace Eigen {
|
||||
class ThreadPoolDevice;
|
||||
struct ThreadPoolDevice;
|
||||
}
|
||||
|
||||
namespace tensorflow {
|
||||
|
@ -59,7 +59,10 @@ cc_library(
|
||||
name = "types",
|
||||
hdrs = ["types.h"],
|
||||
visibility = [":friends"],
|
||||
deps = ["//tensorflow/core:lib"],
|
||||
deps = [
|
||||
"//tensorflow/core:lib",
|
||||
"//third_party/eigen3",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
|
@ -148,6 +148,9 @@ template <typename T>
|
||||
case S64:
|
||||
return CopyRange<int64>(src_literal, src_base, dest_literal, dest_base,
|
||||
copy_size);
|
||||
case F16:
|
||||
return CopyRange<half>(src_literal, src_base, dest_literal, dest_base,
|
||||
copy_size);
|
||||
case F32:
|
||||
return CopyRange<float>(src_literal, src_base, dest_literal, dest_base,
|
||||
copy_size);
|
||||
@ -178,6 +181,8 @@ template <typename T>
|
||||
return *LiteralUtil::CreateR0<int32>(0);
|
||||
case S64:
|
||||
return *LiteralUtil::CreateR0<int64>(0);
|
||||
case F16:
|
||||
return *LiteralUtil::CreateR0<half>(static_cast<half>(0.0f));
|
||||
case F32:
|
||||
return *LiteralUtil::CreateR0<float>(0);
|
||||
case F64:
|
||||
@ -187,8 +192,6 @@ template <typename T>
|
||||
case S16:
|
||||
case U16:
|
||||
LOG(FATAL) << "u16/s16 literals not yet implemented";
|
||||
case F16:
|
||||
LOG(FATAL) << "f16 literals not yet implemented";
|
||||
case TUPLE:
|
||||
LOG(FATAL) << "tuple element type cannot take on value of 0";
|
||||
case OPAQUE:
|
||||
@ -222,7 +225,7 @@ template <typename T>
|
||||
case U16:
|
||||
LOG(FATAL) << "u16/s16 literals not yet implemented";
|
||||
case F16:
|
||||
LOG(FATAL) << "f16 literals not yet implemented";
|
||||
return *LiteralUtil::CreateR0<half>(static_cast<half>(1.0f));
|
||||
case TUPLE:
|
||||
LOG(FATAL) << "tuple element type cannot take on value of 1";
|
||||
case OPAQUE:
|
||||
@ -258,7 +261,8 @@ template <typename T>
|
||||
case U16:
|
||||
LOG(FATAL) << "u16/s16 literals not yet implemented";
|
||||
case F16:
|
||||
LOG(FATAL) << "f16 literals not yet implemented";
|
||||
return *LiteralUtil::CreateR0<half>(
|
||||
static_cast<half>(-std::numeric_limits<float>::infinity()));
|
||||
case TUPLE:
|
||||
LOG(FATAL) << "tuple element type has no minimum value";
|
||||
case OPAQUE:
|
||||
@ -294,7 +298,8 @@ template <typename T>
|
||||
case U16:
|
||||
LOG(FATAL) << "u16/s16 literals not yet implemented";
|
||||
case F16:
|
||||
LOG(FATAL) << "f16 literals not yet implemented";
|
||||
return *LiteralUtil::CreateR0<half>(
|
||||
static_cast<half>(std::numeric_limits<float>::infinity()));
|
||||
case TUPLE:
|
||||
LOG(FATAL) << "tuple element type has no maximum value";
|
||||
case OPAQUE:
|
||||
@ -498,6 +503,8 @@ template <typename T>
|
||||
return tensorflow::strings::StrCat(Get<float>(literal, multi_index));
|
||||
case F64:
|
||||
return tensorflow::strings::StrCat(Get<double>(literal, multi_index));
|
||||
case F16:
|
||||
return tensorflow::strings::StrCat(Get<half>(literal, multi_index));
|
||||
default:
|
||||
return tensorflow::strings::StrCat(
|
||||
"[", PrimitiveType_Name(literal.shape().element_type()), "]");
|
||||
@ -652,6 +659,8 @@ template <typename T>
|
||||
return reinterpret_cast<const void*>(literal.f32s().data());
|
||||
case F64:
|
||||
return reinterpret_cast<const void*>(literal.f64s().data());
|
||||
case F16:
|
||||
return reinterpret_cast<const void*>(literal.f16s().data());
|
||||
default:
|
||||
LOG(FATAL) << "primitive type not supported in literals: "
|
||||
<< PrimitiveType_Name(literal.shape().element_type());
|
||||
@ -692,6 +701,9 @@ template <typename T>
|
||||
case F64:
|
||||
Resize<double>(num_elements, 0, literal);
|
||||
break;
|
||||
case F16:
|
||||
Resize<half>(num_elements, static_cast<half>(0.0f), literal);
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "primitive type not supported in literals: "
|
||||
<< PrimitiveType_Name(literal->shape().element_type());
|
||||
@ -728,6 +740,9 @@ template <typename T>
|
||||
case F64:
|
||||
actual = literal.f64s_size();
|
||||
break;
|
||||
case F16:
|
||||
actual = literal.f16s().size() / sizeof(half);
|
||||
break;
|
||||
default:
|
||||
return tensorflow::errors::Unimplemented(
|
||||
"unhandled element type for literal validation: " +
|
||||
@ -818,6 +833,8 @@ bool EqualElements(const Literal& literal1, const Literal& literal2,
|
||||
return EqualElements<float>(literal1, literal2, 0, &multi_index);
|
||||
case F64:
|
||||
return EqualElements<double>(literal1, literal2, 0, &multi_index);
|
||||
case F16:
|
||||
return EqualElements<half>(literal1, literal2, 0, &multi_index);
|
||||
default:
|
||||
LOG(FATAL) << "Unimplemented: LiteralUtil::Equal for type "
|
||||
<< PrimitiveType_Name(literal1.shape().element_type());
|
||||
@ -916,6 +933,18 @@ LiteralUtil::GetMutableArraySlice(Literal* literal) {
|
||||
values->size());
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ tensorflow::gtl::MutableArraySlice<half>
|
||||
LiteralUtil::GetMutableArraySlice<half>(Literal* literal) {
|
||||
// C++11 standard, basic_string 21.4.1.5, values should be stored
|
||||
// contiguously. From C++17 a mutable data() member will be provided.
|
||||
// TODO - there is an endianess problem here. fix it, or wait for uint16
|
||||
// support in protobuf
|
||||
auto values = literal->mutable_f16s();
|
||||
return tensorflow::gtl::MutableArraySlice<half>(
|
||||
reinterpret_cast<half*>(&(*values)[0]), values->size() / sizeof(half));
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ tensorflow::gtl::ArraySlice<bool> LiteralUtil::GetArraySlice<bool>(
|
||||
const Literal& literal) {
|
||||
@ -976,6 +1005,15 @@ LiteralUtil::GetArraySlice<double>(const Literal& literal) {
|
||||
return literal.f64s();
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ tensorflow::gtl::ArraySlice<half> LiteralUtil::GetArraySlice<half>(
|
||||
const Literal& literal) {
|
||||
CHECK_EQ(literal.shape().element_type(), F16);
|
||||
return tensorflow::gtl::ArraySlice<half>(
|
||||
reinterpret_cast<const half*>(literal.f16s().data()),
|
||||
literal.f16s().size() / sizeof(half));
|
||||
}
|
||||
|
||||
template <typename NativeT>
|
||||
static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
|
||||
for (int64 i = 0; i < ShapeUtil::ElementsIn(literal.shape()); ++i) {
|
||||
@ -1015,6 +1053,8 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
|
||||
return AllElementsEqualValue<float>(literal, value);
|
||||
case F64:
|
||||
return AllElementsEqualValue<double>(literal, value);
|
||||
case F16:
|
||||
return AllElementsEqualValue<half>(literal, static_cast<half>(value));
|
||||
case PRED:
|
||||
if (value == 0) {
|
||||
return AllElementsEqualValue<bool>(literal, false);
|
||||
@ -1034,6 +1074,8 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
|
||||
return AllElementsEqualValue<float>(literal, value);
|
||||
case F64:
|
||||
return AllElementsEqualValue<double>(literal, value);
|
||||
case F16:
|
||||
return AllElementsEqualValue<half>(literal, static_cast<half>(value));
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@ -1058,6 +1100,8 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
|
||||
return Get<float>(literal, indices) == 0.0f;
|
||||
case F64:
|
||||
return Get<double>(literal, indices) == 0.0;
|
||||
case F16:
|
||||
return Get<half>(literal, indices) == static_cast<half>(0.0f);
|
||||
case PRED:
|
||||
return Get<bool>(literal, indices) == false;
|
||||
default:
|
||||
@ -1128,4 +1172,15 @@ template <>
|
||||
literal->mutable_f64s()->Resize(num_elements, value);
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<half>(int64 num_elements, half value,
|
||||
Literal* literal) {
|
||||
CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
|
||||
literal->mutable_f16s()->resize(num_elements * sizeof(half));
|
||||
auto data = GetMutableArraySlice<half>(literal);
|
||||
for (int i = 0; i < num_elements; i++) {
|
||||
data[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace xla
|
||||
|
@ -505,6 +505,10 @@ template <>
|
||||
/* static */ tensorflow::gtl::ArraySlice<double>
|
||||
LiteralUtil::GetArraySlice<double>(const Literal& literal);
|
||||
|
||||
template <>
|
||||
/* static */ tensorflow::gtl::ArraySlice<half> LiteralUtil::GetArraySlice<half>(
|
||||
const Literal& literal);
|
||||
|
||||
template <>
|
||||
/* static */ tensorflow::gtl::MutableArraySlice<bool>
|
||||
LiteralUtil::GetMutableArraySlice(Literal* literal);
|
||||
@ -541,6 +545,50 @@ template <>
|
||||
/* static */ tensorflow::gtl::MutableArraySlice<double>
|
||||
LiteralUtil::GetMutableArraySlice(Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ tensorflow::gtl::MutableArraySlice<half>
|
||||
LiteralUtil::GetMutableArraySlice(Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<bool>(int64 num_elements, bool value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<int8>(int64 num_elements, int8 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<uint8>(int64 num_elements, uint8 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<int32>(int64 num_elements, int32 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<uint32>(int64 num_elements, uint32 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<int64>(int64 num_elements, int64 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<uint64>(int64 num_elements, uint64 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<float>(int64 num_elements, float value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<double>(int64 num_elements, double value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<half>(int64 num_elements, half value,
|
||||
Literal* literal);
|
||||
|
||||
template <typename NativeT>
|
||||
/* static */ std::unique_ptr<Literal> LiteralUtil::CreateR0(NativeT value) {
|
||||
auto literal = MakeUnique<Literal>();
|
||||
@ -770,6 +818,14 @@ template <>
|
||||
return literal.u8s()[linear_index];
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ inline half LiteralUtil::Get<half>(
|
||||
const Literal& literal, tensorflow::gtl::ArraySlice<int64> multi_index) {
|
||||
CHECK(literal.shape().element_type() == F16);
|
||||
int64 linear_index = LinearIndex(literal, multi_index);
|
||||
return GetArraySlice<half>(literal)[linear_index];
|
||||
}
|
||||
|
||||
template <typename NativeT>
|
||||
/* static */ void LiteralUtil::Set(
|
||||
Literal* literal, tensorflow::gtl::ArraySlice<int64> multi_index,
|
||||
@ -834,76 +890,12 @@ template <typename NativeT>
|
||||
} while (IndexUtil::BumpIndices(literal.shape(), &indices));
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ inline void LiteralUtil::PopulateR0<bool>(bool value,
|
||||
Literal* literal) {
|
||||
*literal->mutable_shape() =
|
||||
ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<bool>(), {});
|
||||
literal->mutable_preds()->Add(value);
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ inline void LiteralUtil::PopulateR0<uint8>(uint8 value,
|
||||
Literal* literal) {
|
||||
*literal->mutable_shape() =
|
||||
ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<uint8>(), {});
|
||||
literal->mutable_u8s()->push_back(value);
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ inline void LiteralUtil::PopulateR0<int8>(int8 value,
|
||||
Literal* literal) {
|
||||
*literal->mutable_shape() =
|
||||
ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<int8>(), {});
|
||||
literal->mutable_u8s()->push_back(value);
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ inline void LiteralUtil::PopulateR0<uint32>(uint32 value,
|
||||
Literal* literal) {
|
||||
*literal->mutable_shape() =
|
||||
ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<uint32>(), {});
|
||||
literal->mutable_u32s()->Add(value);
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ inline void LiteralUtil::PopulateR0<int32>(int32 value,
|
||||
Literal* literal) {
|
||||
*literal->mutable_shape() =
|
||||
ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<int32>(), {});
|
||||
literal->mutable_s32s()->Add(value);
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ inline void LiteralUtil::PopulateR0<uint64>(uint64 value,
|
||||
Literal* literal) {
|
||||
*literal->mutable_shape() =
|
||||
ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<uint64>(), {});
|
||||
literal->mutable_u64s()->Add(value);
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ inline void LiteralUtil::PopulateR0<int64>(int64 value,
|
||||
Literal* literal) {
|
||||
*literal->mutable_shape() =
|
||||
ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<int64>(), {});
|
||||
literal->mutable_s64s()->Add(value);
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ inline void LiteralUtil::PopulateR0<float>(float value,
|
||||
Literal* literal) {
|
||||
*literal->mutable_shape() =
|
||||
ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<float>(), {});
|
||||
literal->mutable_f32s()->Add(value);
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ inline void LiteralUtil::PopulateR0<double>(double value,
|
||||
Literal* literal) {
|
||||
*literal->mutable_shape() =
|
||||
ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<double>(), {});
|
||||
literal->mutable_f64s()->Add(value);
|
||||
template <typename NativeT>
|
||||
/* static */ inline void LiteralUtil::PopulateR0(NativeT value,
|
||||
Literal* literal) {
|
||||
*literal->mutable_shape() = ShapeUtil::MakeShape(
|
||||
primitive_util::NativeToPrimitiveType<NativeT>(), {});
|
||||
Resize<NativeT>(1, value, literal);
|
||||
}
|
||||
|
||||
template <typename NativeT>
|
||||
@ -1116,42 +1108,6 @@ template <typename NativeSrcT, typename NativeDestT>
|
||||
return result_literal;
|
||||
}
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<bool>(int64 num_elements, bool value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<int8>(int64 num_elements, int8 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<uint8>(int64 num_elements, uint8 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<int32>(int64 num_elements, int32 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<uint32>(int64 num_elements, uint32 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<int64>(int64 num_elements, int64 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<uint64>(int64 num_elements, uint64 value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<float>(int64 num_elements, float value,
|
||||
Literal* literal);
|
||||
|
||||
template <>
|
||||
/* static */ void LiteralUtil::Resize<double>(int64 num_elements, double value,
|
||||
Literal* literal);
|
||||
|
||||
template <typename NativeT>
|
||||
/* static */ std::unique_ptr<Literal>
|
||||
LiteralUtil::CreateFullWithMonotonicDim0MajorLayout(
|
||||
|
@ -105,6 +105,9 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
|
||||
|
||||
auto f32_lit = LiteralUtil::CreateR0<float>(3.14f);
|
||||
ASSERT_EQ("3.14", LiteralUtil::ToString(*f32_lit));
|
||||
|
||||
auto f16_lit = LiteralUtil::CreateR0<half>(static_cast<half>(0.5f));
|
||||
ASSERT_EQ("0.5", LiteralUtil::ToString(*f16_lit));
|
||||
}
|
||||
|
||||
TEST_F(LiteralUtilTest, LiteralVectorToString) {
|
||||
@ -373,6 +376,15 @@ TEST_F(LiteralUtilTest, IsAll) {
|
||||
EXPECT_FALSE(
|
||||
LiteralUtil::IsAll(*LiteralUtil::CreateR2<uint64>({{9, 8}, {8, 8}}), 8));
|
||||
|
||||
half h8(8.0f);
|
||||
half h9(9.0f);
|
||||
EXPECT_TRUE(
|
||||
LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h8}, {h8}}), 8));
|
||||
EXPECT_FALSE(
|
||||
LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h8}, {h9}}), 8));
|
||||
EXPECT_FALSE(
|
||||
LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h9}, {h8}}), 8));
|
||||
|
||||
auto uint64_max = std::numeric_limits<uint64>::max();
|
||||
EXPECT_FALSE(LiteralUtil::IsAll(
|
||||
*LiteralUtil::CreateR2<uint64>(
|
||||
@ -659,6 +671,30 @@ TEST_F(LiteralUtilTest, PopulateWithValueR2U64) {
|
||||
EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
|
||||
}
|
||||
|
||||
TEST_F(LiteralUtilTest, PopulateWithValueR0F16) {
|
||||
Literal output;
|
||||
half h(0.25f);
|
||||
LiteralUtil::PopulateWithValue<half>(h, {}, &output);
|
||||
auto expected = LiteralUtil::CreateR0<half>(h);
|
||||
EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
|
||||
}
|
||||
|
||||
TEST_F(LiteralUtilTest, PopulateWithValueR1F16) {
|
||||
Literal output;
|
||||
half h(0.5f);
|
||||
LiteralUtil::PopulateWithValue<half>(h, {3}, &output);
|
||||
auto expected = LiteralUtil::CreateR1<half>({h, h, h});
|
||||
EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
|
||||
}
|
||||
|
||||
TEST_F(LiteralUtilTest, PopulateWithValueR2F16) {
|
||||
Literal output;
|
||||
half h(2.0f);
|
||||
LiteralUtil::PopulateWithValue<half>(h, {2, 2}, &output);
|
||||
auto expected = LiteralUtil::CreateR2<half>({{h, h}, {h, h}});
|
||||
EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
|
||||
}
|
||||
|
||||
TEST_F(LiteralUtilTest, ReplicateR2U32) {
|
||||
auto input = LiteralUtil::CreateR2<uint32>(
|
||||
{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
|
||||
@ -730,6 +766,41 @@ TEST_F(LiteralUtilTest, CopyScalars) {
|
||||
EXPECT_EQ(LiteralUtil::Get<uint32>(*vect, {4}), 17);
|
||||
}
|
||||
|
||||
TEST_F(LiteralUtilTest, F16) {
|
||||
// Verify that the internal data views are consistent and that they
|
||||
// are in little endian format
|
||||
// TODO - modify if we make the data format machine endianess dependent
|
||||
auto m1 = LiteralUtil::CreateFromShape(ShapeUtil::MakeShape(F16, {2, 2}));
|
||||
Literal* l1 = m1.get();
|
||||
const char* d1 = (const char*)LiteralUtil::InternalData(*l1);
|
||||
EXPECT_EQ(d1[0], 0);
|
||||
EXPECT_EQ(d1[1], 0);
|
||||
EXPECT_EQ(d1[2], 0);
|
||||
EXPECT_EQ(d1[3], 0);
|
||||
EXPECT_EQ(d1[4], 0);
|
||||
EXPECT_EQ(d1[5], 0);
|
||||
EXPECT_EQ(d1[6], 0);
|
||||
EXPECT_EQ(d1[7], 0);
|
||||
EXPECT_EQ(LiteralUtil::InternalData(*l1),
|
||||
LiteralUtil::MutableInternalData(l1));
|
||||
|
||||
half h1(1.0f);
|
||||
half h2(2.0f);
|
||||
auto m2 = LiteralUtil::CreateR2<half>({{h1, h2}, {h2, h1}});
|
||||
Literal* l2 = m2.get();
|
||||
const char* d2 = (const char*)LiteralUtil::InternalData(*l2);
|
||||
EXPECT_EQ(d2[0], 0);
|
||||
EXPECT_EQ(d2[1], 0x3C);
|
||||
EXPECT_EQ(d2[2], 0);
|
||||
EXPECT_EQ(d2[3], 0x40);
|
||||
EXPECT_EQ(d2[4], 0);
|
||||
EXPECT_EQ(d2[5], 0x40);
|
||||
EXPECT_EQ(d2[6], 0);
|
||||
EXPECT_EQ(d2[7], 0x3C);
|
||||
EXPECT_EQ(LiteralUtil::InternalData(*l2),
|
||||
LiteralUtil::MutableInternalData(l2));
|
||||
}
|
||||
|
||||
TEST_F(LiteralUtilTest, Populate) {
|
||||
struct PopulateData {
|
||||
std::vector<int64> dimensions;
|
||||
|
@ -78,6 +78,11 @@ PrimitiveType NativeToPrimitiveType<double>() {
|
||||
return F64;
|
||||
}
|
||||
|
||||
template <>
|
||||
PrimitiveType NativeToPrimitiveType<half>() {
|
||||
return F16;
|
||||
}
|
||||
|
||||
bool IsFloatingPointType(PrimitiveType type) {
|
||||
return type == F16 || type == F32 || type == F64;
|
||||
}
|
||||
|
@ -75,6 +75,8 @@ template <>
|
||||
PrimitiveType NativeToPrimitiveType<float>();
|
||||
template <>
|
||||
PrimitiveType NativeToPrimitiveType<double>();
|
||||
template <>
|
||||
PrimitiveType NativeToPrimitiveType<half>();
|
||||
|
||||
bool IsFloatingPointType(PrimitiveType type);
|
||||
|
||||
@ -150,6 +152,10 @@ template <>
|
||||
struct PrimitiveTypeToNative<F64> {
|
||||
using type = double;
|
||||
};
|
||||
template <>
|
||||
struct PrimitiveTypeToNative<F16> {
|
||||
using type = half;
|
||||
};
|
||||
|
||||
} // namespace primitive_util
|
||||
} // namespace xla
|
||||
|
@ -15,7 +15,6 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
|
||||
|
||||
#include <sched.h>
|
||||
#include <functional>
|
||||
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
@ -54,7 +54,7 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
|
||||
int lhs_contract_dim = transpose_lhs ? 0 : 1;
|
||||
int rhs_contract_dim = transpose_rhs ? 1 : 0;
|
||||
const Eigen::array<DimPair, 1> dims(
|
||||
DimPair(lhs_contract_dim, rhs_contract_dim));
|
||||
{DimPair(lhs_contract_dim, rhs_contract_dim)});
|
||||
|
||||
// Matrix multiply is a special case of the "contract" operation where
|
||||
// the contraction is performed along dimension 1 of the lhs and dimension
|
||||
|
@ -48,7 +48,7 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
|
||||
int lhs_contract_dim = transpose_lhs ? 0 : 1;
|
||||
int rhs_contract_dim = transpose_rhs ? 1 : 0;
|
||||
const Eigen::array<DimPair, 1> dims(
|
||||
DimPair(lhs_contract_dim, rhs_contract_dim));
|
||||
{DimPair(lhs_contract_dim, rhs_contract_dim)});
|
||||
|
||||
// Matrix multiply is a special case of the "contract" operation where
|
||||
// the contraction is performed along dimension 1 of the lhs and dimension
|
||||
|
@ -113,7 +113,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
|
||||
tensorflow::gtl::ArraySlice<llvm::Value*> operands,
|
||||
tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
|
||||
PrimitiveType output_type) const {
|
||||
// Binary math functions tranform are of type [T] -> T.
|
||||
// Binary math functions transform are of type [T] -> T.
|
||||
for (PrimitiveType input_type : input_types) {
|
||||
if (output_type != input_type) {
|
||||
return Unimplemented("Input type ≠ output type: %s ≠ %s",
|
||||
|
@ -399,7 +399,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot,
|
||||
llvm::Type* accum_type = target_array.GetElementLlvmType();
|
||||
llvm::Value* accum_address = llvm_ir::EmitAllocaAtFunctionEntry(
|
||||
accum_type, // The pointee type of the alloca instruction.
|
||||
"accum_address", // The name of the alloca instuction.
|
||||
"accum_address", // The name of the alloca instruction.
|
||||
&ir_builder_);
|
||||
|
||||
// Initialize the accumulator in the preheader to zero.
|
||||
|
@ -396,7 +396,7 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
|
||||
|
||||
// The LLVM IR verifier performs sanity checking on the IR. This helps
|
||||
// discover problems and report them in a meaningful manner, rather than let
|
||||
// later passes report obscure assertions becasue of unfulfilled invariants.
|
||||
// later passes report obscure assertions because of unfulfilled invariants.
|
||||
module_passes.add(llvm::createVerifierPass());
|
||||
|
||||
// Create the function-level pass manager. It needs data layout information
|
||||
|
@ -168,7 +168,7 @@ class MatcherBase {
|
||||
virtual ~MatcherBase() {}
|
||||
|
||||
// Attempts to match each ExprTree in 'expr_trees_'.
|
||||
// Returns OK on the first succesful match, error status otherwise.
|
||||
// Returns OK on the first successful match, error status otherwise.
|
||||
virtual tensorflow::Status Run() {
|
||||
Status status;
|
||||
for (const ExprTree& expr_tree : expr_trees_) {
|
||||
|
@ -1081,7 +1081,7 @@ StatusOr<Layout> InferArrayLayout(
|
||||
*first_buffer_layout)) {
|
||||
// The points-to set is ambiguous for this index and the different source
|
||||
// buffers have different layouts. This case is possible in valid XLA
|
||||
// computations because we do not propagate BufferLayoutConstaints to all
|
||||
// computations because we do not propagate BufferLayoutConstraints to all
|
||||
// LogicalBuffers which may alias the constrained LogicalBuffer at some
|
||||
// point in the computation.
|
||||
return FailedPrecondition(
|
||||
@ -1294,7 +1294,7 @@ Status LayoutAssignment::RunOnComputation(
|
||||
TF_ASSIGN_OR_RETURN(auto points_to_analysis,
|
||||
TuplePointsToAnalysis::Run(computation->parent()));
|
||||
|
||||
// Construct LayoutConstaints with all layout constraints of the computation.
|
||||
// Construct LayoutConstraints with all layout constraints of the computation.
|
||||
LayoutConstraints constraints(*points_to_analysis, computation);
|
||||
|
||||
// Add constraints required for correctness on all backends (eg, entry
|
||||
|
@ -29,23 +29,21 @@ limitations under the License.
|
||||
#include "tensorflow/compiler/xla/tests/test_macros.h"
|
||||
#include "tensorflow/compiler/xla/xla_data.pb.h"
|
||||
#include "tensorflow/core/platform/dynamic_annotations.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/test.h"
|
||||
|
||||
extern "C" void __attribute__((visibility("default")))
|
||||
R0F32Add2(float* out, float** in) {
|
||||
extern "C" void TF_EXPORT R0F32Add2(float* out, float** in) {
|
||||
TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float*));
|
||||
*out = **in + 2.0f;
|
||||
}
|
||||
|
||||
extern "C" void __attribute__((visibility("default")))
|
||||
R2F32ReduceSum(float* out, float** in) {
|
||||
extern "C" void TF_EXPORT R2F32ReduceSum(float* out, float** in) {
|
||||
TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float) * 4);
|
||||
float* array = in[0];
|
||||
*out = array[0] + array[1] + array[2] + array[3];
|
||||
}
|
||||
|
||||
extern "C" void __attribute__((visibility("default")))
|
||||
Add1ToValues(float* out, float** in) {
|
||||
extern "C" void TF_EXPORT Add1ToValues(float* out, float** in) {
|
||||
TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float) * 4);
|
||||
float* array = in[0];
|
||||
out[0] = array[0] + 1;
|
||||
|
@ -52,7 +52,7 @@ TEST_F(DeallocationTest, DeallocateScalar) {
|
||||
builder.ConstantR0<float>(42.0);
|
||||
auto global_data = ExecuteAndCheckTransfer(&builder, {});
|
||||
|
||||
// A result can be transfered an arbitrary number of times. Add an extra
|
||||
// A result can be transferred an arbitrary number of times. Add an extra
|
||||
// transfer here so we're not just testing that a second call to Transfer
|
||||
// fails.
|
||||
ASSERT_IS_OK(client_->Transfer(*global_data).status());
|
||||
|
@ -314,7 +314,7 @@ class NearComparator {
|
||||
|
||||
private:
|
||||
// EXPECTs that the two given scalar values are within the error bound. Keeps
|
||||
// track of how many mismatches have occured to keep the size of the output
|
||||
// track of how many mismatches have occurred to keep the size of the output
|
||||
// manageable.
|
||||
template <typename NativeT>
|
||||
bool ExpectValuesNear(NativeT expected, NativeT actual) {
|
||||
|
@ -61,7 +61,7 @@ namespace {
|
||||
class ReduceTest : public ClientLibraryTestBase {
|
||||
protected:
|
||||
ReduceTest() {
|
||||
// Implementation note: layed out z >> y >> x by default.
|
||||
// Implementation note: laid out z >> y >> x by default.
|
||||
// clang-format off
|
||||
literal_2d_ = LiteralUtil::CreateR2<float>({
|
||||
// x0 x1 x2
|
||||
|
@ -16,6 +16,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_COMPILER_XLA_TYPES_H_
|
||||
#define TENSORFLOW_COMPILER_XLA_TYPES_H_
|
||||
|
||||
#include "third_party/eigen3/Eigen/Core"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace xla {
|
||||
@ -32,6 +33,8 @@ using ::tensorflow::uint16;
|
||||
using ::tensorflow::uint32;
|
||||
using ::tensorflow::uint64;
|
||||
|
||||
using ::Eigen::half;
|
||||
|
||||
} // namespace xla
|
||||
|
||||
#endif // TENSORFLOW_COMPILER_XLA_TYPES_H_
|
||||
|
@ -286,6 +286,7 @@ message Literal {
|
||||
repeated float f32s = 8;
|
||||
repeated double f64s = 9;
|
||||
repeated Literal tuple_literals = 10;
|
||||
bytes f16s = 11; // Note: the F16s are encoded in little endian byte order
|
||||
}
|
||||
|
||||
message WindowDimension {
|
||||
|
@ -75,7 +75,7 @@ TEST_F(WeightedQuantilesSummaryTest, BuildFromBuffer) {
|
||||
Summary summary;
|
||||
summary.BuildFromBufferEntries(buffer1_->GenerateEntryList());
|
||||
|
||||
// We expect no approximation error because no compress operation occured.
|
||||
// We expect no approximation error because no compress operation occurred.
|
||||
EXPECT_EQ(summary.ApproximationError(), 0);
|
||||
|
||||
// Check first and last elements in the summary.
|
||||
|
@ -61,15 +61,18 @@ add_definitions(-DEIGEN_AVOID_STL_ARRAY)
|
||||
if(WIN32)
|
||||
add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
|
||||
add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
|
||||
add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
|
||||
add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH)
|
||||
add_definitions(-DTF_COMPILE_LIBRARY)
|
||||
add_definitions(-DNDEBUG /O2) # Equivalent of -c opt in Bazel.
|
||||
add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
|
||||
# Suppress warnings to reduce build log size.
|
||||
add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
|
||||
add_definitions(/wd4099 /wd4146 /wd4267 /wd4305 /wd4307)
|
||||
add_definitions(/wd4715 /wd4722 /wd4723 /wd4838 /wd4309 /wd4334)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob0")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
|
||||
set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
|
||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
|
||||
endif()
|
||||
|
||||
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
||||
|
@ -21,7 +21,7 @@ set(googletest_TAG ec44c6c1675c25b9827aacd08c02433cccde7780)
|
||||
|
||||
if(WIN32)
|
||||
set(googletest_STATIC_LIBRARIES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/${CMAKE_BUILD_TYPE}/gtest.lib)
|
||||
${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/$(Configuration)/gtest.lib)
|
||||
else()
|
||||
set(googletest_STATIC_LIBRARIES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/${CMAKE_BUILD_TYPE}/gtest.a)
|
||||
|
6
tensorflow/contrib/cmake/external/grpc.cmake
vendored
6
tensorflow/contrib/cmake/external/grpc.cmake
vendored
@ -21,9 +21,9 @@ set(GRPC_TAG 3bc78cd0b5bd784a235c01612d634b1ec5f8fb97)
|
||||
|
||||
if(WIN32)
|
||||
set(grpc_STATIC_LIBRARIES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/grpc++_unsecure.lib
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/grpc_unsecure.lib
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/gpr.lib)
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc++_unsecure.lib
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc_unsecure.lib
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/gpr.lib)
|
||||
else()
|
||||
set(grpc_STATIC_LIBRARIES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
|
||||
|
@ -23,7 +23,7 @@ set(jsoncpp_LIBRARIES ${jsoncpp_BUILD}/obj/so/libjsoncpp.so)
|
||||
set(jsoncpp_INCLUDES ${jsoncpp_BUILD})
|
||||
|
||||
if(WIN32)
|
||||
set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/${CMAKE_BUILD_TYPE}/jsoncpp.lib)
|
||||
set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/$(Configuration)/jsoncpp.lib)
|
||||
else()
|
||||
set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/libjsoncpp.a)
|
||||
endif()
|
||||
|
4
tensorflow/contrib/cmake/external/png.cmake
vendored
4
tensorflow/contrib/cmake/external/png.cmake
vendored
@ -21,7 +21,9 @@ set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
|
||||
set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
|
||||
|
||||
if(WIN32)
|
||||
set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
|
||||
set(png_STATIC_LIBRARIES
|
||||
debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
|
||||
optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
|
||||
else()
|
||||
set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12.a)
|
||||
endif()
|
||||
|
@ -19,8 +19,10 @@ set(PROTOBUF_URL https://github.com/mrry/protobuf.git) # Includes MSVC fix.
|
||||
set(PROTOBUF_TAG 1d2c7b6c7376f396c8c7dd9b6afd2d4f83f3cb05)
|
||||
|
||||
if(WIN32)
|
||||
set(protobuf_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/${CMAKE_BUILD_TYPE}/libprotobuf.lib)
|
||||
set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/${CMAKE_BUILD_TYPE}/protoc.exe)
|
||||
set(protobuf_STATIC_LIBRARIES
|
||||
debug ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobufd.lib
|
||||
optimized ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobuf.lib)
|
||||
set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/protoc.exe)
|
||||
set(PROTOBUF_ADDITIONAL_CMAKE_OPTIONS -Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF -A x64)
|
||||
else()
|
||||
set(protobuf_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/libprotobuf.a)
|
||||
|
3
tensorflow/contrib/cmake/external/zlib.cmake
vendored
3
tensorflow/contrib/cmake/external/zlib.cmake
vendored
@ -22,7 +22,8 @@ set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
|
||||
|
||||
if(WIN32)
|
||||
set(zlib_STATIC_LIBRARIES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
|
||||
debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
|
||||
optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
|
||||
else()
|
||||
set(zlib_STATIC_LIBRARIES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
|
||||
|
@ -861,9 +861,9 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
|
||||
${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
|
||||
if(WIN32)
|
||||
add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.dll
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
|
||||
${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
|
||||
${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
|
||||
else()
|
||||
add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
|
||||
|
@ -208,6 +208,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
|
||||
"${tensorflow_source_dir}/tensorflow/tensorboard/lib/python/http_util_test.py"
|
||||
# Broken tensorboard test due to cmake issues.
|
||||
"${tensorflow_source_dir}/tensorflow/tensorboard/plugins/debugger/plugin_test.py"
|
||||
"${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"
|
||||
# tensor_forest tests (also note that we exclude the hybrid tests for now)
|
||||
"${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py" # Results in wrong order.
|
||||
"${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py" # Results in wrong order.
|
||||
|
@ -138,6 +138,7 @@ tf_py_test(
|
||||
"//tensorflow/python:platform_test",
|
||||
],
|
||||
tags = [
|
||||
"no_pip", # b/38283730
|
||||
"notsan", # Flaky: b/30756419
|
||||
],
|
||||
)
|
||||
|
@ -70,8 +70,7 @@ bool IsBinaryInstalled(const string& binary_name) {
|
||||
const string binary_path = io::JoinPath(dir, binary_name);
|
||||
char absolute_path[PATH_MAX + 1];
|
||||
if (::realpath(binary_path.c_str(), absolute_path) == NULL) {
|
||||
LOG(ERROR) << "Invalid binary path: " << binary_path;
|
||||
return false;
|
||||
continue;
|
||||
}
|
||||
struct stat statinfo;
|
||||
int result = ::stat(absolute_path, &statinfo);
|
||||
|
@ -29,6 +29,7 @@ from tensorflow.contrib.keras.python.keras.callbacks import ModelCheckpoint
|
||||
from tensorflow.contrib.keras.python.keras.callbacks import ProgbarLogger
|
||||
from tensorflow.contrib.keras.python.keras.callbacks import ReduceLROnPlateau
|
||||
from tensorflow.contrib.keras.python.keras.callbacks import RemoteMonitor
|
||||
from tensorflow.contrib.keras.python.keras.callbacks import TensorBoard
|
||||
|
||||
del absolute_import
|
||||
del division
|
||||
|
@ -597,8 +597,8 @@ class DynamicRnnEstimator(estimator.Estimator):
|
||||
`ProblemType.CLASSIFICATION` or `ProblemType.LINEAR_REGRESSION`.
|
||||
prediction_type: whether the `Estimator` should return a value for each
|
||||
step in the sequence, or just a single value for the final time step.
|
||||
Must be one of `ProblemType.SINGLE_VALUE` or
|
||||
`ProblemType.MULTIPLE_VALUE`.
|
||||
Must be one of `PredictionType.SINGLE_VALUE` or
|
||||
`PredictionType.MULTIPLE_VALUE`.
|
||||
sequence_feature_columns: An iterable containing all the feature columns
|
||||
describing sequence features. All items in the iterable should be
|
||||
instances of classes derived from `FeatureColumn`.
|
||||
|
@ -331,14 +331,21 @@ def _write_dict_to_summary(output_dir,
|
||||
for key in dictionary:
|
||||
if dictionary[key] is None:
|
||||
continue
|
||||
if key == 'global_step':
|
||||
continue
|
||||
value = summary_proto.value.add()
|
||||
value.tag = key
|
||||
if (isinstance(dictionary[key], np.float32) or
|
||||
isinstance(dictionary[key], float)):
|
||||
value.simple_value = float(dictionary[key])
|
||||
elif (isinstance(dictionary[key], np.int64) or
|
||||
isinstance(dictionary[key], np.int32) or
|
||||
isinstance(dictionary[key], int)):
|
||||
value.simple_value = int(dictionary[key])
|
||||
else:
|
||||
logging.warn('Skipping summary for %s, must be a float or np.float32.',
|
||||
key)
|
||||
logging.warn(
|
||||
'Skipping summary for %s, must be a float, np.float32, np.int64, np.int32 or int.',
|
||||
key)
|
||||
summary_writer.add_summary(summary_proto, current_global_step)
|
||||
summary_writer.flush()
|
||||
|
||||
|
@ -110,6 +110,7 @@ tensorflow/core/kernels/fake_quant_ops.cc
|
||||
tensorflow/core/kernels/example_parsing_ops.cc
|
||||
tensorflow/core/kernels/dynamic_stitch_op.cc
|
||||
tensorflow/core/kernels/dynamic_partition_op.cc
|
||||
tensorflow/core/kernels/decode_bmp_op.cc
|
||||
tensorflow/core/kernels/depthtospace_op.cc
|
||||
tensorflow/core/kernels/spacetodepth_op.cc
|
||||
tensorflow/core/kernels/dense_update_ops.cc
|
||||
|
@ -18,6 +18,7 @@ py_library(
|
||||
"python/training/external_optimizer.py",
|
||||
"python/training/lazy_adam_optimizer.py",
|
||||
"python/training/moving_average_optimizer.py",
|
||||
"python/training/nadam_optimizer.py",
|
||||
"python/training/variable_clipping_optimizer.py",
|
||||
],
|
||||
srcs_version = "PY2AND3",
|
||||
@ -106,6 +107,23 @@ py_test(
|
||||
],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "nadam_optimizer_test",
|
||||
srcs = ["python/training/nadam_optimizer_test.py"],
|
||||
srcs_version = "PY2AND3",
|
||||
deps = [
|
||||
":opt_py",
|
||||
"//tensorflow/python:array_ops",
|
||||
"//tensorflow/python:client_testlib",
|
||||
"//tensorflow/python:constant_op",
|
||||
"//tensorflow/python:dtypes",
|
||||
"//tensorflow/python:framework_ops",
|
||||
"//tensorflow/python:math_ops",
|
||||
"//tensorflow/python:variables",
|
||||
"//third_party/py/numpy",
|
||||
],
|
||||
)
|
||||
|
||||
tf_py_test(
|
||||
name = "drop_stale_gradient_optimizer_test",
|
||||
srcs = ["python/training/drop_stale_gradient_optimizer_test.py"],
|
||||
|
@ -23,16 +23,16 @@ from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import
|
||||
from tensorflow.contrib.opt.python.training.external_optimizer import *
|
||||
from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
|
||||
from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
|
||||
from tensorflow.contrib.opt.python.training.nadam_optimizer import *
|
||||
from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
|
||||
# pylint: enable=wildcard-import
|
||||
|
||||
from tensorflow.python.util.all_util import remove_undocumented
|
||||
|
||||
_allowed_symbols = ['DropStaleGradientOptimizer',
|
||||
'ExternalOptimizerInterface',
|
||||
'LazyAdamOptimizer',
|
||||
'MovingAverageOptimizer',
|
||||
'ScipyOptimizerInterface',
|
||||
'VariableClippingOptimizer']
|
||||
_allowed_symbols = [
|
||||
'DropStaleGradientOptimizer', 'ExternalOptimizerInterface',
|
||||
'LazyAdamOptimizer', 'NadamOptimizer', 'MovingAverageOptimizer',
|
||||
'ScipyOptimizerInterface', 'VariableClippingOptimizer'
|
||||
]
|
||||
|
||||
remove_undocumented(__name__, _allowed_symbols)
|
||||
|
@ -184,7 +184,7 @@ class DropStaleGradientOptimizerTest(test.TestCase):
|
||||
thread_0.join()
|
||||
thread_1.join()
|
||||
|
||||
# With 2 workers and max staleness set to 0, only cheif worker will update
|
||||
# With 2 workers and max staleness set to 0, only chief worker will update
|
||||
# var_0 and var_1.
|
||||
self.assertAllEqual(1, sessions[0].run(global_step))
|
||||
self.assertAllEqual(1.0, sessions[0].run(stale_counter))
|
||||
@ -250,7 +250,7 @@ class DropStaleGradientOptimizerTest(test.TestCase):
|
||||
thread_1.join()
|
||||
thread_2.join()
|
||||
|
||||
# With 3 workers and max staleness set to 0, only cheif worker will update
|
||||
# With 3 workers and max staleness set to 0, only chief worker will update
|
||||
# var_0 and var_1.
|
||||
self.assertAllEqual(1, sessions[0].run(global_step))
|
||||
self.assertAllEqual(2.0, sessions[0].run(stale_counter))
|
||||
|
93
tensorflow/contrib/opt/python/training/nadam_optimizer.py
Normal file
93
tensorflow/contrib/opt/python/training/nadam_optimizer.py
Normal file
@ -0,0 +1,93 @@
|
||||
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Nadam for TensorFlow."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.ops import control_flow_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.ops import state_ops
|
||||
from tensorflow.python.training import adam
|
||||
from tensorflow.python.training import training_ops
|
||||
|
||||
|
||||
class NadamOptimizer(adam.AdamOptimizer):
|
||||
"""Optimizer that implements the Nadam algorithm.
|
||||
|
||||
See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
|
||||
"""
|
||||
|
||||
def _apply_dense(self, grad, var):
|
||||
m = self.get_slot(var, "m")
|
||||
v = self.get_slot(var, "v")
|
||||
return training_ops.apply_adam(
|
||||
var,
|
||||
m,
|
||||
v,
|
||||
math_ops.cast(self._beta1_power, var.dtype.base_dtype),
|
||||
math_ops.cast(self._beta2_power, var.dtype.base_dtype),
|
||||
math_ops.cast(self._lr_t, var.dtype.base_dtype),
|
||||
math_ops.cast(self._beta1_t, var.dtype.base_dtype),
|
||||
math_ops.cast(self._beta2_t, var.dtype.base_dtype),
|
||||
math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
|
||||
grad,
|
||||
use_locking=self._use_locking,
|
||||
use_nesterov=True).op
|
||||
|
||||
def _resource_apply_dense(self, grad, var):
|
||||
m = self.get_slot(var, "m")
|
||||
v = self.get_slot(var, "v")
|
||||
return training_ops.resource_apply_adam(
|
||||
var.handle,
|
||||
m.handle,
|
||||
v.handle,
|
||||
math_ops.cast(self._beta1_power, grad.dtype.base_dtype),
|
||||
math_ops.cast(self._beta2_power, grad.dtype.base_dtype),
|
||||
math_ops.cast(self._lr_t, grad.dtype.base_dtype),
|
||||
math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
|
||||
math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
|
||||
math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
|
||||
grad,
|
||||
use_locking=self._use_locking,
|
||||
use_nesterov=True)
|
||||
|
||||
def _apply_sparse_shared(self, grad, var, indices, scatter_add):
|
||||
beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
|
||||
beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
|
||||
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
|
||||
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
|
||||
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
|
||||
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
|
||||
lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
|
||||
# m_t = beta1 * m + (1 - beta1) * g_t
|
||||
m = self.get_slot(var, "m")
|
||||
m_scaled_g_values = grad * (1 - beta1_t)
|
||||
m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
|
||||
with ops.control_dependencies([m_t]):
|
||||
m_t = scatter_add(m, indices, m_scaled_g_values)
|
||||
# m_bar = (1 - beta1) * g_t + beta1 * m_t
|
||||
m_bar = m_scaled_g_values + beta1_t * m_t
|
||||
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
|
||||
v = self.get_slot(var, "v")
|
||||
v_scaled_g_values = (grad * grad) * (1 - beta2_t)
|
||||
v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
|
||||
with ops.control_dependencies([v_t]):
|
||||
v_t = scatter_add(v, indices, v_scaled_g_values)
|
||||
v_sqrt = math_ops.sqrt(v_t)
|
||||
var_update = state_ops.assign_sub(
|
||||
var, lr * m_bar / (v_sqrt + epsilon_t), use_locking=self._use_locking)
|
||||
return control_flow_ops.group(*[var_update, m_bar, v_t])
|
159
tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
Normal file
159
tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
Normal file
@ -0,0 +1,159 @@
|
||||
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Tests for Nadam."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
from tensorflow.contrib.opt.python.training import nadam_optimizer
|
||||
from tensorflow.python.client import session
|
||||
from tensorflow.python.framework import constant_op
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.ops import resource_variable_ops
|
||||
from tensorflow.python.ops import variables
|
||||
from tensorflow.python.platform import test
|
||||
|
||||
|
||||
def nadam_update_numpy(param,
|
||||
g_t,
|
||||
t,
|
||||
m,
|
||||
v,
|
||||
alpha=0.001,
|
||||
beta1=0.9,
|
||||
beta2=0.999,
|
||||
epsilon=1e-8):
|
||||
alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
|
||||
|
||||
m_t = beta1 * m + (1 - beta1) * g_t
|
||||
v_t = beta2 * v + (1 - beta2) * g_t * g_t
|
||||
|
||||
m_bar = (1 - beta1) * g_t + beta1 * m_t
|
||||
|
||||
param_t = param - alpha_t * m_bar / (np.sqrt(v_t) + epsilon)
|
||||
return param_t, m_t, v_t
|
||||
|
||||
|
||||
class NadamOptimizerTest(test.TestCase):
|
||||
|
||||
def doTestSparse(self, use_resource=False):
|
||||
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
|
||||
with self.test_session():
|
||||
# Initialize variables for numpy implementation.
|
||||
m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
|
||||
var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
|
||||
grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
|
||||
var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
|
||||
grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
|
||||
|
||||
if use_resource:
|
||||
var0 = resource_variable_ops.ResourceVariable(var0_np)
|
||||
var1 = resource_variable_ops.ResourceVariable(var1_np)
|
||||
else:
|
||||
var0 = variables.Variable(var0_np)
|
||||
var1 = variables.Variable(var1_np)
|
||||
grads0_np_indices = np.array([0, 1], dtype=np.int32)
|
||||
grads0 = ops.IndexedSlices(
|
||||
constant_op.constant(grads0_np),
|
||||
constant_op.constant(grads0_np_indices), constant_op.constant([2]))
|
||||
grads1_np_indices = np.array([0, 1], dtype=np.int32)
|
||||
grads1 = ops.IndexedSlices(
|
||||
constant_op.constant(grads1_np),
|
||||
constant_op.constant(grads1_np_indices), constant_op.constant([2]))
|
||||
opt = nadam_optimizer.NadamOptimizer()
|
||||
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
|
||||
variables.global_variables_initializer().run()
|
||||
|
||||
# Fetch params to validate initial values
|
||||
self.assertAllClose([1.0, 2.0], var0.eval())
|
||||
self.assertAllClose([3.0, 4.0], var1.eval())
|
||||
|
||||
beta1_power, beta2_power = opt._get_beta_accumulators()
|
||||
|
||||
# Run 3 steps of Nadam
|
||||
for t in range(1, 4):
|
||||
self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
|
||||
self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
|
||||
update.run()
|
||||
|
||||
var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
|
||||
var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
|
||||
|
||||
# Validate updated params
|
||||
self.assertAllCloseAccordingToType(var0_np, var0.eval())
|
||||
self.assertAllCloseAccordingToType(var1_np, var1.eval())
|
||||
|
||||
def testSparse(self):
|
||||
self.doTestSparse(use_resource=False)
|
||||
|
||||
def testResourceSparse(self):
|
||||
self.doTestSparse(use_resource=True)
|
||||
|
||||
def doTestBasic(self, use_resource=False):
|
||||
for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
|
||||
with self.test_session():
|
||||
# Initialize variables for numpy implementation.
|
||||
m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
|
||||
var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
|
||||
grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
|
||||
var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
|
||||
grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
|
||||
|
||||
if use_resource:
|
||||
var0 = resource_variable_ops.ResourceVariable(var0_np)
|
||||
var1 = resource_variable_ops.ResourceVariable(var1_np)
|
||||
else:
|
||||
var0 = variables.Variable(var0_np)
|
||||
var1 = variables.Variable(var1_np)
|
||||
grads0 = constant_op.constant(grads0_np)
|
||||
grads1 = constant_op.constant(grads1_np)
|
||||
opt = nadam_optimizer.NadamOptimizer()
|
||||
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
|
||||
variables.global_variables_initializer().run()
|
||||
|
||||
# Fetch params to validate initial values
|
||||
self.assertAllClose([1.0, 2.0], var0.eval())
|
||||
self.assertAllClose([3.0, 4.0], var1.eval())
|
||||
|
||||
beta1_power, beta2_power = opt._get_beta_accumulators()
|
||||
|
||||
# Run 3 steps of Nadam
|
||||
for t in range(1, 4):
|
||||
self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
|
||||
self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
|
||||
update.run()
|
||||
|
||||
var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
|
||||
var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
|
||||
|
||||
# Validate updated params
|
||||
self.assertAllCloseAccordingToType(var0_np, var0.eval())
|
||||
self.assertAllCloseAccordingToType(var1_np, var1.eval())
|
||||
|
||||
def testBasic(self):
|
||||
self.doTestBasic(use_resource=False)
|
||||
|
||||
def testResourceBasic(self):
|
||||
self.doTestBasic(use_resource=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test.main()
|
@ -88,12 +88,20 @@ class BasicRNNCell(RNNCell):
|
||||
class GRUCell(RNNCell):
|
||||
"""Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""
|
||||
|
||||
def __init__(self, num_units, input_size=None, activation=tanh, reuse=None):
|
||||
def __init__(self,
|
||||
num_units,
|
||||
input_size=None,
|
||||
activation=tanh,
|
||||
reuse=None,
|
||||
kernel_initializer=None,
|
||||
bias_initializer=None):
|
||||
super(GRUCell, self).__init__(_reuse=reuse)
|
||||
if input_size is not None:
|
||||
logging.warn("%s: The input_size parameter is deprecated.", self)
|
||||
self._num_units = num_units
|
||||
self._activation = activation
|
||||
self._kernel_initializer = kernel_initializer
|
||||
self._bias_initializer = bias_initializer
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
@ -107,10 +115,18 @@ class GRUCell(RNNCell):
|
||||
"""Gated recurrent unit (GRU) with nunits cells."""
|
||||
with vs.variable_scope("gates"): # Reset gate and update gate.
|
||||
# We start with bias of 1.0 to not reset and not update.
|
||||
value = sigmoid(_linear([inputs, state], 2 * self._num_units, True, 1.0))
|
||||
bias_ones = self._bias_initializer
|
||||
if self._bias_initializer is None:
|
||||
dtype = [a.dtype for a in [inputs, state]][0]
|
||||
bias_ones = init_ops.constant_initializer(1.0, dtype=dtype)
|
||||
value = sigmoid(
|
||||
_linear([inputs, state], 2 * self._num_units, True, bias_ones,
|
||||
self._kernel_initializer))
|
||||
r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
|
||||
with vs.variable_scope("candidate"):
|
||||
c = self._activation(_linear([inputs, r * state], self._num_units, True))
|
||||
c = self._activation(
|
||||
_linear([inputs, r * state], self._num_units, True,
|
||||
self._bias_initializer, self._kernel_initializer))
|
||||
new_h = u * state + (1 - u) * c
|
||||
return new_h, new_h
|
||||
|
||||
@ -968,14 +984,19 @@ class _SlimRNNCell(RNNCell):
|
||||
return output, state
|
||||
|
||||
|
||||
def _linear(args, output_size, bias, bias_start=0.0):
|
||||
def _linear(args,
|
||||
output_size,
|
||||
bias,
|
||||
bias_initializer=None,
|
||||
kernel_initializer=None):
|
||||
"""Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
|
||||
|
||||
Args:
|
||||
args: a 2D Tensor or a list of 2D, batch x n, Tensors.
|
||||
output_size: int, second dimension of W[i].
|
||||
bias: boolean, whether to add a bias term or not.
|
||||
bias_start: starting value to initialize the bias; 0 by default.
|
||||
bias_initializer: starting value to initialize the bias; None by default.
|
||||
kernel_initializer: starting value to initialize the weight; None by default.
|
||||
|
||||
Returns:
|
||||
A 2D Tensor with shape [batch x output_size] equal to
|
||||
@ -1007,7 +1028,9 @@ def _linear(args, output_size, bias, bias_start=0.0):
|
||||
scope = vs.get_variable_scope()
|
||||
with vs.variable_scope(scope) as outer_scope:
|
||||
weights = vs.get_variable(
|
||||
_WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype)
|
||||
_WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
|
||||
dtype=dtype,
|
||||
initializer=kernel_initializer)
|
||||
if len(args) == 1:
|
||||
res = math_ops.matmul(args[0], weights)
|
||||
else:
|
||||
@ -1016,8 +1039,10 @@ def _linear(args, output_size, bias, bias_start=0.0):
|
||||
return res
|
||||
with vs.variable_scope(outer_scope) as inner_scope:
|
||||
inner_scope.set_partitioner(None)
|
||||
if bias_initializer is None:
|
||||
bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
|
||||
biases = vs.get_variable(
|
||||
_BIAS_VARIABLE_NAME, [output_size],
|
||||
dtype=dtype,
|
||||
initializer=init_ops.constant_initializer(bias_start, dtype=dtype))
|
||||
initializer=bias_initializer)
|
||||
return nn_ops.bias_add(res, biases)
|
||||
|
@ -497,13 +497,20 @@ def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
|
||||
|
||||
time = ops.convert_to_tensor(time, name="time")
|
||||
# During the first time step we only consider the initial beam
|
||||
scores_shape = array_ops.shape(scores)
|
||||
scores_flat = control_flow_ops.cond(
|
||||
time > 0,
|
||||
lambda: array_ops.reshape(scores, [batch_size, -1]),
|
||||
lambda: scores[:, 0])
|
||||
num_available_beam = control_flow_ops.cond(
|
||||
time > 0, lambda: math_ops.reduce_prod(scores_shape[1:]),
|
||||
lambda: math_ops.reduce_prod(scores_shape[2:]))
|
||||
|
||||
# Pick the next beams according to the specified successors function
|
||||
next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=beam_width)
|
||||
next_beam_size = math_ops.minimum(
|
||||
ops.convert_to_tensor(beam_width, dtype=dtypes.int32, name="beam_width"),
|
||||
num_available_beam)
|
||||
next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=next_beam_size)
|
||||
next_beam_scores.set_shape([static_batch_size, beam_width])
|
||||
word_indices.set_shape([static_batch_size, beam_width])
|
||||
|
||||
@ -561,7 +568,8 @@ def _get_scores(log_probs, sequence_lengths, length_penalty_weight):
|
||||
"""Calculates scores for beam search hypotheses.
|
||||
|
||||
Args:
|
||||
log_probs: The log probabilities with shape [batch_size, beam_width].
|
||||
log_probs: The log probabilities with shape
|
||||
`[batch_size, beam_width, vocab_size]`.
|
||||
sequence_lengths: The array of sequence lengths.
|
||||
length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
|
||||
|
||||
|
@ -476,7 +476,11 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
|
||||
struct ibv_qp_attr attr;
|
||||
memset(&attr, 0, sizeof(ibv_qp_attr));
|
||||
attr.qp_state = IBV_QPS_RTR;
|
||||
attr.path_mtu = IBV_MTU_4096;
|
||||
struct ibv_port_attr port_attr;
|
||||
CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &port_attr))
|
||||
<< "Query port failed";
|
||||
// This assumes both QP's ports are configured with the same MTU
|
||||
attr.path_mtu = port_attr.active_mtu;
|
||||
attr.dest_qp_num = remoteAddr.qpn;
|
||||
attr.rq_psn = remoteAddr.psn;
|
||||
attr.max_dest_rd_atomic = 1;
|
||||
@ -778,11 +782,8 @@ void RdmaTensorBuffer::SendNextItem() {
|
||||
EnqueueItem(key_with_step_id);
|
||||
}
|
||||
};
|
||||
// Use default session (legacy_session_)
|
||||
// TODO use WorkerSessionForSession
|
||||
// need to pass in session handle
|
||||
channel_->adapter_->worker_env_->session_mgr->LegacySession()
|
||||
->rendezvous_mgr->RecvLocalAsync(step_id, parsed, cb);
|
||||
channel_->adapter_->worker_env_->rendezvous_mgr->RecvLocalAsync(step_id,
|
||||
parsed, cb);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -29,10 +29,8 @@ namespace tensorflow {
|
||||
|
||||
class RdmaRemoteRendezvous : public BaseRemoteRendezvous {
|
||||
public:
|
||||
RdmaRemoteRendezvous(const WorkerEnv* env, const string& worker_name,
|
||||
int64 step_id, RdmaMgr* rdma_mgr)
|
||||
: BaseRemoteRendezvous(env, worker_name, step_id, true),
|
||||
rdma_mgr_(rdma_mgr) {}
|
||||
RdmaRemoteRendezvous(const WorkerEnv* env, int64 step_id, RdmaMgr* rdma_mgr)
|
||||
: BaseRemoteRendezvous(env, step_id, true), rdma_mgr_(rdma_mgr) {}
|
||||
|
||||
protected:
|
||||
void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
|
||||
@ -133,15 +131,12 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
|
||||
rb->SendNextItem();
|
||||
}
|
||||
|
||||
RdmaRendezvousMgr::RdmaRendezvousMgr(const WorkerEnv* env,
|
||||
const string& worker_name,
|
||||
WorkerCacheInterface* worker_cache)
|
||||
: BaseRendezvousMgr(env, worker_name) {}
|
||||
RdmaRendezvousMgr::RdmaRendezvousMgr(const WorkerEnv* env)
|
||||
: BaseRendezvousMgr(env) {}
|
||||
|
||||
BaseRemoteRendezvous* RdmaRendezvousMgr::Create(int64 step_id,
|
||||
const WorkerEnv* worker_env,
|
||||
const string& worker_name) {
|
||||
return new RdmaRemoteRendezvous(worker_env, worker_name, step_id, rdma_mgr_);
|
||||
const WorkerEnv* worker_env) {
|
||||
return new RdmaRemoteRendezvous(worker_env, step_id, rdma_mgr_);
|
||||
}
|
||||
|
||||
} // end namespace tensorflow
|
||||
|
@ -45,13 +45,12 @@ namespace tensorflow {
|
||||
// RendezvousMgr must have keys generated by Rendezvous::CreateKey.
|
||||
class RdmaRendezvousMgr : public BaseRendezvousMgr {
|
||||
public:
|
||||
explicit RdmaRendezvousMgr(const WorkerEnv* env, const string& worker_name,
|
||||
WorkerCacheInterface* worker_cache);
|
||||
explicit RdmaRendezvousMgr(const WorkerEnv* env);
|
||||
void SetRdmaMgr(RdmaMgr* rdma_mgr) { rdma_mgr_ = rdma_mgr; }
|
||||
|
||||
protected:
|
||||
BaseRemoteRendezvous* Create(int64 step_id, const WorkerEnv* worker_env,
|
||||
const string& worker_name) override;
|
||||
BaseRemoteRendezvous* Create(int64 step_id,
|
||||
const WorkerEnv* worker_env) override;
|
||||
|
||||
private:
|
||||
RdmaMgr* rdma_mgr_;
|
||||
|
@ -27,10 +27,8 @@ namespace tensorflow {
|
||||
|
||||
namespace {
|
||||
// static utility function
|
||||
RendezvousMgrInterface* NewRdmaRendezvousMgr(
|
||||
const WorkerEnv* env, const string& worker_name,
|
||||
WorkerCacheInterface* worker_cache) {
|
||||
return new RdmaRendezvousMgr(env, worker_name, worker_cache);
|
||||
RendezvousMgrInterface* NewRdmaRendezvousMgr(const WorkerEnv* env) {
|
||||
return new RdmaRendezvousMgr(env);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@ -56,7 +54,7 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
|
||||
TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
|
||||
|
||||
*channel_cache =
|
||||
NewGrpcChannelCache(channel_spec, GetChannelCreationFunction(server_def));
|
||||
NewGrpcChannelCache(channel_spec, GetChannelCreationFunction());
|
||||
|
||||
const string host_port = (*channel_cache)->TranslateTask(name_prefix);
|
||||
int requested_port;
|
||||
@ -86,11 +84,7 @@ Status VerbsServer::Init(ServiceInitFunction service_func,
|
||||
rdma_mgr_ = new RdmaMgr(worker_env(), channel_cache_);
|
||||
// set rdma_mgr for verbs_service and rdma_rendezvous_mgr
|
||||
verbs_service_->SetRdmaMgr(rdma_mgr_);
|
||||
// hardcoded to default session (legacy_session_)
|
||||
// TODO: use WorkerSessionForSession
|
||||
// need to pass in session handle
|
||||
dynamic_cast<RdmaRendezvousMgr*>(
|
||||
worker_env()->session_mgr->LegacySession()->rendezvous_mgr.get())
|
||||
dynamic_cast<RdmaRendezvousMgr*>(worker_env()->rendezvous_mgr)
|
||||
->SetRdmaMgr(rdma_mgr_);
|
||||
}
|
||||
return s;
|
||||
|
@ -2878,6 +2878,8 @@ filegroup(
|
||||
"lib/gif/testdata/scan.gif",
|
||||
# GIF data with optimization
|
||||
"lib/gif/testdata/optimized.gif",
|
||||
# BMP data
|
||||
"lib/bmp/testdata/lena.bmp",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
@ -227,7 +227,7 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
|
||||
}
|
||||
// Since we want to use the memory from recv_stream in the
|
||||
// send_device_to_device_stream, add a dependency to make sure the memory is
|
||||
// truely free.
|
||||
// truly free.
|
||||
// TODO(zhengxq): remove this dependency when we switch to a better way
|
||||
// to make sure the memory is free.
|
||||
send_device_to_device_stream->ThenWaitFor(recv_stream);
|
||||
@ -322,7 +322,7 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
|
||||
done(errors::Internal("No send gpu copy-out-stream is available."));
|
||||
return;
|
||||
}
|
||||
// Wait for the recv-stream to make sure the buffer is truely available.
|
||||
// Wait for the recv-stream to make sure the buffer is truly available.
|
||||
recv_host_to_device_stream->ThenWaitFor(recv_stream);
|
||||
|
||||
const int64 total_bytes = cpu_tensor->TotalBytes();
|
||||
|
@ -5,6 +5,4 @@ distributed TensorFlow runtime, using [gRPC](http://grpc.io) for inter-process
|
||||
communication.
|
||||
|
||||
To learn how to use the distributed runtime to create a TensorFlow cluster,
|
||||
see the "Distributed TensorFlow" How To, which is available [in this
|
||||
repository](../../g3doc/how_tos/distributed/index.md), and will be available
|
||||
on the TensorFlow website after the next version is released.
|
||||
see the [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed) How-To.
|
||||
|
@ -516,6 +516,7 @@ CostGraphDef* InMemoryRunGraphResponse::mutable_cost_graph() {
|
||||
|
||||
RunGraphResponse* InMemoryRunGraphResponse::get_proto() {
|
||||
LOG(FATAL) << "Cannot get a mutable protobuf for an InMemoryRunGraphResponse";
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t OwnedProtoRunGraphResponse::num_recvs() const {
|
||||
@ -634,6 +635,7 @@ RunMetadata* InMemoryRunStepResponse::mutable_metadata() { return &metadata_; }
|
||||
|
||||
RunStepResponse* InMemoryRunStepResponse::get_proto() {
|
||||
LOG(FATAL) << "Cannot get a mutable protobuf for an InMemoryRunStepResponse";
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t OwnedProtoRunStepResponse::num_tensors() const {
|
||||
|
@ -73,7 +73,7 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
|
||||
return;
|
||||
}
|
||||
if (!local_status.ok()) {
|
||||
// Discard the data if the run wasn't sucessful.
|
||||
// Discard the data if the run wasn't successful.
|
||||
barrier.DecrementCount();
|
||||
return;
|
||||
}
|
||||
|
@ -47,7 +47,7 @@ static double Median(std::vector<double> &&values) {
|
||||
// nth_element.
|
||||
const auto lower_middle = std::max_element(values.begin(), middle);
|
||||
// Preventing overflow. We know that '*lower_middle <= *middle'.
|
||||
// If both are on oposite sides of zero, the sum won't overflow, otherwise
|
||||
// If both are on opposite sides of zero, the sum won't overflow, otherwise
|
||||
// the difference won't overflow.
|
||||
if (*lower_middle <= 0 && *middle >= 0) {
|
||||
return (*lower_middle + *middle) / 2;
|
||||
|
@ -1625,6 +1625,7 @@ cc_library(
|
||||
":attention_ops",
|
||||
":colorspace_op",
|
||||
":crop_and_resize_op",
|
||||
":decode_bmp_op",
|
||||
":decode_image_op",
|
||||
":draw_bounding_box_op",
|
||||
":encode_jpeg_op",
|
||||
@ -1689,6 +1690,12 @@ tf_kernel_library(
|
||||
deps = IMAGE_DEPS,
|
||||
)
|
||||
|
||||
tf_kernel_library(
|
||||
name = "decode_bmp_op",
|
||||
prefix = "decode_bmp_op",
|
||||
deps = IMAGE_DEPS,
|
||||
)
|
||||
|
||||
tf_kernel_library(
|
||||
name = "decode_image_op",
|
||||
prefix = "decode_image_op",
|
||||
@ -4166,6 +4173,7 @@ filegroup(
|
||||
srcs = [
|
||||
"batchtospace_op.cc",
|
||||
"ctc_decoder_ops.cc",
|
||||
"decode_bmp_op.cc",
|
||||
"depthtospace_op.cc",
|
||||
"dynamic_stitch_op.cc",
|
||||
"in_topk_op.cc",
|
||||
@ -4295,6 +4303,8 @@ filegroup(
|
||||
"decode_image_op.*",
|
||||
"encode_png_op.*",
|
||||
"encode_jpeg_op.*",
|
||||
"decode_jpeg_op.*",
|
||||
"decode_gif_op.*",
|
||||
"identity_reader_op.*",
|
||||
"remote_fused_graph_execute_op.*",
|
||||
"remote_fused_graph_rewriter_transform.*",
|
||||
|
@ -22,7 +22,7 @@ limitations under the License.
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
// We focus on the single thread performance of runing ops.
|
||||
// We focus on the single thread performance of running ops.
|
||||
static SessionOptions InitOptions() {
|
||||
SessionOptions opts;
|
||||
opts.config.set_intra_op_parallelism_threads(1);
|
||||
|
143
tensorflow/core/kernels/decode_bmp_op.cc
Normal file
143
tensorflow/core/kernels/decode_bmp_op.cc
Normal file
@ -0,0 +1,143 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// See docs in ../ops/image_ops.cc
|
||||
|
||||
#include <memory>
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/register_types.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/framework/tensor_shape.h"
|
||||
#include "tensorflow/core/framework/types.h"
|
||||
#include "tensorflow/core/framework/types.pb.h"
|
||||
#include "tensorflow/core/kernels/bounds_check.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
// Decode the contents of a BMP file
|
||||
class DecodeBmpOp : public OpKernel {
|
||||
public:
|
||||
explicit DecodeBmpOp(OpKernelConstruction* context) : OpKernel(context) {
|
||||
OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
|
||||
OP_REQUIRES(
|
||||
context, channels_ == 0 || channels_ == 3 || channels_ == 4,
|
||||
errors::InvalidArgument("channels must be 0, 3 or 4, got ", channels_));
|
||||
}
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor& contents = context->input(0);
|
||||
OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
|
||||
errors::InvalidArgument("contents must be scalar, got shape ",
|
||||
contents.shape().DebugString()));
|
||||
|
||||
// Start decoding image to get shape details
|
||||
const StringPiece input = contents.scalar<string>()();
|
||||
|
||||
const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
|
||||
const int32 header_size = internal::SubtleMustCopy(
|
||||
*(reinterpret_cast<const int32*>(img_bytes + 10)));
|
||||
const int32 width = internal::SubtleMustCopy(
|
||||
*(reinterpret_cast<const int32*>(img_bytes + 18)));
|
||||
const int32 height = internal::SubtleMustCopy(
|
||||
*(reinterpret_cast<const int32*>(img_bytes + 22)));
|
||||
const int32 bpp = internal::SubtleMustCopy(
|
||||
*(reinterpret_cast<const int32*>(img_bytes + 28)));
|
||||
|
||||
if (channels_) {
|
||||
OP_REQUIRES(context, (channels_ == bpp / 8),
|
||||
errors::InvalidArgument(
|
||||
"channels attribute ", channels_,
|
||||
" does not match bits per pixel from file ", bpp / 8));
|
||||
} else {
|
||||
channels_ = bpp / 8;
|
||||
}
|
||||
|
||||
// Current implementation only supports 3 or 4 channel
|
||||
// bitmaps.
|
||||
OP_REQUIRES(context, (channels_ == 3 || channels_ == 4),
|
||||
errors::InvalidArgument(
|
||||
"Number of channels must be 3 or 4, was ", channels_));
|
||||
|
||||
// if height is negative, data layout is top down
|
||||
// otherwise, it's bottom up
|
||||
bool top_down = (height < 0);
|
||||
|
||||
// Decode image, allocating tensor once the image size is known
|
||||
Tensor* output = nullptr;
|
||||
OP_REQUIRES_OK(
|
||||
context, context->allocate_output(
|
||||
0, TensorShape({abs(height), width, channels_}), &output));
|
||||
|
||||
const uint8* bmp_pixels = &img_bytes[header_size];
|
||||
|
||||
Decode(bmp_pixels, output->flat<uint8>().data(), width, abs(height),
|
||||
channels_, top_down);
|
||||
}
|
||||
|
||||
uint8* Decode(const uint8* input, uint8* const output, const int width,
|
||||
const int height, const int channles, bool top_down);
|
||||
|
||||
private:
|
||||
int channels_;
|
||||
};
|
||||
REGISTER_KERNEL_BUILDER(Name("DecodeBmp").Device(DEVICE_CPU), DecodeBmpOp);
|
||||
|
||||
uint8* DecodeBmpOp::Decode(const uint8* input, uint8* const output,
|
||||
const int width, const int height,
|
||||
const int channels, bool top_down) {
|
||||
// there may be padding bytes when the width is not a multiple of 4 bytes
|
||||
// 8 * channels == bits per pixel
|
||||
int row_size = (8 * channels * width + 31) / 32 * 4;
|
||||
|
||||
for (int i = 0; i < height; i++) {
|
||||
int src_pos;
|
||||
int dst_pos;
|
||||
|
||||
for (int j = 0; j < width; j++) {
|
||||
if (!top_down) {
|
||||
src_pos = ((height - 1 - i) * row_size) + j * channels;
|
||||
} else {
|
||||
src_pos = i * row_size + j * channels;
|
||||
}
|
||||
|
||||
dst_pos = (i * width + j) * channels;
|
||||
|
||||
switch (channels) {
|
||||
case 3:
|
||||
// BGR -> RGB
|
||||
output[dst_pos] = input[src_pos + 2];
|
||||
output[dst_pos + 1] = input[src_pos + 1];
|
||||
output[dst_pos + 2] = input[src_pos];
|
||||
break;
|
||||
case 4:
|
||||
// BGRA -> RGBA
|
||||
output[dst_pos] = input[src_pos + 2];
|
||||
output[dst_pos + 1] = input[src_pos + 1];
|
||||
output[dst_pos + 2] = input[src_pos];
|
||||
output[dst_pos + 3] = input[src_pos + 3];
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Unexpected number of channels: " << channels;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
@ -1069,7 +1069,7 @@ struct DeepConv2D<CPUDevice, T> {
|
||||
// Allocate temporary buffer 'buffer2', which is first used for
|
||||
// transformed input tiles, then re-used for transformed output tiles.
|
||||
// Calculate required buffer size for 'buffer2' as max required buffer
|
||||
// between input and output tranform buffer sizes.
|
||||
// between input and output transform buffer sizes.
|
||||
const int64 buffer2_tile_transform_size =
|
||||
tile_spatial_size * num_tiles * in_depth;
|
||||
const int64 buffer2_out_transform_size =
|
||||
|
@ -441,7 +441,9 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
|
||||
DepthwiseConv2dNativeOp<CPUDevice, T>);
|
||||
|
||||
TF_CALL_float(REGISTER_CPU_KERNEL);
|
||||
#if defined(PLATFORM_WINDOWS) && !defined(_DEBUG)
|
||||
TF_CALL_double(REGISTER_CPU_KERNEL);
|
||||
#endif
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
REGISTER_KERNEL_BUILDER(
|
||||
|
@ -15,7 +15,7 @@ limitations under the License.
|
||||
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
// See docs in ../ops/fft_ops.cc.
|
||||
// See docs in ../ops/spectral_ops.cc.
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
@ -29,22 +29,13 @@ limitations under the License.
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#include "tensorflow/core/platform/stream_executor.h"
|
||||
#endif
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
namespace {
|
||||
// TODO(vrv/zhifengc): Refactor AsDeviceMemory() into GPUUtil.
|
||||
template <typename T>
|
||||
perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
|
||||
perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
|
||||
perftools::gputools::DeviceMemory<T> typed(wrapped);
|
||||
return typed;
|
||||
}
|
||||
} // end namespace
|
||||
|
||||
class FFTGPUBase : public OpKernel {
|
||||
class FFTBase : public OpKernel {
|
||||
public:
|
||||
explicit FFTGPUBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
|
||||
explicit FFTBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
|
||||
|
||||
void Compute(OpKernelContext* ctx) override {
|
||||
const Tensor& in = ctx->input(0);
|
||||
@ -97,9 +88,110 @@ class FFTGPUBase : public OpKernel {
|
||||
virtual bool IsForward() const = 0;
|
||||
virtual bool IsReal() const = 0;
|
||||
|
||||
private:
|
||||
// The function that actually computes the FFT.
|
||||
virtual void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
|
||||
Tensor* out) = 0;
|
||||
};
|
||||
|
||||
typedef Eigen::ThreadPoolDevice CPUDevice;
|
||||
|
||||
template <bool Forward, bool _Real, int FFTRank>
|
||||
class FFTCPU : public FFTBase {
|
||||
public:
|
||||
using FFTBase::FFTBase;
|
||||
|
||||
protected:
|
||||
int Rank() const override { return FFTRank; }
|
||||
bool IsForward() const override { return Forward; }
|
||||
bool IsReal() const override { return _Real; }
|
||||
|
||||
void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
|
||||
Tensor* out) {
|
||||
Tensor* out) override {
|
||||
// Create the axes (which are always trailing).
|
||||
auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
|
||||
auto device = ctx->eigen_device<CPUDevice>();
|
||||
|
||||
if (!IsReal()) {
|
||||
auto input = ((Tensor)in).flat_inner_dims<complex64, FFTRank + 1>();
|
||||
// Compute the FFT using eigen.
|
||||
auto output = out->flat_inner_dims<complex64, FFTRank + 1>();
|
||||
output.device(device) = input.template fft < Eigen::BothParts,
|
||||
Forward ? Eigen::FFT_FORWARD : Eigen::FFT_REVERSE > (axes);
|
||||
} else {
|
||||
if (IsForward()) {
|
||||
auto input = ((Tensor)in).flat_inner_dims<float, FFTRank + 1>();
|
||||
auto output = out->flat_inner_dims<complex64, FFTRank + 1>();
|
||||
Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> startIndices;
|
||||
|
||||
// Compute the full FFT using a temporary tensor.
|
||||
Tensor temp;
|
||||
OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<complex64>::v(),
|
||||
in.shape(), &temp));
|
||||
auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
|
||||
full_fft.device(device) =
|
||||
input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
|
||||
|
||||
// Slice away the negative frequency components.
|
||||
output.device(device) =
|
||||
full_fft.slice(startIndices, output.dimensions());
|
||||
} else {
|
||||
// TODO: reconstruct the full fft and take the inverse.
|
||||
ctx->CtxFailureWithWarning(
|
||||
errors::Unimplemented("IRFFT is not implemented as a CPU kernel"));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Use labels to distinguish between internal and open source versions
|
||||
// of these kernels.
|
||||
#ifdef PLATFORM_GOOGLE
|
||||
#define FFT_LABEL "eigen"
|
||||
#else
|
||||
#define FFT_LABEL ""
|
||||
#endif
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name("FFT").Device(DEVICE_CPU).Label(FFT_LABEL),
|
||||
FFTCPU<true, false, 1>);
|
||||
REGISTER_KERNEL_BUILDER(Name("IFFT").Device(DEVICE_CPU).Label(FFT_LABEL),
|
||||
FFTCPU<false, false, 1>);
|
||||
REGISTER_KERNEL_BUILDER(Name("FFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
|
||||
FFTCPU<true, false, 2>);
|
||||
REGISTER_KERNEL_BUILDER(Name("IFFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
|
||||
FFTCPU<false, false, 2>);
|
||||
REGISTER_KERNEL_BUILDER(Name("FFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
|
||||
FFTCPU<true, false, 3>);
|
||||
REGISTER_KERNEL_BUILDER(Name("IFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
|
||||
FFTCPU<false, false, 3>);
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name("RFFT").Device(DEVICE_CPU).Label(FFT_LABEL),
|
||||
FFTCPU<true, true, 1>);
|
||||
REGISTER_KERNEL_BUILDER(Name("RFFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
|
||||
FFTCPU<true, true, 2>);
|
||||
REGISTER_KERNEL_BUILDER(Name("RFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
|
||||
FFTCPU<true, true, 3>);
|
||||
|
||||
#undef FFT_LABEL
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
|
||||
namespace {
|
||||
// TODO(vrv/zhifengc): Refactor AsDeviceMemory() into GPUUtil.
|
||||
template <typename T>
|
||||
perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
|
||||
perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
|
||||
perftools::gputools::DeviceMemory<T> typed(wrapped);
|
||||
return typed;
|
||||
}
|
||||
} // end namespace
|
||||
|
||||
class FFTGPUBase : public FFTBase {
|
||||
public:
|
||||
using FFTBase::FFTBase;
|
||||
|
||||
protected:
|
||||
void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
|
||||
Tensor* out) override {
|
||||
auto* stream = ctx->op_device_context()->stream();
|
||||
OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
|
||||
|
||||
@ -238,7 +330,6 @@ REGISTER_KERNEL_BUILDER(Name("BatchFFT3D").Device(DEVICE_GPU),
|
||||
FFTGPU<true, false, 3>);
|
||||
REGISTER_KERNEL_BUILDER(Name("BatchIFFT3D").Device(DEVICE_GPU),
|
||||
FFTGPU<false, false, 3>);
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
} // end namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
|
@ -114,13 +114,13 @@ GraphTransferUtils::BuildRemoteFusedGraphExecuteInfo(
|
||||
for (const std::pair<string, Tensor>& input_node_info : inputs) {
|
||||
const Scope& scope = root.WithOpName(input_node_info.first);
|
||||
Node* ret;
|
||||
const auto unique_name = scope.GetUniqueNameForOp("PlaceholderV2");
|
||||
auto builder = NodeBuilder(unique_name, "PlaceholderV2")
|
||||
const auto unique_name = scope.GetUniqueNameForOp("Placeholder");
|
||||
auto builder = NodeBuilder(unique_name, "Placeholder")
|
||||
.Attr("dtype", input_node_info.second.dtype())
|
||||
.Attr("shape", input_node_info.second.shape());
|
||||
scope.UpdateBuilder(&builder);
|
||||
scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
|
||||
CHECK(scope.ok());
|
||||
TF_CHECK_OK(scope.status());
|
||||
output_list.emplace_back(Output(ret, 0));
|
||||
input_types.push_back(input_node_info.second.dtype());
|
||||
}
|
||||
|
@ -33,6 +33,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace {
|
||||
|
||||
typedef Eigen::ThreadPoolDevice CPUDevice;
|
||||
|
||||
@ -89,6 +90,59 @@ static inline float ComputeIOU(typename TTypes<float, 2>::ConstTensor boxes,
|
||||
return intersection_area / (area_i + area_j - intersection_area);
|
||||
}
|
||||
|
||||
void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
|
||||
const Tensor& scores, const Tensor& max_output_size,
|
||||
const float iou_threshold) {
|
||||
OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
|
||||
errors::InvalidArgument("iou_threshold must be in [0, 1]"));
|
||||
|
||||
int num_boxes = 0;
|
||||
ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
|
||||
if (!context->status().ok()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
|
||||
typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
|
||||
|
||||
std::vector<float> scores_data(num_boxes);
|
||||
std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
|
||||
std::vector<int> sorted_indices;
|
||||
DecreasingArgSort(scores_data, &sorted_indices);
|
||||
|
||||
std::vector<bool> active(num_boxes, true);
|
||||
std::vector<int> selected;
|
||||
int num_active = active.size();
|
||||
for (int i = 0; i < num_boxes; ++i) {
|
||||
if (num_active == 0 || selected.size() >= output_size) break;
|
||||
if (active[i]) {
|
||||
selected.push_back(sorted_indices[i]);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
for (int j = i + 1; j < num_boxes; ++j) {
|
||||
if (active[j]) {
|
||||
float iou =
|
||||
ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
|
||||
if (iou > iou_threshold) {
|
||||
active[j] = false;
|
||||
num_active--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate output tensor
|
||||
Tensor* output = nullptr;
|
||||
TensorShape output_shape({static_cast<int>(selected.size())});
|
||||
OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
|
||||
typename TTypes<int, 1>::Tensor selected_indices_data =
|
||||
output->tensor<int, 1>();
|
||||
std::copy_n(selected.begin(), selected.size(), selected_indices_data.data());
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
template <typename Device>
|
||||
class NonMaxSuppressionOp : public OpKernel {
|
||||
public:
|
||||
@ -98,9 +152,6 @@ class NonMaxSuppressionOp : public OpKernel {
|
||||
}
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
OP_REQUIRES(context, iou_threshold_ >= 0 && iou_threshold_ <= 1,
|
||||
errors::InvalidArgument("iou_threshold must be in [0, 1]"));
|
||||
|
||||
// boxes: [num_boxes, 4]
|
||||
const Tensor& boxes = context->input(0);
|
||||
// scores: [num_boxes]
|
||||
@ -112,59 +163,48 @@ class NonMaxSuppressionOp : public OpKernel {
|
||||
errors::InvalidArgument("max_output_size must be 0-D, got shape ",
|
||||
max_output_size.shape().DebugString()));
|
||||
|
||||
int num_boxes = 0;
|
||||
ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
|
||||
if (!context->status().ok()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int output_size =
|
||||
std::min(max_output_size.scalar<int>()(), num_boxes);
|
||||
typename TTypes<float, 2>::ConstTensor boxes_data =
|
||||
boxes.tensor<float, 2>();
|
||||
|
||||
std::vector<float> scores_data(num_boxes);
|
||||
std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
|
||||
std::vector<int> sorted_indices;
|
||||
DecreasingArgSort(scores_data, &sorted_indices);
|
||||
|
||||
std::vector<bool> active(num_boxes, true);
|
||||
std::vector<int> selected;
|
||||
int num_active = active.size();
|
||||
for (int i = 0; i < num_boxes; ++i) {
|
||||
if (num_active == 0 || selected.size() >= output_size) break;
|
||||
if (active[i]) {
|
||||
selected.push_back(sorted_indices[i]);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
for (int j = i + 1; j < num_boxes; ++j) {
|
||||
if (active[j]) {
|
||||
float iou =
|
||||
ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
|
||||
if (iou > iou_threshold_) {
|
||||
active[j] = false;
|
||||
num_active--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate output tensor
|
||||
Tensor* output = nullptr;
|
||||
TensorShape output_shape({static_cast<int>(selected.size())});
|
||||
OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
|
||||
typename TTypes<int, 1>::Tensor selected_indices_data =
|
||||
output->tensor<int, 1>();
|
||||
std::copy_n(selected.begin(), selected.size(),
|
||||
selected_indices_data.data());
|
||||
DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
|
||||
iou_threshold_);
|
||||
}
|
||||
|
||||
private:
|
||||
float iou_threshold_;
|
||||
};
|
||||
|
||||
template <typename Device>
|
||||
class NonMaxSuppressionV2Op : public OpKernel {
|
||||
public:
|
||||
explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
|
||||
: OpKernel(context) {}
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
// boxes: [num_boxes, 4]
|
||||
const Tensor& boxes = context->input(0);
|
||||
// scores: [num_boxes]
|
||||
const Tensor& scores = context->input(1);
|
||||
// max_output_size: scalar
|
||||
const Tensor& max_output_size = context->input(2);
|
||||
OP_REQUIRES(
|
||||
context, TensorShapeUtils::IsScalar(max_output_size.shape()),
|
||||
errors::InvalidArgument("max_output_size must be 0-D, got shape ",
|
||||
max_output_size.shape().DebugString()));
|
||||
// iou_threshold: scalar
|
||||
const Tensor& iou_threshold = context->input(3);
|
||||
OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
|
||||
errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
|
||||
iou_threshold.shape().DebugString()));
|
||||
|
||||
const float iou_threshold_val = iou_threshold.scalar<float>()();
|
||||
|
||||
DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
|
||||
iou_threshold_val);
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name("NonMaxSuppression").Device(DEVICE_CPU),
|
||||
NonMaxSuppressionOp<CPUDevice>);
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2").Device(DEVICE_CPU),
|
||||
NonMaxSuppressionV2Op<CPUDevice>);
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -173,4 +173,167 @@ TEST_F(NonMaxSuppressionOpTest, TestEmptyInput) {
|
||||
test::ExpectTensorEqual<int>(expected, *GetOutput(0));
|
||||
}
|
||||
|
||||
//
|
||||
// NonMaxSuppressionV2Op Tests
|
||||
//
|
||||
|
||||
class NonMaxSuppressionV2OpTest : public OpsTestBase {
|
||||
protected:
|
||||
void MakeOp() {
|
||||
TF_EXPECT_OK(NodeDefBuilder("non_max_suppression_op", "NonMaxSuppressionV2")
|
||||
.Input(FakeInput(DT_FLOAT))
|
||||
.Input(FakeInput(DT_FLOAT))
|
||||
.Input(FakeInput(DT_INT32))
|
||||
.Input(FakeInput(DT_FLOAT))
|
||||
.Finalize(node_def()));
|
||||
TF_EXPECT_OK(InitOp());
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(NonMaxSuppressionV2OpTest, TestSelectFromThreeClusters) {
|
||||
MakeOp();
|
||||
AddInputFromArray<float>(
|
||||
TensorShape({6, 4}),
|
||||
{0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f,
|
||||
0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101});
|
||||
AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
|
||||
AddInputFromArray<int>(TensorShape({}), {3});
|
||||
AddInputFromArray<float>(TensorShape({}), {.5f});
|
||||
TF_ASSERT_OK(RunOpKernel());
|
||||
|
||||
Tensor expected(allocator(), DT_INT32, TensorShape({3}));
|
||||
test::FillValues<int>(&expected, {3, 0, 5});
|
||||
test::ExpectTensorEqual<int>(expected, *GetOutput(0));
|
||||
}
|
||||
|
||||
TEST_F(NonMaxSuppressionV2OpTest,
|
||||
TestSelectFromThreeClustersFlippedCoordinates) {
|
||||
MakeOp();
|
||||
AddInputFromArray<float>(TensorShape({6, 4}),
|
||||
{1, 1, 0, 0, 0, 0.1f, 1, 1.1f, 0, .9f, 1, -0.1f,
|
||||
0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
|
||||
AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
|
||||
AddInputFromArray<int>(TensorShape({}), {3});
|
||||
AddInputFromArray<float>(TensorShape({}), {.5f});
|
||||
TF_ASSERT_OK(RunOpKernel());
|
||||
|
||||
Tensor expected(allocator(), DT_INT32, TensorShape({3}));
|
||||
test::FillValues<int>(&expected, {3, 0, 5});
|
||||
test::ExpectTensorEqual<int>(expected, *GetOutput(0));
|
||||
}
|
||||
|
||||
TEST_F(NonMaxSuppressionV2OpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
|
||||
MakeOp();
|
||||
AddInputFromArray<float>(
|
||||
TensorShape({6, 4}),
|
||||
{0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f,
|
||||
0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101});
|
||||
AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
|
||||
AddInputFromArray<int>(TensorShape({}), {2});
|
||||
AddInputFromArray<float>(TensorShape({}), {.5f});
|
||||
TF_ASSERT_OK(RunOpKernel());
|
||||
|
||||
Tensor expected(allocator(), DT_INT32, TensorShape({2}));
|
||||
test::FillValues<int>(&expected, {3, 0});
|
||||
test::ExpectTensorEqual<int>(expected, *GetOutput(0));
|
||||
}
|
||||
|
||||
TEST_F(NonMaxSuppressionV2OpTest,
|
||||
TestSelectAtMostThirtyBoxesFromThreeClusters) {
|
||||
MakeOp();
|
||||
AddInputFromArray<float>(
|
||||
TensorShape({6, 4}),
|
||||
{0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f,
|
||||
0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101});
|
||||
AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
|
||||
AddInputFromArray<int>(TensorShape({}), {30});
|
||||
AddInputFromArray<float>(TensorShape({}), {.5f});
|
||||
TF_ASSERT_OK(RunOpKernel());
|
||||
|
||||
Tensor expected(allocator(), DT_INT32, TensorShape({3}));
|
||||
test::FillValues<int>(&expected, {3, 0, 5});
|
||||
test::ExpectTensorEqual<int>(expected, *GetOutput(0));
|
||||
}
|
||||
|
||||
TEST_F(NonMaxSuppressionV2OpTest, TestSelectSingleBox) {
|
||||
MakeOp();
|
||||
AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
|
||||
AddInputFromArray<float>(TensorShape({1}), {.9f});
|
||||
AddInputFromArray<int>(TensorShape({}), {3});
|
||||
AddInputFromArray<float>(TensorShape({}), {.5f});
|
||||
TF_ASSERT_OK(RunOpKernel());
|
||||
|
||||
Tensor expected(allocator(), DT_INT32, TensorShape({1}));
|
||||
test::FillValues<int>(&expected, {0});
|
||||
test::ExpectTensorEqual<int>(expected, *GetOutput(0));
|
||||
}
|
||||
|
||||
TEST_F(NonMaxSuppressionV2OpTest, TestSelectFromTenIdenticalBoxes) {
|
||||
MakeOp();
|
||||
|
||||
int num_boxes = 10;
|
||||
std::vector<float> corners(num_boxes * 4);
|
||||
std::vector<float> scores(num_boxes);
|
||||
for (int i = 0; i < num_boxes; ++i) {
|
||||
corners[i * 4 + 0] = 0;
|
||||
corners[i * 4 + 1] = 0;
|
||||
corners[i * 4 + 2] = 1;
|
||||
corners[i * 4 + 3] = 1;
|
||||
scores[i] = .9;
|
||||
}
|
||||
AddInputFromArray<float>(TensorShape({num_boxes, 4}), corners);
|
||||
AddInputFromArray<float>(TensorShape({num_boxes}), scores);
|
||||
AddInputFromArray<int>(TensorShape({}), {3});
|
||||
AddInputFromArray<float>(TensorShape({}), {.5f});
|
||||
TF_ASSERT_OK(RunOpKernel());
|
||||
|
||||
Tensor expected(allocator(), DT_INT32, TensorShape({1}));
|
||||
test::FillValues<int>(&expected, {0});
|
||||
test::ExpectTensorEqual<int>(expected, *GetOutput(0));
|
||||
}
|
||||
|
||||
TEST_F(NonMaxSuppressionV2OpTest, TestInconsistentBoxAndScoreShapes) {
|
||||
MakeOp();
|
||||
AddInputFromArray<float>(
|
||||
TensorShape({6, 4}),
|
||||
{0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f,
|
||||
0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101});
|
||||
AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
|
||||
AddInputFromArray<int>(TensorShape({}), {30});
|
||||
AddInputFromArray<float>(TensorShape({}), {.5f});
|
||||
Status s = RunOpKernel();
|
||||
|
||||
ASSERT_FALSE(s.ok());
|
||||
EXPECT_TRUE(
|
||||
StringPiece(s.ToString()).contains("scores has incompatible shape"))
|
||||
<< s;
|
||||
}
|
||||
|
||||
TEST_F(NonMaxSuppressionV2OpTest, TestInvalidIOUThreshold) {
|
||||
MakeOp();
|
||||
AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
|
||||
AddInputFromArray<float>(TensorShape({1}), {.9f});
|
||||
AddInputFromArray<int>(TensorShape({}), {3});
|
||||
AddInputFromArray<float>(TensorShape({}), {1.2f});
|
||||
Status s = RunOpKernel();
|
||||
|
||||
ASSERT_FALSE(s.ok());
|
||||
EXPECT_TRUE(
|
||||
StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
|
||||
<< s;
|
||||
}
|
||||
|
||||
TEST_F(NonMaxSuppressionV2OpTest, TestEmptyInput) {
|
||||
MakeOp();
|
||||
AddInputFromArray<float>(TensorShape({0, 4}), {});
|
||||
AddInputFromArray<float>(TensorShape({0}), {});
|
||||
AddInputFromArray<int>(TensorShape({}), {30});
|
||||
AddInputFromArray<float>(TensorShape({}), {.5f});
|
||||
TF_ASSERT_OK(RunOpKernel());
|
||||
|
||||
Tensor expected(allocator(), DT_INT32, TensorShape({0}));
|
||||
test::FillValues<int>(&expected, {});
|
||||
test::ExpectTensorEqual<int>(expected, *GetOutput(0));
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -89,8 +89,8 @@ static Output BuildPlaceHolderOp(const string& name, const DataType dt,
|
||||
const TensorShape& tensor_shape, Scope* root) {
|
||||
const Scope& scope = root->WithOpName(name);
|
||||
Node* ret;
|
||||
const string unique_name = scope.GetUniqueNameForOp("PlaceholderV2");
|
||||
NodeBuilder builder = NodeBuilder(unique_name, "PlaceholderV2")
|
||||
const string unique_name = scope.GetUniqueNameForOp("Placeholder");
|
||||
NodeBuilder builder = NodeBuilder(unique_name, "Placeholder")
|
||||
.Attr("dtype", dt)
|
||||
.Attr("shape", tensor_shape);
|
||||
scope.UpdateBuilder(&builder);
|
||||
|
@ -245,12 +245,22 @@ struct ApplyAdamNonCuda {
|
||||
typename TTypes<T>::ConstScalar beta1,
|
||||
typename TTypes<T>::ConstScalar beta2,
|
||||
typename TTypes<T>::ConstScalar epsilon,
|
||||
typename TTypes<T>::ConstFlat grad) {
|
||||
typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
|
||||
const T alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) /
|
||||
(T(1) - beta1_power());
|
||||
// beta1 == μ
|
||||
// beta2 == ν
|
||||
// v == n
|
||||
// var == θ
|
||||
|
||||
m.device(d) += (grad - m) * (T(1) - beta1());
|
||||
v.device(d) += (grad.square() - v) * (T(1) - beta2());
|
||||
var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
|
||||
if (use_nesterov) {
|
||||
var.device(d) -= ((grad * (T(1) - beta1()) + beta1() * m) * alpha) /
|
||||
(v.sqrt() + epsilon());
|
||||
} else {
|
||||
var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -2248,6 +2258,7 @@ class ApplyAdamOp : public OpKernel {
|
||||
public:
|
||||
explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
|
||||
OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
|
||||
OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
|
||||
}
|
||||
|
||||
void Compute(OpKernelContext* ctx) override {
|
||||
@ -2318,17 +2329,18 @@ class ApplyAdamOp : public OpKernel {
|
||||
grad.shape().DebugString()));
|
||||
|
||||
const Device& device = ctx->template eigen_device<Device>();
|
||||
functor::ApplyAdam<Device, T>()(device, var.flat<T>(), m.flat<T>(),
|
||||
v.flat<T>(), beta1_power.scalar<T>(),
|
||||
beta2_power.scalar<T>(), lr.scalar<T>(),
|
||||
beta1.scalar<T>(), beta2.scalar<T>(),
|
||||
epsilon.scalar<T>(), grad.flat<T>());
|
||||
functor::ApplyAdam<Device, T>()(
|
||||
device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
|
||||
beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
|
||||
beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
|
||||
grad.flat<T>(), use_nesterov_);
|
||||
|
||||
MaybeForwardRefInputToRefOutput(ctx, 0, 0);
|
||||
}
|
||||
|
||||
private:
|
||||
bool use_exclusive_lock_;
|
||||
bool use_nesterov_;
|
||||
};
|
||||
|
||||
using CPUDevice = Eigen::ThreadPoolDevice;
|
||||
@ -2372,7 +2384,7 @@ namespace functor {
|
||||
typename TTypes<T>::ConstScalar beta1, \
|
||||
typename TTypes<T>::ConstScalar beta2, \
|
||||
typename TTypes<T>::ConstScalar epsilon, \
|
||||
typename TTypes<T>::ConstFlat grad); \
|
||||
typename TTypes<T>::ConstFlat grad, bool use_nesterov); \
|
||||
extern template struct ApplyAdam<GPUDevice, T>;
|
||||
DECLARE_GPU_SPEC(Eigen::half);
|
||||
DECLARE_GPU_SPEC(float);
|
||||
|
@ -123,7 +123,7 @@ struct ApplyAdam {
|
||||
typename TTypes<T>::ConstScalar beta1,
|
||||
typename TTypes<T>::ConstScalar beta2,
|
||||
typename TTypes<T>::ConstScalar epsilon,
|
||||
typename TTypes<T>::ConstFlat grad);
|
||||
typename TTypes<T>::ConstFlat grad, bool use_nesterov);
|
||||
};
|
||||
|
||||
template <typename Device, typename T>
|
||||
|
@ -109,7 +109,7 @@ struct ApplyAdam<GPUDevice, T> {
|
||||
typename TTypes<T>::ConstScalar beta1,
|
||||
typename TTypes<T>::ConstScalar beta2,
|
||||
typename TTypes<T>::ConstScalar epsilon,
|
||||
typename TTypes<T>::ConstFlat grad) {
|
||||
typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
|
||||
Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
|
||||
bcast[0] = grad.dimension(0);
|
||||
Eigen::Sizes<1> single;
|
||||
@ -122,11 +122,25 @@ struct ApplyAdam<GPUDevice, T> {
|
||||
v +
|
||||
(beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
|
||||
(grad.square() - v);
|
||||
var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
|
||||
(beta1_power.constant(one) - beta1_power))
|
||||
.reshape(single)
|
||||
.broadcast(bcast) *
|
||||
m / (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
|
||||
|
||||
if (use_nesterov) {
|
||||
var.device(d) -=
|
||||
(lr * (beta2_power.constant(one) - beta2_power).sqrt() /
|
||||
(beta1_power.constant(one) - beta1_power))
|
||||
.reshape(single)
|
||||
.broadcast(bcast) *
|
||||
(m * beta1.reshape(single).broadcast(bcast) +
|
||||
(beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
|
||||
grad) /
|
||||
(epsilon.reshape(single).broadcast(bcast) + v.sqrt());
|
||||
} else {
|
||||
var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
|
||||
(beta1_power.constant(one) - beta1_power))
|
||||
.reshape(single)
|
||||
.broadcast(bcast) *
|
||||
m /
|
||||
(epsilon.reshape(single).broadcast(bcast) + v.sqrt());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
BIN
tensorflow/core/lib/bmp/testdata/lena.bmp
vendored
Normal file
BIN
tensorflow/core/lib/bmp/testdata/lena.bmp
vendored
Normal file
Binary file not shown.
After ![]() (image error) Size: 4.1 KiB |
@ -816,7 +816,7 @@ static void BM_StdVectorFillString(int iters, int len) {
|
||||
}
|
||||
testing::ItemsProcessed(int64{iters} * len);
|
||||
// The purpose of the benchmark is to verify that inlined vector is
|
||||
// efficient when moving is more efficent than copying. To do so, we
|
||||
// efficient when moving is more efficient than copying. To do so, we
|
||||
// use strings that are larger than the small string optimization.
|
||||
CHECK(!StringRepresentedInline(strings[0]));
|
||||
}
|
||||
|
@ -563,6 +563,28 @@ compression: Compression level.
|
||||
contents: 0-D. PNG-encoded image.
|
||||
)doc");
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
REGISTER_OP("DecodeBmp")
|
||||
.Input("contents: string")
|
||||
.Output("image: uint8")
|
||||
.Attr("channels: int = 0")
|
||||
.SetShapeFn(DecodeImageShapeFn)
|
||||
.Doc(R"doc(
|
||||
Decode the first frame of a BMP-encoded image to a uint8 tensor.
|
||||
|
||||
The attr `channels` indicates the desired number of color channels for the
|
||||
decoded image.
|
||||
|
||||
Accepted values are:
|
||||
|
||||
* 0: Use the number of channels in the BMP-encoded image.
|
||||
* 3: output an RGB image.
|
||||
* 4: output an RGBA image.
|
||||
|
||||
contents: 0-D. The BMP-encoded image.
|
||||
image: 3-D with shape `[height, width, channels]`. RGB order
|
||||
)doc");
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
REGISTER_OP("DecodeGif")
|
||||
.Input("contents: string")
|
||||
@ -992,16 +1014,13 @@ is agnostic to where the origin is in the coordinate system. Note that this
|
||||
algorithm is invariant to orthogonal transformations and translations
|
||||
of the coordinate system; thus translating or reflections of the coordinate
|
||||
system result in the same boxes being selected by the algorithm.
|
||||
|
||||
The output of this operation is a set of integers indexing into the input
|
||||
collection of bounding boxes representing the selected boxes. The bounding
|
||||
box coordinates corresponding to the selected indices can then be obtained
|
||||
using the `tf.gather operation`. For example:
|
||||
|
||||
selected_indices = tf.image.non_max_suppression(
|
||||
boxes, scores, max_output_size, iou_threshold)
|
||||
selected_boxes = tf.gather(boxes, selected_indices)
|
||||
|
||||
boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
|
||||
scores: A 1-D float tensor of shape `[num_boxes]` representing a single
|
||||
score corresponding to each box (each row of boxes).
|
||||
@ -1013,4 +1032,46 @@ selected_indices: A 1-D integer tensor of shape `[M]` representing the selected
|
||||
indices from the boxes tensor, where `M <= max_output_size`.
|
||||
)doc");
|
||||
|
||||
REGISTER_OP("NonMaxSuppressionV2")
|
||||
.Input("boxes: float")
|
||||
.Input("scores: float")
|
||||
.Input("max_output_size: int32")
|
||||
.Input("iou_threshold: float")
|
||||
.Output("selected_indices: int32")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
c->set_output(0, c->Vector(c->UnknownDim()));
|
||||
return Status::OK();
|
||||
})
|
||||
.Doc(R"doc(
|
||||
Greedily selects a subset of bounding boxes in descending order of score,
|
||||
pruning away boxes that have high intersection-over-union (IOU) overlap
|
||||
with previously selected boxes. Bounding boxes are supplied as
|
||||
[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
|
||||
diagonal pair of box corners and the coordinates can be provided as normalized
|
||||
(i.e., lying in the interval [0, 1]) or absolute. Note that this algorithm
|
||||
is agnostic to where the origin is in the coordinate system. Note that this
|
||||
algorithm is invariant to orthogonal transformations and translations
|
||||
of the coordinate system; thus translating or reflections of the coordinate
|
||||
system result in the same boxes being selected by the algorithm.
|
||||
|
||||
The output of this operation is a set of integers indexing into the input
|
||||
collection of bounding boxes representing the selected boxes. The bounding
|
||||
box coordinates corresponding to the selected indices can then be obtained
|
||||
using the `tf.gather operation`. For example:
|
||||
|
||||
selected_indices = tf.image.non_max_suppression_v2(
|
||||
boxes, scores, max_output_size, iou_threshold)
|
||||
selected_boxes = tf.gather(boxes, selected_indices)
|
||||
|
||||
boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
|
||||
scores: A 1-D float tensor of shape `[num_boxes]` representing a single
|
||||
score corresponding to each box (each row of boxes).
|
||||
max_output_size: A scalar integer tensor representing the maximum number of
|
||||
boxes to be selected by non max suppression.
|
||||
iou_threshold: A 0-D float tensor representing the threshold for deciding whether
|
||||
boxes overlap too much with respect to IOU.
|
||||
selected_indices: A 1-D integer tensor of shape `[M]` representing the selected
|
||||
indices from the boxes tensor, where `M <= max_output_size`.
|
||||
)doc");
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -105,7 +105,7 @@ example_weights: a vector which contains the weight associated with each
|
||||
example_labels: a vector which contains the label/target associated with each
|
||||
example.
|
||||
sparse_indices: a list of vectors where each value is the indices which has
|
||||
corresponding weights in sparse_weights. This field maybe ommitted for the
|
||||
corresponding weights in sparse_weights. This field maybe omitted for the
|
||||
dense approach.
|
||||
sparse_weights: a list of vectors where each value is the weight associated with
|
||||
a sparse feature group.
|
||||
|
@ -1004,7 +1004,7 @@ out: Same as "var".
|
||||
use_locking: If `True`, updating of the var and accum tensors will be protected
|
||||
by a lock; otherwise the behavior is undefined, but may exhibit less
|
||||
contention.
|
||||
use_nesterov: If `True`, the tensor passed to compute grad will be
|
||||
use_nesterov: If `True`, the tensor passed to compute grad will be
|
||||
var - lr * momentum * accum, so in the end, the var you get is actually
|
||||
var - lr * momentum * accum.
|
||||
)doc");
|
||||
@ -1043,7 +1043,7 @@ out: Same as "var".
|
||||
use_locking: If `True`, updating of the var and accum tensors will be protected
|
||||
by a lock; otherwise the behavior is undefined, but may exhibit less
|
||||
contention.
|
||||
use_nesterov: If `True`, the tensor passed to compute grad will be
|
||||
use_nesterov: If `True`, the tensor passed to compute grad will be
|
||||
var - lr * momentum * accum, so in the end, the var you get is actually
|
||||
var - lr * momentum * accum.
|
||||
)doc");
|
||||
@ -1075,7 +1075,7 @@ momentum: Momentum. Must be a scalar.
|
||||
use_locking: If `True`, updating of the var and accum tensors will be protected
|
||||
by a lock; otherwise the behavior is undefined, but may exhibit less
|
||||
contention.
|
||||
use_nesterov: If `True`, the tensor passed to compute grad will be
|
||||
use_nesterov: If `True`, the tensor passed to compute grad will be
|
||||
var - lr * momentum * accum, so in the end, the var you get is actually
|
||||
var - lr * momentum * accum.
|
||||
)doc");
|
||||
@ -1112,7 +1112,7 @@ momentum: Momentum. Must be a scalar.
|
||||
use_locking: If `True`, updating of the var and accum tensors will be protected
|
||||
by a lock; otherwise the behavior is undefined, but may exhibit less
|
||||
contention.
|
||||
use_nesterov: If `True`, the tensor passed to compute grad will be
|
||||
use_nesterov: If `True`, the tensor passed to compute grad will be
|
||||
var - lr * momentum * accum, so in the end, the var you get is actually
|
||||
var - lr * momentum * accum.
|
||||
)doc");
|
||||
@ -1150,6 +1150,7 @@ REGISTER_OP("ApplyAdam")
|
||||
.Output("out: Ref(T)")
|
||||
.Attr("T: numbertype")
|
||||
.Attr("use_locking: bool = false")
|
||||
.Attr("use_nesterov: bool = false")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
return ApplyAdamShapeFn(c, false /* sparse */);
|
||||
})
|
||||
@ -1175,6 +1176,7 @@ out: Same as "var".
|
||||
use_locking: If `True`, updating of the var, m, and v tensors will be protected
|
||||
by a lock; otherwise the behavior is undefined, but may exhibit less
|
||||
contention.
|
||||
use_nesterov: If `True`, uses the nesterov update.
|
||||
)doc");
|
||||
|
||||
REGISTER_OP("ResourceApplyAdam")
|
||||
@ -1190,6 +1192,7 @@ REGISTER_OP("ResourceApplyAdam")
|
||||
.Input("grad: T")
|
||||
.Attr("T: numbertype")
|
||||
.Attr("use_locking: bool = false")
|
||||
.Attr("use_nesterov: bool = false")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
return ApplyAdamShapeFn(c, false /* sparse */);
|
||||
})
|
||||
@ -1214,6 +1217,7 @@ grad: The gradient.
|
||||
use_locking: If `True`, updating of the var, m, and v tensors will be protected
|
||||
by a lock; otherwise the behavior is undefined, but may exhibit less
|
||||
contention.
|
||||
use_nesterov: If `True`, uses the nesterov update.
|
||||
)doc");
|
||||
|
||||
static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
|
||||
|
@ -67,7 +67,7 @@ constexpr char kGceTokenUrl[] =
|
||||
// The authentication token scope to request.
|
||||
constexpr char kOAuthScope[] = "https://www.googleapis.com/auth/cloud-platform";
|
||||
|
||||
// The default intial delay between retries with exponential backoff.
|
||||
// The default initial delay between retries with exponential backoff.
|
||||
constexpr int kInitialRetryDelayUsec = 500000; // 0.5 sec
|
||||
|
||||
/// Returns whether the given path points to a readable file.
|
||||
|
@ -87,10 +87,11 @@ limitations under the License.
|
||||
// 22. Placeholder now can specify and enforce scalar and partial
|
||||
// shapes, particularly when restoring a graph from GraphDef
|
||||
// produced at version 22 or later. (04/10/2016)
|
||||
// 23. Remove NonMaxSuppression in favor of NonMaxSuppressionV2.
|
||||
|
||||
#define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
|
||||
#define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
|
||||
#define TF_GRAPH_DEF_VERSION 22
|
||||
#define TF_GRAPH_DEF_VERSION 23
|
||||
|
||||
// Checkpoint compatibility versions (the versions field in SavedSliceMeta).
|
||||
//
|
||||
|
@ -317,7 +317,7 @@ or `clang` available on your system. The binary PIP package installs the header
|
||||
files and the library that you need to compile your op in locations that are
|
||||
system specific. However, the TensorFlow python library provides the
|
||||
`get_include` function to get the header directory.
|
||||
Here is the output of this function on a Ubuntu machine.
|
||||
Here is the output of this function on an Ubuntu machine.
|
||||
|
||||
```bash
|
||||
$ python
|
||||
@ -1215,6 +1215,8 @@ you'll need to specify the path explicitly in the second (g++) command above.
|
||||
For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in
|
||||
`/usr/local/cuda-8.0`.
|
||||
|
||||
> Note in some linux settings, additional options to `nvcc` compiling step are needed. Add `-D_MWAITXINTRIN_H_INCLUDED` to the `nvcc` command line to avoid errors from `mwaitxintrin.h`.
|
||||
|
||||
### Implement the gradient in Python {#implement-gradient}
|
||||
|
||||
Given a graph of ops, TensorFlow uses automatic differentiation
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
A selection of image classification models were tested across multiple platforms
|
||||
to create a point of reference for the TensorFlow community. The
|
||||
[Methodology](#methodology) section details how the test were executed and has
|
||||
[Methodology](#methodology) section details how the tests were executed and has
|
||||
links to the scripts used.
|
||||
|
||||
## Results for image classification models
|
||||
@ -355,7 +355,7 @@ ResNet-50 | distributed_replicated | n/a | True
|
||||
ResNet-152 | distributed_replicated | n/a | True
|
||||
|
||||
To simplify server setup, EC2 instances (p2.8xlarge) running worker servers also
|
||||
ran parameter servers. Equal numbers of parameter servers and work servers were
|
||||
ran parameter servers. Equal numbers of parameter servers and worker servers were
|
||||
used with the following exceptions:
|
||||
|
||||
* InceptionV3: 8 instances / 6 parameter servers
|
||||
|
@ -62,12 +62,12 @@ and executed in parallel. The image preprocessing ops include operations such as
|
||||
image decoding, distortion, and resizing.
|
||||
|
||||
Once the images are through preprocessing, they are concatenated together into 8
|
||||
batch size 32 tensors. Rather than use @{tf.concat} for this purpose, which is
|
||||
implemented as a single op that waits for all the inputs to be ready before
|
||||
concatenating them together, @{tf.parallel_stack} is used. @{tf.parallel_stack}
|
||||
allocates an uninitialized tensor as an output, and each input tensor is written
|
||||
to its designated portion of the output tensor as soon as the input is
|
||||
available.
|
||||
tensors each with a batch-size of 32. Rather than using @{tf.concat} for this
|
||||
purpose, which is implemented as a single op that waits for all the inputs to be
|
||||
ready before concatenating them together, @{tf.parallel_stack} is used.
|
||||
@{tf.parallel_stack} allocates an uninitialized tensor as an output, and each
|
||||
input tensor is written to its designated portion of the output tensor as soon
|
||||
as the input is available.
|
||||
|
||||
When all the input tensors are finished, the output tensor is passed along in
|
||||
the graph. This effectively hides all the memory latency with the long tail of
|
||||
@ -142,7 +142,7 @@ On GPU, NCHW is faster. But on CPU, NHWC is sometimes faster.
|
||||
|
||||
Building a model to support both data formats keeps the model flexible and
|
||||
capable of operating optimally regardless of platform. Most TensorFlow
|
||||
operations used by a CNN support both NHWC and NCHW data format. The benchmark
|
||||
operations used by a CNN support both NHWC and NCHW data formats. The benchmark
|
||||
script was written to support both NCHW and NHWC. NCHW should always be used
|
||||
when training with GPUs. NHWC is sometimes faster on CPU. A flexible model can
|
||||
be trained on GPUs using NCHW with inference done on CPU using NHWC with the
|
||||
|
@ -23,7 +23,7 @@ straight in, feel free to look at the minimalistic implementation in
|
||||
This basic example contains the code needed to download some data, train on it a
|
||||
bit and visualize the result. Once you get comfortable with reading and running
|
||||
the basic version, you can graduate to
|
||||
[tensorflow_models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py)
|
||||
[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py)
|
||||
which is a more serious implementation that showcases some more advanced
|
||||
TensorFlow principles about how to efficiently use threads to move data into a
|
||||
text model, how to checkpoint during training, etc.
|
||||
@ -108,7 +108,7 @@ $$
|
||||
|
||||
where \\(\text{score}(w_t, h)\\) computes the compatibility of word \\(w_t\\)
|
||||
with the context \\(h\\) (a dot product is commonly used). We train this model
|
||||
by maximizing its [log-likelihood](https://en.wikipedia.org/wiki/Likelihood_function)
|
||||
by maximizing its [log-likelihood](https://en.wikipedia.org/wiki/Likelihood_function)
|
||||
on the training set, i.e. by maximizing
|
||||
|
||||
$$
|
||||
@ -130,7 +130,7 @@ context \\(h\\), *at every training step*.
|
||||
|
||||
On the other hand, for feature learning in word2vec we do not need a full
|
||||
probabilistic model. The CBOW and skip-gram models are instead trained using a
|
||||
binary classification objective ([logistic regression](https://en.wikipedia.org/wiki/Logistic_regression))
|
||||
binary classification objective ([logistic regression](https://en.wikipedia.org/wiki/Logistic_regression))
|
||||
to discriminate the real target words \\(w_t\\) from \\(k\\) imaginary (noise) words \\(\tilde w\\), in the
|
||||
same context. We illustrate this below for a CBOW model. For skip-gram the
|
||||
direction is simply inverted.
|
||||
@ -341,7 +341,7 @@ t-SNE.
|
||||
Et voila! As expected, words that are similar end up clustering nearby each
|
||||
other. For a more heavyweight implementation of word2vec that showcases more of
|
||||
the advanced features of TensorFlow, see the implementation in
|
||||
[tensorflow_models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
|
||||
[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
|
||||
|
||||
## Evaluating Embeddings: Analogical Reasoning
|
||||
|
||||
@ -357,7 +357,7 @@ Download the dataset for this task from
|
||||
|
||||
To see how we do this evaluation, have a look at the `build_eval_graph()` and
|
||||
`eval()` functions in
|
||||
[tensorflow_models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
|
||||
[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
|
||||
|
||||
The choice of hyperparameters can strongly influence the accuracy on this task.
|
||||
To achieve state-of-the-art performance on this task requires training over a
|
||||
@ -385,13 +385,13 @@ your model is seriously bottlenecked on input data, you may want to implement a
|
||||
custom data reader for your problem, as described in
|
||||
@{$new_data_formats$New Data Formats}. For the case of Skip-Gram
|
||||
modeling, we've actually already done this for you as an example in
|
||||
[tensorflow_models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
|
||||
[models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
|
||||
|
||||
If your model is no longer I/O bound but you want still more performance, you
|
||||
can take things further by writing your own TensorFlow Ops, as described in
|
||||
@{$adding_an_op$Adding a New Op}. Again we've provided an
|
||||
example of this for the Skip-Gram case
|
||||
[tensorflow_models/tutorials/embedding/word2vec_optimized.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec_optimized.py).
|
||||
[models/tutorials/embedding/word2vec_optimized.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec_optimized.py).
|
||||
Feel free to benchmark these against each other to measure performance
|
||||
improvements at each stage.
|
||||
|
||||
|
@ -28,9 +28,9 @@ on API >= 14 devices.
|
||||
using Deep Neural Networks](https://arxiv.org/abs/1312.2249) to
|
||||
localize and track people in the camera preview in real-time.
|
||||
3. [TF Stylize](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java):
|
||||
Uses a model based on [A Learned Representation For Artistic Style]
|
||||
(https://arxiv.org/abs/1610.07629) to restyle the camera preview image
|
||||
to that of a number of different artists.
|
||||
Uses a model based on [A Learned Representation For Artistic
|
||||
Style](https://arxiv.org/abs/1610.07629) to restyle the camera preview
|
||||
image to that of a number of different artists.
|
||||
|
||||
<img src="sample_images/classify1.jpg" width="30%"><img src="sample_images/stylize1.jpg" width="30%"><img src="sample_images/detect1.jpg" width="30%">
|
||||
|
||||
|
@ -12,12 +12,32 @@ cc_binary(
|
||||
srcs = [
|
||||
"main.cc",
|
||||
],
|
||||
linkopts = ["-lm"],
|
||||
deps = [
|
||||
"//tensorflow/cc:cc_ops",
|
||||
"//tensorflow/core:framework_internal",
|
||||
"//tensorflow/core:tensorflow",
|
||||
],
|
||||
linkopts = select({
|
||||
"//tensorflow:android": [
|
||||
"-pie",
|
||||
"-landroid",
|
||||
"-ljnigraphics",
|
||||
"-llog",
|
||||
"-lm",
|
||||
"-z defs",
|
||||
"-s",
|
||||
"-Wl,--exclude-libs,ALL",
|
||||
],
|
||||
"//conditions:default": ["-lm"],
|
||||
}),
|
||||
deps = select({
|
||||
"//tensorflow:android": [
|
||||
# cc:cc_ops is used to include image ops (for label_image)
|
||||
# Jpg, gif, and png related code won't be included
|
||||
"//tensorflow/cc:cc_ops",
|
||||
"//tensorflow/core:android_tensorflow_lib",
|
||||
],
|
||||
"//conditions:default": [
|
||||
"//tensorflow/cc:cc_ops",
|
||||
"//tensorflow/core:framework_internal",
|
||||
"//tensorflow/core:tensorflow",
|
||||
],
|
||||
}),
|
||||
)
|
||||
|
||||
filegroup(
|
||||
|
@ -30,6 +30,9 @@ limitations under the License.
|
||||
// the top of the main() function.
|
||||
//
|
||||
// The googlenet_graph.pb file included by default is created from Inception.
|
||||
//
|
||||
// Note that, for GIF inputs, to reuse existing code, only single-frame ones
|
||||
// are supported.
|
||||
|
||||
#include <fstream>
|
||||
#include <utility>
|
||||
@ -103,7 +106,12 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
|
||||
image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
|
||||
DecodePng::Channels(wanted_channels));
|
||||
} else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
|
||||
image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
|
||||
// gif decoder returns 4-D tensor, remove the first dim
|
||||
image_reader =
|
||||
Squeeze(root.WithOpName("squeeze_first_dim"),
|
||||
DecodeGif(root.WithOpName("gif_reader"), file_reader));
|
||||
} else if (tensorflow::StringPiece(file_name).ends_with(".bmp")) {
|
||||
image_reader = DecodeBmp(root.WithOpName("bmp_reader"), file_reader);
|
||||
} else {
|
||||
// Assume if it's neither a PNG nor a GIF then it must be a JPEG.
|
||||
image_reader = DecodeJpeg(root.WithOpName("jpeg_reader"), file_reader,
|
||||
|
@ -20,7 +20,7 @@ package org.tensorflow;
|
||||
*
|
||||
* <p>Instances of a Graph are thread-safe.
|
||||
*
|
||||
* <p><b>WARNING:</b> Resources consumed by the Graph object msut be explicitly freed by invoking
|
||||
* <p><b>WARNING:</b> Resources consumed by the Graph object must be explicitly freed by invoking
|
||||
* the {@link #close()} method then the Graph object is no longer needed.
|
||||
*/
|
||||
public final class Graph implements AutoCloseable {
|
||||
|
@ -73,6 +73,29 @@ public final class OperationBuilder {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure that the operation does not execute before the control operation does.
|
||||
*
|
||||
* <p>A control input is an Operation that must be executed before running the operation currently
|
||||
* being built.
|
||||
*
|
||||
* <p>For example, an Assert operation may be added as a control input for this operation. The
|
||||
* Assert now behaves as a pre-condition that will always verify itself before running the
|
||||
* operation.
|
||||
*
|
||||
* @param control operation that must be executed before running this operation.
|
||||
* @return the OperationBuilder instance for chaining.
|
||||
*/
|
||||
public OperationBuilder addControlInput(Operation control) {
|
||||
Graph.Reference r = graph.ref();
|
||||
try {
|
||||
addControlInput(unsafeNativeHandle, control.getUnsafeNativeHandle());
|
||||
} finally {
|
||||
r.close();
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public OperationBuilder addInputList(Output[] inputs) {
|
||||
Graph.Reference r = graph.ref();
|
||||
try {
|
||||
@ -244,6 +267,8 @@ public final class OperationBuilder {
|
||||
|
||||
private static native void addInputList(long handle, long[] opHandles, int[] indices);
|
||||
|
||||
private static native void addControlInput(long handle, long opHandle);
|
||||
|
||||
private static native void setDevice(long handle, String device);
|
||||
|
||||
// The names of all the setAttr* family functions below correspond to the C library types, not the
|
||||
|
@ -115,6 +115,20 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInputList(
|
||||
TF_AddInputList(d, o.get(), n);
|
||||
}
|
||||
|
||||
JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addControlInput(
|
||||
JNIEnv* env, jclass clazz, jlong handle, jlong op_handle) {
|
||||
if (op_handle == 0) {
|
||||
throwException(env, kIllegalStateException,
|
||||
"control input is not valid, "
|
||||
"perhaps the Graph containing it has been closed()?");
|
||||
return;
|
||||
}
|
||||
TF_Operation* control = reinterpret_cast<TF_Operation*>(op_handle);
|
||||
TF_OperationDescription* d = requireHandle(env, handle);
|
||||
if (d == nullptr) return;
|
||||
TF_AddControlInput(d, control);
|
||||
}
|
||||
|
||||
JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setDevice(
|
||||
JNIEnv* env, jclass clazz, jlong handle, jstring device) {
|
||||
TF_OperationDescription* d = requireHandle(env, handle);
|
||||
|
@ -55,6 +55,14 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInput(
|
||||
JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInputList(
|
||||
JNIEnv *, jclass, jlong, jlongArray, jintArray);
|
||||
|
||||
/*
|
||||
* Class: org_tensorflow_OperationBuilder
|
||||
* Method: addControlInput
|
||||
* Signature: (JJ)V
|
||||
*/
|
||||
JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addControlInput(
|
||||
JNIEnv *, jclass, jlong, jlong);
|
||||
|
||||
/*
|
||||
* Class: org_tensorflow_OperationBuilder
|
||||
* Method: setDevice
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user