Merge changes from github.

PiperOrigin-RevId: 186674197
This commit is contained in:
Yifei Feng 2018-02-22 14:24:57 -08:00 committed by TensorFlower Gardener
parent cb7e1963c6
commit dce9a49c19
148 changed files with 7985 additions and 710 deletions

View File

@ -21,7 +21,7 @@ newcomers.
* Other:
* Add `tf.contrib.distributions.Kumaraswamy`.
* `RetryingFileSystem::FlushCaches()` calls the base FileSystem's `FlushCaches()`.
* Add auto_correlation to distributions.
* Add `auto_correlation` to distributions.
* Add `tf.contrib.distributions.Autoregressive`.
* Add SeparableConv1D layer.
* Add convolutional Flipout layers.
@ -31,12 +31,12 @@ newcomers.
* Output variance over trees predictions for classifications tasks.
* For `pt` and `eval` commands, allow writing tensor values to filesystem as numpy files.
* gRPC: Propagate truncated errors (instead of returning gRPC internal error).
* Augment parallel_interleave to support 2 kinds of prefetching.
* Augment `parallel_interleave` to support 2 kinds of prefetching.
* Improved XLA support for C64-related ops log, pow, atan2, tanh.
* Add probabilistic convolutional layers.
## API Changes
* Introducing prepare_variance boolean with default setting to False for backward compatibility.
* Introducing `prepare_variance` boolean with default setting to False for backward compatibility.
* Move `layers_dense_variational_impl.py` to `layers_dense_variational.py`.
## Known Bugs
@ -96,27 +96,6 @@ Yoni Tsafir, yordun, Yuan (Terry) Tang, Yuxin Wu, zhengdi, Zhengsheng Wei, 田
* Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
This may break TF on older CPUs.
## Known Bugs
* Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or
`CUDA_ILLEGAL_ADDRESS` failures.
Google discovered in mid-December 2017 that the PTX-to-SASS compiler in CUDA 9
and CUDA 9.1 sometimes does not properly compute the carry bit when
decomposing 64-bit address calculations with large offsets (e.g. `load [x +
large_constant]`) into 32-bit arithmetic in SASS.
As a result, these versions of `ptxas` miscompile most XLA programs which use
more than 4GB of temp memory. This results in garbage results and/or
`CUDA_ERROR_ILLEGAL_ADDRESS` failures.
A fix in CUDA 9.1.121 is expected in late February 2018. We do not expect a
fix for CUDA 9.0.x. Until the fix is available, the only workaround is to
[downgrade](https://developer.nvidia.com/cuda-toolkit-archive) to CUDA 8.0.x
or disable XLA:GPU.
TensorFlow will print a warning if you use XLA:GPU with a known-bad version of
CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122.
## Major Features And Improvements
* [Eager execution](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/eager)
preview version is now available.

View File

@ -445,7 +445,7 @@ def convert_version_to_int(version):
def check_bazel_version(min_version):
"""Check installed bezel version is at least min_version.
"""Check installed bazel version is at least min_version.
Args:
min_version: string for minimum bazel version.
@ -1078,12 +1078,22 @@ def set_tf_tensorrt_install_path(environ_cp):
break
# Reset and Retry
print('Invalid path to TensorRT. None of the following files can be found:')
print(trt_install_path)
print(os.path.join(trt_install_path, 'lib'))
print(os.path.join(trt_install_path, 'lib64'))
if search_result:
print(libnvinfer_path_from_ldconfig)
if possible_files:
print('TensorRT libraries found in one the following directories',
'are not compatible with selected cuda and cudnn installations')
print(trt_install_path)
print(os.path.join(trt_install_path, 'lib'))
print(os.path.join(trt_install_path, 'lib64'))
if search_result:
print(libnvinfer_path_from_ldconfig)
else:
print(
'Invalid path to TensorRT. None of the following files can be found:')
print(trt_install_path)
print(os.path.join(trt_install_path, 'lib'))
print(os.path.join(trt_install_path, 'lib64'))
if search_result:
print(libnvinfer_path_from_ldconfig)
else:
raise UserInputError('Invalid TF_TENSORRT setting was provided %d '
@ -1481,7 +1491,6 @@ def main():
'more details.')
config_info_line('mkl', 'Build with MKL support.')
config_info_line('monolithic', 'Config for mostly static monolithic build.')
config_info_line('tensorrt', 'Build with TensorRT support.')
if __name__ == '__main__':
main()

View File

@ -2081,7 +2081,7 @@ TEST_F(CApiAttributesTest, Tensor) {
}
TEST_F(CApiAttributesTest, StringTensor) {
// Create the string-Tensor "atttribute" value.
// Create the string-Tensor "attribute" value.
char encoded[] = {
0, 0, 0, 0, 0, 0, 0, 0, // array[uint64] offsets
1, // varint encoded string length

View File

@ -234,7 +234,8 @@ Status Literal::CopySliceFromInternal(
int64 src_index = linear_index(src_literal.shape(), src_indexes);
int64 dest_index = linear_index(shape(), dest_indexes);
StridedCopy(data<NativeT>(), dest_index, stride_config.dest_stride,
// `this->` is needed to workaround MSVC bug: #16882
StridedCopy(this->data<NativeT>(), dest_index, stride_config.dest_stride,
src_literal.data<NativeT>(), src_index,
stride_config.source_stride, stride_config.minor_loop_size);
return true;

View File

@ -589,12 +589,9 @@ class HloInstruction {
if (opcode() != other.opcode()) {
return false;
}
auto eq_shapes = layout_sensitive
? [](const Shape& a,
const Shape& b) { return ShapeUtil::Equal(a, b); }
: [](const Shape& a, const Shape& b) {
return ShapeUtil::Compatible(a, b);
};
using EqShapeFuncType = bool (*)(const Shape&, const Shape&);
EqShapeFuncType eq_shapes =
layout_sensitive ? ShapeUtil::Equal : ShapeUtil::Compatible;
if (!eq_shapes(shape(), other.shape())) {
return false;
}

View File

@ -7,6 +7,7 @@ package(default_visibility = ["//tensorflow:__subpackages__"])
load("//third_party/mpi:mpi.bzl", "if_mpi")
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
py_library(
name = "contrib_py",
@ -107,7 +108,9 @@ py_library(
"//tensorflow/contrib/training:training_py",
"//tensorflow/contrib/util:util_py",
"//tensorflow/python:util",
] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]),
] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_tensorrt([
"//tensorflow/contrib/tensorrt:init_py",
]),
)
cc_library(

View File

@ -81,6 +81,11 @@ For documentation on building a self-contained AAR file with cmake, see
[tensorflow/contrib/android/cmake](cmake).
### Makefile
For documentation on building native TF libraries with make, including a CUDA-enabled variant for devices like the Nvidia Shield TV, see [tensorflow/contrib/makefile/README.md](../makefile/README.md)
## AssetManagerFileSystem
This directory also contains a TensorFlow filesystem supporting the Android

View File

@ -78,7 +78,7 @@ def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):
# Calculate softmax probabilities for each class.
unnormalized_probs = math_ops.exp(logits)
normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keep_dims=True)
normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keepdims=True)
softmax_predictions = math_ops.divide(unnormalized_probs,
math_ops.add(normalizers, eps))
@ -120,7 +120,7 @@ def per_example_squared_loss(labels, weights, predictions):
update_op: An update operation to update the loss's internal state.
"""
unweighted_loss = math_ops.reduce_sum(
math_ops.square(predictions - labels), 1, keep_dims=True)
math_ops.square(predictions - labels), 1, keepdims=True)
return unweighted_loss * weights, control_flow_ops.no_op()

View File

@ -52,6 +52,7 @@ if (NOT WIN32)
# for targets that link ${CMAKE_THREAD_LIBS_INIT}.
find_package (Threads)
# Options for linking CUDA/CUDNN libraries
option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/)
option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/)
if (NOT tensorflow_CUDNN_INCLUDE)
@ -73,6 +74,14 @@ if (NOT WIN32)
# option's default value is OFF. Fill it with real default values
set(tensorflow_CUDA_LIBRARY_PATH /usr/local/cuda/lib64)
endif (NOT tensorflow_CUDA_LIBRARY_PATH)
# Options for linking other libraries
option(systemlib_ZLIB "Use the system installed library as shared objects instead of downloading ZLIB and statically linking to it: ZLIB" OFF)
option(systemlib_ALL "Turn on every possible systemlib_* options" OFF)
if (systemlib_ALL)
set (systmelib_ZLIB ON)
endif (systemlib_ALL)
endif()
if (WIN32)
@ -188,8 +197,10 @@ if (tensorflow_BUILD_CC_TESTS)
include(googletest)
endif()
add_definitions(${ADD_CFLAGS})
link_directories(${ADD_LINK_DIRECTORY})
set(tensorflow_EXTERNAL_LIBRARIES
${zlib_STATIC_LIBRARIES}
${gif_STATIC_LIBRARIES}
${png_STATIC_LIBRARIES}
${jpeg_STATIC_LIBRARIES}
@ -203,6 +214,15 @@ set(tensorflow_EXTERNAL_LIBRARIES
${re2_STATIC_LIBRARIES}
${sqlite_STATIC_LIBRARIES}
)
if (systemlib_ZLIB)
set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
${ZLIB_LIBRARIES})
else (systemlib_ZLIB)
set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
${zlib_STATIC_LIBRARIES})
endif (systemlib_ZLIB)
set(tensorflow_EXTERNAL_DEPENDENCIES
zlib_copy_headers_to_destination
gif_copy_headers_to_destination

View File

@ -12,61 +12,75 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
include (ExternalProject)
if (systemlib_ZLIB)
find_package(PkgConfig)
pkg_search_module(ZLIB REQUIRED zlib)
set(zlib_INCLUDE_DIR ${ZLIB_INCLUDE_DIRS})
set(ADD_LINK_DIRECTORY ${ADD_LINK_DIRECTORY} ${ZLIB_LIBRARY_DIRS})
set(ADD_CFLAGS ${ADD_CFLAGS} ${ZLIB_CFLAGS_OTHER})
set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
set(ZLIB_URL https://github.com/madler/zlib)
set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
# To meet DEPENDS zlib from other projects.
# If we hit this line, zlib is already built and installed to the system.
add_custom_target(zlib)
add_custom_target(zlib_copy_headers_to_destination)
if(WIN32)
if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
set(zlib_STATIC_LIBRARIES
debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
else()
if(CMAKE_BUILD_TYPE EQUAL Debug)
else (systemlib_ZLIB)
include (ExternalProject)
set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
set(ZLIB_URL https://github.com/madler/zlib)
set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
if(WIN32)
if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
set(zlib_STATIC_LIBRARIES
${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib)
debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
else()
set(zlib_STATIC_LIBRARIES
${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
if(CMAKE_BUILD_TYPE EQUAL Debug)
set(zlib_STATIC_LIBRARIES
${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib)
else()
set(zlib_STATIC_LIBRARIES
${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
endif()
endif()
else()
set(zlib_STATIC_LIBRARIES
${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
endif()
else()
set(zlib_STATIC_LIBRARIES
${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
endif()
set(ZLIB_HEADERS
"${ZLIB_INSTALL}/include/zconf.h"
"${ZLIB_INSTALL}/include/zlib.h"
)
set(ZLIB_HEADERS
"${ZLIB_INSTALL}/include/zconf.h"
"${ZLIB_INSTALL}/include/zlib.h"
)
ExternalProject_Add(zlib
PREFIX zlib
GIT_REPOSITORY ${ZLIB_URL}
GIT_TAG ${ZLIB_TAG}
INSTALL_DIR ${ZLIB_INSTALL}
BUILD_IN_SOURCE 1
BUILD_BYPRODUCTS ${zlib_STATIC_LIBRARIES}
DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
CMAKE_CACHE_ARGS
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
)
ExternalProject_Add(zlib
PREFIX zlib
GIT_REPOSITORY ${ZLIB_URL}
GIT_TAG ${ZLIB_TAG}
INSTALL_DIR ${ZLIB_INSTALL}
BUILD_IN_SOURCE 1
BUILD_BYPRODUCTS ${zlib_STATIC_LIBRARIES}
DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
CMAKE_CACHE_ARGS
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
)
# put zlib includes in the directory where they are expected
add_custom_target(zlib_create_destination_dir
COMMAND ${CMAKE_COMMAND} -E make_directory ${zlib_INCLUDE_DIR}
DEPENDS zlib)
# put zlib includes in the directory where they are expected
add_custom_target(zlib_create_destination_dir
COMMAND ${CMAKE_COMMAND} -E make_directory ${zlib_INCLUDE_DIR}
DEPENDS zlib)
add_custom_target(zlib_copy_headers_to_destination
DEPENDS zlib_create_destination_dir)
add_custom_target(zlib_copy_headers_to_destination
DEPENDS zlib_create_destination_dir)
foreach(header_file ${ZLIB_HEADERS})
add_custom_command(TARGET zlib_copy_headers_to_destination PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${zlib_INCLUDE_DIR})
endforeach()
foreach(header_file ${ZLIB_HEADERS})
add_custom_command(TARGET zlib_copy_headers_to_destination PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${zlib_INCLUDE_DIR})
endforeach()
endif (systemlib_ZLIB)

View File

@ -413,6 +413,9 @@ tensorflow/contrib/tensorboard
tensorflow/contrib/tensorboard/plugins
tensorflow/contrib/tensorboard/plugins/projector
tensorflow/contrib/tensorboard/plugins/trace
# TODO(sami): Add cmake implementations.
# tensorflow/contrib/tensorrt/python
# tensorflow/contrib/tensorrt/python/ops
tensorflow/contrib/tensor_forest
tensorflow/contrib/tensor_forest/client
tensorflow/contrib/tensor_forest/hybrid

View File

@ -105,8 +105,8 @@ def crf_sequence_score(inputs, tag_indices, sequence_lengths,
return utils.smart_cond(
pred=math_ops.equal(inputs.shape[1].value or array_ops.shape(inputs)[1],
1),
fn1=_single_seq_fn,
fn2=_multi_seq_fn)
true_fn=_single_seq_fn,
false_fn=_multi_seq_fn)
def crf_log_norm(inputs, sequence_lengths, transition_params):
@ -511,7 +511,7 @@ def crf_decode(potentials, transition_params, sequence_length):
return decode_tags, best_score
return utils.smart_cond(
pred=math_ops.equal(
potentials.shape[1].value or array_ops.shape(potentials)[1], 1),
fn1=_single_seq_fn,
fn2=_multi_seq_fn)
pred=math_ops.equal(potentials.shape[1].value or
array_ops.shape(potentials)[1], 1),
true_fn=_single_seq_fn,
false_fn=_multi_seq_fn)

View File

@ -278,7 +278,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
* math_ops.log(self.temperature))
# compute the unnormalized density
log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self._temperature_2d)
log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keep_dims=False)
log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keepdims=False)
# combine unnormalized density with normalization constant
log_prob = log_norm_const + log_unnorm_prob
# Reshapes log_prob to be consistent with shape of user-supplied logits

View File

@ -22,11 +22,10 @@ to models defined without using eager execution.
Eager execution is included in TensorFlow versions 1.5 and above.
Installation instructions at https://www.tensorflow.org/install/
The contents of this guide are compatible with TensorFlow 1.5.
However, if you run into bugs that are fixed in source but not the
release, you may want to either either [building from
source](https://www.tensorflow.org/install/install_sources)
or the try latest nightly builds. The nightly builds are available as:
The contents of this guide are compatible with TensorFlow 1.5. However, if you
run into bugs that are fixed in source but not the release, you may want to
either [build from source](https://www.tensorflow.org/install/install_sources)
or try a nightly build. The nightly builds are available as:
- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and

View File

@ -192,11 +192,11 @@ class KMeans(object):
# Computes Euclidean distance. Note the first and third terms are
# broadcast additions.
squared_distance = (
math_ops.reduce_sum(math_ops.square(inp), 1, keep_dims=True) -
math_ops.reduce_sum(math_ops.square(inp), 1, keepdims=True) -
2 * math_ops.matmul(inp, clusters, transpose_b=True) +
array_ops.transpose(
math_ops.reduce_sum(
math_ops.square(clusters), 1, keep_dims=True)))
math_ops.square(clusters), 1, keepdims=True)))
output.append(squared_distance)
return output

View File

@ -85,6 +85,8 @@ See the @{$python/contrib.framework} guide.
@@py_func
@@sort
@@get_placeholders
@@CriticalSection
@@BoundedTensorSpec
@ -102,10 +104,10 @@ from tensorflow.contrib.framework.python.ops import *
from tensorflow.python.framework.ops import prepend_name_scope
from tensorflow.python.framework.ops import strip_name_scope
from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
from tensorflow.python.framework.tensor_spec import TensorSpec
from tensorflow.python.ops.control_flow_ops import smart_cond
from tensorflow.python.ops.control_flow_ops import smart_constant_value
from tensorflow.python.util.all_util import remove_undocumented
_allowed_symbols = ['nest']

View File

@ -133,6 +133,18 @@ def fuse_op(graph_def, input_nodes, output_nodes, output_dtypes,
def get_placeholders(graph):
"""Get placeholders of a graph.
For example:
```python
a = tf.placeholder(dtype=tf.float32, shape=[2, 2], name='a')
a = tf.placeholder(dtype=tf.int32, shape=[3, 2], name='b')
tf.contrib.framework.get_placeholders(tf.get_default_graph())
# Returns:
# [<tf.Tensor 'a:0' shape=(2, 2) dtype=float32>,
# <tf.Tensor 'b:0' shape=(3, 2) dtype=int32>]
```
Args:
graph: A tf.Graph.
Returns:

View File

@ -85,7 +85,7 @@ class BipartiteMatchOp : public OpKernel {
context->allocate_output(1, TensorShape({num_input_columns}),
&column_to_row_match_indices));
typename TTypes<float, 2>::ConstTensor distance_mat =
TTypes<float, 2>::ConstTensor distance_mat =
input_distance_mat.shaped<float, 2>(
{num_input_rows, num_input_columns});

View File

@ -517,8 +517,8 @@ def batch_norm(inputs,
then the batch normalization uses weighted mean and
variance. (This can be used to correct for bias in training
example selection.)
fused: if `True`, use a faster, fused implementation if possible.
If `None`, use the system recommended implementation.
fused: if `None` or `True`, use a faster, fused implementation if possible.
If `False`, use the system recommended implementation.
data_format: A string. `NHWC` (default) and `NCHW` are supported.
zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new
pair of variables 'moving_mean/biased' and 'moving_mean/local_step'.
@ -778,7 +778,7 @@ def batch_norm(inputs,
else:
if data_format == DATA_FORMAT_NCHW:
mean, variance = nn.weighted_moments(
inputs, moments_axes, batch_weights, keep_dims=True)
inputs, moments_axes, batch_weights, keepdims=True)
mean = array_ops.reshape(mean, [-1])
variance = array_ops.reshape(variance, [-1])
else:
@ -2836,9 +2836,9 @@ def spatial_softmax(features,
softmax_attention = nn.softmax(features / temperature)
expected_x = math_ops.reduce_sum(
pos_x * softmax_attention, [1], keep_dims=True)
pos_x * softmax_attention, [1], keepdims=True)
expected_y = math_ops.reduce_sum(
pos_y * softmax_attention, [1], keep_dims=True)
pos_y * softmax_attention, [1], keepdims=True)
expected_xy = array_ops.concat([expected_x, expected_y], 1)
feature_keypoints = array_ops.reshape(expected_xy,
[-1, num_channels.value * 2])
@ -3018,7 +3018,7 @@ def poincare_normalize(x, axis=1, epsilon=1e-5, name=None):
"""
with ops.name_scope(name, 'poincare_normalize', [x]) as name:
x = ops.convert_to_tensor(x, name='x')
square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keep_dims=True)
square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
x_inv_norm = math_ops.rsqrt(square_sum)
x_inv_norm = math_ops.minimum((1. - epsilon) * x_inv_norm, 1.)
return math_ops.multiply(x, x_inv_norm, name=name)

View File

@ -6,7 +6,7 @@ TensorFlow Lite uses many techniques for achieving low latency like optimizing t
![image](g3doc/TFLite-Architecture.jpg)
# Getting Started with an Android Demo App
This section contains an example application using TensorFlow Lite for Android devices. The demo is a sample camera app that classifies images continuously using a quantized Mobilenet model. A device running Android 5.0 ( API 21) or higher is required to run the demo.
This section contains an example application using TensorFlow Lite for Android devices. The demo is a sample camera app that classifies images continuously using either a quantized Mobilenet model or a floating point Inception-v3 model. A device running Android 5.0 ( API 21) or higher is required to run the demo.
There are 3 ways to get the demo app to your device
- Download the prebuilt binary or
@ -29,9 +29,16 @@ The simplest way to compile the demo app, and try out changes to the project cod
- Make sure the Android SDK version is greater than 26 and NDK version is greater than 14 (in the Android Studio Settings).
- Import the `tensorflow/contrib/lite/java/demo` directory as a new Android Studio project.
- Click through installing all the Gradle extensions it requests.
- Download the quantized Mobilenet TensorFlow Lite model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
- unzip and copy mobilenet_quant_v1_224.tflite to the assets directory:
`tensorflow/contrib/lite/java/demo/app/src/main/assets/`
- Either
- Download the quantized Mobilenet TensorFlow Lite model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
- unzip and copy mobilenet_quant_v1_224.tflite to the assets directory:
`tensorflow/contrib/lite/java/demo/app/src/main/assets/`
- Or download the floating point Inception-v3 model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
- unzip and copy inceptionv3_non_slim_2015.tflite to the assets directory
- change the chosen classifier in [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java) from
`classifier = new ImageClassifierQuantizedMobileNet(getActivity());`
to
`classifier = new ImageClassifierFloatInception(getActivity());`
- Build and run the demo app
## Building TensorFlow Lite and the demo app from source
@ -84,7 +91,7 @@ Currently, we only support building the Android demo app within a Python 2
environment (due to a Bazel bug).
### More about the demo
The demo is resizing each camera image frame to (224 width * 224 height) to match the quantized Mobilenet model being used. The resized image is converted into a ByteBuffer row by row of size 1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch 224 * 224 is the width and height of the image 3 bytes represents three colors of a pixel. This demo uses the TensorFlow Lite Java inference API for models which take a single input and provide a single output. This outputs a two-dimensional array, with the first dimension being the category index and the second dimension being the confidence of classification. The Mobilenet model has 1001 unique categories and the app sorts the probabilities of all the categories and displays the top three. The Mobilenet quantized model is bundled within the assets directory of the app.
The demo is resizing each camera image frame to (224 width * 224 height) to match the quantized Mobilenet model being used (229 * 229 for Inception-v3). The resized image is converted into a ByteBuffer row by row of size 1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch. 224 * 224 (299 * 299) is the width and height of the image. 3 bytes represents three colors of a pixel. This demo uses the TensorFlow Lite Java inference API for models which take a single input and provide a single output. This outputs a two-dimensional array, with the first dimension being the category index and the second dimension being the confidence of classification. Both models have 1001 unique categories and the app sorts the probabilities of all the categories and displays the top three. The model file must be downloaded and bundled within the assets directory of the app.
# iOS Demo App

File diff suppressed because it is too large Load Diff

View File

@ -296,7 +296,8 @@ public class Camera2BasicFragment extends Fragment
public void onActivityCreated(Bundle savedInstanceState) {
super.onActivityCreated(savedInstanceState);
try {
classifier = new ImageClassifier(getActivity());
// create either a new ImageClassifierQuantizedMobileNet or an ImageClassifierFloatInception
classifier = new ImageClassifierQuantizedMobileNet(getActivity());
} catch (IOException e) {
Log.e(TAG, "Failed to initialize an image classifier.");
}
@ -658,8 +659,7 @@ public class Camera2BasicFragment extends Fragment
showToast("Uninitialized Classifier or invalid context.");
return;
}
Bitmap bitmap =
textureView.getBitmap(ImageClassifier.DIM_IMG_SIZE_X, ImageClassifier.DIM_IMG_SIZE_Y);
Bitmap bitmap = textureView.getBitmap(classifier.getImageSizeX(), classifier.getImageSizeY());
String textToShow = classifier.classifyFrame(bitmap);
bitmap.recycle();
showToast(textToShow);

View File

@ -37,17 +37,11 @@ import java.util.PriorityQueue;
import org.tensorflow.lite.Interpreter;
/** Classifies images with Tensorflow Lite. */
public class ImageClassifier {
public abstract class ImageClassifier {
/** Tag for the {@link Log}. */
private static final String TAG = "TfLiteCameraDemo";
/** Name of the model file stored in Assets. */
private static final String MODEL_PATH = "mobilenet_quant_v1_224.tflite";
/** Name of the label file stored in Assets. */
private static final String LABEL_PATH = "labels.txt";
/** Number of results to show in the UI. */
private static final int RESULTS_TO_SHOW = 3;
@ -56,23 +50,18 @@ public class ImageClassifier {
private static final int DIM_PIXEL_SIZE = 3;
static final int DIM_IMG_SIZE_X = 224;
static final int DIM_IMG_SIZE_Y = 224;
/* Preallocated buffers for storing image data in. */
private int[] intValues = new int[DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y];
private int[] intValues = new int[getImageSizeX() * getImageSizeY()];
/** An instance of the driver class to run model inference with Tensorflow Lite. */
private Interpreter tflite;
protected Interpreter tflite;
/** Labels corresponding to the output of the vision model. */
private List<String> labelList;
/** A ByteBuffer to hold image data, to be feed into Tensorflow Lite as inputs. */
private ByteBuffer imgData = null;
protected ByteBuffer imgData = null;
/** An array to hold inference results, to be feed into Tensorflow Lite as outputs. */
private byte[][] labelProbArray = null;
/** multi-stage low pass filter * */
private float[][] filterLabelProbArray = null;
@ -95,10 +84,13 @@ public class ImageClassifier {
labelList = loadLabelList(activity);
imgData =
ByteBuffer.allocateDirect(
DIM_BATCH_SIZE * DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y * DIM_PIXEL_SIZE);
DIM_BATCH_SIZE
* getImageSizeX()
* getImageSizeY()
* DIM_PIXEL_SIZE
* getNumBytesPerChannel());
imgData.order(ByteOrder.nativeOrder());
labelProbArray = new byte[1][labelList.size()];
filterLabelProbArray = new float[FILTER_STAGES][labelList.size()];
filterLabelProbArray = new float[FILTER_STAGES][getNumLabels()];
Log.d(TAG, "Created a Tensorflow Lite Image Classifier.");
}
@ -111,7 +103,7 @@ public class ImageClassifier {
convertBitmapToByteBuffer(bitmap);
// Here's where the magic happens!!!
long startTime = SystemClock.uptimeMillis();
tflite.run(imgData, labelProbArray);
runInference();
long endTime = SystemClock.uptimeMillis();
Log.d(TAG, "Timecost to run model inference: " + Long.toString(endTime - startTime));
@ -125,12 +117,12 @@ public class ImageClassifier {
}
void applyFilter() {
int numLabels = labelList.size();
int numLabels = getNumLabels();
// Low pass filter `labelProbArray` into the first stage of the filter.
for (int j = 0; j < numLabels; ++j) {
filterLabelProbArray[0][j] +=
FILTER_FACTOR * (labelProbArray[0][j] - filterLabelProbArray[0][j]);
FILTER_FACTOR * (getProbability(j) - filterLabelProbArray[0][j]);
}
// Low pass filter each stage into the next.
for (int i = 1; i < FILTER_STAGES; ++i) {
@ -142,7 +134,7 @@ public class ImageClassifier {
// Copy the last stage filter output back to `labelProbArray`.
for (int j = 0; j < numLabels; ++j) {
labelProbArray[0][j] = (byte)filterLabelProbArray[FILTER_STAGES - 1][j];
setProbability(j, filterLabelProbArray[FILTER_STAGES - 1][j]);
}
}
@ -156,7 +148,7 @@ public class ImageClassifier {
private List<String> loadLabelList(Activity activity) throws IOException {
List<String> labelList = new ArrayList<String>();
BufferedReader reader =
new BufferedReader(new InputStreamReader(activity.getAssets().open(LABEL_PATH)));
new BufferedReader(new InputStreamReader(activity.getAssets().open(getLabelPath())));
String line;
while ((line = reader.readLine()) != null) {
labelList.add(line);
@ -167,7 +159,7 @@ public class ImageClassifier {
/** Memory-map the model file in Assets. */
private MappedByteBuffer loadModelFile(Activity activity) throws IOException {
AssetFileDescriptor fileDescriptor = activity.getAssets().openFd(MODEL_PATH);
AssetFileDescriptor fileDescriptor = activity.getAssets().openFd(getModelPath());
FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
FileChannel fileChannel = inputStream.getChannel();
long startOffset = fileDescriptor.getStartOffset();
@ -185,12 +177,10 @@ public class ImageClassifier {
// Convert the image to floating point.
int pixel = 0;
long startTime = SystemClock.uptimeMillis();
for (int i = 0; i < DIM_IMG_SIZE_X; ++i) {
for (int j = 0; j < DIM_IMG_SIZE_Y; ++j) {
for (int i = 0; i < getImageSizeX(); ++i) {
for (int j = 0; j < getImageSizeY(); ++j) {
final int val = intValues[pixel++];
imgData.put((byte) ((val >> 16) & 0xFF));
imgData.put((byte) ((val >> 8) & 0xFF));
imgData.put((byte) (val & 0xFF));
addPixelValue(val);
}
}
long endTime = SystemClock.uptimeMillis();
@ -199,9 +189,9 @@ public class ImageClassifier {
/** Prints top-K labels, to be shown in UI as the results. */
private String printTopKLabels() {
for (int i = 0; i < labelList.size(); ++i) {
for (int i = 0; i < getNumLabels(); ++i) {
sortedLabels.add(
new AbstractMap.SimpleEntry<>(labelList.get(i), (labelProbArray[0][i] & 0xff) / 255.0f));
new AbstractMap.SimpleEntry<>(labelList.get(i), getNormalizedProbability(i)));
if (sortedLabels.size() > RESULTS_TO_SHOW) {
sortedLabels.poll();
}
@ -214,4 +204,89 @@ public class ImageClassifier {
}
return textToShow;
}
/**
* Get the name of the model file stored in Assets.
*
* @return
*/
protected abstract String getModelPath();
/**
* Get the name of the label file stored in Assets.
*
* @return
*/
protected abstract String getLabelPath();
/**
* Get the image size along the x axis.
*
* @return
*/
protected abstract int getImageSizeX();
/**
* Get the image size along the y axis.
*
* @return
*/
protected abstract int getImageSizeY();
/**
* Get the number of bytes that is used to store a single color channel value.
*
* @return
*/
protected abstract int getNumBytesPerChannel();
/**
* Add pixelValue to byteBuffer.
*
* @param pixelValue
*/
protected abstract void addPixelValue(int pixelValue);
/**
* Read the probability value for the specified label This is either the original value as it was
* read from the net's output or the updated value after the filter was applied.
*
* @param labelIndex
* @return
*/
protected abstract float getProbability(int labelIndex);
/**
* Set the probability value for the specified label.
*
* @param labelIndex
* @param value
*/
protected abstract void setProbability(int labelIndex, Number value);
/**
* Get the normalized probability value for the specified label. This is the final value as it
* will be shown to the user.
*
* @return
*/
protected abstract float getNormalizedProbability(int labelIndex);
/**
* Run inference using the prepared input in {@link #imgData}. Afterwards, the result will be
* provided by getProbability().
*
* <p>This additional method is necessary, because we don't have a common base for different
* primitive data types.
*/
protected abstract void runInference();
/**
* Get the total number of labels.
*
* @return
*/
protected int getNumLabels() {
return labelList.size();
}
}

View File

@ -0,0 +1,103 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
package com.example.android.tflitecamerademo;
import android.app.Activity;
import java.io.IOException;
/**
* This classifier works with the Inception-v3 slim model. It applies floating point inference
* rather than using a quantized model.
*/
public class ImageClassifierFloatInception extends ImageClassifier {
/** The inception net requires additional normalization of the used input. */
private static final int IMAGE_MEAN = 128;
private static final float IMAGE_STD = 128.0f;
/**
* An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
* of the super class, because we need a primitive array here.
*/
private float[][] labelProbArray = null;
/**
* Initializes an {@code ImageClassifier}.
*
* @param activity
*/
ImageClassifierFloatInception(Activity activity) throws IOException {
super(activity);
labelProbArray = new float[1][getNumLabels()];
}
@Override
protected String getModelPath() {
// you can download this file from
// https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip
return "inceptionv3_slim_2016.tflite";
}
@Override
protected String getLabelPath() {
return "labels_imagenet_slim.txt";
}
@Override
protected int getImageSizeX() {
return 299;
}
@Override
protected int getImageSizeY() {
return 299;
}
@Override
protected int getNumBytesPerChannel() {
// a 32bit float value requires 4 bytes
return 4;
}
@Override
protected void addPixelValue(int pixelValue) {
imgData.putFloat((((pixelValue >> 16) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
imgData.putFloat((((pixelValue >> 8) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
imgData.putFloat(((pixelValue & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
}
@Override
protected float getProbability(int labelIndex) {
return labelProbArray[0][labelIndex];
}
@Override
protected void setProbability(int labelIndex, Number value) {
labelProbArray[0][labelIndex] = value.floatValue();
}
@Override
protected float getNormalizedProbability(int labelIndex) {
// TODO the following value isn't in [0,1] yet, but may be greater. Why?
return getProbability(labelIndex);
}
@Override
protected void runInference() {
tflite.run(imgData, labelProbArray);
}
}

View File

@ -0,0 +1,94 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
package com.example.android.tflitecamerademo;
import android.app.Activity;
import java.io.IOException;
/** This classifier works with the quantized MobileNet model. */
public class ImageClassifierQuantizedMobileNet extends ImageClassifier {
/**
* An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
* of the super class, because we need a primitive array here.
*/
private byte[][] labelProbArray = null;
/**
* Initializes an {@code ImageClassifier}.
*
* @param activity
*/
ImageClassifierQuantizedMobileNet(Activity activity) throws IOException {
super(activity);
labelProbArray = new byte[1][getNumLabels()];
}
@Override
protected String getModelPath() {
// you can download this file from
// https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip
return "mobilenet_quant_v1_224.tflite";
}
@Override
protected String getLabelPath() {
return "labels_mobilenet_quant_v1_224.txt";
}
@Override
protected int getImageSizeX() {
return 224;
}
@Override
protected int getImageSizeY() {
return 224;
}
@Override
protected int getNumBytesPerChannel() {
// the quantized model uses a single byte only
return 1;
}
@Override
protected void addPixelValue(int pixelValue) {
imgData.put((byte) ((pixelValue >> 16) & 0xFF));
imgData.put((byte) ((pixelValue >> 8) & 0xFF));
imgData.put((byte) (pixelValue & 0xFF));
}
@Override
protected float getProbability(int labelIndex) {
return labelProbArray[0][labelIndex];
}
@Override
protected void setProbability(int labelIndex, Number value) {
labelProbArray[0][labelIndex] = value.byteValue();
}
@Override
protected float getNormalizedProbability(int labelIndex) {
return (labelProbArray[0][labelIndex] & 0xff) / 255.0f;
}
@Override
protected void runInference() {
tflite.run(imgData, labelProbArray);
}
}

View File

@ -50,16 +50,12 @@ def pairwise_distance(feature, squared=False):
pairwise_distances: 2-D Tensor of size [number of data, number of data].
"""
pairwise_distances_squared = math_ops.add(
math_ops.reduce_sum(math_ops.square(feature), axis=[1], keepdims=True),
math_ops.reduce_sum(
math_ops.square(feature),
axis=[1],
keep_dims=True),
math_ops.reduce_sum(
math_ops.square(
array_ops.transpose(feature)),
math_ops.square(array_ops.transpose(feature)),
axis=[0],
keep_dims=True)) - 2.0 * math_ops.matmul(
feature, array_ops.transpose(feature))
keepdims=True)) - 2.0 * math_ops.matmul(feature,
array_ops.transpose(feature))
# Deal with numerical inaccuracies. Set small negatives to zero.
pairwise_distances_squared = math_ops.maximum(pairwise_distances_squared, 0.0)
@ -132,10 +128,10 @@ def masked_maximum(data, mask, dim=1):
masked_maximums: N-D `Tensor`.
The maximized dimension is of size 1 after the operation.
"""
axis_minimums = math_ops.reduce_min(data, dim, keep_dims=True)
axis_minimums = math_ops.reduce_min(data, dim, keepdims=True)
masked_maximums = math_ops.reduce_max(
math_ops.multiply(
data - axis_minimums, mask), dim, keep_dims=True) + axis_minimums
math_ops.multiply(data - axis_minimums, mask), dim,
keepdims=True) + axis_minimums
return masked_maximums
@ -151,10 +147,10 @@ def masked_minimum(data, mask, dim=1):
masked_minimums: N-D `Tensor`.
The minimized dimension is of size 1 after the operation.
"""
axis_maximums = math_ops.reduce_max(data, dim, keep_dims=True)
axis_maximums = math_ops.reduce_max(data, dim, keepdims=True)
masked_minimums = math_ops.reduce_min(
math_ops.multiply(
data - axis_maximums, mask), dim, keep_dims=True) + axis_maximums
math_ops.multiply(data - axis_maximums, mask), dim,
keepdims=True) + axis_maximums
return masked_minimums
@ -202,8 +198,7 @@ def triplet_semihard_loss(labels, embeddings, margin=1.0):
mask_final = array_ops.reshape(
math_ops.greater(
math_ops.reduce_sum(
math_ops.cast(
mask, dtype=dtypes.float32), 1, keep_dims=True),
math_ops.cast(mask, dtype=dtypes.float32), 1, keepdims=True),
0.0), [batch_size, batch_size])
mask_final = array_ops.transpose(mask_final)
@ -290,7 +285,7 @@ def npairs_loss(labels, embeddings_anchor, embeddings_positive,
labels_remapped = math_ops.to_float(
math_ops.equal(labels, array_ops.transpose(labels)))
labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True)
labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)
# Add the softmax loss.
xent_loss = nn.softmax_cross_entropy_with_logits(
@ -395,7 +390,7 @@ def npairs_loss_multilabel(sparse_labels, embeddings_anchor,
multilabel_adjacency_matrix = _build_multilabel_adjacency(sparse_labels)
labels_remapped = math_ops.to_float(multilabel_adjacency_matrix)
labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True)
labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)
# Add the softmax loss.
xent_loss = nn.softmax_cross_entropy_with_logits(
@ -448,10 +443,10 @@ def lifted_struct_loss(labels, embeddings, margin=1.0):
# Safe maximum: Temporarily shift negative distances
# above zero before taking max.
# this is to take the max only among negatives.
row_minimums = math_ops.reduce_min(diff, 1, keep_dims=True)
row_minimums = math_ops.reduce_min(diff, 1, keepdims=True)
row_negative_maximums = math_ops.reduce_max(
math_ops.multiply(
diff - row_minimums, mask), 1, keep_dims=True) + row_minimums
math_ops.multiply(diff - row_minimums, mask), 1,
keepdims=True) + row_minimums
# Compute the loss.
# Keep track of matrix of maximums where M_ij = max(m_i, m_j)
@ -467,10 +462,11 @@ def lifted_struct_loss(labels, embeddings, margin=1.0):
array_ops.transpose(max_elements), [-1, 1])
loss_exp_left = array_ops.reshape(
math_ops.reduce_sum(math_ops.multiply(
math_ops.exp(
diff_tiled - max_elements_vect),
mask_tiled), 1, keep_dims=True), [batch_size, batch_size])
math_ops.reduce_sum(
math_ops.multiply(
math_ops.exp(diff_tiled - max_elements_vect), mask_tiled),
1,
keepdims=True), [batch_size, batch_size])
loss_mat = max_elements + math_ops.log(
loss_exp_left + array_ops.transpose(loss_exp_left))
@ -686,7 +682,7 @@ def _find_loss_augmented_facility_idx(pairwise_distances, labels, chosen_ids,
array_ops.reshape(pairwise_distances_candidate, [1, -1])
], 0),
axis=0,
keep_dims=True), [num_candidates, -1]),
keepdims=True), [num_candidates, -1]),
axis=1)
nmi_scores = array_ops.zeros([num_candidates])

View File

@ -130,6 +130,105 @@ adb shell '/data/local/tmp/benchmark \
For more details, see the [benchmark documentation](../../tools/benchmark).
## CUDA support for Tegra devices running Android (Nvidia Shield TV, etc)
With the release of TF 1.6 and JetPack for Android 3.2 (currently pending), you can now build a version of TensorFlow for compatible devices according to the following instructions which will receive the full benefits of GPU acceleration.
#### Environment setup:
First, download and install JetPack for Android version 3.2 or greater from [Nvidia](https://developers.nvidia.com). Note that as of the TF 1.6 release the JetPack for Android 3.2 release is still pending, and regular JetPack for L4T will not work.
```bash
git clone https://github.com/tensorflow/tensorflow.git
cd tensorflow
JETPACK=$HOME/JetPack_Android_3.2
TEGRA_LIBS="$JETPACK/cuDNN/aarch64/cuda/lib64/libcudnn.so $JETPACK/cuda-9.0/extras/CUPTI/lib64/libcupti.so $JETPACK/cuda/targets/aarch64-linux-androideabi/lib64/libcufft.so"
```
#### Building all CUDA-enabled native binaries:
This will build CUDA-enabled versions of libtensorflow_inference.so and the benchmark binary. (libtensorflow_demo.so will also be built incidentally, but it does not support CUDA)
```bash
NDK_ROOT=$JETPACK/android-ndk-r13b
CC_PREFIX=ccache tensorflow/contrib/makefile/build_all_android.sh -s tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in -t "libtensorflow_inference.so libtensorflow_demo.so all" -a tegra
```
(add -T on subsequent builds to skip protobuf downloading/building)
#### Testing the CUDA-enabled benchmark via adb:
Build binaries first as above, then run:
```bash
adb shell mkdir -p /data/local/tmp/lib64
adb push $TEGRA_LIBS /data/local/tmp/lib64
adb push tensorflow/contrib/makefile/gen/bin/android_arm64-v8a/benchmark /data/local/tmp
wget https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk
unzip tensorflow_demo.apk -d /tmp/tensorflow_demo
adb push /tmp/tensorflow_demo/assets/*.pb /data/local/tmp
adb shell "LD_LIBRARY_PATH=/data/local/tmp/lib64 /data/local/tmp/benchmark --graph=/data/local/tmp/tensorflow_inception_graph.pb"
```
#### Building the CUDA-enabled TensorFlow AAR with Bazel:
Build the native binaries first as above. Then, build the aar and package the native libs by executing the following:
```bash
mkdir -p /tmp/tf/jni/arm64-v8a
cp tensorflow/contrib/makefile/gen/lib/android_tegra/libtensorflow_*.so /tmp/tf/jni/arm64-v8a/
cp $TEGRA_LIBS /tmp/tf/jni/arm64-v8a
bazel build //tensorflow/contrib/android:android_tensorflow_inference_java.aar
cp bazel-bin/tensorflow/contrib/android/android_tensorflow_inference_java.aar /tmp/tf/tensorflow.aar
cd /tmp/tf
chmod +w tensorflow.aar
zip -ur tensorflow.aar $(find jni -name *.so)
```
#### Building the CUDA-enabled TensorFlow Android demo with Bazel:
Build binaries first as above, then edit tensorflow/examples/android/BUILD and replace:
```
srcs = [
":libtensorflow_demo.so",
"//tensorflow/contrib/android:libtensorflow_inference.so",
],
```
with:
```
srcs = glob(["libs/arm64-v8a/*.so"]),
```
Then run:
```bash
# Create dir for native libs
mkdir -p tensorflow/examples/android/libs/arm64-v8a
# Copy JetPack libs
cp $TEGRA_LIBS tensorflow/examples/android/libs/arm64-v8a
# Copy native TensorFlow libraries
cp tensorflow/contrib/makefile/gen/lib/android_arm64-v8a/libtensorflow_*.so tensorflow/examples/android/libs/arm64-v8a/
# Build APK
bazel build -c opt --fat_apk_cpu=arm64-v8a tensorflow/android:tensorflow_demo
# Install
adb install -r -f bazel-bin/tensorflow/examples/android/tensorflow_demo.apk
```
#### Building the CUDA-enabled Android demo with gradle/Android Studio:
Add tensorflow/examples/android as an Android project in Android Studio as normal.
Edit build.gradle and:
* set nativeBuildSystem = 'makefile'
* set cpuType = 'arm64-v8a'
* in "buildNativeMake", replace cpuType with 'tegra' (optional speedups like -T and ccache also work)
* set the environment "NDK_ROOT" var to $JETPACK/android-ndk-r13b
Click "build apk" to build.
Install:
```bash
adb install -r -f tensorflow/examples/android/gradleBuild/outputs/apk/debug/android-debug.apk
```
## iOS
_Note: To use this library in an iOS application, see related instructions in

View File

@ -36,7 +36,7 @@ while getopts "bc:Eps" opt_name; do
b) BUILD_ONLY="true";;
c) TEST_COUNT="${OPTARG}";;
E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
p) USE_PREBUILT_HEXAOGON_BINARIES="true";;
p) USE_PREBUILT_HEXAGON_BINARIES="true";;
s) SKIP_DOWNLOAD_IF_EXIST="true";;
*) usage;;
esac
@ -49,7 +49,7 @@ if [[ -z "${NDK_ROOT}" ]]; then
exit 1
fi
if [[ "${USE_PREBUILT_HEXAOGON_BINARIES}" != "true" &&
if [[ "${USE_PREBUILT_HEXAGON_BINARIES}" != "true" &&
-z "${QUALCOMM_SDK}" ]]; then
echo "QUALCOMM_SDK is empty" 1>&2
usage
@ -84,7 +84,7 @@ rm -rf "${GEN_DIR}"
mkdir -p "${GEN_LIBS_DIR}"
mkdir -p "${GEN_DOWNLOAD_DIR}"
if [[ "${USE_PREBUILT_HEXAOGON_BINARIES}" == "true" ]]; then
if [[ "${USE_PREBUILT_HEXAGON_BINARIES}" == "true" ]]; then
echo "Download prebuilt hexagon binaries"
if [[ "${BUILD_ONLY}" != "true" ]]; then
CONTROLLER_PUSH_DEST="/data/local/tmp"

View File

@ -572,9 +572,8 @@ class LSTMBlockWrapper(base_layer.Layer):
def _gather_states(self, data, indices, batch_size):
"""Produce `out`, s.t. out(i, j) = data(indices(i), i, j)."""
mod_indices = indices * batch_size + math_ops.range(batch_size)
return array_ops.gather(
array_ops.reshape(data, [-1, self.num_units]), mod_indices)
return array_ops.gather_nd(
data, array_ops.stack([indices, math_ops.range(batch_size)], axis=1))
class LSTMBlockFusedCell(LSTMBlockWrapper):

View File

@ -424,8 +424,9 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
"W_O_diag", shape=[self._num_units], dtype=dtype)
# initialize the first freq state to be zero
m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]), self._num_units],
dtype)
m_prev_freq = array_ops.zeros(
[inputs.shape[0].value or inputs.get_shape()[0], self._num_units],
dtype)
for fq in range(len(freq_inputs)):
c_prev = array_ops.slice(state, [0, 2 * fq * self._num_units],
[-1, self._num_units])

View File

@ -80,12 +80,12 @@ class GatherTreeOp : public OpKernel {
max_sequence_lengths.shape().DebugString()));
Tensor* beams;
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, step_ids_shape, &beams));
typename TTypes<T, 3>::ConstTensor step_ids_t = step_ids.tensor<T, 3>();
typename TTypes<T, 3>::ConstTensor parent_ids_t = parent_ids.tensor<T, 3>();
typename TTypes<T, 3>::ConstTensor step_ids_t(step_ids.tensor<T, 3>());
typename TTypes<T, 3>::ConstTensor parent_ids_t(parent_ids.tensor<T, 3>());
typename TTypes<int32>::ConstVec max_seq_lens_t =
max_sequence_lengths.vec<int32>();
typename TTypes<T>::ConstScalar end_token_t = end_token.scalar<T>();
typename TTypes<T, 3>::Tensor beams_t = beams->tensor<T, 3>();
typename TTypes<T>::ConstScalar end_token_t(end_token.scalar<T>());
typename TTypes<T, 3>::Tensor beams_t(beams->tensor<T, 3>());
const T end_token_value = end_token_t();
functor::GatherTree<Device, T>()(ctx, device, step_ids_t, parent_ids_t,
max_seq_lens_t, end_token_value, beams_t);

View File

@ -144,7 +144,7 @@ def inverse_stft_window_fn(frame_step,
overlaps = -(-frame_length // frame_step) # Ceiling division.
denom = array_ops.pad(denom, [(0, overlaps * frame_step - frame_length)])
denom = array_ops.reshape(denom, [overlaps, frame_step])
denom = math_ops.reduce_sum(denom, 0, keep_dims=True)
denom = math_ops.reduce_sum(denom, 0, keepdims=True)
denom = array_ops.tile(denom, [overlaps, 1])
denom = array_ops.reshape(denom, [overlaps * frame_step])

View File

@ -29,6 +29,7 @@ from tensorflow.contrib.framework.python.ops import variables as variables_lib
from tensorflow.contrib.metrics.python.ops import metric_ops
from tensorflow.contrib.slim.python.slim import evaluation
from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
from tensorflow.core.protobuf import saver_pb2
from tensorflow.python.debug.lib import debug_data
from tensorflow.python.debug.wrappers import hooks
from tensorflow.python.framework import constant_op
@ -235,7 +236,7 @@ class SingleEvaluationTest(test.TestCase):
def _prepareCheckpoint(self, checkpoint_path):
init_op = control_flow_ops.group(variables.global_variables_initializer(),
variables.local_variables_initializer())
saver = saver_lib.Saver()
saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
with self.test_session() as sess:
sess.run(init_op)
saver.save(sess, checkpoint_path)

View File

@ -497,6 +497,7 @@ py_library(
":tensor_forest_v4_ops_py",
"//tensorflow/contrib/decision_trees/proto:generic_tree_model_py",
"//tensorflow/contrib/framework:framework_py",
"//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_py",
"//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_py",
"//tensorflow/python:array_ops",
"//tensorflow/python:control_flow_ops",

View File

@ -1,5 +1,6 @@
# Description:
# Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow.
# Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow
# and provide TensorRT operators and converter package.
# APIs are meant to change over time.
package(default_visibility = ["//tensorflow:__subpackages__"])
@ -8,7 +9,19 @@ licenses(["notice"]) # Apache 2.0
exports_files(["LICENSE"])
load(
"//tensorflow:tensorflow.bzl",
"tf_cc_test",
"tf_copts",
"tf_cuda_library",
"tf_custom_op_library",
"tf_custom_op_library_additional_deps",
"tf_gen_op_libs",
"tf_gen_op_wrapper_py",
)
load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
load(
"@local_config_tensorrt//:build_defs.bzl",
"if_tensorrt",
@ -32,6 +45,195 @@ tf_cuda_cc_test(
]),
)
tf_custom_op_library(
name = "python/ops/_trt_engine_op.so",
srcs = ["ops/trt_engine_op.cc"],
deps = [
":trt_engine_op_kernel",
":trt_shape_function",
"//tensorflow/core:lib_proto_parsing",
] + if_tensorrt([
"@local_config_tensorrt//:nv_infer",
]),
)
tf_cuda_library(
name = "trt_shape_function",
srcs = ["shape_fn/trt_shfn.cc"],
hdrs = ["shape_fn/trt_shfn.h"],
visibility = ["//visibility:public"],
deps = [
":trt_logging",
] + if_tensorrt([
"@local_config_tensorrt//:nv_infer",
]) + tf_custom_op_library_additional_deps(),
)
cc_library(
name = "trt_engine_op_kernel",
srcs = ["kernels/trt_engine_op.cc"],
hdrs = ["kernels/trt_engine_op.h"],
copts = tf_copts(),
deps = [
":trt_logging",
"//tensorflow/core:gpu_headers_lib",
"//tensorflow/core:lib_proto_parsing",
"//tensorflow/core:stream_executor_headers_lib",
] + if_tensorrt([
"@local_config_tensorrt//:nv_infer",
]) + tf_custom_op_library_additional_deps(),
# TODO(laigd)
alwayslink = 1, # buildozer: disable=alwayslink-with-hdrs
)
tf_gen_op_libs(
op_lib_names = ["trt_engine_op"],
deps = if_tensorrt([
"@local_config_tensorrt//:nv_infer",
]),
)
tf_cuda_library(
name = "trt_logging",
srcs = ["log/trt_logger.cc"],
hdrs = ["log/trt_logger.h"],
visibility = ["//visibility:public"],
deps = [
"//tensorflow/core:lib_proto_parsing",
] + if_tensorrt([
"@local_config_tensorrt//:nv_infer",
]),
)
tf_gen_op_wrapper_py(
name = "trt_engine_op",
deps = [
":trt_engine_op_op_lib",
":trt_logging",
":trt_shape_function",
],
)
tf_custom_op_py_library(
name = "trt_engine_op_loader",
srcs = ["python/ops/trt_engine_op.py"],
dso = [
":python/ops/_trt_engine_op.so",
] + if_tensorrt([
"@local_config_tensorrt//:nv_infer",
]),
srcs_version = "PY2AND3",
deps = [
"//tensorflow/python:framework_for_generated_wrappers",
"//tensorflow/python:resources",
],
)
py_library(
name = "init_py",
srcs = [
"__init__.py",
"python/__init__.py",
],
srcs_version = "PY2AND3",
deps = [
":trt_convert_py",
":trt_ops_py",
],
)
py_library(
name = "trt_ops_py",
srcs_version = "PY2AND3",
deps = [
":trt_engine_op",
":trt_engine_op_loader",
],
)
py_library(
name = "trt_convert_py",
srcs = ["python/trt_convert.py"],
srcs_version = "PY2AND3",
deps = [
":wrap_conversion",
],
)
tf_py_wrap_cc(
name = "wrap_conversion",
srcs = ["trt_conversion.i"],
copts = tf_copts(),
deps = [
":trt_conversion",
"//tensorflow/core:framework_lite",
"//util/python:python_headers",
],
)
# Library for the node-level conversion portion of TensorRT operation creation
tf_cuda_library(
name = "trt_conversion",
srcs = [
"convert/convert_graph.cc",
"convert/convert_nodes.cc",
],
hdrs = [
"convert/convert_graph.h",
"convert/convert_nodes.h",
],
deps = [
":segment",
":trt_logging",
"//tensorflow/core/grappler:grappler_item",
"//tensorflow/core/grappler:utils",
"//tensorflow/core:framework",
"//tensorflow/core:framework_lite",
"//tensorflow/core:graph",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core/grappler:devices",
"//tensorflow/core/grappler/clusters:virtual_cluster",
"//tensorflow/core/grappler/costs:graph_properties",
"//tensorflow/core/grappler/optimizers:constant_folding",
"//tensorflow/core/grappler/optimizers:layout_optimizer",
] + if_tensorrt([
"@local_config_tensorrt//:nv_infer",
]) + tf_custom_op_library_additional_deps(),
)
# Library for the segmenting portion of TensorRT operation creation
cc_library(
name = "segment",
srcs = ["segment/segment.cc"],
hdrs = [
"segment/segment.h",
"segment/union_find.h",
],
linkstatic = 1,
deps = [
"//tensorflow/core:graph",
"//tensorflow/core:lib_proto_parsing",
"//tensorflow/core:protos_all_cc",
"@protobuf_archive//:protobuf_headers",
],
)
tf_cc_test(
name = "segment_test",
size = "small",
srcs = ["segment/segment_test.cc"],
deps = [
":segment",
"//tensorflow/c:c_api",
"//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
],
)
filegroup(
name = "all_files",
srcs = glob(

View File

@ -0,0 +1,40 @@
Using TensorRT in TensorFlow
============================
This module provides necessary bindings and introduces TRT_engine_op
operator that wraps a subgraph in TensorRT.
Compilation
-----------
In order to compile the module, you need to have a local TensorRT
installation (libnvinfer.so and respective include files). During the
configuration step, TensorRT should be enabled and installation path
should be set. If installed through package managers (deb,rpm),
configure script should find the necessary components from the system
automatically. If installed from tar packages, user has to set path to
location where the library is installed during configuration.
```
bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
```
After the installation of tensorflow package, TensorRT transformation
will be available. An example use is shown below.
```python
import tensorflow as tf
import tensorflow.contrib.tensorrt as trt
#... create and train or load model
gdef = sess.graph.as_graph_def()
trt_gdef = trt.create_inference_graph(
gdef, #original graph_def
["output"], #name of output node(s)
max_batch_size, #maximum batch size to run the inference
max_workspace_size_bytes) # max memory for TensorRT to use
tf.reset_default_graph()
tf.import_graph_def(graph_def=trt_gdef)
#...... run inference
```

View File

@ -0,0 +1,23 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""Exposes the python wrapper for TensorRT graph transforms."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# pylint: disable=unused-import,wildcard-import
from tensorflow.contrib.tensorrt.python import *
# pylint: enable=unused-import,wildcard-import

View File

@ -0,0 +1,273 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
#include <map>
#include <set>
#include <unordered_map>
#include <utility>
#include <vector>
#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
#include "tensorflow/contrib/tensorrt/segment/segment.h"
#include "tensorflow/core/graph/algorithm.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/graph/graph_constructor.h"
#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
#include "tensorflow/core/grappler/costs/graph_properties.h"
#include "tensorflow/core/grappler/devices.h"
#include "tensorflow/core/grappler/grappler_item.h"
#include "tensorflow/core/grappler/optimizers/constant_folding.h"
#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
#include "tensorflow/core/grappler/utils.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/protobuf/device_properties.pb.h" // NOLINT
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
#include "tensorrt/include/NvInfer.h"
namespace tensorflow {
namespace tensorrt {
namespace convert {
namespace {
static bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
// LINT.IfChange
// TODO(jie): Segmentation shouldn't associated with op name.
// Split it into a registration for each kernel.
static const std::set<string> candidate_ops = {
"Identity", "Const", "Conv2D", "MaxPool", "BiasAdd", "Relu",
"Add", "Mul", "Sub", "Rsqrt", "Pad" // "Placeholder" ,"Mean"
};
// LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
return candidate_ops.count(node_def.op());
}
void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
const std::set<int>& subgraph_node_ids,
tensorflow::EdgeSet* incoming_edges) {
for (int node_id : subgraph_node_ids) {
const tensorflow::Node* node = graph.FindNodeId(node_id);
for (const tensorflow::Edge* edge : node->in_edges()) {
if (!subgraph_node_ids.count(edge->src()->id()) &&
!edge->src()->IsSource()) {
incoming_edges->insert(edge);
}
}
}
}
void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
const std::set<int>& subgraph_node_ids,
tensorflow::EdgeSet* outgoing_edges) {
for (int node_id : subgraph_node_ids) {
const tensorflow::Node* node = graph.FindNodeId(node_id);
for (const tensorflow::Edge* edge : node->out_edges()) {
if (!subgraph_node_ids.count(edge->dst()->id()) &&
!edge->dst()->IsSink()) {
outgoing_edges->insert(edge);
}
}
}
}
std::pair<string, int> ParseTensorName(string name, int default_idx = 0) {
int idx = default_idx;
size_t sep = name.find_last_of(':');
if (sep != string::npos) {
name = name.substr(0, sep);
idx = std::stoi(name.substr(sep + 1));
}
return std::make_pair(name, idx);
}
std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
const std::vector<string>& tensor_names) {
std::unordered_map<string, std::vector<int>> result;
for (string const& tensor_name : tensor_names) {
string node_name;
int index;
std::tie(node_name, index) = ParseTensorName(tensor_name);
result[node_name].push_back(index);
}
return result;
}
tensorflow::Status ConvertSubGraphToTensorRT(
const std::vector<string>& output_names,
const std::set<int>& subgraph_node_ids,
size_t max_batch_size, // Max batch size that engine will be created for
// Max amount of memory that engine will be allowed to consume, in bytes
size_t max_workspace_size_bytes,
const tensorflow::grappler::GraphProperties& graph_properties,
tensorflow::Graph* graph) {
tensorflow::EdgeSet subgraph_incoming_edges;
GetSubGraphIncomingEdges(*graph, subgraph_node_ids, &subgraph_incoming_edges);
std::vector<std::pair<int, int>> subgraph_inputs;
// Collect inputs by looking for incoming edges
for (const tensorflow::Edge* edge : subgraph_incoming_edges) {
subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
}
std::set<std::pair<int, int>> subgraph_outputs_set;
// Collect outputs referenced from output_names
auto output_name_to_index_map = BuildTensorNameMap(output_names);
for (int node_id : subgraph_node_ids) {
tensorflow::Node* node = graph->FindNodeId(node_id);
if (output_name_to_index_map.count(node->name())) {
for (int index : output_name_to_index_map.at(node->name())) {
subgraph_outputs_set.insert({node_id, index});
}
}
}
// Collect outputs referenced from outgoing edges
tensorflow::EdgeSet subgraph_outgoing_edges;
GetSubGraphOutgoingEdges(*graph, subgraph_node_ids, &subgraph_outgoing_edges);
for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
}
// Impose an ordering on the outputs
std::vector<std::pair<int, int>> subgraph_outputs(
subgraph_outputs_set.begin(), subgraph_outputs_set.end());
// Build TensorRT node and add it to the graph
tensorflow::NodeDef trt_node_def;
TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(
*graph, subgraph_node_ids, subgraph_inputs, subgraph_outputs,
max_batch_size, max_workspace_size_bytes, graph_properties,
&trt_node_def));
tensorflow::Status status;
tensorflow::Node* trt_node = graph->AddNode(trt_node_def, &status);
TF_RETURN_IF_ERROR(status);
// Re-map outgoing edges to use the new TRT node instead of the orig subgraph
std::map<std::pair<int, int>, int> subgraph_edge_to_output_map;
for (size_t i = 0; i < subgraph_outputs.size(); ++i) {
subgraph_edge_to_output_map.insert({subgraph_outputs.at(i), i});
}
TF_RETURN_IF_ERROR(status);
for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
int new_src_output = subgraph_edge_to_output_map.at(old_src);
TF_RETURN_IF_ERROR(graph->UpdateEdge(trt_node, new_src_output, edge->dst(),
edge->dst_input()));
}
// Remove the original subgraph
for (int node_id : subgraph_node_ids) {
tensorflow::Node* node = graph->FindNodeId(node_id);
// Don't remove the input placeholders
if (node->type_string() == "Placeholder") {
continue;
}
graph->RemoveNode(node);
}
return tensorflow::Status::OK();
}
tensorflow::Status BuildNodeMap(
const tensorflow::Graph& graph,
std::unordered_map<string, tensorflow::Node*>* node_map) {
for (auto* node : graph.op_nodes()) {
if (!node_map->insert({node->name(), node}).second) {
return tensorflow::errors::AlreadyExists(
"Node name is not unique in graph: " + node->name());
}
}
return tensorflow::Status::OK();
}
} // namespace
tensorflow::Status ConvertGraphDefToTensorRT(
const tensorflow::GraphDef& graph_def,
const std::vector<string>& output_names, size_t max_batch_size,
size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def) {
// Optimization pass
tensorflow::grappler::GrapplerItem item;
item.fetch = output_names;
tensorflow::GraphDef gdef;
// Layout optimization
item.graph = graph_def;
tensorflow::grappler::LayoutOptimizer optimizer;
tensorflow::grappler::Cluster* cluster;
// Virtual cluster
tensorflow::DeviceProperties device_properties;
device_properties.set_type("GPU");
device_properties.mutable_environment()->insert({"architecture", "6"});
cluster =
new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
// Constant folding
item.graph = gdef;
tensorflow::grappler::ConstantFolding fold(nullptr);
TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));
// AJ refactoring shape inference through grappler/GraphProperties.
tensorflow::grappler::GraphProperties static_graph_properties(item);
TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(false));
// Build full graph
tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
gdef.library());
tensorflow::Graph graph(flib);
TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
tensorflow::GraphConstructorOptions(), gdef, &graph));
// Segment the graph into subgraphs that can be converted to TensorRT
tensorflow::tensorrt::segment::SegmentOptions segment_options;
// TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
for (auto node : output_names) {
segment_options.exclude_node_list.insert(node);
}
// TODO(sami): this should be passed as a knob!!!!
segment_options.minimum_segment_size = 2;
tensorflow::tensorrt::segment::SegmentNodesVector segments;
TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
gdef, IsTensorRTCandidate, segment_options, &segments));
if (segments.size() > 1) {
VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
}
std::unordered_map<string, tensorflow::Node*> node_map;
TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
for (const std::set<string>& subgraph_node_names : segments) {
std::set<int> subgraph_node_ids;
for (const string& node_name : subgraph_node_names) {
subgraph_node_ids.insert(node_map.at(node_name)->id());
}
TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
output_names, subgraph_node_ids, max_batch_size,
max_workspace_size_bytes, static_graph_properties, &graph));
}
graph.ToGraphDef(new_graph_def);
return tensorflow::Status::OK();
}
} // namespace convert
} // namespace tensorrt
} // namespace tensorflow
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA

View File

@ -0,0 +1,47 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
#include <vector>
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/types.h"
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
namespace tensorflow {
namespace tensorrt {
namespace convert {
// max_batch_size: maximum batch size which can be used for inference for
// optimization targets inference run with max batch size.
// max_workspace_size_bytes: The upper bound of memory allowence for
// engine building.
tensorflow::Status ConvertGraphDefToTensorRT(
const tensorflow::GraphDef& graph_def,
const std::vector<string>& output_names, size_t max_batch_size,
size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def);
} // namespace convert
} // namespace tensorrt
} // namespace tensorflow
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA
#endif // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,52 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
#include <set>
#include <utility>
#include <vector>
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/grappler/costs/graph_properties.h"
#include "tensorflow/core/lib/core/status.h"
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
namespace tensorflow {
namespace tensorrt {
namespace convert {
tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
const std::vector<std::pair<int, int>>&
input_inds, // {node_id, output_idx}
const std::vector<std::pair<int, int>>&
output_inds, // {node_id, output_idx}
size_t max_batch_size, size_t max_workspace_size_bytes,
const tensorflow::grappler::GraphProperties& graph_prop,
tensorflow::NodeDef* trt_node);
} // namespace convert
} // namespace tensorrt
} // namespace tensorflow
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA
#endif // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_

View File

@ -0,0 +1,140 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/types.h"
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
#include "cuda/include/cuda_runtime_api.h"
namespace tensorflow {
namespace tensorrt {
static ::tensorflow::tensorrt::Logger logger;
TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
// read serialized_engine
string serialized_engine;
OP_REQUIRES_OK(context,
context->GetAttr("serialized_engine", &serialized_engine));
// register input output node name in trt_sub_graph
OP_REQUIRES_OK(context, context->GetAttr("input_nodes", &input_nodes_));
OP_REQUIRES_OK(context, context->GetAttr("output_nodes", &output_nodes_));
// TODO(samikama) runtime should be taken from a resourcemanager as well.
// Only engine should be in the op and context and runtime should be taken
// from resourcemanager
nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
trt_engine_ptr_.reset(infer->deserializeCudaEngine(
serialized_engine.c_str(), serialized_engine.size(), nullptr));
trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
// Runtime is safe to delete after engine creation
infer->destroy();
}
void TRTEngineOp::Compute(OpKernelContext* context) {
int num_binding = context->num_inputs() + context->num_outputs();
std::vector<void*> buffers(num_binding);
size_t binding_index;
int num_batch = 0;
bool valid = true;
for (int i = 0; i < context->num_inputs(); i++) {
// Grab the input tensor
binding_index = trt_engine_ptr_->getBindingIndex(input_nodes_[i].c_str());
const Tensor& input_tensor = context->input(i);
const TensorShape& input_shape = input_tensor.shape();
if (i == 0) {
num_batch = input_shape.dim_size(0);
} else if (num_batch != input_shape.dim_size(0)) {
valid = false;
break;
}
switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
case nvinfer1::DataType::kFLOAT:
buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
break;
case nvinfer1::DataType::kHALF:
LOG(FATAL) << "half size is not supported yet!";
break;
case nvinfer1::DataType::kINT8:
LOG(FATAL) << "int8 is not supported yet!";
break;
}
}
// Might want a different way to inform the user of batch size inconsistency
if (!valid) LOG(WARNING) << "input data inconsistent batch size";
for (int i = 0; i < static_cast<int>(output_nodes_.size()); i++) {
// This is bad that we have to reallocate output buffer every run.
// Create an output tensor
binding_index = trt_engine_ptr_->getBindingIndex(output_nodes_[i].c_str());
Tensor* output_tensor = nullptr;
TensorShape output_shape;
if (binding_index != -1) {
auto dims = trt_engine_ptr_->getBindingDimensions(binding_index);
std::vector<int> trt_shape(dims.nbDims + 1);
trt_shape[0] = num_batch;
for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
OP_REQUIRES_OK(context,
TensorShapeUtils::MakeShape(
trt_shape.data(), trt_shape.size(), &output_shape));
} else {
LOG(FATAL) << "output node not found, at " << output_nodes_[i];
break;
}
OP_REQUIRES_OK(context,
context->allocate_output(i, output_shape, &output_tensor));
switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
case nvinfer1::DataType::kFLOAT:
buffers[binding_index] =
reinterpret_cast<void*>(output_tensor->flat<float>().data());
break;
case nvinfer1::DataType::kHALF:
LOG(FATAL) << "half size is not supported yet!";
break;
case nvinfer1::DataType::kINT8:
LOG(FATAL) << "int8 is not supported yet!";
break;
}
}
// copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
const cudaStream_t* stream = CHECK_NOTNULL(
reinterpret_cast<const cudaStream_t*>(context->op_device_context()
->stream()
->implementation()
->CudaStreamMemberHack()));
// execution handled by TF since we are getting stream from TF.
// it is safe for CPU pointer array (buffers) to go out of scope after enqueue
trt_execution_context_ptr_->enqueue(num_batch, &buffers[0], *stream, nullptr);
}
REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
} // namespace tensorrt
} // namespace tensorflow
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA

View File

@ -0,0 +1,62 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
#define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
#include <memory>
#include <string>
#include <vector>
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
#include "cuda/include/cuda_runtime_api.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorrt/include/NvInfer.h"
namespace tensorflow {
namespace tensorrt {
class Logger;
class TRTEngineOp : public OpKernel {
public:
explicit TRTEngineOp(OpKernelConstruction* context);
void Compute(OpKernelContext* context) override;
private:
template <typename T>
struct Destroyer {
void operator()(T* d) { d->destroy(); }
};
template <typename T>
using destroyed_ptr = std::unique_ptr<T, Destroyer<T>>;
destroyed_ptr<nvinfer1::ICudaEngine> trt_engine_ptr_;
// TODO(samikama): context should go to a resource manager!
destroyed_ptr<nvinfer1::IExecutionContext> trt_execution_context_ptr_;
std::vector<string> input_nodes_;
std::vector<string> output_nodes_;
};
} // namespace tensorrt
} // namespace tensorflow
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA
#endif // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_

View File

@ -0,0 +1,57 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
#include "tensorflow/core/platform/logging.h"
namespace tensorflow {
namespace tensorrt {
// Use TF logging for TensorRT informations
void Logger::log(Severity severity, const char* msg) {
// Suppress info-level messages
switch (severity) {
case Severity::kINFO: { // Mark TRT info messages as debug!
VLOG(2) << msg;
break;
}
case Severity::kWARNING: {
LOG(WARNING) << msg;
break;
}
case Severity::kERROR: {
LOG(ERROR) << msg;
break;
}
case Severity::kINTERNAL_ERROR: {
LOG(FATAL) << msg;
break;
}
// This is useless for now. But would catch it in future if enum changes. It
// is always good to have default case!
default: {
LOG(FATAL) << name_ << "Got unknown severity level from TRT " << msg;
break;
}
}
}
} // namespace tensorrt
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_TENSORRT

View File

@ -0,0 +1,42 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
#define TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
#include "tensorflow/core/platform/types.h"
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
#include "tensorrt/include/NvInfer.h"
namespace tensorflow {
namespace tensorrt {
// Logger for GIE info/warning/errors
class Logger : public nvinfer1::ILogger {
private:
void log(nvinfer1::ILogger::Severity severity, const char* msg) override;
string name_;
};
} // namespace tensorrt
} // namespace tensorflow
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA
#endif // TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_

View File

@ -0,0 +1,43 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/framework/tensor_shape.h"
namespace tensorflow {
namespace shape_inference {
extern Status TRTEngineOpShapeInference(InferenceContext* c);
}
REGISTER_OP("TRTEngineOp")
.Attr("serialized_engine: string")
.Attr("input_nodes: list(string)")
.Attr("output_nodes: list(string)")
.Attr("InT: list({float32})")
.Attr("OutT: list({float32})")
.Input("in_tensor: InT")
.Output("out_tensor: OutT")
.SetShapeFn(shape_inference::TRTEngineOpShapeInference);
} // namespace tensorflow
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA

View File

@ -0,0 +1,24 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""Exposes the python wrapper for TensorRT graph transforms."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# pylint: disable=unused-import,line-too-long
from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
from tensorflow.contrib.tensorrt.python.trt_convert import create_inference_graph
# pylint: enable=unused-import,line-too-long

View File

@ -0,0 +1,34 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""Exposes the Python wrapper of TRTEngineOp."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import platform
if platform.system() != "Windows":
# pylint: disable=wildcard-import,unused-import,g-import-not-at-top
from tensorflow.contrib.tensorrt.ops.gen_trt_engine_op import *
from tensorflow.contrib.util import loader
from tensorflow.python.platform import resource_loader
# pylint: enable=wildcard-import,unused-import,g-import-not-at-top
_trt_engine_op = loader.load_op_library(
resource_loader.get_path_to_datafile("_trt_engine_op.so"))
else:
raise RuntimeError("Windows platforms are not supported")

View File

@ -0,0 +1,103 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""Exposes the Python wrapper conversion to trt_graph."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# pylint: disable=unused-import,line-too-long
import six as _six
from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
from tensorflow.core.framework import graph_pb2
from tensorflow.python.framework import errors
from tensorflow.python.framework import errors_impl as _impl
from tensorflow.python.framework import ops
# TODO(skama): get outputs from session when implemented as c++
# optimization pass
def create_inference_graph(input_graph_def,
outputs,
max_batch_size=1,
max_workspace_size_bytes=2 << 20):
"""Python wrapper for the TRT transormation.
Args:
input_graph_def: GraphDef object containing a model to be transformed.
outputs: List of tensors or node names for the model outputs.
max_batch_size: max size for the input batch
max_workspace_size_bytes: parameter to control memory allocation (in Bytes)
Returns:
New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
Raises:
RuntimeError: if the returned status message is malformed.
"""
def py2bytes(inp):
return inp
def py3bytes(inp):
return inp.encode("utf-8", errors="surrogateescape")
def py2string(inp):
return inp
def py3string(inp):
return inp.decode("utf-8")
if _six.PY2:
to_bytes = py2bytes
to_string = py2string
else:
to_bytes = py3bytes
to_string = py3string
out_names = []
for i in outputs:
if isinstance(i, ops.Tensor):
out_names.append(to_bytes(i.name))
else:
out_names.append(to_bytes(i))
input_graph_def_str = input_graph_def.SerializeToString()
# TODO(sami): Fix this when we can return status from C++ library
# There is a problem with the TF internal library setup that doesn't
# allow us to return a status object from C++. Thus we return a
# pair or strings where first one is encoded status and the second
# one is the transformed graphs protobuf string.
out = trt_convert(input_graph_def_str, out_names, max_batch_size,
max_workspace_size_bytes)
status = to_string(out[0])
output_graph_def_string = out[1]
del input_graph_def_str # Save some memory
if len(status) < 2:
raise _impl.UnknownError(None, None, status)
if status[:2] != "OK":
msg = status.split(";")
if len(msg) == 1:
raise RuntimeError("Status message is malformed {}".format(status))
# pylint: disable=protected-access
raise _impl._make_specific_exception(None, None, ";".join(msg[1:]),
int(msg[0]))
# pylint: enable=protected-access
output_graph_def = graph_pb2.GraphDef()
output_graph_def.ParseFromString(output_graph_def_string)
del output_graph_def_string # Save some memory
return output_graph_def

View File

@ -0,0 +1,253 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/tensorrt/segment/segment.h"
#include <set>
#include <unordered_map>
#include <vector>
#include "tensorflow/contrib/tensorrt/segment/union_find.h"
#include "tensorflow/core/graph/algorithm.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/graph/graph_constructor.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/types.h"
namespace tensorflow {
namespace tensorrt {
namespace segment {
namespace {
bool CanContractEdge(const tensorflow::Edge* edge,
const tensorflow::Graph& graph) {
const tensorflow::Node* src = edge->src();
const tensorflow::Node* dst = edge->dst();
// Can't contract edge if doing so would cause a cycle in the
// graph. So, if there is a directed path from 'src' to 'dst', other
// than 'edge' (or any other direct edge from 'src' to 'dst'), then
// combining 'src' and 'dst' will cause a cycle along that path.
//
// In practice, to avoid modifying the graph and to take advantage
// of existing graph functions, we perform an equivalent.
// 1. Get all nodes incoming to 'dst', excluding 'src'
// 2. Reverse DFS from those nodes
// 3. If reverse DFS reaches 'src' then we have a cycle
std::vector<tensorflow::Node*> dfs_start_nodes;
for (tensorflow::Node* node : dst->in_nodes()) {
if (node != src) {
dfs_start_nodes.push_back(node);
}
}
bool is_cycle = false;
if (!dfs_start_nodes.empty()) {
tensorflow::ReverseDFSFrom(graph, dfs_start_nodes, {},
[&is_cycle, src](tensorflow::Node* node) {
if (node == src) {
is_cycle = true;
}
});
}
return !is_cycle;
}
void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
std::vector<const tensorflow::Edge*>* remove_edges) {
// Transfer all inputs and outputs of 'dst' to 'src' except edges
// connecting the two.
tensorflow::Node* src = edge->src();
tensorflow::Node* dst = edge->dst();
// We can use '0' for input/output index because we don't need them
// to be accurate for the way we are using the graph.
std::vector<const tensorflow::Edge*> in_edges(dst->in_edges().begin(),
dst->in_edges().end());
for (const tensorflow::Edge* in_edge : in_edges) {
if (in_edge->src() != src) {
tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
if (e->src() == graph->source_node()) {
graph->AddEdge(e->src(), e->src_output(), src,
tensorflow::Graph::kControlSlot);
} else {
graph->AddEdge(e->src(), e->src_output(), src, 0 /* input index */);
}
}
}
std::vector<const tensorflow::Edge*> out_edges(dst->out_edges().begin(),
dst->out_edges().end());
for (const tensorflow::Edge* out_edge : out_edges) {
tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
if (e->dst() == graph->sink_node()) {
graph->AddEdge(src, tensorflow::Graph::kControlSlot, e->dst(),
e->dst_input());
} else {
graph->AddEdge(src, 0 /* output index */, e->dst(), e->dst_input());
}
}
// Return the edges that must be removed to disconnect 'dst' from
// the graph. We don't actually remove 'dst' since the caller holds
// references to all the nodes.
for (const auto& in_edge : dst->in_edges()) {
remove_edges->push_back(in_edge);
}
for (const auto& out_edge : dst->out_edges()) {
remove_edges->push_back(out_edge);
}
}
} // namespace
tensorflow::Status SegmentGraph(
const tensorflow::GraphDef& gdef,
const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
const SegmentOptions& options, SegmentNodesVector* segments) {
// Create a Graph representation of the GraphDef.
tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
gdef.library());
tensorflow::Graph graph(flib);
TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
tensorflow::GraphConstructorOptions(), gdef, &graph));
// tensorflow::DumpGraph("Pre-Segment", &graph);
// Use a union-find to collect the nodes that belong to the same
// segment. A node value of nullptr indicates that the node is not a
// candidate for TRT.
std::vector<UnionFind<tensorflow::Node*>> node_segments;
for (int i = 0; i < graph.num_node_ids(); ++i) {
tensorflow::Node* node = graph.FindNodeId(i);
if (options.exclude_node_list.count(node->name()) != 0 ||
!candidate_fn(node->def())) {
node = nullptr;
}
node_segments.emplace_back(node);
}
// The segmentation algorithm below visits nodes in reverse
// topological order and attempts to merge nodes along output
// edges. That means that subgraphs grow from the output-side of the
// network towards the inputs. In general this is not guaranteed to
// produce a globally optimal segmentation. In the future if we have
// a measure of how beneficial it is to include a given node in a
// TRT subgraph then we can revisit this algorithm to take advantage
// of that information.
std::vector<tensorflow::Node*> order;
tensorflow::GetPostOrder(graph, &order);
for (const tensorflow::Node* node : order) {
// All output nodes of 'node' have been visited...
VLOG(2) << "Trying node " << node->name();
// 'node' must be a TRT candidate...
if (node_segments[node->id()].Value() == nullptr) {
VLOG(2) << "... not a TRT candidate";
continue;
}
// Contract output edges to combine 'node' with output
// nodes. Iterate since combining two nodes may unblock other
// combining.
while (true) {
std::set<const tensorflow::Edge*> contract_edges;
for (const tensorflow::Edge* out_edge : node->out_edges()) {
VLOG(2) << "... out node " << out_edge->dst()->name();
// Out node must be TRT candidate...
if (node_segments[out_edge->dst()->id()].Value() == nullptr) {
VLOG(2) << "... ... not a TRT candidate";
continue;
}
if (CanContractEdge(out_edge, graph)) {
VLOG(2) << "... ... can contract";
contract_edges.insert(out_edge);
} else {
VLOG(2) << "... ... cannot contract, would form cycle";
}
}
if (contract_edges.empty()) {
break;
}
// Contract edges and collect the adjacent nodes into the same
// segment/subgraph.
while (!contract_edges.empty()) {
const tensorflow::Edge* contract_edge = *contract_edges.begin();
const tensorflow::Node* src = contract_edge->src();
const tensorflow::Node* dst = contract_edge->dst();
VLOG(2) << "Merge " << src->name() << " <- " << dst->name();
node_segments[src->id()].Merge(&node_segments[dst->id()]);
// Contracting the edge leaves disconnected graph edges.
// Remove these from the graph and from 'contract_edges' so we
// don't visit them again.
tensorflow::Edge* e = const_cast<tensorflow::Edge*>(contract_edge);
std::vector<const tensorflow::Edge*> remove_edges;
ContractEdge(e, &graph, &remove_edges);
for (const tensorflow::Edge* r : remove_edges) {
contract_edges.erase(r);
graph.RemoveEdge(r);
}
}
}
}
// Collect the segments/subgraphs. Each subgraph is represented by a
// set of the names of the nodes in that subgraph.
std::unordered_map<string, std::set<string>> sg_map;
for (auto& u : node_segments) {
if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
sg_map[u.ParentValue()->name()].insert(u.Value()->name());
}
}
// Convert the segments into the expected return format
for (const auto& itr : sg_map) {
const auto& segment_node_names = itr.second;
if (VLOG_IS_ON(1)) {
string s;
for (const auto& name : segment_node_names) {
s += " " + name;
}
VLOG(1) << "Segment " << segments->size() << ":" << s;
}
// Don't use small segments.
if (static_cast<int>(segment_node_names.size()) <
options.minimum_segment_size) {
VLOG(1) << "Segment " << segments->size() << " has only "
<< segment_node_names.size() << " nodes, dropping";
continue;
}
segments->emplace_back(segment_node_names);
}
return tensorflow::Status::OK();
}
} // namespace segment
} // namespace tensorrt
} // namespace tensorflow

View File

@ -0,0 +1,56 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
#include <set>
#include <vector>
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/types.h"
namespace tensorflow {
namespace tensorrt {
namespace segment {
using SegmentNodesVector = std::vector<std::set<string>>;
struct SegmentOptions {
// Segment must contain at least this many nodes.
int minimum_segment_size = 2;
std::set<string> exclude_node_list;
};
// Get the subgraphs of a graph that can be handled by TensorRT.
//
// @param gdef The GraphDef describing the network
// @param candidate_fn A function that returns true for a NodeDef if
// that node can be handled by TensorRT.
// @param segments Returns the TensorRT segments/subgraphs. Each entry
// in the vector describes a subgraph by giving a set of the names of
// all the NodeDefs in that subgraph.
// @return the status.
tensorflow::Status SegmentGraph(
const tensorflow::GraphDef& gdef,
const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
const SegmentOptions& options, SegmentNodesVector* segments);
} // namespace segment
} // namespace tensorrt
} // namespace tensorflow
#endif // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_

View File

@ -0,0 +1,367 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/tensorrt/segment/segment.h"
#include "tensorflow/c/c_api.h"
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/framework/node_def.pb.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/platform/types.h"
namespace tensorflow {
namespace tensorrt {
namespace segment {
namespace test {
class SegmentTest : public ::testing::Test {
public:
bool GetGraphDef(TF_Graph* graph, tensorflow::GraphDef* graph_def);
TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name);
TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
TF_Status* s, const char* name);
std::function<bool(const NodeDef&)> MakeCandidateFn(
const std::set<string>& node_names);
protected:
void PlaceholderHelper(TF_Graph* graph, TF_Status* s, const char* name,
TF_Operation** op);
void AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
TF_Status* s, const char* name, TF_Operation** op, bool check);
SegmentOptions default_options_;
};
bool SegmentTest::GetGraphDef(TF_Graph* graph,
tensorflow::GraphDef* graph_def) {
TF_Status* s = TF_NewStatus();
TF_Buffer* buffer = TF_NewBuffer();
TF_GraphToGraphDef(graph, buffer, s);
bool ret = TF_GetCode(s) == TF_OK;
EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
if (ret) ret = graph_def->ParseFromArray(buffer->data, buffer->length);
TF_DeleteBuffer(buffer);
TF_DeleteStatus(s);
return ret;
}
std::function<bool(const NodeDef&)> SegmentTest::MakeCandidateFn(
const std::set<string>& node_names) {
return [node_names](const NodeDef& node) -> bool {
return node_names.find(node.name()) != node_names.end();
};
}
void SegmentTest::PlaceholderHelper(TF_Graph* graph, TF_Status* s,
const char* name, TF_Operation** op) {
TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name);
TF_SetAttrType(desc, "dtype", TF_INT32);
*op = TF_FinishOperation(desc, s);
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
ASSERT_NE(*op, nullptr);
}
TF_Operation* SegmentTest::Placeholder(TF_Graph* graph, TF_Status* s,
const char* name) {
TF_Operation* op;
PlaceholderHelper(graph, s, name, &op);
return op;
}
void SegmentTest::AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
TF_Status* s, const char* name, TF_Operation** op,
bool check) {
TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
TF_AddInputList(desc, add_inputs, 2);
*op = TF_FinishOperation(desc, s);
if (check) {
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
ASSERT_NE(*op, nullptr);
}
}
TF_Operation* SegmentTest::Add(TF_Operation* l, TF_Operation* r,
TF_Graph* graph, TF_Status* s,
const char* name) {
TF_Operation* op;
AddHelper(l, r, graph, s, name, &op, true);
return op;
}
TEST_F(SegmentTest, Empty) {
TF_Graph* graph = TF_NewGraph();
GraphDef graph_def;
ASSERT_TRUE(GetGraphDef(graph, &graph_def));
SegmentNodesVector segments;
ASSERT_EQ(
SegmentGraph(graph_def, MakeCandidateFn({}), default_options_, &segments),
tensorflow::Status::OK());
// Expect no segments/subgraphs.
EXPECT_TRUE(segments.empty());
TF_DeleteGraph(graph);
}
TEST_F(SegmentTest, Simple) {
TF_Status* s = TF_NewStatus();
TF_Graph* graph = TF_NewGraph();
// feed
// // ||
// add0 add1
// | | /
// | add2
// | / ||
// add3 add4
// | /
// <sink>
//
TF_Operation* feed = Placeholder(graph, s, "feed");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
TF_Operation* add4 = Add(add2, add2, graph, s, "add4");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
GraphDef graph_def;
ASSERT_TRUE(GetGraphDef(graph, &graph_def));
SegmentNodesVector segments;
ASSERT_EQ(
SegmentGraph(graph_def,
MakeCandidateFn({"add0", "add1", "add2", "add3", "add4"}),
default_options_, &segments),
tensorflow::Status::OK());
// Expect all Add operations to be collapsed into a single segment
ASSERT_EQ(segments.size(), 1);
std::vector<string> expected{"add0", "add1", "add2", "add3", "add4"};
for (const auto& ex : expected) {
EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
<< "Missing expected node " << ex;
}
TF_DeleteGraph(graph);
TF_DeleteStatus(s);
}
TEST_F(SegmentTest, AvoidCycle) {
TF_Status* s = TF_NewStatus();
TF_Graph* graph = TF_NewGraph();
// add2 is not a TRT candidate so add0/add3 cannot be formed as a
// subgraph
//
// feed
// // ||
// add0 add1
// | | /
// | add2
// | / ||
// add3 add4
// | /
// <sink>
//
TF_Operation* feed = Placeholder(graph, s, "feed");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
TF_Operation* add4 = Add(add2, add2, graph, s, "add4");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
GraphDef graph_def;
ASSERT_TRUE(GetGraphDef(graph, &graph_def));
SegmentNodesVector segments;
ASSERT_EQ(
SegmentGraph(graph_def, MakeCandidateFn({"add0", "add1", "add3", "add4"}),
default_options_, &segments),
tensorflow::Status::OK());
// Expect no subgraphs
EXPECT_EQ(segments.size(), 0);
TF_DeleteGraph(graph);
TF_DeleteStatus(s);
}
TEST_F(SegmentTest, Multiple) {
TF_Status* s = TF_NewStatus();
TF_Graph* graph = TF_NewGraph();
// add5 is not a TRT candidate so two subgraphs should be formed
//
// feed
// // || ||
// add0 add1 add7
// | | / / ||
// | add2-----add5 add8
// | / | | | |
// add3 add4 add6
// | | /
// <sink>
//
TF_Operation* feed = Placeholder(graph, s, "feed");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add7 = Add(feed, feed, graph, s, "add7");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add5 = Add(add2, add7, graph, s, "add5");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add8 = Add(add7, add7, graph, s, "add8");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
TF_Operation* add4 = Add(add2, add5, graph, s, "add4");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
TF_Operation* add6 = Add(add5, add8, graph, s, "add6");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("add6"), string(TF_OperationName(add6)));
GraphDef graph_def;
ASSERT_TRUE(GetGraphDef(graph, &graph_def));
SegmentNodesVector segments;
ASSERT_EQ(SegmentGraph(graph_def,
MakeCandidateFn({"add0", "add1", "add2", "add3",
"add4", "add6", "add7", "add8"}),
default_options_, &segments),
tensorflow::Status::OK());
// Expect two subgraphs
EXPECT_EQ(segments.size(), 2);
std::vector<string> expected0{"add0", "add1", "add2", "add3"};
for (const auto& ex : expected0) {
EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
<< "Missing expected node " << ex;
}
std::vector<string> expected1{"add6", "add8"};
for (const auto& ex : expected1) {
EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
<< "Missing expected node " << ex;
}
TF_DeleteGraph(graph);
TF_DeleteStatus(s);
}
TEST_F(SegmentTest, BigIfElse) {
TF_Status* s = TF_NewStatus();
TF_Graph* graph = TF_NewGraph();
// add2 is not a TRT candidate
//
// feed
// ||
// add0
// // ||
// add1 add4
// || ||
// add2 add5
// || ||
// add3 add6
// || //
// add7
// ||
// <sink>
//
TF_Operation* feed = Placeholder(graph, s, "feed");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add1 = Add(add0, add0, graph, s, "add1");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add2 = Add(add1, add1, graph, s, "add2");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add3 = Add(add2, add2, graph, s, "add3");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add4 = Add(add0, add0, graph, s, "add4");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add5 = Add(add4, add4, graph, s, "add5");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add6 = Add(add5, add5, graph, s, "add6");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
TF_Operation* add7 = Add(add3, add6, graph, s, "add7");
ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
EXPECT_EQ(string("add7"), string(TF_OperationName(add7)));
GraphDef graph_def;
ASSERT_TRUE(GetGraphDef(graph, &graph_def));
SegmentNodesVector segments;
ASSERT_EQ(SegmentGraph(graph_def,
MakeCandidateFn({"add0", "add1", "add3", "add4",
"add5", "add6", "add7"}),
default_options_, &segments),
tensorflow::Status::OK());
// Expect 2 subgraphs
EXPECT_EQ(segments.size(), 2);
std::vector<string> expected0{"add3", "add4", "add5", "add6", "add7"};
for (const auto& ex : expected0) {
EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
<< "Missing expected node " << ex;
}
std::vector<string> expected1{"add0", "add1"};
for (const auto& ex : expected1) {
EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
<< "Missing expected node " << ex;
}
TF_DeleteGraph(graph);
TF_DeleteStatus(s);
}
} // namespace test
} // namespace segment
} // namespace tensorrt
} // namespace tensorflow

View File

@ -0,0 +1,79 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
namespace tensorflow {
namespace tensorrt {
namespace segment {
// Union-Find data structure.
// Each cluster has an associated value; when merging clusters we can control
// which value becomes the representative of the merged clusters. Values must be
// copyable.
template <typename T>
class UnionFind {
public:
UnionFind() : size_(1), parent_(nullptr) {}
explicit UnionFind(const T& v) : size_(1), parent_(nullptr), value_(v) {}
// Returns the number of elements in a cluster.
int Size() { return FindRoot()->size_; }
// Merges this cluster with 'other'. This cluster's value becomes
// the value of the merged cluster; the value of 'other' is ignored.
void Merge(UnionFind* other);
// Each cluster has an associated value. Retrieves the value associated
// with this cluster.
T& ParentValue() { return FindRoot()->value_; }
// Get the original value of this node.
T& Value() { return value_; }
private:
// Finds the root element of the cluster. Performs path compression.
UnionFind* FindRoot();
int size_;
UnionFind* parent_;
T value_;
};
template <typename T>
void UnionFind<T>::Merge(UnionFind* other) {
UnionFind<T>* a = FindRoot();
UnionFind<T>* b = other->FindRoot();
if (a == b) return;
b->parent_ = a;
a->size_ += b->size_;
}
template <typename T>
UnionFind<T>* UnionFind<T>::FindRoot() {
if (!parent_) return this;
// Path compression: update intermediate nodes to point to the root of the
// equivalence class.
parent_ = parent_->FindRoot();
return parent_;
}
} // namespace segment
} // namespace tensorrt
} // namespace tensorflow
#endif // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_

View File

@ -0,0 +1,89 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
#include <string>
#include <vector>
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorrt/include/NvInfer.h"
namespace tensorflow {
namespace shape_inference {
tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
tensorflow::tensorrt::Logger logger;
string serialized_engine;
TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine));
nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
serialized_engine.c_str(), serialized_engine.size(), nullptr);
int num_batch = -1;
std::vector<::tensorflow::DataType> input_type;
TF_RETURN_IF_ERROR(context->GetAttr("InT", &input_type));
for (size_t i = 0; i < context->num_inputs(); i++) {
// Check if input shape is legit
auto input_shape = context->input(i);
for (int j = 0; j < context->Rank(input_shape); j++) {
auto dim_handler = context->Dim(input_shape, j);
if (j == 0) {
if (i == 0) {
num_batch = context->Value(dim_handler);
} else if (num_batch != context->Value(dim_handler)) {
// TODO(jie): TensorRT engine requires consistent batch between inputs
// tensors. Segmenter should be aware of this.
LOG(FATAL) << "TensorRT engine requires consistent batch size";
}
}
}
}
// Arrange input here
std::vector<string> input_nodes;
TF_RETURN_IF_ERROR(context->GetAttr("input_nodes", &input_nodes));
// Arrange output here
std::vector<string> output_nodes;
TF_RETURN_IF_ERROR(context->GetAttr("output_nodes", &output_nodes));
for (size_t i = 0; i < output_nodes.size(); i++) {
int binding_index = trt_engine->getBindingIndex(output_nodes[i].c_str());
ShapeHandle output_shape;
std::vector<DimensionHandle> dim_vec;
dim_vec.emplace_back(context->MakeDim(num_batch));
if (binding_index != -1) {
auto dims = trt_engine->getBindingDimensions(binding_index);
for (int j = 0; j < dims.nbDims; j++) {
dim_vec.emplace_back(context->MakeDim(dims.d[j]));
}
} else {
LOG(FATAL) << "TensorRT engine cannot find binding: " << output_nodes[i];
}
output_shape = context->MakeShape(dim_vec);
context->set_output(i, output_shape);
}
return Status::OK();
}
} // namespace shape_inference
} // namespace tensorflow
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA

View File

@ -0,0 +1,33 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_
#define TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
#include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/lib/core/status.h"
namespace tensorflow {
namespace shape_inference {
Status TRTEngineOpShapeInference(InferenceContext* c);
} // namespace shape_inference
} // namespace tensorflow
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA
#endif // TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_

View File

@ -0,0 +1,88 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Script to test TF-TensorRT integration."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
# normally we should do import tensorflow as tf and then
# tf.placeholder, tf.constant, tf.nn.conv2d etc but
# it looks like internal builds don't like it so
# importing every module individually
from tensorflow.contrib import tensorrt as trt
from tensorflow.core.protobuf import config_pb2 as cpb2
from tensorflow.python.client import session as csess
from tensorflow.python.framework import constant_op as cop
from tensorflow.python.framework import dtypes as dtypes
from tensorflow.python.framework import importer as importer
from tensorflow.python.framework import ops as ops
from tensorflow.python.ops import array_ops as aops
from tensorflow.python.ops import nn as nn
from tensorflow.python.ops import nn_ops as nn_ops
def get_simple_graph_def():
"""Create a simple graph and return its graph_def."""
g = ops.Graph()
with g.as_default():
a = aops.placeholder(
dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
e = cop.constant(
[[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
name="weights",
dtype=dtypes.float32)
conv = nn.conv2d(
input=a, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
b = cop.constant(
[4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
t = nn.bias_add(conv, b, name="biasAdd")
relu = nn.relu(t, "relu")
idty = aops.identity(relu, "ID")
v = nn_ops.max_pool(
idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
aops.squeeze(v, name="output")
return g.as_graph_def()
def run_graph(gdef, dumm_inp):
gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
ops.reset_default_graph()
g = ops.Graph()
with g.as_default():
inp, out = importer.import_graph_def(
graph_def=gdef, return_elements=["input", "output"])
inp = inp.outputs[0]
out = out.outputs[0]
with csess.Session(
config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
val = sess.run(out, {inp: dumm_inp})
return val
if "__main__" in __name__:
inp_dims = (100, 24, 24, 2)
dummy_input = np.random.random_sample(inp_dims)
gdef = get_simple_graph_def()
# Get optimized graph
trt_graph = trt.create_inference_graph(gdef, ["output"], inp_dims[0])
o1 = run_graph(gdef, dummy_input)
o2 = run_graph(trt_graph, dummy_input)
o3 = run_graph(trt_graph, dummy_input)
assert np.array_equal(o1, o2)
assert np.array_equal(o3, o2) # sanity check
print("Pass")

View File

@ -0,0 +1,131 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/* Wrap trt_conversion */
%{
#define SWIG_FILE_WITH_INIT
%}
%include "std_pair.i"
%include "tensorflow/python/platform/base.i"
%{
PyObject* pair_helper(std::pair<string, string>* in) {
PyObject *first(nullptr), *second(nullptr), *tuple(nullptr);
first = PyBytes_FromStringAndSize(in->first.data(), in->first.length());
if (!first) {
if (!PyErr_Occurred()) {
PyErr_SetString(PyExc_TypeError, "Pair conversion first argument failed");
}
return NULL;
}
second = PyBytes_FromStringAndSize(in->second.data(), in->second.length());
if (!second) {
if (!PyErr_Occurred()) {
PyErr_SetString(PyExc_TypeError,
"Pair conversion second argument failed");
}
return NULL;
}
tuple = Py_BuildValue("(OO)", first, second);
if (!tuple) {
if (!PyErr_Occurred()) {
PyErr_SetString(PyExc_TypeError,
"Tuple creation from pair<string,string> failed!");
}
return NULL;
}
return tuple;
}
%}
%typemap(out) std::pair<string, string> {
PyObject *tuple = pair_helper(&$1);
if (!tuple) SWIG_fail;
$result = tuple;
}
%{
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/util/stat_summarizer.h"
#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
%}
%ignoreall
%unignore tensorflow;
%unignore trt_convert;
%{
std::pair<string, string> trt_convert(
string graph_def_string, // The serialized GraphDef string.
std::vector<string> output_names,
size_t max_batch_size,
size_t max_workspace_size_bytes
// Unfortunately we can't use TF_Status here since it
// is in c/c_api and brings in a lot of other libraries
// which in turn declare ops. These ops are included
// statically in our library and cause an abort when
// module is loaded due to double registration
// until Tensorflow properly exposes these headers
// we have to work around this by returning a string
// and converting it to exception on python side.
//,TF_Status* out_status) {
) {
#if GOOGLE_CUDA && GOOGLE_TENSORRT
string out_status;
tensorflow::GraphDef graph_def;
if (!graph_def.ParseFromString(graph_def_string)) {
out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
return std::pair<string, string>{out_status, ""};
}
if (!output_names.size()) {
out_status = "InvalidArgument;Size of the output_names vector is 0";
return std::pair<string, string>{out_status, ""};
// return "";
}
tensorflow::GraphDef outGraph;
tensorflow::Status conversion_status =
tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(
graph_def, output_names, max_batch_size, max_workspace_size_bytes,
&outGraph);
if (!conversion_status.ok()) {
auto retCode = (int)conversion_status.code();
char buff[2000];
snprintf(buff, 2000, "%d;%s", retCode,
conversion_status.error_message().c_str());
out_status = buff;
return std::pair<string, string>{out_status, ""};
}
string result;
if (!outGraph.SerializeToString(&result)) {
out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
return std::pair<string, string>{out_status, ""};
}
out_status = "OK;All good!";
return std::pair<string, string>{out_status, result};
#else
// Returns FAILED_PRECONDITION.
return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
#endif // GOOGLE_CUDA && GOOGLE_TENSORRT
}
%}
std::pair<string, string> trt_convert(string graph_def_string,
std::vector<string> output_names,
size_t max_batch_size,
size_t max_workspace_size_bytes);
%unignoreall

View File

@ -20,7 +20,7 @@ from __future__ import print_function
from setuptools import setup
_VERSION = '1.6.0-rc0'
_VERSION = '1.6.0-rc1'
CONSOLE_SCRIPTS = [
'capture_tpu_profile=cloud_tpu_profiler.main:run_main',

View File

@ -40,7 +40,7 @@ namespace tensorflow {
// a BaseGPUDevice. Note that the configuration allows us to create multiple
// BaseGPUDevice per GPU hardware in order to use multi CUDA streams on the
// hardware, so the mapping between TF GPU id and CUDA GPU id is not a 1:1
// mappping, see the example below.
// mapping, see the example below.
//
// For example, assuming that in the machine we have GPU device with index 0, 1,
// 2 and 3 (physical GPU id). Setting "CUDA_VISIBLE_DEVICES=1,2,3" will create

View File

@ -21,7 +21,6 @@ limitations under the License.
#ifdef INTEL_MKL
#include <unistd.h>
#include <cstdlib>
#include <string>
#include "tensorflow/core/common_runtime/bfc_allocator.h"

View File

@ -191,9 +191,6 @@ class TensorShapeBase : public TensorShapeRep {
/// Appends all the dimensions from `shape`.
void AppendShape(const TensorShapeBase& shape);
// Maximum number of dimensions in a tensor.
static constexpr int MaxDimensions() { return 254; }
/// \brief Insert a dimension somewhere in the `TensorShape`.
/// REQUIRES: `0 <= d <= dims()`
/// REQUIRES: `size >= 0`

View File

@ -222,7 +222,7 @@ Status MklToTfConversionPass::InsertInputConversionNode(
BaseType(n->input_type(0)));
// Check ordering of edges
for (uint i = 0; i < 4; i++) {
for (uint32 i = 0; i < 4; i++) {
CHECK_EQ((edges[i]->dst_input() == i), true);
}

View File

@ -71,7 +71,7 @@ class RGBToHSVOp : public OpKernel {
TensorShape({input_data.dimension(0)}),
&trange));
typename TTypes<T, 1>::Tensor range = trange.tensor<T, 1>();
typename TTypes<T, 1>::Tensor range(trange.tensor<T, 1>());
functor::RGBToHSV<Device, T>()(context->eigen_device<Device>(), input_data,
range, output_data);

View File

@ -24,12 +24,12 @@ limitations under the License.
#include "tensorflow/core/util/cuda_kernel_helper.h"
#include "tensorflow/core/util/tensor_format.h"
#if !defined(_MSC_VER)
#define UNROLL _Pragma("unroll")
#define NOUNROLL _Pragma("nounroll")
#else
#if defined(_MSC_VER) && !defined(__clang__)
#define UNROLL
#define NOUNROLL
#else
#define UNROLL _Pragma("unroll")
#define NOUNROLL _Pragma("nounroll")
#endif
namespace tensorflow {

View File

@ -29,7 +29,6 @@ limitations under the License.
#include <vector>
#include "mkl_cblas.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/numeric_types.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
@ -41,9 +40,6 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
#define MKL_Complex8 tensorflow::complex64
#define MKL_Complex16 tensorflow::complex128
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
@ -180,16 +176,16 @@ class BatchMatMulMkl : public OpKernel {
void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
const bool TransB, const MKL_INT *M_Array,
const MKL_INT *N_Array, const MKL_INT *K_Array,
const MKL_Complex8 **A_Array, const MKL_INT *lda_Array,
const MKL_Complex8 **B_Array, const MKL_INT *ldb_Array,
MKL_Complex8 **C_Array, const MKL_INT *ldc_Array,
const complex64 **A_Array, const MKL_INT *lda_Array,
const complex64 **B_Array, const MKL_INT *ldb_Array,
complex64 **C_Array, const MKL_INT *ldc_Array,
const MKL_INT group_count, const MKL_INT *group_size) {
std::vector<CBLAS_TRANSPOSE> TransA_array(
group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
std::vector<CBLAS_TRANSPOSE> TransB_array(
group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
std::vector<MKL_Complex8> alpha_Array(group_size[0], {1.0f, 0.0f});
std::vector<MKL_Complex8> beta_Array(group_size[0], {0.0f, 0.0f});
std::vector<complex64> alpha_Array(group_size[0], {1.0f, 0.0f});
std::vector<complex64> beta_Array(group_size[0], {0.0f, 0.0f});
cblas_cgemm_batch(
Layout, &TransA_array[0], &TransB_array[0], M_Array, N_Array, K_Array,
static_cast<const void *>(&alpha_Array[0]),
@ -202,18 +198,16 @@ class BatchMatMulMkl : public OpKernel {
void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
const bool TransB, const MKL_INT *M_Array,
const MKL_INT *N_Array, const MKL_INT *K_Array,
const MKL_Complex16 **A_Array,
const MKL_INT *lda_Array,
const MKL_Complex16 **B_Array,
const MKL_INT *ldb_Array, MKL_Complex16 **C_Array,
const MKL_INT *ldc_Array, const MKL_INT group_count,
const MKL_INT *group_size) {
const complex128 **A_Array, const MKL_INT *lda_Array,
const complex128 **B_Array, const MKL_INT *ldb_Array,
complex128 **C_Array, const MKL_INT *ldc_Array,
const MKL_INT group_count, const MKL_INT *group_size) {
std::vector<CBLAS_TRANSPOSE> TransA_array(
group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
std::vector<CBLAS_TRANSPOSE> TransB_array(
group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
std::vector<MKL_Complex16> alpha_Array(group_size[0], {1.0f, 0.0f});
std::vector<MKL_Complex16> beta_Array(group_size[0], {0.0f, 0.0f});
std::vector<complex128> alpha_Array(group_size[0], {1.0f, 0.0f});
std::vector<complex128> beta_Array(group_size[0], {0.0f, 0.0f});
cblas_zgemm_batch(
Layout, &TransA_array[0], &TransB_array[0], M_Array, N_Array, K_Array,
static_cast<const void *>(&alpha_Array[0]),

View File

@ -145,8 +145,8 @@ class MklInputConversionOp : public OpKernel {
const MklShape* mkl_shape;
const Tensor* tf_tensor;
MklShape* tf_mkl_shape;
uint mkl_tensor_index;
uint tf_tensor_index;
uint32 mkl_tensor_index;
uint32 tf_tensor_index;
if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
mkl_tensor = &input_tensor_0;
mkl_shape = &input_shape_0;

View File

@ -170,32 +170,32 @@ class MklMatMulOp : public OpKernel {
// Matrix-Matrix Multiplication with Complex64 (std::complex<float>) tensors.
// For detailed info about parameters, look at FP32 function description.
void MklBlasGemm(bool transa, bool transb, const int m, const int n,
const int k, const std::complex<float>* a, const int lda,
const std::complex<float>* b, const int ldb,
std::complex<float>* c, int const ldc) {
const int k, const complex64* a, const int lda,
const complex64* b, const int ldb, complex64* c,
int const ldc) {
const MKL_Complex8 alpha = {1.0f, 0.0f};
const MKL_Complex8 beta = {0.0f, 0.0f};
cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
transb ? CblasTrans : CblasNoTrans, m, n, k,
static_cast<const void*>(&alpha), static_cast<const void*>(a),
lda, static_cast<const void*>(b), ldb,
static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
reinterpret_cast<const MKL_Complex8*>(a), lda,
reinterpret_cast<const MKL_Complex8*>(b), ldb, &beta,
reinterpret_cast<MKL_Complex8*>(c), ldc);
}
// Matrix-Matrix Multiplication with Complex128 (std::complex<double>)
// tensors. For detailed info about parameters, look at FP32 function
// description.
void MklBlasGemm(bool transa, bool transb, const int m, const int n,
const int k, const std::complex<double>* a, const int lda,
const std::complex<double>* b, const int ldb,
std::complex<double>* c, const int ldc) {
const int k, const complex128* a, const int lda,
const complex128* b, const int ldb, complex128* c,
const int ldc) {
const MKL_Complex16 alpha = {1.0, 0.0};
const MKL_Complex16 beta = {0.0, 0.0};
cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
transb ? CblasTrans : CblasNoTrans, m, n, k,
static_cast<const void*>(&alpha), static_cast<const void*>(a),
lda, static_cast<const void*>(b), ldb,
static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
reinterpret_cast<const MKL_Complex16*>(a), lda,
reinterpret_cast<const MKL_Complex16*>(b), ldb, &beta,
reinterpret_cast<MKL_Complex16*>(c), ldc);
}
};

View File

@ -128,7 +128,7 @@ class MklToTfOp : public OpKernel {
#else
static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
string data_format_str, DataType op_data_type,
bool has_avx512f, uint input_number) {
bool has_avx512f, uint32 input_number) {
// Check that input tensor is in MKL format.
const Tensor& input_tensor = MklGetInput(context, input_number);
MklShape input_shape;

View File

@ -18,9 +18,6 @@ limitations under the License.
#ifdef INTEL_MKL
#define EIGEN_USE_THREADS
#include "tensorflow/core/framework/numeric_types.h"
#define MKL_Complex8 tensorflow::complex64
#define MKL_Complex16 tensorflow::complex128
#include "mkl_trans.h"
#include "tensorflow/core/kernels/transpose_functor.h"
#include "tensorflow/core/kernels/transpose_op.h"
@ -62,10 +59,37 @@ Status MKLTranspose2D(const char trans, const Tensor& in, Tensor* out);
INSTANTIATE(float, s)
INSTANTIATE(double, d)
INSTANTIATE(complex64, c)
INSTANTIATE(complex128, z)
#undef INSTANTIATE
template <>
Status MKLTranspose2D<complex64>(const char trans, const Tensor& in,
Tensor* out) {
const MKL_Complex8 alpha = {1.0f, 0.0f};
mkl_comatcopy(
'R', trans, in.dim_size(0), in.dim_size(1), alpha,
reinterpret_cast<const MKL_Complex8*>(in.flat<complex64>().data()),
in.dim_size(1),
reinterpret_cast<MKL_Complex8*>(
const_cast<complex64*>(out->flat<complex64>().data())),
in.dim_size(0));
return Status::OK();
}
template <>
Status MKLTranspose2D<complex128>(const char trans, const Tensor& in,
Tensor* out) {
const MKL_Complex16 alpha = {1.0, 0.0};
mkl_zomatcopy(
'R', trans, in.dim_size(0), in.dim_size(1), alpha,
reinterpret_cast<const MKL_Complex16*>(in.flat<complex128>().data()),
in.dim_size(1),
reinterpret_cast<MKL_Complex16*>(
const_cast<complex128*>(out->flat<complex128>().data())),
in.dim_size(0));
return Status::OK();
}
static const char kMKLTranspose = 'T';
static const char kMKLConjugateTranspose = 'C';

View File

@ -105,7 +105,7 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
}
const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
std::vector<float> scores_data(num_boxes);
std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
@ -138,8 +138,7 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
Tensor* output = nullptr;
TensorShape output_shape({static_cast<int>(selected.size())});
OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
typename TTypes<int, 1>::Tensor selected_indices_data =
output->tensor<int, 1>();
TTypes<int, 1>::Tensor selected_indices_data = output->tensor<int, 1>();
std::copy_n(selected.begin(), selected.size(), selected_indices_data.data());
}

View File

@ -29,7 +29,7 @@ limitations under the License.
#include "tensorflow/core/lib/random/random_distributions.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"
#ifdef COMPILER_MSVC
#if defined(_MSC_VER) && !defined(__clang__)
// msvc does not support unroll. One could try the loop pragma but we need to
// take a closer look if this generates better code in this case. For now let
// the compiler take care of it.

View File

@ -697,8 +697,8 @@ class QuantizedResizeBilinearOp : public OpKernel {
// Return if the output is empty.
if (st.output->NumElements() == 0) return;
typename TTypes<T, 4>::ConstTensor image_data = input.tensor<T, 4>();
typename TTypes<T, 4>::Tensor output_data = st.output->tensor<T, 4>();
typename TTypes<T, 4>::ConstTensor image_data(input.tensor<T, 4>());
typename TTypes<T, 4>::Tensor output_data(st.output->tensor<T, 4>());
ResizeBilinear<T>(image_data, st.height_scale, st.width_scale, in_min,
in_max, &output_data);

View File

@ -92,8 +92,8 @@ class RandomCropOp : public OpKernel {
// TODO(shlens): Do this more efficiently with memcpy once padding is
// available for smaller images.
typename TTypes<T, 3>::ConstTensor input_data = input.tensor<T, 3>();
typename TTypes<T, 3>::Tensor output_data = output->tensor<T, 3>();
typename TTypes<T, 3>::ConstTensor input_data(input.tensor<T, 3>());
typename TTypes<T, 3>::Tensor output_data(output->tensor<T, 3>());
for (int y = 0; y < target_height; ++y) {
for (int x = 0; x < target_width; ++x) {

View File

@ -149,7 +149,7 @@ class ResizeAreaOp : public OpKernel {
if (!context->status().ok()) return;
typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
typename TTypes<T, 4>::ConstTensor input_data(input.tensor<T, 4>());
// Precompute values used when iterating over x coordinates within a row.
// Note that it may be useful to cache x_interps for a given
@ -190,8 +190,7 @@ class ResizeAreaOp : public OpKernel {
void ComputeLoop(const ImageResizerState& st,
const std::vector<CachedInterpolation>& x_interps,
typename TTypes<T, 4>::ConstTensor input_data) {
typename TTypes<float, 4>::Tensor output_data =
st.output->tensor<float, 4>();
TTypes<float, 4>::Tensor output_data = st.output->tensor<float, 4>();
// When using this algorithm for downsizing, the target pixel value is the
// weighted average of all the source pixels. The weight is determined by

View File

@ -480,9 +480,8 @@ class ResizeBicubicOp : public OpKernel {
if (!context->status().ok()) return;
typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
typename TTypes<float, 4>::Tensor output_data =
st.output->tensor<float, 4>();
typename TTypes<T, 4>::ConstTensor input_data(input.tensor<T, 4>());
TTypes<float, 4>::Tensor output_data = st.output->tensor<float, 4>();
interpolate_with_caching<T>(input_data, st, output_data);
}
@ -510,9 +509,8 @@ class ResizeBicubicOpGrad : public OpKernel {
if (!context->status().ok()) return;
typename TTypes<float, 4>::ConstTensor input_grad =
input.tensor<float, 4>();
typename TTypes<T, 4>::Tensor output_grad = st.output->tensor<T, 4>();
TTypes<float, 4>::ConstTensor input_grad = input.tensor<float, 4>();
typename TTypes<T, 4>::Tensor output_grad(st.output->tensor<T, 4>());
ResizeBicubicGrad<T>(input_grad, st, output_grad);
}

View File

@ -51,9 +51,8 @@ class ResizeBilinearOp : public OpKernel {
// Return if the output is empty.
if (st.output->NumElements() == 0) return;
typename TTypes<T, 4>::ConstTensor image_data = input.tensor<T, 4>();
typename TTypes<float, 4>::Tensor output_data =
st.output->tensor<float, 4>();
typename TTypes<T, 4>::ConstTensor image_data(input.tensor<T, 4>());
TTypes<float, 4>::Tensor output_data = st.output->tensor<float, 4>();
functor::ResizeBilinear<Device, T>()(context->eigen_device<Device>(),
image_data, st.height_scale,
@ -258,9 +257,8 @@ class ResizeBilinearOpGrad : public OpKernel {
if (!context->status().ok()) return;
typename TTypes<float, 4>::ConstTensor input_grad =
input.tensor<float, 4>();
typename TTypes<T, 4>::Tensor output_grad = st.output->tensor<T, 4>();
TTypes<float, 4>::ConstTensor input_grad = input.tensor<float, 4>();
typename TTypes<T, 4>::Tensor output_grad(st.output->tensor<T, 4>());
functor::ResizeBilinearGrad<Device, T>()(context->eigen_device<Device>(),
input_grad, st.height_scale,

View File

@ -56,8 +56,8 @@ class ResizeNearestNeighborOp : public OpKernel {
// Return if the output is empty.
if (st.output->NumElements() == 0) return;
typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
typename TTypes<T, 4>::Tensor output_data = st.output->tensor<T, 4>();
typename TTypes<T, 4>::ConstTensor input_data(input.tensor<T, 4>());
typename TTypes<T, 4>::Tensor output_data(st.output->tensor<T, 4>());
bool status;
if (align_corners_) {
@ -162,8 +162,8 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
// Return if the output is empty.
if (output->NumElements() == 0) return;
typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
typename TTypes<T, 4>::ConstTensor input_data(input.tensor<T, 4>());
typename TTypes<T, 4>::Tensor output_data(output->tensor<T, 4>());
const float height_scale =
CalculateResizeScale(out_height, in_height, align_corners_);

View File

@ -387,9 +387,9 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
OP_REQUIRES_OK(
context, context->allocate_output(2, TensorShape({1, 1, 4}), &bboxes));
typename TTypes<T, 1>::Tensor begin_data = begin->tensor<T, 1>();
typename TTypes<T, 1>::Tensor size_data = size->tensor<T, 1>();
typename TTypes<float, 3>::Tensor bboxes_data = bboxes->tensor<float, 3>();
typename TTypes<T, 1>::Tensor begin_data(begin->tensor<T, 1>());
typename TTypes<T, 1>::Tensor size_data(size->tensor<T, 1>());
TTypes<float, 3>::Tensor bboxes_data = bboxes->tensor<float, 3>();
begin_data(0) = T(offset_height);
size_data(0) = T(target_height);

View File

@ -358,11 +358,11 @@ class MklSliceOp : public OpKernel {
/* data format = NCHW */
#pragma omp parallel for
for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
for (ssize_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
T* ip = in_buf + (d0 * in_strides[0]);
T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
#pragma omp parallel for
for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
for (ssize_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
T* ip1 = ip + (d1 * in_strides[1]);
T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
// For NCHW, H and W will be contiguous. So we can copy
@ -376,15 +376,15 @@ class MklSliceOp : public OpKernel {
/* data_format = NHWC */
#pragma omp parallel for
for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
for (ssize_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
T* ip = in_buf + (d0 * in_strides[0]);
T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
#pragma omp parallel for
for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
for (ssize_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
T* ip1 = ip + (d1 * in_strides[1]);
T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
#pragma omp parallel for
for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
for (ssize_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
T* ip2 = ip1 + (d2 * in_strides[2]);
T* ip3 = ip2 + begin[3];
T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]);

View File

@ -115,7 +115,7 @@ class SubstrOp : public OpKernel {
Tensor input_buffer;
OP_REQUIRES_OK(context, context->allocate_temp(
DT_STRING, output_shape, &input_buffer));
typename TTypes<string, 1>::Tensor input_bcast =
TTypes<string, 1>::Tensor input_bcast =
input_buffer.shaped<string, 1>(bcast.result_shape());
input_bcast =
input.broadcast(BCast::ToIndexArray<1>(bcast.x_bcast()));
@ -125,8 +125,8 @@ class SubstrOp : public OpKernel {
OP_REQUIRES_OK(context,
context->allocate_temp(DataTypeToEnum<T>::v(),
output_shape, &pos_buffer));
typename TTypes<T, 1>::Tensor pos_bcast =
pos_buffer.shaped<T, 1>(bcast.result_shape());
typename TTypes<T, 1>::Tensor pos_bcast(
pos_buffer.shaped<T, 1>(bcast.result_shape()));
pos_bcast =
pos_shaped.broadcast(BCast::ToIndexArray<1>(bcast.y_bcast()));
@ -135,8 +135,8 @@ class SubstrOp : public OpKernel {
OP_REQUIRES_OK(context,
context->allocate_temp(DataTypeToEnum<T>::v(),
output_shape, &len_buffer));
typename TTypes<T, 1>::Tensor len_bcast =
len_buffer.shaped<T, 1>(bcast.result_shape());
typename TTypes<T, 1>::Tensor len_bcast(
len_buffer.shaped<T, 1>(bcast.result_shape()));
len_bcast =
len_shaped.broadcast(BCast::ToIndexArray<1>(bcast.y_bcast()));
@ -164,7 +164,7 @@ class SubstrOp : public OpKernel {
Tensor input_buffer;
OP_REQUIRES_OK(context, context->allocate_temp(
DT_STRING, output_shape, &input_buffer));
typename TTypes<string, 2>::Tensor input_bcast =
TTypes<string, 2>::Tensor input_bcast =
input_buffer.shaped<string, 2>(bcast.result_shape());
input_bcast =
input.broadcast(BCast::ToIndexArray<2>(bcast.x_bcast()));
@ -174,8 +174,8 @@ class SubstrOp : public OpKernel {
OP_REQUIRES_OK(context,
context->allocate_temp(DataTypeToEnum<T>::v(),
output_shape, &pos_buffer));
typename TTypes<T, 2>::Tensor pos_bcast =
pos_buffer.shaped<T, 2>(bcast.result_shape());
typename TTypes<T, 2>::Tensor pos_bcast(
pos_buffer.shaped<T, 2>(bcast.result_shape()));
pos_bcast =
pos_shaped.broadcast(BCast::ToIndexArray<2>(bcast.y_bcast()));
@ -184,8 +184,8 @@ class SubstrOp : public OpKernel {
OP_REQUIRES_OK(context,
context->allocate_temp(DataTypeToEnum<T>::v(),
output_shape, &len_buffer));
typename TTypes<T, 2>::Tensor len_bcast =
len_buffer.shaped<T, 2>(bcast.result_shape());
typename TTypes<T, 2>::Tensor len_bcast(
len_buffer.shaped<T, 2>(bcast.result_shape()));
len_bcast =
len_shaped.broadcast(BCast::ToIndexArray<2>(bcast.y_bcast()));

View File

@ -27,9 +27,6 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty();
#include <stdlib.h>
#include <cstring>
#if 0
#include <omp.h>
#endif
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/lib/core/blocking_counter.h"
@ -360,7 +357,6 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
l_tick6 = libxsmm_timer_tick();
#endif
#if 1
BlockingCounter counter(num_threads);
for (int i = 0; i < num_threads; ++i) {
@ -371,14 +367,6 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
});
}
counter.Wait();
#else
#pragma omp parallel
{
chk_libxsmm_err(
libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, omp_get_thread_num()),
"Worker");
}
#endif
#if defined(LIBXSMM_DETAILED_TIMING)
l_tick7 = libxsmm_timer_tick();

View File

@ -49,7 +49,7 @@ RecordWriterOptions RecordWriterOptions::CreateRecordWriterOptions(
#endif // IS_SLIM_BUILD
} else if (compression_type != compression::kNone) {
LOG(ERROR) << "Unsupported compression_type:" << compression_type
<< ". No comprression will be used.";
<< ". No compression will be used.";
}
return options;
}

View File

@ -619,6 +619,10 @@ REGISTER_OP("NonMaxSuppression")
TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
// The boxes is a 2-D float Tensor of shape [num_boxes, 4].
DimensionHandle unused;
// The boxes[0] and scores[0] are both num_boxes.
TF_RETURN_IF_ERROR(
c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
// The boxes[1] is 4.
TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
c->set_output(0, c->Vector(c->UnknownDim()));
@ -643,6 +647,10 @@ REGISTER_OP("NonMaxSuppressionV2")
TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
// The boxes is a 2-D float Tensor of shape [num_boxes, 4].
DimensionHandle unused;
// The boxes[0] and scores[0] are both num_boxes.
TF_RETURN_IF_ERROR(
c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
// The boxes[1] is 4.
TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
c->set_output(0, c->Vector(c->UnknownDim()));

View File

@ -43,10 +43,11 @@ limitations under the License.
#elif defined(__arm__)
#define PLATFORM_POSIX
// Require an outside macro to tell us if we're building for Raspberry Pi.
#if !defined(RASPBERRY_PI)
// Require an outside macro to tell us if we're building for Raspberry Pi or
// another ARM device that's not a mobile platform.
#if !defined(RASPBERRY_PI) && !defined(ARM_NON_MOBILE)
#define IS_MOBILE_PLATFORM
#endif // !defined(RASPBERRY_PI)
#endif // !defined(RASPBERRY_PI) && !defined(ARM_NON_MOBILE)
#else
// If no platform specified, use:

View File

@ -387,7 +387,7 @@ message RunOptions {
// EXPERIMENTAL. Options used to initialize DebuggerState, if enabled.
DebugOptions debug_options = 6;
// When enabled, causes tensor alllocation information to be included in
// When enabled, causes tensor allocation information to be included in
// the error message when the Run() call fails because the allocator ran
// out of memory (OOM).
//

View File

@ -24,7 +24,7 @@ limitations under the License.
// TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
// "-beta", "-rc", "-rc.1")
#define TF_VERSION_SUFFIX "-rc0"
#define TF_VERSION_SUFFIX "-rc1"
#define TF_STR_HELPER(x) #x
#define TF_STR(x) TF_STR_HELPER(x)

View File

@ -1112,9 +1112,11 @@ inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
// Forward the MKL shape ONLY (used in elementwise and other ops where
// we call the eigen implementation and MKL shape is not used)
inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
uint idx_data_in, uint idx_data_out) {
uint idx_meta_in = GetTensorMetaDataIndex(idx_data_in, context->num_inputs());
uint idx_meta_out =
uint32 idx_data_in,
uint32_t idx_data_out) {
uint32 idx_meta_in =
GetTensorMetaDataIndex(idx_data_in, context->num_inputs());
uint32 idx_meta_out =
GetTensorMetaDataIndex(idx_data_out, context->num_outputs());
if (IsRefType(context->input_dtype(idx_data_in))) {
@ -1126,7 +1128,7 @@ inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
// Set a dummy MKL shape (called when the output is in TF format)
inline void SetDummyMklShapeOutput(OpKernelContext* context,
uint idx_data_out) {
uint32 idx_data_out) {
MklShape mkl_shape_output;
mkl_shape_output.SetMklTensor(false);
AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);

View File

@ -1,37 +1,86 @@
# Roadmap
**Last updated: January 23, 2017**
**Last updated: Feb 15, 2018**
TensorFlow is a fast moving project. In order for the community to better
understand what the near future will bring, this document shares what we are
working on internally. Many of these features were requested by the community,
and we welcome
[contributions](https://github.com/tensorflow/tensorflow/labels/stat%3Acontributions%20welcome).
TensorFlow is a rapidly moving, community supported project. This document is intended
to provide guidance about priorities and focus areas of the core set of TensorFlow
developers and about functionality that can be expected in the upcoming releases of
TensorFlow. Many of these areas are driven by community use cases, and we welcome
further
[contributions](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md)
to TensorFlow.
The features on this list are targeted for the next few months. At this point,
we do not have timelines for these features.
The features below do not have concrete release dates. However, the majority can be
expected in the next one to two releases.
### Improve non-Python language support
### APIs
#### High Level APIs:
* Easy multi-GPU utilization with Estimators
* Easy-to-use high-level pre-made estimators for Gradient Boosted Trees, Time Series, and other models
* Support for adding gradient computation for graphs constructed in other
languages (C++, Java, Go etc.)
#### Eager Execution:
* Efficient utilization of multiple GPUs
* Distributed training (multi-machine)
* Performance improvements
* Simpler export to a GraphDef/SavedModel
### Making TensorFlow easier to use
* High-level APIs
* Well-maintained models showing best practices
#### Keras API:
* Better integration with tf.data (ability to call `model.fit` with data tensors)
* Full support for Eager Execution (both Eager support for the regular Keras API, and ability
to create Keras models Eager- style via Model subclassing)
* Better distribution/multi-GPU support and TPU support (including a smoother model-to-estimator workflow)
### Performance
* Speed and memory benchmarks
* Distributed full model benchmarks
* Performance and memory usage improvements
#### Official Models:
* A set of
[reference models](https://github.com/tensorflow/models/tree/master/official)
across image recognition, speech, object detection, and
translation that demonstrate best practices and serve as a starting point for
high-performance model development.
### Core Features
* Automatic op placement ([#2126](https://github.com/tensorflow/tensorflow/issues/2126))
* Support for graph-level functions
#### Contrib:
* Deprecation notices added to parts of tf.contrib where preferred implementations exist outside of tf.contrib.
* As much as possible, large projects inside tf.contrib moved to separate repositories.
* The tf.contrib module will eventually be discontinued in its current form, experimental development will in future happen in other repositories.
#### Probabilistic Reasoning and Statistical Analysis:
* Rich set of tools for probabilistic and statistical analysis in tf.distributions
and tf.probability. These include new samplers, layers, optimizers, losses, and structured models
* Statistical tools for hypothesis testing, convergence diagnostics, and sample statistics
* Edward 2.0: High-level API for probabilistic programming
### Platforms
* OpenCL support ([#22](https://github.com/tensorflow/tensorflow/issues/22))
#### TensorFlow Lite:
* Increased coverage of supported ops in TensorFlow Lite
* Easier conversion of a trained TensorFlow graph for use on TensorFlow Lite
* Support for GPU acceleration in TensorFlow Lite (iOS and Android)
* Support for hardware accelerators via Android NeuralNets API
* Improved CPU performance by quantization and other network optimizations (eg. pruning, distillation)
* Increased support for devices beyond Android and iOS (eg. RPi, Cortex-M)
### Community
* More educational resources
* Better integration of TensorFlow into the opensource big data ecosystem (e.g.
[#2655](https://github.com/tensorflow/tensorflow/issues/2655))
### Performance
#### Distributed TensorFlow:
* Multi-GPU support optimized for a variety of GPU topologies
* Improved mechanisms for distributing computations on several machines
#### Optimizations:
* Mixed precision training support with initial example model and guide
* Native TensorRT support
* Int8 support for SkyLake via MKL
* Dynamic loading of SIMD-optimized kernels
### Documentation and Usability:
* Updated documentation, tutorials and Getting Started guides
* Process to enable external contributions to tutorials, documentation, and blogs showcasing best practice use-cases of TensorFlow and high-impact applications
### Community and Partner Engagement
#### Special Interest Groups:
* Mobilizing the community to work together in focused domains
* [tf-distribute](https://groups.google.com/a/tensorflow.org/forum/#!forum/tf-distribute)
: build and packaging of TensorFlow
* More to be identified and launched
#### Community:
* Incorporate public feedback on significant design decisions via a Request-for-Comment (RFC) process
* Formalize process for external contributions to land in TensorFlow and associated projects
* Grow global TensorFlow communities and user groups
* Collaborate with partners to co-develop and publish research papers

View File

@ -22,6 +22,14 @@ This section describes some of the current uses of the TensorFlow system.
> TensorFlow, or even better, send us a pull request to add an entry to this
> file.
* **Deep Speech**
<ul>
<li>**Organization**: Mozilla</li>
<li> **Domain**: Speech Recognition</li>
<li> **Description**: A TensorFlow implementation motivated by Baidu's Deep Speech architecture.</li>
<li> **More info**: [GitHub Repo](https://github.com/mozilla/deepspeech)</li>
</ul>
* **RankBrain**
<ul>
<li>**Organization**: Google</li>

View File

@ -7,6 +7,8 @@ the following documents:
a cluster of TensorFlow servers.
* @{$hadoop$How to run TensorFlow on Hadoop}, which has a highly
self-explanatory title.
* @{$s3$How to run TensorFlow with the S3 filesystem}, which explains how
to run TensorFlow with the S3 file system.
* The entire document set for [TensorFlow serving](/serving), an open-source,
flexible, high-performance serving system for machine-learned models
designed for production environments. TensorFlow Serving provides

View File

@ -1,3 +1,4 @@
index.md
distributed.md
hadoop.md
s3.md

View File

@ -0,0 +1,40 @@
# How to run TensorFlow on S3
This document describes how to run TensorFlow on S3 file system.
## S3
We assume that you are familiar with @{$reading_data$reading data}.
To use S3 with TensorFlow, change the file paths you use to read and write
data to an S3 path. For example:
```python
filenames = ["s3://bucketname/path/to/file1.tfrecord",
"s3://bucketname/path/to/file2.tfrecord"]
dataset = tf.data.TFRecordDataset(filenames)
```
When reading or writing data on S3 with your TensorFlow program, the behavior
could be controlled by various environmental variables:
* **AWS_REGION**: By default, regional endpoint is used for S3, with region
controlled by `AWS_REGION`. If `AWS_REGION` is not specified, then
`us-east-1` is used.
* **S3_ENDPOINT**: The endpoint could be overridden explicitly with
`S3_ENDPOINT` specified.
* **S3_USE_HTTPS**: HTTPS is used to access S3 by default, unless
`S3_USE_HTTPS=0`.
* **S3_VERIFY_SSL**: If HTTPS is used, SSL verification could be disabled
with `S3_VERIFY_SSL=0`.
To read or write objects in a bucket that is no publicly accessible,
AWS credentials must be provided through one of the following methods:
* Set credentials in the AWS credentials profile file on the local system,
located at: `~/.aws/credentials` on Linux, macOS, or Unix, or
`C:\Users\USERNAME\.aws\credentials` on Windows.
* Set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment
variables.
* If TensorFlow is deployed on an EC2 instance, specify an IAM role and then
give the EC2 instance access to that role.

View File

@ -81,6 +81,8 @@ filesystem implementations call their existing libraries. Examples include:
plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hadoop/hadoop_file_system.h)
* [GCS
plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/cloud/gcs_file_system.h)
* [S3
plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/s3/s3_file_system.h)
#### The File interfaces

View File

@ -38,7 +38,7 @@ enable TensorFlow for C:
OS="linux" # Change to "darwin" for macOS
TARGET_DIRECTORY="/usr/local"
curl -L \
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.6.0-rc1.tar.gz" |
sudo tar -C $TARGET_DIRECTORY -xz
The `tar` command extracts the TensorFlow C library into the `lib`

View File

@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
TF_TYPE="cpu" # Change to "gpu" for GPU support
TARGET_DIRECTORY='/usr/local'
curl -L \
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.6.0-rc0.tar.gz" |
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.6.0-rc1.tar.gz" |
sudo tar -C $TARGET_DIRECTORY -xz
The `tar` command extracts the TensorFlow C library into the `lib`

View File

@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
<dependency>
<groupId>org.tensorflow</groupId>
<artifactId>tensorflow</artifactId>
<version>1.6.0-rc0</version>
<version>1.6.0-rc1</version>
</dependency>
```
@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
<dependency>
<groupId>org.tensorflow</groupId>
<artifactId>tensorflow</artifactId>
<version>1.6.0-rc0</version>
<version>1.6.0-rc1</version>
</dependency>
</dependencies>
</project>
@ -123,12 +123,12 @@ instead:
<dependency>
<groupId>org.tensorflow</groupId>
<artifactId>libtensorflow</artifactId>
<version>1.6.0-rc0</version>
<version>1.6.0-rc1</version>
</dependency>
<dependency>
<groupId>org.tensorflow</groupId>
<artifactId>libtensorflow_jni_gpu</artifactId>
<version>1.6.0-rc0</version>
<version>1.6.0-rc1</version>
</dependency>
```
@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
Take the following steps to install TensorFlow for Java on Linux or macOS:
1. Download
[libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
[libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc1.jar),
which is the TensorFlow Java Archive (JAR).
2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
OS=$(uname -s | tr '[:upper:]' '[:lower:]')
mkdir -p ./jni
curl -L \
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0-rc1.tar.gz" |
tar -xz -C ./jni
### Install on Windows
@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
Take the following steps to install TensorFlow for Java on Windows:
1. Download
[libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
[libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc1.jar),
which is the TensorFlow Java Archive (JAR).
2. Download the following Java Native Interface (JNI) file appropriate for
[TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0-rc0.zip).
[TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0-rc1.zip).
3. Extract this .zip file.
@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
as follows:
<pre><b>javac -cp libtensorflow-1.6.0-rc0.jar HelloTF.java</b></pre>
<pre><b>javac -cp libtensorflow-1.6.0-rc1.jar HelloTF.java</b></pre>
### Running
@ -239,11 +239,11 @@ two files are available to the JVM:
For example, the following command line executes the `HelloTF` program on Linux
and macOS X:
<pre><b>java -cp libtensorflow-1.6.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
<pre><b>java -cp libtensorflow-1.6.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
And the following command line executes the `HelloTF` program on Windows:
<pre><b>java -cp libtensorflow-1.6.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
<pre><b>java -cp libtensorflow-1.6.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>d
If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
installed TensorFlow for Java and are ready to use the API. If the program

View File

@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
Virtualenv environment:
<pre>(tensorflow)$ <b>pip3 install --upgrade \
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
If you encounter installation problems, see
[Common Installation Problems](#common_installation_problems).
@ -293,7 +293,7 @@ take the following steps:
<pre>
$ <b>sudo pip3 install --upgrade \
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b>
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl</b>
</pre>
If this step fails, see
@ -480,8 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
<pre>
(tensorflow)$ <b>pip install --ignore-installed --upgrade \
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
<a name="ValidateYourInstallation"></a>
## Validate your installation
@ -648,14 +647,14 @@ This section documents the relevant values for Linux installations.
CPU only:
<pre>
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp27-none-linux_x86_64.whl
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp27-none-linux_x86_64.whl
</pre>
GPU support:
<pre>
https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp27-none-linux_x86_64.whl
https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp27-none-linux_x86_64.whl
</pre>
Note that GPU support requires the NVIDIA hardware and software described in
@ -667,14 +666,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
CPU only:
<pre>
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl
</pre>
GPU support:
<pre>
https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp34-cp34m-linux_x86_64.whl
</pre>
Note that GPU support requires the NVIDIA hardware and software described in
@ -686,14 +685,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
CPU only:
<pre>
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp35-cp35m-linux_x86_64.whl
</pre>
GPU support:
<pre>
https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp35-cp35m-linux_x86_64.whl
</pre>
@ -705,14 +704,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
CPU only:
<pre>
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp36-cp36m-linux_x86_64.whl
</pre>
GPU support:
<pre>
https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp36-cp36m-linux_x86_64.whl
</pre>

View File

@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
TensorFlow in the active Virtualenv is as follows:
<pre> $ <b>pip3 install --upgrade \
https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b></pre>
https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py3-none-any.whl</b></pre>
If you encounter installation problems, see
[Common Installation Problems](#common-installation-problems).
@ -242,7 +242,7 @@ take the following steps:
issue the following command:
<pre> $ <b>sudo pip3 install --upgrade \
https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b> </pre>
https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py3-none-any.whl</b> </pre>
If the preceding command fails, see
[installation problems](#common-installation-problems).
@ -351,7 +351,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
TensorFlow for Python 2.7:
<pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl</b></pre>
https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py2-none-any.whl</b></pre>
<a name="ValidateYourInstallation"></a>
@ -524,7 +524,7 @@ This section documents the relevant values for Mac OS installations.
<pre>
https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl
https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py2-none-any.whl
</pre>
@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-a
<pre>
https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl
https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py3-none-any.whl
</pre>

View File

@ -359,10 +359,10 @@ Invoke `pip install` to install that pip package.
The filename of the `.whl` file depends on your platform.
For example, the following command will install the pip package
for TensorFlow 1.6.0rc0 on Linux:
for TensorFlow 1.6.0rc1 on Linux:
<pre>
$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.6.0rc0-py2-none-any.whl</b>
$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.6.0rc1-py2-none-any.whl</b>
</pre>
## Validate your installation
@ -393,7 +393,7 @@ TensorFlow programs:
<pre>Hello, TensorFlow!</pre>
If you are new to TensorFlow, see @{$get_started$Getting Started with
If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with
TensorFlow}.
If the system outputs an error message instead of a greeting, see [Common
@ -460,8 +460,8 @@ Stack Overflow and specify the `tensorflow` tag.
**Linux**
<table>
<tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
<tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
<tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
<tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
@ -479,7 +479,7 @@ Stack Overflow and specify the `tensorflow` tag.
**Mac**
<table>
<tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
<tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
<tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
<tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@ -493,8 +493,8 @@ Stack Overflow and specify the `tensorflow` tag.
**Windows**
<table>
<tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
<tr><td>tensorflow-1.6.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
<tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
<tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>

Some files were not shown because too many files have changed in this diff Show More