Merge changes from github.

Change: 151046259
This commit is contained in:
Martin Wicke 2017-03-23 12:31:16 -08:00 committed by TensorFlower Gardener
parent 8ca0714565
commit bc456e361d
141 changed files with 4407 additions and 602 deletions

1
.gitignore vendored
View File

@ -1,6 +1,7 @@
.DS_Store
.ipynb_checkpoints
node_modules
/.bazelrc
/bazel-*
/third_party/py/numpy/numpy_include
/tools/bazel.rc

View File

@ -1,3 +1,10 @@
# Release 1.0.1
## Bug Fixes and Other Changes
* Change GraphConstructor to not increase the version when importing, but instead take the min of all versions.
* Google Cloud Storage fixes.
* Removed `tf.core` and `tf.python` modules from the API. These were never intended to be exposed. Please use the same objects through top-level `tf` module instead.
# Release 1.0.0
## Major Features and Improvements
@ -88,6 +95,8 @@ To help you upgrade your existing TensorFlow Python code to match the API change
from the tensorflow::ops namespace to tensorflow.
* Change arg order for `{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits` to be (labels, predictions), and force use of named args.
* tf.nn.rnn_cell.* and most functions in tf.nn.rnn.* (with the exception of dynamic_rnn and raw_rnn) are temporarily in tf.contrib.rnn. They will be moved back into core for TF 1.1.
* `tf.nn.sampled_softmax_loss` and `tf.nn.nce_loss` have both changed their API such that you need to switch the `inputs, labels` to `labels, inputs` parameters.
* The shape keyword argument of the `SparseTensor` constructor changes its name to `dense_shape` between Tensorflow 0.12 and Tensorflow 1.0.
## Bug Fixes and Other Changes
* Numerous C++ API updates.

View File

@ -14,12 +14,7 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
closure_repositories()
load("//tensorflow:workspace.bzl", "check_version", "tf_workspace")
# We must check the bazel version before trying to parse any other BUILD files,
# in case the parsing of those build files depends on the bazel version we
# require here.
check_version("0.4.2")
load("//tensorflow:workspace.bzl", "tf_workspace")
# Uncomment and update the paths in these entries to build the Android demo.
#android_sdk_repository(

65
configure vendored
View File

@ -8,6 +8,9 @@ pushd `dirname $0` > /dev/null
SOURCE_BASE_DIR=`pwd -P`
popd > /dev/null
# This file contains customized config settings.
touch .bazelrc
PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
function is_linux() {
@ -36,15 +39,11 @@ function is_windows() {
}
function bazel_clean_and_fetch() {
# bazel clean --expunge currently doesn't work on Windows
# TODO(pcloudy): Re-enable it after bazel clean --expunge is fixed.
if ! is_windows; then
bazel clean --expunge
fi
if [ -z "$TF_BAZEL_TARGETS" ]; then
TF_BAZEL_TARGETS="//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..."
bazel fetch "//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..."
else
bazel fetch $TF_BAZEL_TARGETS
fi
bazel fetch "$TF_BAZEL_TARGETS"
}
function sed_hyphen_i() {
@ -102,8 +101,8 @@ if false; then # Disable building with MKL for now
if [ "$TF_NEED_MKL" == "1" ]; then # TF_NEED_MKL
DST=`dirname $0`
ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170110.tgz
GITHUB_RELEASE_TAG=v0.3
ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170209.tgz
GITHUB_RELEASE_TAG=v0.5
MKLURL="https://github.com/01org/mkl-dnn/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
if ! [ -e "$DST/third_party/mkl/$ARCHIVE_BASENAME" ]; then
wget --no-check-certificate -P $DST/third_party/mkl/ $MKLURL
@ -182,13 +181,12 @@ else
TF_NEED_JEMALLOC=0
fi
if [ "$TF_NEED_JEMALLOC" == "1" ]; then
sed_hyphen_i -e "s/WITH_JEMALLOC = False/WITH_JEMALLOC = True/" tensorflow/core/platform/default/build_config.bzl
else
sed_hyphen_i -e "s/WITH_JEMALLOC = True/WITH_JEMALLOC = False/" tensorflow/core/platform/default/build_config.bzl
sed_hyphen_i -e "/with_jemalloc/d" .bazelrc
if [[ "$TF_NEED_JEMALLOC" == "1" ]]; then
echo 'build --define with_jemalloc=true' >>.bazelrc
fi
while [ "$TF_NEED_GCP" == "" ]; do
while [[ "$TF_NEED_GCP" == "" ]]; do
read -p "Do you wish to build TensorFlow with "\
"Google Cloud Platform support? [y/N] " INPUT
case $INPUT in
@ -202,23 +200,12 @@ while [ "$TF_NEED_GCP" == "" ]; do
esac
done
if [ "$TF_NEED_GCP" == "1" ]; then
## Verify that libcurl header files are available.
# Only check Linux, since on MacOS the header files are installed with XCode.
if is_linux && [[ ! -f "/usr/include/curl/curl.h" ]]; then
echo "ERROR: It appears that the development version of libcurl is not "\
"available. Please install the libcurl3-dev package."
exit 1
fi
# Update Bazel build configuration.
sed_hyphen_i -e "s/WITH_GCP_SUPPORT = False/WITH_GCP_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
else
# Update Bazel build configuration.
sed_hyphen_i -e "s/WITH_GCP_SUPPORT = True/WITH_GCP_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
sed_hyphen_i -e "/with_gcp_support/d" .bazelrc
if [[ "$TF_NEED_GCP" == "1" ]]; then
echo 'build --define with_gcp_support=true' >>.bazelrc
fi
while [ "$TF_NEED_HDFS" == "" ]; do
while [[ "$TF_NEED_HDFS" == "" ]]; do
read -p "Do you wish to build TensorFlow with "\
"Hadoop File System support? [y/N] " INPUT
case $INPUT in
@ -232,16 +219,13 @@ while [ "$TF_NEED_HDFS" == "" ]; do
esac
done
if [ "$TF_NEED_HDFS" == "1" ]; then
# Update Bazel build configuration.
sed_hyphen_i -e "s/WITH_HDFS_SUPPORT = False/WITH_HDFS_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
else
# Update Bazel build configuration.
sed_hyphen_i -e "s/WITH_HDFS_SUPPORT = True/WITH_HDFS_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
sed_hyphen_i -e "/with_hdfs_support/d" .bazelrc
if [[ "$TF_NEED_HDFS" == "1" ]]; then
echo 'build --define with_hdfs_support=true' >>.bazelrc
fi
## Enable XLA.
while [ "$TF_ENABLE_XLA" == "" ]; do
while [[ "$TF_ENABLE_XLA" == "" ]]; do
read -p "Do you wish to build TensorFlow with the XLA just-in-time compiler (experimental)? [y/N] " INPUT
case $INPUT in
[Yy]* ) echo "XLA JIT support will be enabled for TensorFlow"; TF_ENABLE_XLA=1;;
@ -251,12 +235,9 @@ while [ "$TF_ENABLE_XLA" == "" ]; do
esac
done
if [ "$TF_ENABLE_XLA" == "1" ]; then
# Update Bazel build configuration.
sed_hyphen_i -e "s/^WITH_XLA_SUPPORT = [FT].*/WITH_XLA_SUPPORT = True/" tensorflow/core/platform/default/build_config_root.bzl
else
# Update Bazel build configuration.
sed_hyphen_i -e "s/^WITH_XLA_SUPPORT = [FT].*/WITH_XLA_SUPPORT = False/" tensorflow/core/platform/default/build_config_root.bzl
sed_hyphen_i -e "/with_xla_support/d" .bazelrc
if [[ "$TF_ENABLE_XLA" == "1" ]]; then
echo 'build --define with_xla_support=true' >>.bazelrc
fi

View File

@ -110,6 +110,34 @@ config_setting(
visibility = ["//visibility:public"],
)
# TODO(jhseu): Enable on other platforms other than Linux.
config_setting(
name = "with_jemalloc",
values = {
"cpu": "k8",
"define": "with_jemalloc=true",
},
visibility = ["//visibility:public"],
)
config_setting(
name = "with_gcp_support",
values = {"define": "with_gcp_support=true"},
visibility = ["//visibility:public"],
)
config_setting(
name = "with_hdfs_support",
values = {"define": "with_hdfs_support=true"},
visibility = ["//visibility:public"],
)
config_setting(
name = "with_xla_support",
values = {"define": "with_xla_support=true"},
visibility = ["//visibility:public"],
)
package_group(
name = "internal",
packages = ["//tensorflow/..."],
@ -321,6 +349,8 @@ cc_binary(
deps = [
"//tensorflow/c:c_api",
"//tensorflow/cc:cc_ops",
"//tensorflow/cc:client_session",
"//tensorflow/cc:scope",
"//tensorflow/core:tensorflow",
],
)

View File

@ -138,7 +138,8 @@ tensorflow::Status AllocationTracker::DeallocateShape(
TF_RET_CHECK(ShapeUtil::TupleElementCount(shape) == elements.size())
<< "tuple has unexpected number of elements: " << elements.size()
<< " != " << ShapeUtil::TupleElementCount(shape);
for (int i = 0; i < elements.size(); ++i) {
for (std::vector<se::DeviceMemoryBase>::size_type i = 0;
i < elements.size(); ++i) {
VLOG(2) << "recursing onto the tuple elements";
TF_RETURN_IF_ERROR(DeallocateShape(backend, device_ordinal, &elements[i],
shape.tuple_shapes(i),

View File

@ -118,10 +118,10 @@ GenericTransferManager::ShallowCopyTupleFromDevice(
// Create a DeviceMemoryBase from each void* pointer.
std::vector<se::DeviceMemoryBase> destination;
for (int i = 0; i < element_pointers.size(); ++i) {
for (std::vector<void*>::size_type i = 0; i < element_pointers.size(); ++i) {
if (element_pointers[i] == nullptr &&
!ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
return FailedPrecondition("tuple contains nullptr at element %d", i);
return FailedPrecondition("tuple contains nullptr at element %lu", i);
}
int64 buffer_size = ShapeUtil::ByteSizeOf(shape.tuple_shapes(i),
/*pointer_size=*/sizeof(void*));

View File

@ -256,7 +256,8 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
const Backend* backend, int device_ordinal) {
std::vector<const Allocation*> allocations;
for (int i = 0; i < arguments.size(); ++i) {
for (tensorflow::gtl::ArraySlice<const GlobalDataHandle*>::size_type i = 0;
i < arguments.size(); ++i) {
auto allocation_status = allocation_tracker_.Resolve(*arguments[i]);
if (!allocation_status.ok()) {
return Status(allocation_status.status().code(),
@ -269,7 +270,7 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
if (allocation->backend() != backend ||
allocation->device_ordinal() != device_ordinal) {
return InvalidArgument(
"argument %d is on device %s but computation will be executed "
"argument %lu is on device %s but computation will be executed "
"on device %s",
i,
allocation->backend()
@ -295,13 +296,14 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
program_shape.parameters_size(), arguments.size());
}
for (int i = 0; i < arguments.size(); ++i) {
for (tensorflow::gtl::ArraySlice<const Allocation*>::size_type i = 0;
i < arguments.size(); ++i) {
// Verify that shape of arguments matches the shape of the arguments in the
// ProgramShape.
if (!ShapeUtil::Compatible(arguments[i]->shape(),
program_shape.parameters(i))) {
return InvalidArgument(
"computation expects parameter %d to have shape %s, given shape %s",
"computation expects parameter %lu to have shape %s, given shape %s",
i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
ShapeUtil::HumanString(arguments[i]->shape()).c_str());
}
@ -383,7 +385,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
hlo_dumper, std::move(executors)));
if (!other_directory_path.empty()) {
for (int64 i = 0; i < versioned_handles.size(); ++i) {
for (std::vector<VersionedComputationHandle>::size_type i = 0;
i < versioned_handles.size(); ++i) {
executables[i]->set_session_module(std::move(session_modules[i]));
}
}
@ -523,7 +526,8 @@ Service::ExecuteParallelAndRegisterResult(
// Asynchronously launch all executables.
std::vector<GlobalDataHandle> result_handles;
for (int64 i = 0; i < executables.size(); i++) {
for (tensorflow::gtl::ArraySlice<Executable*>::size_type i = 0;
i < executables.size(); i++) {
TF_ASSIGN_OR_RETURN(
perftools::gputools::DeviceMemoryBase result,
executables[i]->ExecuteAsyncOnStream(&run_options[i], arguments[i]));

View File

@ -72,13 +72,17 @@ LINKER_SCRIPT = "//tensorflow/contrib/android:jni/version_script.lds"
cc_binary(
name = "libtensorflow_inference.so",
srcs = [],
copts = tf_copts(),
copts = tf_copts() + [
"-ffunction-sections",
"-fdata-sections",
],
linkopts = if_android([
"-landroid",
"-llog",
"-lm",
"-z defs",
"-s",
"-Wl,--gc-sections",
"-Wl,--version-script", # This line must be directly followed by LINKER_SCRIPT.
LINKER_SCRIPT,
]),

View File

@ -56,9 +56,10 @@ mark_as_advanced(DOWNLOAD_LOCATION)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_definitions(-DEIGEN_AVOID_STL_ARRAY)
if(WIN32)
add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
add_definitions(-DTF_COMPILE_LIBRARY)
add_definitions(-DNDEBUG /O2) # Equivalent of -c opt in Bazel.
add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
# Suppress warnings to reduce build log size.
@ -190,6 +191,7 @@ if (tensorflow_ENABLE_GPU)
${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_HOME}/include/cudnn.h
${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h
DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
)
include_directories(${tensorflow_source_dir}/third_party/gpus)

View File

@ -13,7 +13,7 @@ Linux.
Current Status
--------------
CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/get_started/os_setup.html#pip-installation-on-windows)
CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/install/install_windows)
for instructions on how to install a pre-built TensorFlow package on Windows.
### Current known limitations

View File

@ -120,3 +120,43 @@ list(REMOVE_ITEM tf_cc_srcs ${tf_cc_test_srcs})
add_library(tf_cc OBJECT ${tf_cc_srcs})
add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
add_custom_target(tf_extension_ops)
function(AddUserOps)
cmake_parse_arguments(_AT "" "" "TARGET;SOURCES;GPUSOURCES;DEPENDS;DISTCOPY" ${ARGN})
if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
# if gpu build is enabled and we have gpu specific code,
# hint to cmake that this needs to go to nvcc
set (gpu_source ${_AT_GPUSOURCES})
set (gpu_lib "${_AT_TARGET}_gpu")
set_source_files_properties(${gpu_source} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
cuda_compile(gpu_lib ${gpu_source})
endif()
# create shared library from source and cuda obj
add_library(${_AT_TARGET} SHARED ${_AT_SOURCES} ${gpu_lib})
target_link_libraries(${_AT_TARGET} ${pywrap_tensorflow_lib})
if(WIN32)
if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
# some ops call out to cuda directly; need to link libs for the cuda dlls
target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
endif()
if (_AT_DISTCOPY)
add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
endif()
endif()
if (_AT_DEPENDS)
add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
endif()
# make sure TF_COMPILE_LIBRARY is not defined for this target
get_target_property(target_compile_flags ${_AT_TARGET} COMPILE_FLAGS)
if(target_compile_flags STREQUAL "target_compile_flags-NOTFOUND")
set(target_compile_flags "/UTF_COMPILE_LIBRARY")
else()
set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
endif()
set_target_properties(${_AT_TARGET} PROPERTIES COMPILE_FLAGS ${target_compile_flags})
add_dependencies(tf_extension_ops ${_AT_TARGET})
endfunction(AddUserOps)

View File

@ -199,7 +199,6 @@ add_custom_command(OUTPUT
COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
--raw_generate ${VERSION_INFO_CC}
DEPENDS __force_rebuild)
set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
########################################################
@ -238,3 +237,9 @@ add_dependencies(tf_core_framework
tf_core_lib
proto_text
)
if(WIN32)
# Cmake > 3.6 will quote this as -D"__VERSION__=\"MSVC\"" which nvcc fails on.
# Instead of defining this global, limit it to tf_core_framework where its used.
target_compile_definitions(tf_core_framework PRIVATE __VERSION__="MSVC")
endif()

View File

@ -93,6 +93,12 @@ if(WIN32)
"${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
"${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
"${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
# no in tensorflow.dll - comes from .so
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
)
list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs})
endif(WIN32)

107
tensorflow/contrib/cmake/tf_python.cmake Normal file → Executable file
View File

@ -623,12 +623,7 @@ add_custom_command(
COMMENT "Running SWIG to generate Python wrappers"
VERBATIM )
# pywrap_tensorflow_internal is a shared library containing all of the
# TensorFlow runtime and the standard ops and kernels. These are installed into
# tf_python/tensorflow/python/.
# TODO(mrry): Refactor this to expose a framework library that
# facilitates `tf.load_op_library()`.
add_library(pywrap_tensorflow_internal SHARED
set (pywrap_tensorflow_internal_src
"${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.h"
"${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.cc"
"${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.h"
@ -652,6 +647,55 @@ add_library(pywrap_tensorflow_internal SHARED
"${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc"
"${tensorflow_source_dir}/tensorflow/c/tf_status_helper.h"
"${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.cc"
)
if(WIN32)
# Windows: build a static library with the same objects as tensorflow.dll.
# This can be used to build for a standalone exe and also helps us to
# find all symbols that need to be exported from the dll which is needed
# to provide the tensorflow c/c++ api in tensorflow.dll.
# From the static library we create the def file with all symbols that need to
# be exported from tensorflow.dll. Because there is a limit of 64K sybmols
# that can be exported, we filter the symbols with a python script to the namespaces
# we need.
#
add_library(pywrap_tensorflow_internal_static STATIC
${pywrap_tensorflow_internal_src}
$<TARGET_OBJECTS:tf_core_lib>
$<TARGET_OBJECTS:tf_core_cpu>
$<TARGET_OBJECTS:tf_core_framework>
$<TARGET_OBJECTS:tf_core_ops>
$<TARGET_OBJECTS:tf_core_direct_session>
$<TARGET_OBJECTS:tf_tools_transform_graph_lib>
$<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
$<TARGET_OBJECTS:tf_core_kernels>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
)
target_include_directories(pywrap_tensorflow_internal_static PUBLIC
${PYTHON_INCLUDE_DIR}
${NUMPY_INCLUDE_DIR}
)
target_link_libraries(pywrap_tensorflow_internal_static
tf_protos_cc
tf_python_protos_cc
)
set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow.def")
set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE)
add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD
COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
--input $<TARGET_FILE:pywrap_tensorflow_internal_static>
--output ${pywrap_tensorflow_deffile}
)
endif(WIN32)
# pywrap_tensorflow_internal is a shared library containing all of the
# TensorFlow runtime and the standard ops and kernels. These are installed into
# tf_python/tensorflow/python/.
add_library(pywrap_tensorflow_internal SHARED
${pywrap_tensorflow_internal_src}
$<TARGET_OBJECTS:tf_core_lib>
$<TARGET_OBJECTS:tf_core_cpu>
$<TARGET_OBJECTS:tf_core_framework>
@ -662,7 +706,13 @@ add_library(pywrap_tensorflow_internal SHARED
$<TARGET_OBJECTS:tf_core_kernels>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
${pywrap_tensorflow_deffile}
)
if(WIN32)
add_dependencies(pywrap_tensorflow_internal pywrap_tensorflow_internal_static)
endif(WIN32)
target_include_directories(pywrap_tensorflow_internal PUBLIC
${PYTHON_INCLUDE_DIR}
${NUMPY_INCLUDE_DIR}
@ -675,6 +725,44 @@ target_link_libraries(pywrap_tensorflow_internal
${PYTHON_LIBRARIES}
)
if(WIN32)
# include contrib/rnn as .so
#
set(tf_gru_srcs
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.h"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.h"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
)
set(tf_gru_gpu_srcs
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops_gpu.cu.cc"
)
set(tf_lstm_srcs
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.h"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.h"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
)
set(tf_lstm_gpu_srcs
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc"
)
AddUserOps(TARGET _gru_ops
SOURCES "${tf_gru_srcs}"
GPUSOURCES ${tf_gru_gpu_srcs}
DEPENDS pywrap_tensorflow_internal tf_python_ops
DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
AddUserOps(TARGET _lstm_ops
SOURCES "${tf_lstm_srcs}"
GPUSOURCES ${tf_lstm_gpu_srcs}
DEPENDS pywrap_tensorflow_internal tf_python_ops
DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
endif(WIN32)
############################################################
# Build a PIP package containing the TensorFlow runtime.
############################################################
@ -684,14 +772,17 @@ add_dependencies(tf_python_build_pip_package
tensorboard_copy_dependencies
tf_python_copy_scripts_to_destination
tf_python_touchup_modules
tf_python_ops)
tf_python_ops
tf_extension_ops)
add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/setup.py
${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
if(WIN32)
add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.dll
${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd)
${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib
${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
else()
add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so

View File

@ -115,7 +115,14 @@ if (tensorflow_BUILD_PYTHON_TESTS)
#
# include all test
if (WIN32)
file(GLOB_RECURSE tf_test_rnn_src_py
"${tensorflow_source_dir}/tensorflow/contrib/rnn/python/kernel_tests/*_test.py"
)
endif()
file(GLOB_RECURSE tf_test_src_py
${tf_test_rnn_src_py}
"${tensorflow_source_dir}/tensorflow/python/debug/cli/*_test.py"
"${tensorflow_source_dir}/tensorflow/python/debug/lib/*_test.py"
"${tensorflow_source_dir}/tensorflow/python/debug/wrappers/*_test.py"

View File

@ -106,3 +106,22 @@ target_link_libraries(${compare_graphs} PUBLIC
${tf_core_gpu_kernels_lib}
${tensorflow_EXTERNAL_LIBRARIES}
)
set(benchmark_model "benchmark_model")
add_executable(${benchmark_model}
"${tensorflow_source_dir}/tensorflow/tools/benchmark/benchmark_model.cc"
"${tensorflow_source_dir}/tensorflow/tools/benchmark/benchmark_model_main.cc"
$<TARGET_OBJECTS:tf_core_lib>
$<TARGET_OBJECTS:tf_core_cpu>
$<TARGET_OBJECTS:tf_core_framework>
$<TARGET_OBJECTS:tf_core_ops>
$<TARGET_OBJECTS:tf_core_direct_session>
$<TARGET_OBJECTS:tf_core_kernels>
)
target_link_libraries(${benchmark_model} PUBLIC
tf_protos_cc
${tf_core_gpu_kernels_lib}
${tensorflow_EXTERNAL_LIBRARIES}
)

View File

@ -0,0 +1,134 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
create_def_file.py - tool to create a windows def file to export
symbols from tensorflow.dll to enable tf.load_library().
Because the linker allows only 64K symbols to be exported per dll
we filter the symbols down to the essentials. The regular expressions
we use for this are specific to tensorflow.
TODO: this works fine but there is an issue with exporting
'const char * const' and importing it from a user_ops. The problem is
on the importing end and using __declspec(dllimport) works around it.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import io
import os
import re
import sys
import tempfile
from subprocess import Popen, PIPE
# External tools we use that come with visual studio sdk and
# we assume that the caller has the correct PATH to the sdk
UNDNAME = "undname.exe"
DUMPBIN = "dumpbin.exe"
# Exclude if matched
EXCLUDE_RE = re.compile(r"deleting destructor|::internal::")
# Include if matched before exclude
INCLUDEPRE_RE = re.compile(r"tensorflow::internal::LogMessage|" +
r"tensorflow::internal::CheckOpMessageBuilder")
# Include if matched after exclude
INCLUDE_RE = re.compile(r"^(TF_\w*)$|" +
r"tensorflow::|" +
r"functor::|" +
r"perftools::gputools")
def get_args():
"""Parse command line."""
parser = argparse.ArgumentParser()
parser.add_argument("--input", help="input library", required=True)
parser.add_argument("--output", help="output deffile", required=True)
args = parser.parse_args()
return args
def main():
"""main."""
args = get_args()
# Pipe dumpbin to extract all linkable symbols from a lib.
# Good symbols are collected in candidates and also written to
# a temp file.
candidates = []
tmpfile = tempfile.NamedTemporaryFile(mode="w", delete=False)
proc = Popen([DUMPBIN, "/nologo", "/linkermember:1", args.input], stdout=PIPE)
for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
cols = line.split()
if len(cols) < 2:
continue
sym = cols[1]
tmpfile.file.write(sym + "\n")
candidates.append(sym)
tmpfile.file.close()
exit_code = proc.wait()
if exit_code != 0:
print("{} failed, exit={}".format(DUMPBIN, exit_code))
return exit_code
# Run the symbols through undname to get their undecorated name
# so we can filter on something readable.
with open(args.output, "w") as def_fp:
# track dupes
taken = set()
# Header for the def file. Since the tensorflow.dll is actually called
# _pywrap_tensorflow.pyd in the python wheel, hint that in the def file.
def_fp.write("LIBRARY _pywrap_tensorflow_internal.pyd\n")
def_fp.write("EXPORTS\n")
def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
# Each symbols returned by undname matches the same position in candidates.
# We compare on undname but use the decorated name from candidates.
dupes = 0
proc = Popen([UNDNAME, tmpfile.name], stdout=PIPE)
for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
decorated = candidates[idx]
if decorated in taken:
# Symbol is already in output, done.
dupes += 1
continue
if not INCLUDEPRE_RE.search(line):
if EXCLUDE_RE.search(line):
continue
if not INCLUDE_RE.search(line):
continue
def_fp.write("\t" + decorated + "\n")
taken.add(decorated)
exit_code = proc.wait()
if exit_code != 0:
print("{} failed, exit={}".format(UNDNAME, exit_code))
return exit_code
os.unlink(tmpfile.name)
print("symbols={}, taken={}, dupes={}"
.format(len(candidates), len(taken), dupes))
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -280,10 +280,11 @@ def init_from_checkpoint(checkpoint_dir, assignment_map):
for var_name in scope_variables:
# Lookup name with specified prefix and suffix from current variable.
# If tensor_name given is '/' (root), don't use it for full name.
full_tensor_name = var_name[len(scopes):]
if current_var_or_name != "/":
full_tensor_name = full_tensor_name[1:]
if tensor_name_in_ckpt != "/":
full_tensor_name = tensor_name_in_ckpt + var_name[len(scopes) + 1:]
else:
full_tensor_name = var_name[len(scopes) + 1:]
full_tensor_name = tensor_name_in_ckpt + full_tensor_name
if full_tensor_name not in variable_map:
raise ValueError(
"Tensor %s (%s in %s) is not found in %s checkpoint" % (

View File

@ -168,6 +168,29 @@ class CheckpointsTest(test.TestCase):
self.assertAllEqual(my3.eval(session), v3)
self.assertAllEqual(my4.eval(session), v4)
def testInitToRootCheckpoint(self):
checkpoint_dir = self.get_temp_dir()
with self.test_session() as session:
v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
# New graph and session.
with ops.Graph().as_default() as g:
with self.test_session(graph=g) as session:
my1 = variable_scope.get_variable("var1", [1, 10])
my2 = variable_scope.get_variable("var2", [10, 10])
my3 = variable_scope.get_variable("var3", [100, 100])
with variable_scope.variable_scope("useful_scope"):
my4 = variable_scope.get_variable("var4", [9, 9])
checkpoint_utils.init_from_checkpoint(checkpoint_dir,
{"/": "/",})
session.run(variables.global_variables_initializer())
self.assertAllEqual(my1.eval(session), v1)
self.assertAllEqual(my2.eval(session), v2)
self.assertAllEqual(my3.eval(session), v3)
self.assertAllEqual(my4.eval(session), v4)
def testInitFromPartitionVar(self):
checkpoint_dir = self.get_temp_dir()
with self.test_session() as session:

View File

@ -30,11 +30,15 @@
net = layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1')
net = layers.conv2d(net, 256, [5, 5], scope='conv2')
```
The first call to conv2d will use predefined args:
layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', ..., scope='conv1')
The first call to conv2d will behave as follows:
layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID',
initializer=layers.variance_scaling_initializer(),
regularizer=layers.l2_regularizer(0.05), scope='conv1')
The second call to conv2d will overwrite padding:
layers.conv2d(inputs, 256, [5, 5], padding='SAME', ..., scope='conv2')
The second call to conv2d will also use the arg_scope's default for padding:
layers.conv2d(inputs, 256, [5, 5], padding='SAME',
initializer=layers.variance_scaling_initializer(),
regularizer=layers.l2_regularizer(0.05), scope='conv2')
Example of how to reuse an arg_scope:
@ -49,7 +53,7 @@
net = layers.conv2d(net, 256, [5, 5], scope='conv2')
```
Example of how to use tf.contrib.framework.add_arg_scope:
Example of how to use tf.contrib.framework.add_arg_scope to enable your function to be called within an arg_scope later:
@tf.contrib.framework.add_arg_scope
def conv2d(*args, **kwargs)

View File

@ -40,6 +40,7 @@ See the @{$python/contrib.layers} guide.
@@softmax
@@stack
@@unit_norm
@@bow_encoder
@@embed_sequence
@@apply_regularization

View File

@ -160,9 +160,8 @@ def _fused_batch_norm(
they need to be added as a dependency to the `train_op`, example:
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
if update_ops:
updates = tf.group(*update_ops)
total_loss = control_flow_ops.with_dependencies([updates], total_loss)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss)
One can set updates_collections=None to force the updates in place, but that
can have speed penalty, especially in distributed settings.
@ -393,9 +392,8 @@ def batch_norm(inputs,
they need to be added as a dependency to the `train_op`, example:
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
if update_ops:
updates = tf.group(*update_ops)
total_loss = control_flow_ops.with_dependencies([updates], total_loss)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss)
One can set updates_collections=None to force the updates in place, but that
can have speed penalty, especially in distributed settings.

View File

@ -33,6 +33,7 @@ See the @{$python/contrib.learn} guide.
@@DNNLinearCombinedRegressor
@@DNNLinearCombinedEstimator
@@DNNLinearCombinedClassifier
@@DynamicRnnEstimator
@@LinearClassifier
@@LinearEstimator
@@LinearRegressor

View File

@ -20,18 +20,17 @@ Optionally you can install [scikit-learn](http://scikit-learn.org/stable/) and [
### Tutorials
- [TF Learn Quickstart](../../../../g3doc/tutorials/tflearn/index.md). Build,
- [TF Learn Quickstart](https://www.tensorflow.org/get_started/tflearn). Build,
train, and evaluate a neural network with just a few lines of code.
- [Input Functions](../../../../g3doc/tutorials/input_fn/index.md). Learn how
- [Input Functions](https://www.tensorflow.org/get_started/input_fn). Learn how
to create input functions to feed data into your models.
- [Linear Model](../../../../g3doc/tutorials/wide/index.md). Learn the basics
- [Linear Model](https://www.tensorflow.org/tutorials/wide). Learn the basics
of building linear models.
- [Wide and Deep
Learning](../../../../g3doc/tutorials/wide_and_deep/index.md). Jointly train
a linear model and a deep neural network.
- [Logging and Monitoring](../../../../g3doc/tutorials/monitors/index.md). Use
the Monitor API to audit training of a neural network.
- [Custom Estimators](../../../../g3doc/tutorials/estimators/index.md). Learn
- [Wide and Deep Learning](https://www.tensorflow.org/tutorials/wide_and_deep).
Jointly train a linear model and a deep neural network.
- [Logging and Monitoring](https://www.tensorflow.org/get_started/monitors).
Use the Monitor API to audit training of a neural network.
- [Custom Estimators](https://www.tensorflow.org/extend/estimators). Learn
how to create a custom estimator.
- More coming soon.

View File

@ -1108,7 +1108,7 @@ class Estimator(BaseEstimator):
if isinstance(model_fn_results, model_fn_lib.ModelFnOps):
return model_fn_results
# Here model_fn_ops should be a tuple with 3 elements.
# Here model_fn_results should be a tuple with 3 elements.
if len(model_fn_results) != 3:
raise ValueError('Unrecognized value returned by model_fn, '
'please return ModelFnOps.')

View File

@ -149,21 +149,16 @@ def _linear_model_fn(features, labels, mode, params, config=None):
values=tuple(six.itervalues(features)),
partitioner=partitioner) as scope:
if joint_weights:
logits, _, _ = (
layers.joint_weighted_sum_from_feature_columns(
columns_to_tensors=features,
feature_columns=feature_columns,
num_outputs=head.logits_dimension,
weight_collections=[parent_scope],
scope=scope))
layer_fn = layers.joint_weighted_sum_from_feature_columns
else:
logits, _, _ = (
layers.weighted_sum_from_feature_columns(
columns_to_tensors=features,
feature_columns=feature_columns,
num_outputs=head.logits_dimension,
weight_collections=[parent_scope],
scope=scope))
layer_fn = layers.weighted_sum_from_feature_columns
logits, _, _ = layer_fn(
columns_to_tensors=features,
feature_columns=feature_columns,
num_outputs=head.logits_dimension,
weight_collections=[parent_scope],
scope=scope)
def _train_op_fn(loss):
global_step = contrib_variables.get_global_step()

View File

@ -63,57 +63,54 @@ def _assert_df_equals_dict(expected_df, actual_dict):
actual_dict[col]))
def _make_test_csv():
f = tempfile.NamedTemporaryFile(
dir=test.get_temp_dir(), delete=False, mode="w")
w = csv.writer(f)
w.writerow(["int", "float", "bool", "string"])
for _ in range(100):
intvalue = np.random.randint(-10, 10)
floatvalue = np.random.rand()
boolvalue = int(np.random.rand() > 0.3)
stringvalue = "S: %.4f" % np.random.rand()
row = [intvalue, floatvalue, boolvalue, stringvalue]
w.writerow(row)
f.close()
return f.name
def _make_test_csv_sparse():
f = tempfile.NamedTemporaryFile(
dir=test.get_temp_dir(), delete=False, mode="w")
w = csv.writer(f)
w.writerow(["int", "float", "bool", "string"])
for _ in range(100):
# leave columns empty; these will be read as default value (e.g. 0 or NaN)
intvalue = np.random.randint(-10, 10) if np.random.rand() > 0.5 else ""
floatvalue = np.random.rand() if np.random.rand() > 0.5 else ""
boolvalue = int(np.random.rand() > 0.3) if np.random.rand() > 0.5 else ""
stringvalue = (("S: %.4f" % np.random.rand()) if np.random.rand() > 0.5 else
"")
row = [intvalue, floatvalue, boolvalue, stringvalue]
w.writerow(row)
f.close()
return f.name
def _make_test_tfrecord():
f = tempfile.NamedTemporaryFile(dir=test.get_temp_dir(), delete=False)
w = tf_record.TFRecordWriter(f.name)
for i in range(100):
ex = example_pb2.Example()
ex.features.feature["var_len_int"].int64_list.value.extend(range((i % 3)))
ex.features.feature["fixed_len_float"].float_list.value.extend(
[float(i), 2 * float(i)])
w.write(ex.SerializeToString())
return f.name
class TensorFlowDataFrameTestCase(test.TestCase):
"""Tests for `TensorFlowDataFrame`."""
def _make_test_csv(self):
f = tempfile.NamedTemporaryFile(
dir=self.get_temp_dir(), delete=False, mode="w")
w = csv.writer(f)
w.writerow(["int", "float", "bool", "string"])
for _ in range(100):
intvalue = np.random.randint(-10, 10)
floatvalue = np.random.rand()
boolvalue = int(np.random.rand() > 0.3)
stringvalue = "S: %.4f" % np.random.rand()
row = [intvalue, floatvalue, boolvalue, stringvalue]
w.writerow(row)
f.close()
return f.name
def _make_test_csv_sparse(self):
f = tempfile.NamedTemporaryFile(
dir=self.get_temp_dir(), delete=False, mode="w")
w = csv.writer(f)
w.writerow(["int", "float", "bool", "string"])
for _ in range(100):
# leave columns empty; these will be read as default value (e.g. 0 or NaN)
intvalue = np.random.randint(-10, 10) if np.random.rand() > 0.5 else ""
floatvalue = np.random.rand() if np.random.rand() > 0.5 else ""
boolvalue = int(np.random.rand() > 0.3) if np.random.rand() > 0.5 else ""
stringvalue = (("S: %.4f" % np.random.rand()) if np.random.rand() > 0.5 else
"")
row = [intvalue, floatvalue, boolvalue, stringvalue]
w.writerow(row)
f.close()
return f.name
def _make_test_tfrecord(self):
f = tempfile.NamedTemporaryFile(dir=self.get_temp_dir(), delete=False)
w = tf_record.TFRecordWriter(f.name)
for i in range(100):
ex = example_pb2.Example()
ex.features.feature["var_len_int"].int64_list.value.extend(range((i % 3)))
ex.features.feature["fixed_len_float"].float_list.value.extend(
[float(i), 2 * float(i)])
w.write(ex.SerializeToString())
return f.name
def _assert_pandas_equals_tensorflow(self, pandas_df, tensorflow_df,
num_batches, batch_size):
self.assertItemsEqual(
@ -190,7 +187,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
batch_size = 8
enqueue_size = 7
data_path = _make_test_csv()
data_path = self._make_test_csv()
default_values = [0, 0.0, 0, ""]
pandas_df = pd.read_csv(data_path)
@ -211,7 +208,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
num_epochs = 17
expected_num_batches = (num_epochs * 100) // batch_size
data_path = _make_test_csv()
data_path = self._make_test_csv()
default_values = [0, 0.0, 0, ""]
tensorflow_df = df.TensorFlowDataFrame.from_csv(
@ -234,7 +231,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
num_batches = 100
batch_size = 8
data_path = _make_test_csv_sparse()
data_path = self._make_test_csv_sparse()
feature_spec = {
"int": parsing_ops.FixedLenFeature(None, dtypes.int16, np.nan),
"float": parsing_ops.VarLenFeature(dtypes.float16),
@ -270,7 +267,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
enqueue_size = 11
batch_size = 13
data_path = _make_test_tfrecord()
data_path = self._make_test_tfrecord()
features = {
"fixed_len_float":
parsing_ops.FixedLenFeature(
@ -318,7 +315,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
num_epochs = 17
expected_num_batches = (num_epochs * 100) // batch_size
data_path = _make_test_csv()
data_path = self._make_test_csv()
default_values = [0, 0.0, 0, ""]
tensorflow_df = df.TensorFlowDataFrame.from_csv(

View File

@ -261,7 +261,7 @@ def streaming_false_negatives(predictions, labels, weights=None,
metrics_collections=None,
updates_collections=None,
name=None):
"""Computes the total number of false positives.
"""Computes the total number of false negatives.
If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.

View File

@ -13,7 +13,14 @@
# limitations under the License.
# ==============================================================================
"""Module implementing RNN Cells."""
"""Module implementing RNN Cells.
This module provides a number of basic commonly used RNN cells, such as LSTM
(Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number of
operators that allow adding dropouts, projections, or embeddings for inputs.
Constructing multi-layer cells is supported by the class `MultiRNNCell`, or by
calling the `rnn` ops several times.
"""
from __future__ import absolute_import
from __future__ import division
@ -146,12 +153,12 @@ class GRUCell(RNNCell):
with _checked_scope(self, scope or "gru_cell", reuse=self._reuse):
with vs.variable_scope("gates"): # Reset gate and update gate.
# We start with bias of 1.0 to not reset and not update.
value = sigmoid(_linear(
[inputs, state], 2 * self._num_units, True, 1.0))
r, u = array_ops.split(
value=_linear(
[inputs, state], 2 * self._num_units, True, 1.0),
value=value,
num_or_size_splits=2,
axis=1)
r, u = sigmoid(r), sigmoid(u)
with vs.variable_scope("candidate"):
c = self._activation(_linear([inputs, r * state],
self._num_units, True))

View File

@ -70,7 +70,7 @@ def _lstm_block_cell(x,
cs = ci .* i + cs_prev .* f
cs = clip(cs, cell_clip)
o = sigmoid(cs * wco + f)
o = sigmoid(cs * wco + o)
co = tanh(cs)
h = co .* o
```

View File

@ -486,7 +486,7 @@ class GreedyEmbeddingHelper(Helper):
# Outputs are logits, use argmax to get the most probable id
if not isinstance(outputs, ops.Tensor):
raise TypeError("Expected outputs to be a single Tensor, got: %s" %
outputs)
type(outputs))
sample_ids = math_ops.cast(
math_ops.argmax(outputs, axis=-1), dtypes.int32)
return sample_ids

View File

@ -44,8 +44,7 @@ def sequence_loss(logits, targets, weights,
sequence. When using weights as masking set all valid timesteps to 1 and
all padded timesteps to 0.
average_across_timesteps: If set, sum the cost across the sequence
dimension and divide by the cost by the total label weight across
timesteps.
dimension and divide the cost by the total label weight across timesteps.
average_across_batch: If set, sum the cost across the batch dimension and
divide the returned cost by the batch size.
softmax_loss_function: Function (labels-batch, inputs-batch) -> loss-batch

View File

@ -229,7 +229,7 @@ net = ...
net = slim.conv2d(net, 256, [3, 3], scope='conv3_1')
net = slim.conv2d(net, 256, [3, 3], scope='conv3_2')
net = slim.conv2d(net, 256, [3, 3], scope='conv3_3')
net = slim.max_pool2d(net, [2, 2], scope='pool3')
net = slim.max_pool2d(net, [2, 2], scope='pool2')
```
One way to reduce this code duplication would be via a `for` loop:
@ -238,14 +238,14 @@ One way to reduce this code duplication would be via a `for` loop:
net = ...
for i in range(3):
net = slim.conv2d(net, 256, [3, 3], scope='conv3_' % (i+1))
net = slim.max_pool2d(net, [2, 2], scope='pool3')
net = slim.max_pool2d(net, [2, 2], scope='pool2')
```
This can be made even cleaner by using TF-Slim's `repeat` operation:
```python
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
net = slim.max_pool(net, [2, 2], scope='pool2')
net = slim.max_pool2d(net, [2, 2], scope='pool2')
```
Notice that the `slim.repeat` not only applies the same argument in-line, it

View File

@ -21,6 +21,7 @@ from __future__ import division
from __future__ import print_function
import os
import re
from tensorflow.python.framework import load_library
from tensorflow.python.platform import resource_loader
@ -29,9 +30,9 @@ from tensorflow.python.platform import resource_loader
def load_op_library(path):
"""Loads a contrib op library from the given path.
NOTE(mrry): On Windows, we currently assume that contrib op
NOTE(mrry): On Windows, we currently assume that some contrib op
libraries are statically linked into the main TensorFlow Python
extension DLL.
extension DLL - use dynamically linked ops if the .so is present.
Args:
path: An absolute path to a shared object file.
@ -40,11 +41,17 @@ def load_op_library(path):
A Python module containing the Python wrappers for Ops defined in the
plugin.
"""
if os.name != 'nt':
path = resource_loader.get_path_to_datafile(path)
ret = load_library.load_op_library(path)
assert ret, 'Could not load %s' % path
return ret
else:
# NOTE(mrry):
return None
if os.name == 'nt':
# To avoid makeing every user_ops aware of windows, re-write
# the file extension from .so to .dll.
path = re.sub('\.so$', '.dll', path)
# TODO: currently we have only some user_ops as .dll's on windows - don't try
# to load them if the dll is not found. Once we have all of them
# this check should be removed.
if not os.path.exists(path):
return None
path = resource_loader.get_path_to_datafile(path)
ret = load_library.load_op_library(path)
assert ret, 'Could not load %s' % path
return ret

View File

@ -339,6 +339,7 @@ tf_cuda_library(
hdrs = [
"example/feature_util.h",
"framework/allocator.h",
"framework/allocator_registry.h",
"framework/attr_value_util.h",
"framework/bfloat16.h",
"framework/cancellation.h",
@ -408,7 +409,9 @@ tf_cuda_library(
"util/memmapped_file_system.h",
"util/memmapped_file_system_writer.h",
],
}),
}) + if_mkl([
"util/mkl_util.h",
]),
visibility = ["//visibility:public"],
deps = [":framework_internal"],
)
@ -707,7 +710,9 @@ cc_library(
"//tensorflow/core/kernels:math_not_windows",
"//tensorflow/core/kernels:quantized_ops",
]) + if_mkl([
"//tensorflow/core/kernels:mkl_ops",
"//tensorflow/core/kernels:mkl_conv_op",
"//tensorflow/core/kernels:mkl_matmul_op",
"//tensorflow/core/kernels:mkl_tfconv_op",
]),
)
@ -772,7 +777,7 @@ cc_library(
"//tensorflow/core/kernels:constant_op",
"//tensorflow/core/kernels:ops_testutil",
"//tensorflow/core/kernels:ops_util",
"//tensorflow/core/platform/default/build_config:gtest", # + if_sycl([":sycl_runtime"]),
"//tensorflow/core/platform/default/build_config:gtest", # + if_sycl([":sycl_runtime"])
],
)
@ -1393,7 +1398,7 @@ tf_cuda_library(
":version_lib",
"//tensorflow/core/kernels:bounds_check",
"//third_party/eigen3",
],
] + if_mkl(["//third_party/mkl:intel_binary_blob"]),
alwayslink = 1,
)
@ -1482,20 +1487,21 @@ tf_cuda_library(
),
copts = tf_copts(),
deps = [
":framework",
":framework_internal",
":function_ops_op_lib",
":functional_grad",
":functional_ops_op_lib",
":lib",
":lib_internal",
":proto_text",
":protos_all_cc",
"//tensorflow/core/grappler:grappler_item",
"//tensorflow/core/grappler/optimizers:meta_optimizer",
"//third_party/eigen3",
"//tensorflow/core/kernels:required",
] + tf_additional_core_deps(),
":framework",
":framework_internal",
":function_ops_op_lib",
":functional_grad",
":functional_ops_op_lib",
":lib",
":lib_internal",
":proto_text",
":protos_all_cc",
"//tensorflow/core/grappler:grappler_item",
"//tensorflow/core/grappler/optimizers:meta_optimizer",
"//third_party/eigen3",
"//tensorflow/core/kernels:required",
] + if_mkl(["//third_party/mkl:intel_binary_blob"]) +
tf_additional_core_deps(),
alwayslink = 1,
)
@ -2037,33 +2043,38 @@ tf_cc_tests(
],
)
if_mkl(
tf_cc_test_mkl(
name = "mkl_related_tests",
size = "small",
srcs = ["graph/mkl_optimizer_merge_test.cc"],
linkstatic = tf_kernel_tests_linkstatic(),
deps = [
":core",
":core_cpu",
":core_cpu_internal",
":direct_session_internal",
":framework",
":framework_internal",
":lib",
":lib_internal",
":ops",
":protos_all_cc", # under if_mkl
":test",
":test_main",
":testlib",
"//tensorflow/cc:cc_ops",
"//tensorflow/cc:scope",
"//tensorflow/cc:sendrecv_ops",
"//tensorflow/core/kernels:ops_util",
"//third_party/eigen3",
],
),
tf_cc_test_mkl(
name = "mkl_related_tests",
size = "small",
srcs = [
"graph/mkl_layout_pass_test.cc",
"graph/mkl_optimizer_merge_test.cc",
"graph/mkl_tfconversion_pass_test.cc",
],
linkstatic = tf_kernel_tests_linkstatic(),
deps = [
":core",
":core_cpu",
":core_cpu_internal",
":direct_session_internal",
":framework",
":framework_internal",
":lib",
":lib_internal",
":ops",
":protos_all_cc",
":test",
":test_main",
":testlib",
"//tensorflow/cc:cc_ops",
"//tensorflow/cc:scope",
"//tensorflow/cc:sendrecv_ops",
"//tensorflow/core/kernels:mkl_conv_op",
"//tensorflow/core/kernels:mkl_matmul_op",
"//tensorflow/core/kernels:mkl_tfconv_op",
"//tensorflow/core/kernels:ops_util",
"//third_party/eigen3",
],
)
tf_cc_tests_gpu(

View File

@ -0,0 +1,120 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// A simple CPU allocator that intercepts malloc/free calls from MKL library
// and redirects them to Tensorflow allocator
#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
#define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
#ifdef INTEL_MKL
#include <string>
#include "tensorflow/core/common_runtime/bfc_allocator.h"
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/platform/mem.h"
#include "third_party/mkl/include/i_malloc.h"
namespace tensorflow {
class MklSubAllocator : public SubAllocator {
public:
~MklSubAllocator() override {}
void* Alloc(size_t alignment, size_t num_bytes) override {
return port::AlignedMalloc(num_bytes, alignment);
}
void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
};
/// CPU allocator for MKL that wraps BFC allocator and intercepts
/// and redirects memory allocation calls from MKL.
class MklCPUAllocator : public Allocator {
public:
// Constructor and other standard functions
MklCPUAllocator() {
VLOG(2) << "MklCPUAllocator: In MklCPUAllocator";
allocator_ =
new BFCAllocator(new MklSubAllocator, kMaxMemSize, kAllowGrowth, kName);
// For redirecting all allocations from MKL to this allocator
// From: http://software.intel.com/en-us/node/528565
i_malloc = MallocHook;
i_calloc = CallocHook;
i_realloc = ReallocHook;
i_free = FreeHook;
}
~MklCPUAllocator() override { delete allocator_; }
inline string Name() override { return kName; }
inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
return allocator_->AllocateRaw(alignment, num_bytes);
}
inline void DeallocateRaw(void* ptr) override {
allocator_->DeallocateRaw(ptr);
}
private:
// Hooks provided by this allocator for memory allocation routines from MKL
static inline void* MallocHook(size_t size) {
VLOG(2) << "MklCPUAllocator: In MallocHook";
return cpu_allocator()->AllocateRaw(kAlignment, size);
}
static inline void FreeHook(void* ptr) {
VLOG(2) << "MklCPUAllocator: In FreeHook";
cpu_allocator()->DeallocateRaw(ptr);
}
static inline void* CallocHook(size_t num, size_t size) {
Status s = Status(error::Code::UNIMPLEMENTED,
"Unimplemented case for hooking MKL function.");
TF_CHECK_OK(s); // way to assert with an error message
}
static inline void* ReallocHook(void* ptr, size_t size) {
Status s = Status(error::Code::UNIMPLEMENTED,
"Unimplemented case for hooking MKL function.");
TF_CHECK_OK(s); // way to assert with an error message
}
// TODO(jbobba): We should ideally move this into CPUOptions in config.proto.
/// Memory limit - 64GB
static const size_t kMaxMemSize =
static_cast<size_t>(64) * 1024 * 1024 * 1024;
/// Do we allow growth in BFC Allocator
static const bool kAllowGrowth = true;
/// Name
static constexpr const char* kName = "mklcpu";
/// The alignment that we need for the allocations
static const size_t kAlignment = 64;
Allocator* allocator_; // owned by this class
};
} // namespace tensorflow
#endif // INTEL_MKL
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_

View File

@ -17,6 +17,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/local_device.h"
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/framework/allocator_registry.h"
#include "tensorflow/core/framework/device_base.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor.pb_text.h"
@ -27,6 +28,10 @@ limitations under the License.
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/public/session_options.h"
#ifdef INTEL_MKL
#include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
#endif
namespace tensorflow {
ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
@ -70,4 +75,8 @@ Status ThreadPoolDevice::MakeTensorFromProto(
ProtoDebugString(tensor_proto));
}
#ifdef INTEL_MKL
REGISTER_MEM_ALLOCATOR("MklCPUAllocator", 200, MklCPUAllocator);
#endif
} // namespace tensorflow

View File

@ -15,6 +15,7 @@ limitations under the License.
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/framework/allocator_registry.h"
#include "tensorflow/core/framework/log_memory.h"
#include "tensorflow/core/framework/tracking_allocator.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
@ -119,11 +120,13 @@ Allocator* MakeCpuAllocator() {
} // namespace
Allocator* cpu_allocator() {
static Allocator* cpu_alloc = MakeCpuAllocator();
static Allocator* cpu_alloc = AllocatorRegistry::Global()->GetAllocator();
if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) {
cpu_alloc = new TrackingAllocator(cpu_alloc, true);
}
return cpu_alloc;
}
REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocator);
} // namespace tensorflow

View File

@ -0,0 +1,66 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <string>
#include "tensorflow/core/framework/allocator_registry.h"
#include "tensorflow/core/platform/logging.h"
namespace tensorflow {
// static
AllocatorRegistry* AllocatorRegistry::Global() {
static AllocatorRegistry* global_allocator_registry = new AllocatorRegistry;
return global_allocator_registry;
}
bool AllocatorRegistry::CheckForDuplicates(const string& name, int priority) {
for (auto entry : allocators_) {
if (!name.compare(entry.name) && priority == entry.priority) {
return true;
}
}
return false;
}
void AllocatorRegistry::Register(const string& name, int priority,
Allocator* allocator) {
CHECK(!name.empty()) << "Need a valid name for Allocator";
CHECK_GE(priority, 0) << "Priority needs to be non-negative";
CHECK(!CheckForDuplicates(name, priority)) << "Allocator with name: [" << name
<< "] and priority: [" << priority
<< "] already registered";
AllocatorRegistryEntry tmp_entry;
tmp_entry.name = name;
tmp_entry.priority = priority;
tmp_entry.allocator = allocator;
allocators_.push_back(tmp_entry);
int high_pri = -1;
for (auto entry : allocators_) {
if (high_pri < entry.priority) {
m_curr_allocator_ = entry.allocator;
high_pri = entry.priority;
}
}
}
Allocator* AllocatorRegistry::GetAllocator() {
return CHECK_NOTNULL(m_curr_allocator_);
}
} // namespace tensorflow

View File

@ -0,0 +1,77 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Classes to maintain a static registry of memory allocators
#ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
#define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
#include <string>
#include <vector>
#include "tensorflow/core/framework/allocator.h"
namespace tensorflow {
// A global AllocatorRegistry is used to hold allocators for CPU backends
class AllocatorRegistry {
public:
// Add an allocator to the registry.
void Register(const string& name, int priority, Allocator* allocator);
// Return allocator with highest priority
// If multiple allocators have the same high priority, return one of them
Allocator* GetAllocator();
// Returns the global registry of allocators.
static AllocatorRegistry* Global();
private:
typedef struct {
string name;
int priority;
Allocator* allocator; // not owned
} AllocatorRegistryEntry;
bool CheckForDuplicates(const string& name, int priority);
std::vector<AllocatorRegistryEntry> allocators_;
Allocator* m_curr_allocator_; // not owned
};
namespace allocator_registration {
class AllocatorRegistration {
public:
AllocatorRegistration(const string& name, int priority,
Allocator* allocator) {
AllocatorRegistry::Global()->Register(name, priority, allocator);
}
};
} // namespace allocator_registration
#define REGISTER_MEM_ALLOCATOR(name, priority, allocator) \
REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(__COUNTER__, name, priority, allocator)
#define REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(ctr, name, priority, allocator) \
REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator)
#define REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator) \
static allocator_registration::AllocatorRegistration \
register_allocator_##ctr(name, priority, new allocator)
} // namespace tensorflow
#endif // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_

View File

@ -17,7 +17,7 @@ limitations under the License.
#define TENSORFLOW_FRAMEWORK_TYPE_INDEX_H_
#include <string>
#ifdef __GXX_RTTI
#if defined(__GXX_RTTI) || defined(_CPPRTTI)
#include <typeindex>
#include <typeinfo>
#endif // __GXX_RTTI
@ -30,7 +30,7 @@ namespace tensorflow {
// binary sizes. The following #ifdef section provides a non-RTTI
// replacement for std::type_index (with a minimal set of functions needed by
// the TensorFlow framework, and more can be added if necessary).
#ifndef __GXX_RTTI
#if !defined(__GXX_RTTI) && !defined(_CPPRTTI)
// A thin TypeIndex class that mimics std::type_index but does not use RTTI. As
// a result, it does not provide the actual name of the type, and only returns a

View File

@ -68,9 +68,9 @@ class DeviceType {
std::ostream& operator<<(std::ostream& os, const DeviceType& d);
// Convenient constants that can be passed to a DeviceType constructor
extern const char* const DEVICE_CPU; // "CPU"
extern const char* const DEVICE_GPU; // "GPU"
extern const char* const DEVICE_SYCL; // "SYCL"
TF_EXPORT extern const char* const DEVICE_CPU; // "CPU"
TF_EXPORT extern const char* const DEVICE_GPU; // "GPU"
TF_EXPORT extern const char* const DEVICE_SYCL; // "SYCL"
typedef gtl::InlinedVector<MemoryType, 4> MemoryTypeVector;
typedef gtl::ArraySlice<MemoryType> MemoryTypeSlice;

View File

@ -0,0 +1,548 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifdef INTEL_MKL
#include <vector>
#include <utility>
#include <string>
#include <memory>
#include <unordered_set>
#include <functional>
#include "tensorflow/core/framework/node_def_util.h"
#include "tensorflow/core/graph/algorithm.h"
#include "tensorflow/core/graph/node_builder.h"
#include "tensorflow/core/lib/gtl/map_util.h"
#include "tensorflow/core/lib/hash/hash.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/common_runtime/function.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/common_runtime/optimization_registry.h"
#include "tensorflow/core/graph/mkl_layout_pass.h"
#include "tensorflow/core/util/mkl_util.h"
namespace tensorflow {
// This pass implements rewriting of graph for propagating Mkl
// layout as an additional output tensor (we will loosely call a
// tensor that carries Mkl layout as Mkl tensor henceforth.)
// from every Mkl supported NN layer.
//
// As a example, consider Relu layer. Current definition of Relu
// layer looks like:
//
// O = Relu(A)
//
// Relu has 1 input (A), and 1 output (O).
//
// This rewrite pass will generate a new graph node for Relu
// (new node is called MklRelu) as:
//
// O, O_m = MklRelu(A, A_m)
//
// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m).
// Here A input is same as A input of Relu; O output is same
// as O output of Relu. O_m is the additional output tensor
// that will be set by MklRelu, and it represents Mkl tensor
// corresponding to O -- in other words, O_m is some kind of
// metadata for O. A_m is additional input of Relu, and it
// represents metadata for A - as O_m is metadata for O, A_m
// is metadata for A. MklRelu receives this metadata from
// previous layer (in the graph).
//
// When previous layer in the graph is Mkl layer, A_m will
// represent a valid Mkl tensor. But when previous Mkl layer
// is not an Mkl layer, then A_m represents a dummy Mkl tensor.
//
// Rewriting rules:
// - Selection of an op for rewriting happens by registering
// an op with this pass. If an op is not registered, then
// it is not rewritten.
// - Number of inputs after rewriting:
// Since for every input Tensorflow tensor, the rewritten
// layer gets Mkl tensor, rewritten op gets 2*N inputs,
// where N is the number of inputs for original op.
// - Number of outputs after rewriting:
// Since for every output Tensorflow tensor, the rewritten
// layer generates Mkl tensor, rewritten op generates 2*N
// outputs, where N is the number of outputs of original op.
// - Ordering of Tensorflow tensors and Mkl tensors:
// Since every op generates twice the number of inputs and
// outputs, one could imagine different ordering among
// Tensorflow tensors and Mkl tensors. E.g., let's assume
// an op 'Conv2D' takes (A, B) as input, then new op
// 'MklConv2D' can take (A, A_m, B, B_m) as input or it
// can also take (A, B, A_m, B_m) as input. Among N inputs
// one can get N! permutations.
//
// So the question is: which one do we follow? Currently,
// we follow an intuitive order where Mkl tensor follows a
// corresponding Tensorflow tensor immediately. In the
// context of above example, it will be: (A, A_m, B, B_m).
// We follow same ordering rule for output tensors.
//
// NOTE: Current rewriting approach rewrites an op to Mkl op without
// any conditions. But in the future, it may be possible to
// consider conditions such as input shapes and sizes to rewrite
// an op.
//
// Graph rewrite algorithm:
// Algorithm: Graph Rewrite
// Input: Graph G, Names of nodes to rewrite and their new nodes
// Output: Modified Graph G' if nodes are modified, G otherwise.
// Start:
// N = Topological_Sort(G) // N is set of nodes in toposort order.
// foreach node n in N
// do
// if (Is_MKL_Layer(n)) // Can this layer accept Mkl layout as input.
// then
// E = set of <incoming edge and its src_output slot> of n
// E' = {} // new set of edges for rewritten node
// foreach <e,s> in E
// do
// E' U {<e,s>} // First copy edge which generates Tensorflow
// // tensor as it is
// m = Source node of edge e
// if Is_Rewritten(m) // Did we rewrite this node in this pass?
// then
// E' U {<m,s+1>} // If yes, then m will generate Mkl tensor
// // as output.
// else
// d = Generate_Dummy_Mkl_Tensor() // If not, generate dummy
// // Mkl tensor.
// E' U {<d,0>} // Dummy Mkl tensor has only 1 output slot.
// fi
// done
// n' = Build_New_Node(G,new_name,E')
// Mark_Rewritten(n') // Mark new node as being rewritten.
// fi
// done
//
// Explanation:
// For graph rewrite, we visit nodes of the graph in the topological
// sort order. With this ordering, we visit nodes in top-to-bottom
// fashion. We need this order because while visiting a node we want
// all of its input nodes (parents) visited (and rewritten if
// applicable). This is because if we need to rewrite a current node
// then all of its input nodes need to be fixed (in other words they
// cannot be removed later.)
//
// While visiting each node, we first check if it is Mkl layer. If
// it is, then we rewrite that node after constructing new inputs to
// the node. If it is not Mkl layer, then we do not rewrite the node.
//
class MklLayoutRewritePass : public GraphOptimizationPass {
public:
MklLayoutRewritePass() {
csinfo_.conv2d = "Conv2D";
ninfo_.push_back({csinfo_.conv2d, GetMklOpName(csinfo_.conv2d),
2, CopyAttrsConv2D});
}
// Standard interface to run pass
Status Run(const GraphOptimizationPassOptions& options);
// Helper function which does most of heavy lifting for rewriting
// Mkl nodes to propagate Mkl tensor as additional output
//
// Extracts common functionality between Run public interface and
// test interface.
//
// @return true, if and only if graph is mutated; false otherwise.
bool RunPass(std::unique_ptr<Graph>* g);
private:
/// Structure to specify name of original op, its new name after rewrite,
/// the number of inputs to the original op, and the function to be used
/// to copy attributes for the op
typedef struct {
string name; // Original name of the op in the graph
string newname; // New name of op in the graph
int numins; // Number of inputs to the original op
std::function<void(Node*, NodeBuilder*)> copyattrs; // Function handler
// to copy attributes from old node to new node.
} NodesInfo;
/// Structure to store all constant strings
struct {
string relu;
string relugrad;
string conv2d;
} csinfo_;
/// Maintain info about nodes to rewrite
std::vector<NodesInfo> ninfo_;
/// Hash table to maintain nodes visited in the graph.
std::unordered_set<const Node*> visited_nodes_;
private:
// Predicate to check if we rewrote node 'n'
//
// If we rewrote the node, then the rewritten node will produce
// Mkl tensor as output. If we did not rewrite the node, then
// we need to insert dummy Mkl node on the input side.
//
// Returns true if node is rewritten, false otherwise.
inline bool IsRewrittenNode(Node* n) const {
return visited_nodes_.find(n) != visited_nodes_.end();
}
// Mark the node as rewritten
inline void MarkRewrittenNode(Node* n) {
visited_nodes_.insert(n);
}
// Get the name of Mkl op from original TensorFlow op
// We prefix 'Mkl' to the original op to get Mkl op.
// TODO(nhasabni) We should move this to mkl_util.h.
inline string GetMklOpName(const string& name) const {
// Prefix that we add to Tensorflow op name to construct Mkl op name.
const char* const kMklOpPrefix = "Mkl";
return string(kMklOpPrefix) + name;
}
// Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
// in graph 'g'. Original node is input in 'orign'.
//
// For details, refer to 'Number of inputs after rewriting' section in the
// documentation above.
//
// Returns Status::OK() if setting up inputs is successful, otherwise
// returns appropriate status code.
Status SetUpInputs(std::unique_ptr<Graph>* g,
const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
NodeBuilder* nb, Node* orign);
// Rewrite Node 'n' in graph 'g' with rewrite information specified in 'ni'
// Returns Status::OK() if node rewrite is successful, otherwise returns
// appropriate error status
Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const NodesInfo& ni);
// Functions specific to operators to copy attributes
// We need operator-specific function to copy attributes because the framework
// does not provide any generic function for it.
static void CopyAttrsConv2D(Node* orign, NodeBuilder* nb);
// Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
// using node for original node 'orign' and return it in '*out'.
// TODO(nhasabni) We should move this to mkl_util.h
void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
Node* orign);
};
// We register Mkl rewrite pass for phase 1 in pre-placement group.
// Do not change the ordering of the Mkl passes.
REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1,
MklLayoutRewritePass);
static void FillInputs(const Node* n,
gtl::InlinedVector<Node*, 4>* control_edges,
gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
DCHECK_EQ(in->size(), n->num_inputs());
control_edges->clear();
for (const Edge* e : n->in_edges()) {
if (e->IsControlEdge()) {
control_edges->push_back(e->src());
} else {
(*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
}
}
std::sort(control_edges->begin(), control_edges->end());
if (n->op_def().is_commutative()) {
// For commutative inputs, we sort the input by the input Node*
// to get a canonical ordering (so that add(a,b) and add(b, a) will
// hash to the same value if is_commutative is true for 'add').
std::sort(in->begin(), in->end());
}
}
//////////////////////////////////////////////////////////////////////////
// Macros to build new node with different number of inputs.
// We need this way because we need to specify all the inputs when
// building a node. Comment at core/graph/node_builder.h, line 85-86.
#define SETUP_INPUTS1(nb, op1) do { \
nb->Input(op1.node, op1.index); \
}while(0)
#define SETUP_INPUTS2(nb, op1, op2) do { \
nb->Input(op1.node, op1.index); \
nb->Input(op2.node, op2.index); \
}while(0)
#define SETUP_INPUTS3(nb, op1, op2, op3) do { \
nb->Input(op1.node, op1.index); \
nb->Input(op2.node, op2.index); \
nb->Input(op3.node, op3.index); \
}while(0)
#define SETUP_INPUTS4(nb, op1, op2, op3, op4) do { \
nb->Input(op1.node, op1.index); \
nb->Input(op2.node, op2.index); \
nb->Input(op3.node, op3.index); \
nb->Input(op4.node, op4.index); \
}while(0)
#define SETUP_INPUTS5(nb, op1, op2, op3, op4, op5) do {\
nb->Input(op1.node, op1.index); \
nb->Input(op2.node, op2.index); \
nb->Input(op3.node, op3.index); \
nb->Input(op4.node, op4.index); \
nb->Input(op5.node, op5.index); \
}while(0)
// TODO(nhasabni) We should move this to mkl_util.h.
void MklLayoutRewritePass::GetDummyMklTensorNode(
std::unique_ptr<Graph>* g, Node** out, Node* orign) {
// We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent
// dummy Mkl tensor. 8 = 2*size_t.
const DataType dt = DataTypeToEnum<uint8>::v();
TensorProto proto;
proto.set_dtype(dt);
uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
proto.set_tensor_content(const_cast<const void*>(
static_cast<void*>(&zero)), 8);
TensorShape dummy_shape({8});
dummy_shape.AsProto(proto.mutable_tensor_shape());
TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
.Attr("value", proto)
.Attr("dtype", dt)
.Device(orign->def().device()) // We place this node on same
// device as device of original
// node.
.Finalize(&**g, out));
}
Status MklLayoutRewritePass::SetUpInputs(std::unique_ptr<Graph>* g,
const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
NodeBuilder* nb, Node* orign) {
std::vector<NodeBuilder::NodeOut> new_inputs;
// 1. Let's setup inputs for the new node.
for (int i = 0; i < inputs.size(); i++) {
Node* n = inputs[i].first;
// First let's copy original TF tensor input as it is.
new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second));
// Second, let's add edge to propagate Mkl tensors from input Mkl layers,
// or generate a dummy Mkl tensor representing not-mkl-tensor case.
if (IsRewrittenNode(n)) {
// If we have visited this node and rewritten it, then it will generate
// an edge that will receive Mkl tensor from a node.
// First, let's assert that this op is Mkl layer.
DataType T;
TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T));
// If this op has been rewritten, then its name must have been same as
// Mkl op.
CHECK_EQ(mkl_layer_registry::IsMklLayer(n->type_string()), true);
// src slot number for Mkl tensor would be the one next to TF tensor
// slot number.
new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second+1));
} else {
// If we have not visited the node and rewritten it, then we need
// to create a dummy node that will feed a non-Mkl tensor to this node.
// DummyMklTensor node has no input and generates only 1 output
// (dummy Mkl tensor) as output slot number 0.
Node* dmt = nullptr;
GetDummyMklTensorNode(g, &dmt, orign);
CHECK_NOTNULL(dmt);
new_inputs.push_back(NodeBuilder::NodeOut(dmt, 0));
}
}
// The total number of inputs to new node _must_ be 2 times the number
// of inputs to the original node: N original Tensorflow tensors and
// N for Mkl tensors corresponding to each Tensorflow tensors.
CHECK_EQ(new_inputs.size(), inputs.size() * 2);
// 2. Let's build the node with new inputs.
switch (new_inputs.size()) {
case 0: // We don't need to do anything for no input as we have
// already built node.
break;
case 1: SETUP_INPUTS1(nb, new_inputs[0]); break;
case 2: SETUP_INPUTS2(nb, new_inputs[0],
new_inputs[1]); break;
case 3: SETUP_INPUTS3(nb, new_inputs[0],
new_inputs[1],
new_inputs[2]); break;
case 4: SETUP_INPUTS4(nb, new_inputs[0],
new_inputs[1],
new_inputs[2],
new_inputs[3]); break;
case 5: SETUP_INPUTS5(nb, new_inputs[0],
new_inputs[1],
new_inputs[2],
new_inputs[3],
new_inputs[4]); break;
default: {
return Status(error::Code::UNIMPLEMENTED,
"Could not create node with given number of inputs");
}
}
return Status::OK();
}
void MklLayoutRewritePass::CopyAttrsConv2D(Node* orign, NodeBuilder* nb) {
DataType T;
string data_format;
string padding;
std::vector<int32> strides;
bool use_cudnn_on_gpu;
// Get all attributes from old node.
TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
TF_CHECK_OK(GetNodeAttr(orign->def(), "strides", &strides));
TF_CHECK_OK(GetNodeAttr(orign->def(), "padding", &padding));
TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &data_format));
TF_CHECK_OK(GetNodeAttr(orign->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
// Add attributes to new node.
nb->Attr("T", T);
nb->Attr("strides", strides);
nb->Attr("padding", padding);
nb->Attr("data_format", data_format);
nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
}
Status MklLayoutRewritePass::RewriteNode(
std::unique_ptr<Graph>* g, Node* orign, const NodesInfo& ni) {
VLOG(1) << "MKLLayoutRewritePass: Original node:" << orign->DebugString();
// Get all inputs.
const int num = orign->num_inputs();
CHECK_EQ(num, ni.numins);
gtl::InlinedVector<Node*, 4> control_edges;
gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num);
FillInputs(orign, &control_edges, &inputs);
// Build new node. We use same name as original node, but change the op name.
NodeBuilder nb(orign->name().c_str(), ni.newname.c_str());
// Copy user-specified device assigned to original node to new node.
nb.Device(orign->def().device());
// Set up new inputs to the rewritten node.
Status s = SetUpInputs(g, inputs, &nb, orign);
if (s != Status::OK()) {
return s;
}
// Copy attributes from original node to new node.
ni.copyattrs(orign, &nb);
// Set the Mkl layer label for this op.
nb.Attr("_kernel", mkl_layer_registry::kMklLayerLabel);
Node* newn = nullptr;
// Finalize graph and get new node.
TF_CHECK_OK(nb.Finalize(&**g, &newn));
CHECK_NOTNULL(newn);
// Incoming edges from 'orign' node to new 'newn' node are already copied
// in BuildNode. Copy outgoing edges from 'orign' node to new 'newn' node.
for (const Edge* e : orign->out_edges()) {
(*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input());
}
// Copy the runtime device assigned from original code to new node.
newn->set_assigned_device_name(orign->assigned_device_name());
// Delete original node and mark new node as rewritten.
(*g)->RemoveNode(orign);
MarkRewrittenNode(newn);
VLOG(1) << "MKLLayoutRewritePass: New node:" << newn->DebugString();
return Status::OK();
}
bool MklLayoutRewritePass::RunPass(
std::unique_ptr<Graph>* g) {
bool result = false;
CHECK_NOTNULL(g);
DumpGraph("Before running MklLayoutRewritePass", &**g);
std::vector<Node*> order;
GetReversePostOrder(**g, &order); // This will give us topological sort.
for (Node* n : order) {
if (!n->IsOp()) {
continue;
}
for (const NodesInfo& ni : ninfo_) {
DataType dtype = DT_INVALID;
// An op needs to have data type (T) attribute and its corresponding
// Mkl op name must be supported.
if (GetNodeAttr(n->def(), "T", &dtype) == Status::OK() &&
mkl_layer_registry::IsMklLayer(GetMklOpName(n->type_string())) &&
n->type_string().compare(ni.name) == 0) {
string node_name = n->name();
string op_name = n->type_string();
VLOG(1) << "MKLLayoutRewritePass: Scheduled node " << node_name
<< " with op " << op_name << " for rewrite using"
<< " layout optimization.";
if (RewriteNode(g, n, ni) == Status::OK()) {
VLOG(1) << "MKLLayoutRewritePass: Successfully rewrote node "
<< node_name << " with op " << op_name
<< " for Mkl layout optimization.";
result = true;
break; // We found matching nodesinfo so no need to search next.
}
}
}
}
DumpGraph("After running MklLayoutRewritePass", &**g);
return result;
}
///////////////////////////////////////////////////////////////////////////////
// Run function for the pass
///////////////////////////////////////////////////////////////////////////////
bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
return MklLayoutRewritePass().RunPass(g);
}
Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
if (options.graph == nullptr) {
return Status::OK();
}
// Get the ownership of graph
std::unique_ptr<Graph>* g = std::move(options.graph);
RunPass(g);
// Return the ownership of graph back
options.graph->reset(g->release());
return Status::OK();
}
} // namespace tensorflow
#endif

View File

@ -0,0 +1,36 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// A graph pass that rewrites graph for propagating MKL layout as a tensor
#ifndef TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
#define TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
#ifdef INTEL_MKL
#include <sys/types.h>
#include <memory>
#include "tensorflow/core/graph/graph.h"
namespace tensorflow {
// Interface to invoke the pass for unit test
//
// Returns true if and only if 'g' is mutated.
extern bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g);
} // namespace tensorflow
#endif
#endif // TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_

View File

@ -0,0 +1,199 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifdef INTEL_MKL
#include "tensorflow/core/graph/mkl_layout_pass.h"
#include "tensorflow/core/util/mkl_util.h"
#include <vector>
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/graph/graph_constructor.h"
#include "tensorflow/core/graph/testlib.h"
#include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/lib/random/simple_philox.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/platform/test_benchmark.h"
namespace tensorflow {
namespace {
static void InitGraph(const string& s, Graph* graph) {
GraphDef graph_def;
auto parser = protobuf::TextFormat::Parser();
// parser.AllowRelaxedWhitespace(true);
CHECK(parser.MergeFromString(s, &graph_def)) << s;
GraphConstructorOptions opts;
TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
}
class MklLayoutPassTest : public ::testing::Test {
public:
MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
void InitGraph(const string& s) {
::tensorflow::InitGraph(s, &graph_);
original_ = CanonicalGraphString(&graph_);
}
static bool IncludeNode(const Node* n) { return n->IsOp(); }
static string EdgeId(const Node* n, int index) {
if (index == 0) {
return n->name();
} else if (index == Graph::kControlSlot) {
return strings::StrCat(n->name(), ":control");
} else {
return strings::StrCat(n->name(), ":", index);
}
}
string CanonicalGraphString(Graph* g) {
std::vector<string> nodes;
std::vector<string> edges;
for (const Node* n : g->nodes()) {
if (IncludeNode(n)) {
nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
}
}
for (const Edge* e : g->edges()) {
if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
EdgeId(e->dst(), e->dst_input())));
}
}
// Canonicalize
std::sort(nodes.begin(), nodes.end());
std::sort(edges.begin(), edges.end());
return strings::StrCat(str_util::Join(nodes, ";"), "|",
str_util::Join(edges, ";"));
}
string DoMklLayoutOptimizationPass() {
string before = CanonicalGraphString(&graph_);
LOG(ERROR) << "Before MKL layout rewrite pass: " << before;
std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
RunMklLayoutRewritePass(ug);
string result = CanonicalGraphString(&graph_);
LOG(ERROR) << "After MKL layout rewrite pass: " << result;
return result;
}
const string& OriginalGraph() const { return original_; }
Graph graph_;
string original_;
};
REGISTER_OP("Input").Output("o: float").SetIsStateful();
// Single Conv2D Op; No Mkl layer on the input and on the output.
// We will generate dummy Mkl tensor as 2nd input of Conv2D.
TEST_F(MklLayoutPassTest, Conv2D_Basic) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'Conv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}"
"node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['B', 'C'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(MklConv2D);D(Mul);DMT/_0(Const);DMT/_1(Const)|"
"A->C;B->C:2;B->D;C->D:1;DMT/_0->C:1;DMT/_1->C:3");
}
// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
// have 2 outputs, both of which will be inputs to next Conv2D.
TEST_F(MklLayoutPassTest, Conv2D_Positive1) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'Conv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}"
"node { name: 'D' op: 'Conv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'C']}"
"node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(MklConv2D);D(MklConv2D);DMT/_0(Const);"
"DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;B->C:2;C->D:2;C->E;"
"C:1->D:3;D->E:1;DMT/_0->C:1;DMT/_1->C:3;DMT/_2->D:1");
}
static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
testing::StopTiming();
string s;
for (int in = 0; in < 10; in++) {
s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
}
random::PhiloxRandom philox(301, 17);
random::SimplePhilox rnd(&philox);
for (int op = 0; op < op_nodes; op++) {
s += strings::Printf(
"node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
"type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
op, rnd.Uniform(10), rnd.Uniform(10));
}
bool first = true;
while (iters > 0) {
Graph* graph = new Graph(OpRegistry::Global());
InitGraph(s, graph);
int N = graph->num_node_ids();
if (first) {
testing::SetLabel(strings::StrCat("Per graph node. Nodes: ", N));
first = false;
}
{
testing::StartTiming();
std::unique_ptr<Graph> ug(graph);
RunMklLayoutRewritePass(&ug);
testing::StopTiming();
}
iters -= N; // Our benchmark units are individual graph nodes,
// not whole graphs
// delete graph;
}
}
BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
} // namespace
} // namespace tensorflow
#endif /* INTEL_MKL */

View File

@ -22,6 +22,8 @@ limitations under the License.
#include <vector>
#include <queue>
#include <utility>
#include <string>
#include <memory>
#include "tensorflow/core/graph/mkl_optimizer_merge.h"
@ -33,6 +35,8 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/common_runtime/function.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/common_runtime/optimization_registry.h"
namespace tensorflow {
@ -58,8 +62,8 @@ static size_t kNodeMergeContextMaxDepth = 10;
class NodeMergeRewritePass : public GraphOptimizationPass {
public:
NodeMergeRewritePass() {
csinfo_.conv2d = "Conv2D";
csinfo_.conv2dwithbias = "Conv2DWithBias";
csinfo_.conv2d = "MklConv2D";
csinfo_.conv2dwithbias = "MklConv2DWithBias";
csinfo_.conv2dwithbiasbackpropbias = "Conv2DWithBiasBackpropBias";
csinfo_.biasadd = "BiasAdd";
csinfo_.matmul = "MatMul";
@ -72,6 +76,9 @@ class NodeMergeRewritePass : public GraphOptimizationPass {
// maxhops in backward data-flow graph. Since input of forward nodes
// (Conv2D) directly goes to backward nodes, we do not expect the
// hop-distance would be more than few nodes.
// TODO(nhasabni) Temporarily disabling rewrite of BiasAddGrad.
// Will enable it once we support Conv2DWithBiasBackpropBias op.
#if 0
rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias,
{csinfo_.conv2dwithbias, kNodeMergeContextMaxDepth}});
rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias,
@ -80,6 +87,7 @@ class NodeMergeRewritePass : public GraphOptimizationPass {
// because we do not have a separate Op for MatMulwithBias.
rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.biasaddgrad,
{csinfo_.matmul, kNodeMergeContextMaxDepth}});
#endif
}
// Standard interface to run optimization pass
@ -182,10 +190,16 @@ class NodeMergeRewritePass : public GraphOptimizationPass {
// @return Matching rewriteinfo in case a match is found; null otherwise.
const RewriteInfo* FindMatchingRewriteInfo(const Node* n,
const Node** fwdn) const;
// Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
// and return it in '*out'.
// TODO(nhasabni) We should move this to mkl_util.h
void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out);
};
/// We register merge optimizer for phase 1 and MKLToTF insertion for phase 2.
REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1,
// We register merge optimizer for phase 2 in pre-placement group.
// Do not change the ordering of the Mkl passes.
REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 2,
NodeMergeRewritePass);
static void FillInputs(const Node* n,
@ -219,8 +233,6 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const {
}
}
VLOG(1) << "FindNodeForMerge: " << a->type_string();
for (const MergeInfo* mi : matching_mi) {
const int N_in = a->num_inputs();
if (mi->op >= N_in) {
@ -240,8 +252,6 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const {
continue;
}
VLOG(1) << " FindNode: " << b->type_string();
gtl::InlinedVector<Node*, 4> b_control_edges;
gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(N_in);
FillInputs(b, &b_control_edges, &b_in);
@ -258,6 +268,22 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const {
return nullptr;
}
void NodeMergeRewritePass::GetDummyMklTensorNode(
std::unique_ptr<Graph>* g, Node** out) {
const DataType dt = DataTypeToEnum<uint8>::v();
TensorProto proto;
proto.set_dtype(dt);
uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
proto.set_tensor_content(const_cast<const void*>(
static_cast<void*>(&zero)), 8);
TensorShape dummy_shape({8});
dummy_shape.AsProto(proto.mutable_tensor_shape());
TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
.Attr("value", proto)
.Attr("dtype", dt)
.Finalize(&**g, out));
}
Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
Node* succ, Node* pred) {
CHECK_NOTNULL(succ);
@ -271,7 +297,6 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
std::vector<int32> strides;
string data_format_pred, data_format_succ;
bool use_cudnn_on_gnu;
int groups = 1;
TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
@ -280,25 +305,28 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu",
&use_cudnn_on_gnu));
// Groups attribute may not be there on the input node. So we do not
// check for error in GetNodeAttr call.
GetNodeAttr(pred->def(), "groups", &groups);
// We check to ensure that data formats of both succ and pred are same.
// We expect them to be same, so we can enforce this as assert.
// But assert can be too strict, so we enforce this as a check.
// If the check fails, then we do not merge two nodes.
// We also do same check for devices.
if (data_format_pred != data_format_succ ||
T_pred != T_succ) {
T_pred != T_succ ||
pred->assigned_device_name() != succ->assigned_device_name() ||
pred->def().device() != succ->def().device()) {
return Status(error::Code::INVALID_ARGUMENT,
"data_format or T attribute of Conv2D and BiasAdd"
"do not match. Will skip node merge optimization");
"data_format or T attribute or devices of Conv2D and "
"BiasAdd do not match. Will skip node merge optimization");
}
// 2. Get inputs from both the nodes.
// Find the 2 inputs from the conv and the bias from the add Bias.
Node* oper1 = nullptr;
Node* oper1_mkl = nullptr; // Mkl tensor corresponding to oper1
Node* oper2 = nullptr;
Node* oper2_mkl = nullptr; // Mkl tensor corresponding to oper2
Node* oper3 = nullptr;
Node* oper3_mkl = nullptr; // Mkl tensor corresponding to oper3
const int succ_num = succ->num_inputs();
gtl::InlinedVector<Node*, 4> succ_control_edges;
@ -326,24 +354,35 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
}
}
// Get operand 0, 1 of conv2D
oper1 = pred_in[0].first;
oper2 = pred_in[1].first;
// Get operand 0, 1 of conv2D and their Mkl tensors.
CHECK_EQ(pred->in_edges().size(), 4); // MklConv2D must have 4 inputs.
oper1 = pred_in[0].first;
oper1_mkl = pred_in[1].first;
oper2 = pred_in[2].first;
oper2_mkl = pred_in[3].first;
// Get operand 1 of add_bias
oper3 = succ_in[1].first;
// BiasAdd must have 2 inputs: Conv, bias
CHECK_EQ(succ->in_edges().size(), 2);
oper3 = succ_in[1].first;
GetDummyMklTensorNode(g, &oper3_mkl); // Get dummy Mkl tensor node
// as BiasAdd does not have Mkl tensor as input.
CHECK_NOTNULL(oper3_mkl);
Node* ret;
// We will use the node name of BiasAdd as the name of new node
TF_CHECK_OK(NodeBuilder(succ->name(), csinfo_.conv2dwithbias)
.Input(oper1)
.Input(oper1_mkl)
.Input(oper2)
.Input(oper2_mkl)
.Input(oper3)
.Input(oper3_mkl)
.Attr("T", T_pred)
.Attr("strides", strides)
.Attr("padding", padding)
.Attr("data_format", data_format_pred)
.Attr("use_cudnn_on_gpu", use_cudnn_on_gnu)
.Attr("groups", groups)
.Device(succ->def().device())
.Finalize(&**g, &ret));
CHECK_NOTNULL(ret);
@ -352,6 +391,15 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
(*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input());
}
// Copy device assigned to old node to new node.
// It's ok to use pred or succ as we have enforced a check that
// both have same device assigned.
ret->set_assigned_device_name(pred->assigned_device_name());
VLOG(1) << "NodeMergeRewritePass: Merged old node:" << pred->DebugString()
<< ", and node: " << succ->DebugString() << ", into node:"
<< ret->DebugString();
(*g)->RemoveNode(succ);
(*g)->RemoveNode(pred);
@ -369,13 +417,14 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
const Node* fwdn = nullptr;
const RewriteInfo* ri = FindMatchingRewriteInfo(n, &fwdn);
if (ri == nullptr || fwdn == nullptr) {
VLOG(1) << "Rewriteinfo not found for: " << n->type_string();
VLOG(2) << "NodeMergeRewritePass: Rewriteinfo not found for: "
<< n->type_string();
return Status(error::Code::INVALID_ARGUMENT,
"Rewrite info not found for the node."
"Will skip node rewrite optimization");
}
VLOG(1) << "Rewrite called for: " << n->type_string();
VLOG(1) << "NodeMergeRewritePass: Rewrite called for: " << n->type_string();
if (n->type_string() == csinfo_.biasaddgrad &&
ri->node == csinfo_.biasaddgrad &&
@ -407,6 +456,7 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
.Attr("T", T)
.Attr("data_format", data_format)
.Attr("strides", strides)
.Device(n->def().device())
.Finalize(&**g, &ret));
} else {
CHECK_EQ(ri->rewrite, csinfo_.biasaddgrad);
@ -414,6 +464,7 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
.Input(op)
.Attr("T", T)
.Attr("data_format", data_format)
.Device(n->def().device())
.Finalize(&**g, &ret));
}
@ -424,7 +475,11 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
(*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input());
}
VLOG(1) << "Rewrite node: " << n->type_string() << " successful";
// Copy device assigned to old node to new node.
ret->set_assigned_device_name(n->assigned_device_name());
VLOG(1) << "MKLOptimizerMergePass: Rewrote old node:" << n->DebugString()
<< ", into node:" << ret->DebugString();
(*g)->RemoveNode(n);
return Status::OK();
@ -450,7 +505,8 @@ NodeMergeRewritePass::FindMatchingRewriteInfo(const Node* n,
}
}
VLOG(1) << "Searching graph for: " << n->type_string() << " in backwards.";
VLOG(1) << "NodeMergeRewritePass: Searching graph for: "
<< n->type_string() << " in backwards.";
// Now we will check for forward op name for rewrite info in data
// flow graph. Get the max hops we should search for the fwd node
@ -473,7 +529,8 @@ NodeMergeRewritePass::FindMatchingRewriteInfo(const Node* n,
curr_depth = curr_pair.second;
CHECK_NOTNULL(curr_node);
VLOG(1) << "Visiting node: " << curr_node->type_string()
VLOG(1) << "NodeMergeRewritePass: Visiting node: "
<< curr_node->type_string()
<< " at depth: " << curr_depth
<< " for node: " << n->type_string();
@ -528,17 +585,16 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr<Graph>* g) {
std::vector<std::pair<Node*, Node*>> nodes_to_be_merged;
std::vector<Node*> nodes_to_be_rewritten;
VLOG(1) << "Running NodeMerge Optimization";
for (Node* n : order) {
if (!n->IsOp()) continue;
Node* n1 = nullptr;
if ((n1 = FindNodeForMerge(n)) != nullptr) {
VLOG(1) << "Scheduled nodes " << n->name() << " and "
<< n1->name() << " for merging";
VLOG(1) << "NodeMergeRewritePass: Scheduled nodes "
<< n->name() << " and " << n1->name() << " for merging";
nodes_to_be_merged.push_back(std::make_pair(n, n1));
} else if (IsApplicableRewriteNode(n)) {
VLOG(1) << "Scheduled node " << n->name() << " for rewrite";
VLOG(1) << "NodeMergeRewritePass: Scheduled node " << n->name()
<< " for rewrite";
nodes_to_be_rewritten.push_back(n);
}
}
@ -549,7 +605,8 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr<Graph>* g) {
string n1_name = i.first->name();
string n2_name = i.second->name();
if (MergeNode(g, i.first, i.second) == Status::OK()) {
VLOG(1) << "Merged nodes " << n1_name << " and " << n2_name;
VLOG(1) << "NodeMergeRewritePass: Merged nodes " << n1_name
<< " and " << n2_name;
result = true;
}
}
@ -559,7 +616,8 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr<Graph>* g) {
for (Node* i : nodes_to_be_rewritten) {
string name = i->name();
if (RewriteNode(g, i) == Status::OK()) {
VLOG(1) << "Rewrite node: " << name << " successful.";
VLOG(1) << "NodeMergeRewritePass: Rewrite node: "
<< name << " successful.";
result = true;
}
}
@ -574,8 +632,6 @@ bool OptimizeNodeMerge(std::unique_ptr<Graph>* g) {
}
Status NodeMergeRewritePass::Run(const GraphOptimizationPassOptions& options) {
// Currently checking only for two cases - Conv2D+Bias and Matmul+Bias.
// It is possible to extend it to other operators in future.
if (options.graph == nullptr) {
return Status::OK();
}

View File

@ -21,20 +21,14 @@ limitations under the License.
#ifdef INTEL_MKL
#include <sys/types.h>
#include <vector>
#include <string>
#include <memory>
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/common_runtime/optimization_registry.h"
namespace tensorflow {
// Interface to invoke the pass for unit test
//
// Returns true if and only if 'g' is mutated.
extern bool OptimizeNodeMerge(std::unique_ptr<Graph>* g);
} // namespace tensorflow
#endif // INTEL_MKL

View File

@ -105,6 +105,7 @@ class OptimizerMergeTest : public ::testing::Test {
};
REGISTER_OP("Input").Output("o: float").SetIsStateful();
REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();
TEST_F(OptimizerMergeTest, Basic) {
InitGraph(
@ -121,8 +122,38 @@ TEST_F(OptimizerMergeTest, Basic) {
// Test set 1: Conv2D + AddBias
// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y)
// C=MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y)
TEST_F(OptimizerMergeTest, Conv2DWithBias_Positive) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'M' op: 'MklInput'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'N' op: 'MklInput'}"
"node { name: 'C' op: 'MklConv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'M', 'B', 'N']}"
"node { name: 'D' op: 'Input'}"
"node { name: 'E' op: 'BiasAdd'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['C', 'D'] }"
"node { name: 'Y' op: 'Input'}"
"node { name: 'Z' op: 'Sub'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['E', 'Y']}");
EXPECT_EQ(DoNodeMerge(),
"A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
"M(MklInput);N(MklInput);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
"DMT/_0->E:5;E->Z;M->E:1;N->E:3;Y->Z:1");
}
// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
// We do not merge in this case as op is Conv2D and not MklConv2D.
TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_NoMklConv2D) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
@ -143,63 +174,69 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Positive) {
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['E', 'Y']}");
EXPECT_EQ(DoNodeMerge(),
"A(Input);B(Input);D(Input);E(Conv2DWithBias);Y(Input);Z(Sub)|"
"A->E;B->E:1;D->E:2;E->Z;Y->Z:1");
"A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd);Y(Input);Z(Sub)|"
"A->C;B->C:1;C->E;D->E:1;E->Z;Y->Z:1");
}
// Graph contains only Conv2D, no AddBias.
// Graph contains only MklConv2D, no AddBias.
TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_NoAddBias) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'M' op: 'MklInput'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'Conv2D'"
"node { name: 'N' op: 'MklInput'}"
"node { name: 'C' op: 'MklConv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}");
" input: ['A', 'M', 'B', 'N']}");
EXPECT_EQ(DoNodeMerge(),
"A(Input);B(Input);C(Conv2D)|"
"A->C;B->C:1");
"A(Input);B(Input);C(MklConv2D);M(MklInput);N(MklInput)|"
"A->C;B->C:2;M->C:1;N->C:3");
}
// Conv2D output does not go to BiasAdd.
// MklConv2D output does not go to BiasAdd.
TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow1) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'M' op: 'MklInput'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'Conv2D'"
"node { name: 'N' op: 'MklInput'}"
"node { name: 'C' op: 'MklConv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}"
" input: ['A', 'M', 'B', 'N']}"
"node { name: 'D' op: 'Input'}"
"node { name: 'E' op: 'Input'}"
"node { name: 'F' op: 'BiasAdd'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['D', 'E'] }"); // Output of Conv2D does not go to BiasAdd.
" input: ['D', 'E'] }"); // Output of MklConv2D does not go to BiasAdd.
EXPECT_EQ(DoNodeMerge(),
"A(Input);B(Input);C(Conv2D);D(Input);E(Input);F(BiasAdd)|"
"A->C;B->C:1;D->F;E->F:1");
"A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
"M(MklInput);N(MklInput)|A->C;B->C:2;D->F;E->F:1;M->C:1;N->C:3");
}
// Conv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
// MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
// Merge should not be done in such case.
TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'M' op: 'MklInput'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'Conv2D'"
"node { name: 'N' op: 'MklInput'}"
"node { name: 'C' op: 'MklConv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}"
" input: ['A', 'M', 'B', 'N']}"
"node { name: 'D' op: 'Input'}"
"node { name: 'E' op: 'Input'}"
"node { name: 'F' op: 'BiasAdd'"
@ -211,8 +248,9 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) {
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'E'] }");
EXPECT_EQ(DoNodeMerge(),
"A(Input);B(Input);C(Conv2D);D(Input);E(Input);F(BiasAdd);G(Add)|"
"A->C;B->C:1;C->G;D->F;E->F:1;E->G:1");
"A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
"G(Add);M(MklInput);N(MklInput)|A->C;B->C:2;C->G;D->F;"
"E->F:1;E->G:1;M->C:1;N->C:3");
}
// data_format attribute value mismatch. Merge should not be done
@ -220,28 +258,63 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) {
TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_AttrMismatch) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'M' op: 'MklInput'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'Conv2D'"
"node { name: 'N' op: 'MklInput'}"
"node { name: 'C' op: 'MklConv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}"
" input: ['A', 'M', 'B', 'N']}"
"node { name: 'D' op: 'Input'}"
"node { name: 'E' op: 'BiasAdd'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NHCW' } }"
" input: ['C', 'D'] }");
EXPECT_EQ(DoNodeMerge(),
"A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd)|"
"A->C;B->C:1;C->E;D->E:1");
"A(Input);B(Input);C(MklConv2D);D(Input);E(BiasAdd);M(MklInput);"
"N(MklInput)|A->C;B->C:2;C->E;D->E:1;M->C:1;N->C:3");
}
// Test set 2: Conv2D..BiasAddGrad -> Conv2DWithBiasBackpropBias rewrite tests
#if 0
// This test set is disabled temporarily as we do not enable node rewrite.
// This test set will be enabled when we support Mkl-specific kernels for
// backward bias.
//
// Test set 2: MklConv2D..BiasAddGrad -> Conv2DWithBiasBackpropBias
// rewrite tests
// C=Conv2D(A,B); D=Sub(C,A); F=BiasAddGrad(D)
// C=MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D)
TEST_F(OptimizerMergeTest, Conv2DBackprop_Positive) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'M' op: 'MklInput'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'N' op: 'MklInput'}"
"node { name: 'C' op: 'MklConv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'M', 'B', 'N']}"
"node { name: 'D' op: 'Sub'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'A']}"
"node { name: 'E' op: 'BiasAddGrad'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['D'] }");
EXPECT_EQ(DoNodeMerge(),
"A(Input);B(Input);C(MklConv2D);D(Sub);E(Conv2DWithBiasBackpropBias);"
"M(MklInput);N(MklInput)|A->C;A->D:1;B->C:2;C->D;D->E;M->C:1;N->C:3");
}
// No MklConv2D in context, but Conv2D in context. No rewrite should happen.
// C=Conv2D(A,B); D=Sub(C,A); E=BiasAddGrad(D)
TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoMklConv2D) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
@ -260,12 +333,12 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Positive) {
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['D'] }");
EXPECT_EQ(DoNodeMerge(),
"A(Input);B(Input);C(Conv2D);D(Sub);E(Conv2DWithBiasBackpropBias)|"
"A(Input);B(Input);C(Conv2D);D(Sub);E(BiasAddGrad)|"
"A->C;A->D:1;B->C:1;C->D;D->E");
}
// No Conv2D in the context for BiasAddGrad. No rewrite should happen.
// C=Add(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D) {
InitGraph(
"node { name: 'A' op: 'Input'}"
@ -287,7 +360,7 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D) {
// No Conv2D in the context for BiasAddGrad, but MatMul in context.
// Rewrite should happen, but name of BiasAddGrad does not change.
// C=MatMul(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D_MatMul) {
InitGraph(
"node { name: 'A' op: 'Input'}"
@ -310,7 +383,7 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D_MatMul) {
}
// Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
// C=MatMul(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Positive) {
InitGraph(
"node { name: 'A' op: 'Input'}"
@ -333,7 +406,7 @@ TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Positive) {
}
// No MatMul in the context for BiasAddGrad. No rewrite should happen.
// C=Add(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Negative_NoMatMul) {
InitGraph(
"node { name: 'A' op: 'Input'}"
@ -352,7 +425,7 @@ TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Negative_NoMatMul) {
"A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
"A->C;A->D:1;B->C:1;C->D;D->E");
}
#endif
static void BM_NodeMerge(int iters, int op_nodes) {
testing::StopTiming();

View File

@ -0,0 +1,271 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifdef INTEL_MKL
#include <set>
#include <vector>
#include <queue>
#include <utility>
#include <string>
#include <memory>
#include "tensorflow/core/framework/node_def_util.h"
#include "tensorflow/core/graph/algorithm.h"
#include "tensorflow/core/graph/node_builder.h"
#include "tensorflow/core/lib/gtl/map_util.h"
#include "tensorflow/core/lib/hash/hash.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/common_runtime/function.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/common_runtime/optimization_registry.h"
#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
#include "tensorflow/core/util/mkl_util.h"
namespace tensorflow {
// This pass inserts Mkl to Tf tensor conversion nodes (represented by C)
// in the graph in between A and B, where A and B match any one
// of the following
// cases:
// 1) A = layer/Op that generates output in Mkl format and,
// B = layer/Op that does not accept input in Mkl format and,
// A -> B (there is a direct edge between A and B, then
// We will insert C such that A->C->B.
//
// 2) A = layer/Op that generates output in Mkl format and,
// B = NULL (in other words, A is the last layer in the graph), then
// We will insert C such that A->C->B. (C will be the last layer.)
//
// Note that case 1 applies to all outputs of A that are input to B.
// In other words, the conversions will be required for every output
// of A that is input to B. For example, let us say the output of A
// is A1, A2, A3, of which A1 and A2 are in Mkl format, but A3 is not
// in Mkl format, and all of them are input to B. In such case, we will
// do the conversion for A1 and A2 only. We do not need to do any conversion
// for A3.
//
// This pass relies on layers registering themselves about their Mkl compliant.
// Mkl compliant layer can accept inputs in Mkl format, and produce output in
// Mkl format. Non-compliant layer accepts inputs and outputs in
// TensorFlow format.
//
class MklToTfConversionPass : public GraphOptimizationPass {
public:
MklToTfConversionPass() {}
Status Run(const GraphOptimizationPassOptions& options);
// Insert layout conversion node in the graph pointed by g.
// Function scans the graph for candidate edges where we
// need to insert conversion nodes.
//
// @return true even if single conversion node is inserted;
// false, otherwise.
bool RunPass(std::unique_ptr<Graph>* g);
private:
// Is the input Op supported by Mkl-specific layout?
//
// @input op_name string of the op
// @return true if op is Mkl supported; false, otherwise.
inline bool IsMklSupportedOp(const string& op_name) const {
return mkl_layer_registry::IsMklLayer(op_name);
}
// Insert layout conversion node on the edge pointed by 'e' from graph 'g'.
//
// Edge will be deleted once a call to this function is successful.
// Any attempt to use the edge after this call
// will lead to undefined behaviors.
//
// @return Success:OK() if insertion is successful, otherwise returns
// appropriate error status code.
Status InsertConversionNodeOnEdge(std::unique_ptr<Graph>* g, Edge*);
};
// We register MklToTf insertion for phase 1 in post-partition grouping.
// We register this pass after partitioning so that we get a complete
// picture of inputs and outputs of the nodes in the graphs.
const OptimizationPassRegistry::Grouping kMklTfConvPassGroup =
OptimizationPassRegistry::POST_PARTITIONING;
REGISTER_OPTIMIZATION(kMklTfConvPassGroup, 1, MklToTfConversionPass);
Status MklToTfConversionPass::InsertConversionNodeOnEdge(
std::unique_ptr<Graph>* g, Edge *e) {
CHECK_NOTNULL(e);
Node* src = e->src();
Node* dst = e->dst();
CHECK_NOTNULL(src);
CHECK_NOTNULL(dst);
Node* conversion_node = nullptr;
DataType src_datatype = DT_INVALID;
DataType dst_datatype = DT_INVALID;
string data_format;
TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype));
TF_CHECK_OK(GetNodeAttr(dst->def(), "T", &dst_datatype));
if (src_datatype != dst_datatype) {
string err_msg = "T attribute of " + src->name() + " and " +
dst->name() + " do not match. Will not insert" +
" MklToTf node in such case.";
return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str());
}
// Lets build the conversion node and specify src as input.
TF_CHECK_OK(NodeBuilder((*g)->NewName("Mkl2Tf"), "MklToTf")
.Input(src, e->src_output())
.Input(src, e->src_output()+1) // Mkl tensor immediately
// follows Tf tensor.
.Device(src->def().device()) // We want to get conversion node
// on same device as source node.
.Attr("T", src_datatype)
.Finalize(&**g, &conversion_node));
CHECK_NOTNULL(conversion_node);
if (GetNodeAttr(src->def(), "data_format", &data_format) == Status::OK()) {
conversion_node->AddAttr("data_format", data_format);
}
// Get assigned device from source node and apply it to conversion node.
// We want conversion node to be on the same device as the source node.
conversion_node->set_assigned_device_name(src->assigned_device_name());
// Set the Mkl layer label for this op.
conversion_node->AddAttr("_kernel", mkl_layer_registry::kMklLayerLabel);
// Now that we have added edge from src->conversion_node, let's add edge from
// output of conversion_node to the dest node. Since conversion_node
// has only 1 output, the src_output of conversion_node is 0.
CHECK_NOTNULL((*g)->AddEdge(conversion_node, 0, dst, e->dst_input()));
VLOG(1) << "MklToTfConversionPass: Inserting Conversion node on: "
<< src->type_string() << " and " << dst->type_string()
<< " successful.";
// Remove src->dst edge now.
(*g)->RemoveEdge(e);
return Status::OK();
}
bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
bool result = false;
CHECK_NOTNULL(g);
DumpGraph("Before MklToTfConversionPass", &**g);
// Since we are looking for mkl-supported op node immediately
// followed by non-mkl op node, we will just iterate over edge
// set of the graph.
// vector to maintain candiadate edges whose source and destination
// are candidate for inserting conversion node
std::vector<Edge*> candidate_edges;
for (const Edge *e : (*g)->edges()) {
Node* src = e->src();
Node* dst = e->dst();
// We skip control edges.
if (e->IsControlEdge()) {
continue;
}
VLOG(1) << "MklToTfConversionPass: InsertConversionNodes: "
<< src->type_string() << " and " << dst->type_string();
// Let's get source and destination data type.
DataType src_datatype = DT_INVALID;
if (GetNodeAttr(src->def(), "T", &src_datatype) != Status::OK()) {
continue;
}
// We cannot check datatype on destination node because destination node
// may not be Mkl node.
DataType dst_datatype = DT_INVALID;
GetNodeAttr(dst->def(), "T", &dst_datatype);
// Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
if (IsMklSupportedOp(src->type_string()) &&
!IsMklSupportedOp(dst->type_string())) {
VLOG(1) << "MklToTfConversionPass: Scheduled nodes " << src->name()
<< " and " << dst->name() << " for inserting conversion nodes";
candidate_edges.push_back(const_cast<Edge*>(e));
}
}
// Process all candidate edges and insert conversion nodes on them.
for (Edge* e : candidate_edges) {
// Even if we insert conversion node on a single edge, we
// need to return true.
string src_name = e->src()->name();
string dst_name = e->dst()->name();
if (InsertConversionNodeOnEdge(g, e) == Status::OK()) {
VLOG(1) << "MklToTfConversionPass: Inserted conversion "
<< "node on edge between " << src_name << " and " << dst_name;
result = true;
}
}
DumpGraph("After MklToTfConversionPass", &**g);
// We need to return true even if we insert one conversion node
// anywhere in the graph.
return result;
}
//////////////////////////////////////////////////////////////////////////////
// Run function for the pass
//////////////////////////////////////////////////////////////////////////////
bool InsertMklToTfConversionNodes(std::unique_ptr<Graph>* g) {
return MklToTfConversionPass().RunPass(g);
}
Status MklToTfConversionPass::Run(
const GraphOptimizationPassOptions& options) {
if (options.graph == nullptr && options.partition_graphs == nullptr) {
return Status::OK();
}
auto process_graph = [&](std::unique_ptr<Graph>* g) {
// Get the ownership of graph
std::unique_ptr<Graph>* ng = std::move(g);
RunPass(ng);
// Return the ownership of graph back
g->reset(ng->release());
};
if (kMklTfConvPassGroup != OptimizationPassRegistry::POST_PARTITIONING) {
// For any pre-partitioning phase, graph is stored in options.graph.
process_graph(options.graph);
} else {
// For post partitioning phase, graphs are stored in
// options.partition_graphs.
for (auto& pg : *options.partition_graphs) {
process_graph(&pg.second);
}
}
return Status::OK();
}
} // namespace tensorflow
#endif

View File

@ -0,0 +1,36 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// An optimization pass that inserts MklToTf conversion nodes in the graph
#ifndef TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
#define TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
#ifdef INTEL_MKL
#include <sys/types.h>
#include <memory>
#include "tensorflow/core/graph/graph.h"
namespace tensorflow {
// Interface to invoke the pass for unit test
//
// Returns true if and only if 'g' is mutated.
extern bool InsertMklToTfConversionNodes(std::unique_ptr<Graph>* g);
} // namespace tensorflow
#endif
#endif // TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_

View File

@ -0,0 +1,243 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifdef INTEL_MKL
#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
#include <vector>
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/graph/graph_constructor.h"
#include "tensorflow/core/graph/testlib.h"
#include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/lib/random/simple_philox.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/platform/test_benchmark.h"
namespace tensorflow {
namespace {
class MklToTfConversionPass : public ::testing::Test {
public:
MklToTfConversionPass() : graph_(OpRegistry::Global()) {}
static void InitGraph(const string& s, Graph* graph) {
GraphDef graph_def;
auto parser = protobuf::TextFormat::Parser();
CHECK(parser.MergeFromString(s, &graph_def)) << s;
GraphConstructorOptions opts;
TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
}
void InitGraph(const string& s) {
InitGraph(s, &graph_);
original_ = CanonicalGraphString(&graph_);
}
static bool IncludeNode(const Node* n) { return n->IsOp(); }
static string EdgeId(const Node* n, int index) {
if (index == 0) {
return n->name();
} else if (index == Graph::kControlSlot) {
return strings::StrCat(n->name(), ":control");
} else {
return strings::StrCat(n->name(), ":", index);
}
}
string CanonicalGraphString(Graph* g) {
std::vector<string> nodes;
std::vector<string> edges;
for (const Node* n : g->nodes()) {
if (IncludeNode(n)) {
nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
}
}
for (const Edge* e : g->edges()) {
if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
EdgeId(e->dst(), e->dst_input())));
}
}
// Canonicalize
std::sort(nodes.begin(), nodes.end());
std::sort(edges.begin(), edges.end());
return strings::StrCat(str_util::Join(nodes, ";"), "|",
str_util::Join(edges, ";"));
}
string DoRunMklToTfConversionPass() {
string before = CanonicalGraphString(&graph_);
LOG(ERROR) << "Before MklToTf conversion pass: " << before;
std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
InsertMklToTfConversionNodes(ug);
string result = CanonicalGraphString(&graph_);
LOG(ERROR) << "After MklToTf conversion pass: " << result;
return result;
}
const string& OriginalGraph() const { return original_; }
Graph graph_;
string original_;
};
REGISTER_OP("Input").Output("o: float").SetIsStateful();
REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();
TEST_F(MklToTfConversionPass, Basic) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }"
"node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['A', 'B'] }");
EXPECT_EQ(DoRunMklToTfConversionPass(),
"A(Input);B(Input);C(Mul);D(Mul)|"
"A->C;A->D;B->C:1;B->D:1");
}
// MklConv2D followed by Non-Mkl layer
// C=MklConv2D(A,M,B,N); E=Sub(C,D)
TEST_F(MklToTfConversionPass, Positive) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'M' op: 'MklInput'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'N' op: 'MklInput'}"
"node { name: 'C' op: 'MklConv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'M', 'B', 'N']}"
"node { name: 'D' op: 'Input'}"
"node { name: 'E' op: 'Sub'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'D']}");
EXPECT_EQ(DoRunMklToTfConversionPass(),
"A(Input);B(Input);C(MklConv2D);D(Input);E(Sub);M(MklInput);"
"Mkl2Tf/_0(MklToTf);N(MklInput)|A->C;B->C:2;C->Mkl2Tf/_0;"
"C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3");
}
// MklConv2D followed by Non-Mkl layer, and MklConv2D uses half type
// C=MklConv2D(A,M,B,N); E=Sub(C,D)
// MklToTf node should be inserted.
TEST_F(MklToTfConversionPass, Positive_Type) {
InitGraph(
"node { name: 'A' op: 'HalfInput'}"
"node { name: 'M' op: 'MklInput'}"
"node { name: 'B' op: 'HalfInput'}"
"node { name: 'N' op: 'MklInput'}"
"node { name: 'C' op: 'MklConv2D'"
" attr { key: 'T' value { type: DT_HALF } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'M', 'B', 'N']}"
"node { name: 'D' op: 'HalfInput'}"
"node { name: 'E' op: 'Sub'"
" attr {key: 'T' value { type: DT_HALF } }"
" input: ['C', 'D']}");
EXPECT_EQ(DoRunMklToTfConversionPass(),
"A(HalfInput);B(HalfInput);C(MklConv2D);D(HalfInput);"
"E(Sub);M(MklInput);Mkl2Tf/_0(MklToTf);N(MklInput)|"
"A->C;B->C:2;C->Mkl2Tf/_0;C:1->Mkl2Tf/_0:1;D->E:1;"
"M->C:1;Mkl2Tf/_0->E;N->C:3");
}
// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
// There is no Mkl layer so no conversion op should be inserted.
TEST_F(MklToTfConversionPass, Negative_NoMklLayer) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'Conv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" input: ['A', 'B']}"
"node { name: 'D' op: 'Input'}"
"node { name: 'E' op: 'BiasAdd'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['C', 'D'] }"
"node { name: 'Y' op: 'Input'}"
"node { name: 'Z' op: 'Sub'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['E', 'Y']}");
EXPECT_EQ(DoRunMklToTfConversionPass(),
"A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd);Y(Input);Z(Sub)|"
"A->C;B->C:1;C->E;D->E:1;E->Z;Y->Z:1");
}
static void BM_RunMklToTfConversionPass(int iters, int op_nodes) {
testing::StopTiming();
string s;
for (int in = 0; in < 10; in++) {
s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
}
random::PhiloxRandom philox(301, 17);
random::SimplePhilox rnd(&philox);
for (int op = 0; op < op_nodes; op++) {
s += strings::Printf(
"node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
"type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
op, rnd.Uniform(10), rnd.Uniform(10));
}
bool first = true;
while (iters > 0) {
Graph* graph = new Graph(OpRegistry::Global());
MklToTfConversionPass::InitGraph(s, graph);
int N = graph->num_node_ids();
if (first) {
testing::SetLabel(strings::StrCat("Per graph node. Nodes: ", N));
first = false;
}
{
testing::StartTiming();
std::unique_ptr<Graph> ug(graph);
InsertMklToTfConversionNodes(&ug);
testing::StopTiming();
}
iters -= N; // Our benchmark units are individual graph nodes,
// not whole graphs
// delete graph;
}
}
BENCHMARK(BM_RunMklToTfConversionPass)->Arg(1000)->Arg(10000);
} // namespace
} // namespace tensorflow
#endif /* INTEL_MKL */

View File

@ -688,8 +688,15 @@ tf_kernel_library(
tf_kernel_library(
name = "transpose_op",
prefix = "transpose_op",
deps = ARRAY_DEPS,
srcs = [
"transpose_op.cc",
] + if_mkl([
"mkl_transpose_op.cc",
]),
hdrs = ["transpose_op.h"],
deps = ARRAY_DEPS + if_mkl([
"//third_party/mkl:intel_binary_blob",
]),
)
tf_kernel_library(
@ -1735,6 +1742,22 @@ tf_cuda_cc_test(
],
)
tf_cuda_cc_test(
name = "resize_benchmark_test",
srcs = ["resize_op_benchmark_test.cc"],
deps = [
":image",
":ops_testutil",
":ops_util",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core:testlib",
],
)
cc_library(
name = "io",
deps = [
@ -4376,7 +4399,7 @@ tf_cc_test(
if_mkl(
tf_kernel_library(
name = "mkl_ops",
name = "mkl_matmul_op",
prefix = "mkl_matmul",
deps = [
":math",
@ -4385,6 +4408,40 @@ if_mkl(
),
)
if_mkl(
tf_kernel_library(
name = "mkl_conv_op",
prefix = "mkl_conv",
deps = [
":bounds_check",
":ops_util",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core:nn_ops_op_lib",
"//third_party/mkl:intel_binary_blob",
],
),
)
if_mkl(
tf_kernel_library(
name = "mkl_tfconv_op",
prefix = "mkl_tfconv",
deps = [
":bounds_check",
":ops_util",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core:nn_ops_op_lib",
"//third_party/mkl:intel_binary_blob",
],
),
)
# -----------------------------------------------------------------------------
# Google-internal targets. These must be at the end for syncrepo.

View File

@ -1,5 +1,4 @@
/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
@ -12,16 +11,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#define EIGEN_USE_THREADS
#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#endif
#include <memory>
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/kernels/adjust_hue_op.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/util/work_sharder.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
namespace tensorflow {
@ -77,6 +84,7 @@ template <class Device>
class AdjustHueOp;
namespace internal {
// Helper function to convert a RGB color to H-and-V-range. H is in the range
// of [0, 6] instead of the normal [0, 1]
static void rgb_to_hv_range(float r, float g, float b, float* h, float* v_min,
@ -185,6 +193,7 @@ static void hv_range_to_rgb(float h, float v_min, float v_max, float* r,
}
} // namespace internal
template <>
class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
public:
@ -237,4 +246,34 @@ class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_CPU),
AdjustHueOp<CPUDevice>);
#if GOOGLE_CUDA
template <>
class AdjustHueOp<GPUDevice> : public AdjustHueOpBase {
public:
explicit AdjustHueOp(OpKernelConstruction* context)
: AdjustHueOpBase(context) {}
virtual void DoCompute(OpKernelContext* context, const ComputeOptions& options) override {
const Tensor* input = options.input;
const Tensor* delta = options.delta;
Tensor* output = options.output;
const int64 number_of_elements = input->NumElements();
GPUDevice device = context->eigen_gpu_device();
const auto stream = device.stream();
OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
if (number_of_elements > 0) {
const float* input_data = input->flat<float>().data();
const float* delta_h = delta->flat<float>().data();
float* const output_data = output->flat<float>().data();
functor::AdjustHueGPU()(&device, number_of_elements, input_data, delta_h,
output_data);
}
}
};
REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_GPU), AdjustHueOp<GPUDevice>);
#endif
//} // namespace functor
} // namespace tensorflow

View File

@ -0,0 +1,42 @@
/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
#define _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
namespace tensorflow {
typedef Eigen::GpuDevice GPUDevice;
namespace functor {
struct AdjustHueGPU {
void operator()(
GPUDevice* device,
const int64 number_of_elements,
const float* const input,
const float* const delta,
float* const output
);
};
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H

View File

@ -0,0 +1,141 @@
/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/adjust_hue_op.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"
namespace tensorflow {
namespace internal {
namespace {
typedef struct RgbTuple {
float r;
float g;
float b;
} RgbTuple;
typedef struct HsvTuple {
float h;
float s;
float v;
} HsvTuple;
} // anon namespace
__device__ HsvTuple rgb2hsv_cuda(const float r, const float g, const float b)
{
HsvTuple tuple;
const float M = fmaxf(r, fmaxf(g, b));
const float m = fminf(r, fminf(g, b));
const float chroma = M - m;
float h = 0.0f, s = 0.0f;
// hue
if (chroma > 0.0f) {
if (M == r) {
const float num = (g - b) / chroma;
const float sign = copysignf(1.0f, num);
h = ((sign < 0.0f) * 6.0f + sign * fmodf(sign * num, 6.0f)) / 6.0f;
} else if (M == g) {
h = ((b - r) / chroma + 2.0f) / 6.0f;
} else {
h = ((r - g) / chroma + 4.0f) / 6.0f;
}
} else {
h = 0.0f;
}
// saturation
if (M > 0.0) {
s = chroma / M;
} else {
s = 0.0f;
}
tuple.h = h;
tuple.s = s;
tuple.v = M;
return tuple;
}
__device__ RgbTuple hsv2rgb_cuda(const float h, const float s, const float v)
{
RgbTuple tuple;
const float new_h = h * 6.0f;
const float chroma = v * s;
const float x = chroma * (1.0f - fabsf(fmodf(new_h, 2.0f) - 1.0f));
const float new_m = v - chroma;
const bool between_0_and_1 = new_h >= 0.0f && new_h < 1.0f;
const bool between_1_and_2 = new_h >= 1.0f && new_h < 2.0f;
const bool between_2_and_3 = new_h >= 2.0f && new_h < 3.0f;
const bool between_3_and_4 = new_h >= 3.0f && new_h < 4.0f;
const bool between_4_and_5 = new_h >= 4.0f && new_h < 5.0f;
const bool between_5_and_6 = new_h >= 5.0f && new_h < 6.0f;
tuple.r = chroma * (between_0_and_1 || between_5_and_6) +
x * (between_1_and_2 || between_4_and_5) + new_m;
tuple.g = chroma * (between_1_and_2 || between_2_and_3) +
x * (between_0_and_1 || between_3_and_4) + new_m;
tuple.b = chroma * (between_3_and_4 || between_4_and_5) +
x * (between_2_and_3 || between_5_and_6) + new_m;
return tuple;
}
__global__ void adjust_hue_nhwc(const int64 number_elements,
const float * const __restrict__ input,
float * const output,
const float * const hue_delta)
{
// multiply by 3 since we're dealing with contiguous RGB bytes for each pixel (NHWC)
const int64 idx = (blockDim.x * blockIdx.x + threadIdx.x) * 3;
// bounds check
if (idx > number_elements - 1) {
return;
}
const float delta = hue_delta[0];
const HsvTuple hsv = rgb2hsv_cuda(input[idx], input[idx + 1], input[idx + 2]);
// hue adjustment
float new_h = fmodf(hsv.h + delta, 1.0f);
if (new_h < 0.0f) {
new_h = fmodf(1.0f + new_h, 1.0f);
}
const RgbTuple rgb = hsv2rgb_cuda(new_h, hsv.s, hsv.v);
output[idx] = rgb.r;
output[idx + 1] = rgb.g;
output[idx + 2] = rgb.b;
}
} // namespace internal
namespace functor {
void AdjustHueGPU::operator()(
GPUDevice* device,
const int64 number_of_elements,
const float* const input,
const float* const delta,
float* const output
) {
const auto stream = device->stream();
const CudaLaunchConfig config = GetCudaLaunchConfig(number_of_elements, *device);
const int threads_per_block = config.thread_per_block;
const int block_count = (number_of_elements + threads_per_block - 1) / threads_per_block;
internal::adjust_hue_nhwc<<<block_count, threads_per_block, 0, stream>>>(
number_of_elements, input, output, delta
);
}
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA

View File

@ -338,6 +338,7 @@ struct AvgPoolMeanReducer {
// In the case below, 0xd8 implies (false_mask) ? (b) : (a)
// For details, refer to the vpternlogd instruction table at
// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
#define psel(a, b, false_mask) \
_mm512_castsi512_ps(_mm512_ternarylogic_epi32( \
_mm512_castps_si512(a), _mm512_castps_si512(b), \

View File

@ -40,8 +40,8 @@ class FixedLengthRecordReader : public ReaderBase {
// On success:
// * input_buffer_ != nullptr,
// * input_buffer_->Tell() == footer_bytes_
// * file_pos_limit_ == file size - header_bytes_
// * input_buffer_->Tell() == header_bytes_
// * file_pos_limit_ == file size - footer_bytes_
Status OnWorkStartedLocked() override {
record_number_ = 0;
uint64 file_size = 0;

View File

@ -0,0 +1,457 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// See docs in ../ops/nn_ops.cc.
#ifdef INTEL_MKL
#include <string.h>
#include <map>
#include <vector>
#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_slice.h"
#include "tensorflow/core/kernels/bounds_check.h"
#include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/lib/strings/numbers.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/util/padding.h"
#include "tensorflow/core/util/tensor_format.h"
#include "tensorflow/core/util/mkl_util.h"
#include "third_party/mkl/include/mkl_dnn.h"
#include "third_party/mkl/include/mkl_dnn_types.h"
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
template <typename Device, typename T, bool biasEnabled>
class MklConv2DOp : public OpKernel {
public:
~MklConv2DOp() {}
explicit MklConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
string data_format;
OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
errors::InvalidArgument("Invalid data format"));
OP_REQUIRES(context, strides_.size() == 4,
errors::InvalidArgument("Sliding window strides field must "
"specify 4 dimensions"));
const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
OP_REQUIRES(
context, stride_n == 1 && stride_c == 1,
errors::InvalidArgument("Current implementation does not yet support "
"strides in the batch and depth dimensions."));
OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
}
void Compute(OpKernelContext* context) override {
const Tensor& input = MklGetInput(context, 0);
GetMklShape(context, 0, &(mkl_params_.input_shape));
bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
const Tensor& filter = MklGetInput(context, 1);
MklShape mkl_filter_shape;
GetMklShape(context, 1, &mkl_filter_shape);
CHECK(!mkl_filter_shape.IsMklTensor())
<< "Conv filter should not be in MKL Layout";
if (biasEnabled) {
const Tensor& bias = MklGetInput(context, 2);
OP_REQUIRES(context, bias.dims() == 1,
errors::InvalidArgument("bias must be 1-dimensional: ",
bias.shape().DebugString()));
}
if (!input_in_mkl_format) {
OP_REQUIRES(context, input.dims() == 4,
errors::InvalidArgument("input must be 4-dimensional",
input.shape().DebugString()));
}
OP_REQUIRES(context, filter.dims() == 4,
errors::InvalidArgument("filter must be 4-dimensional: ",
filter.shape().DebugString()));
for (int i = 0; i < 3; i++) {
OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
std::numeric_limits<int>::max()),
errors::InvalidArgument("filter too large"));
}
const int64 input_depth = input_in_mkl_format
? mkl_params_.input_shape.GetSizes()[2]
: GetTensorDim(input, data_format_, 'C');
OP_REQUIRES(
context, input_depth == filter.dim_size(2),
errors::InvalidArgument("input and filter must have the same depth: ",
input_depth, " vs ", filter.dim_size(2)));
// The last dimension for filter is out_depth.
const int out_depth = static_cast<int>(filter.dim_size(3));
// The second dimension for input is rows/height.
// The first dimension for filter is rows/height.
const int64 input_rows_raw = input_in_mkl_format
? mkl_params_.input_shape.GetSizes()[1]
: GetTensorDim(input, data_format_, 'H');
OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
std::numeric_limits<int>::max()),
errors::InvalidArgument("Input rows too large"));
const int input_rows = static_cast<int>(input_rows_raw);
const int filter_rows = static_cast<int>(filter.dim_size(0));
// The third dimension for input is columns/width.
// The second dimension for filter is columns/width.
const int64 input_cols_raw = input_in_mkl_format
? mkl_params_.input_shape.GetSizes()[0]
: GetTensorDim(input, data_format_, 'W');
OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
std::numeric_limits<int>::max()),
errors::InvalidArgument("Input cols too large"));
const int input_cols = static_cast<int>(input_cols_raw);
const int filter_cols = static_cast<int>(filter.dim_size(1));
// The first dimension for input is batch.
const int64 input_batch_raw = input_in_mkl_format
? mkl_params_.input_shape.GetSizes()[3]
: GetTensorDim(input, data_format_, 'N');
OP_REQUIRES(context, FastBoundsCheck(input_batch_raw,
std::numeric_limits<int>::max()),
errors::InvalidArgument("batch is too large"));
const int batch = static_cast<int>(input_batch_raw);
// For now we take the stride from the second and third dimensions only (we
// do not support striding on the batch or depth dimension).
const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
OP_REQUIRES_OK(context,
GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
padding_, &out_rows, &pad_rows));
OP_REQUIRES_OK(context,
GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
padding_, &out_cols, &pad_cols));
TensorShape out_shape =
ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
// Output tensor is of the following dimensions:
// [ in_batch, out_rows, out_cols, out_depth ]
Tensor* output = nullptr;
// If there is nothing to compute, return.
if (out_shape.num_elements() == 0) {
// TODO(jbobba): Verify correctness here
// Need semantics for Null MKL tensor
return;
}
if (batch == 0) {
// Nothing to do, allocate output tensor and return
MklShape mkl_output_mkl_shape;
mkl_output_mkl_shape.SetMklTensor(false);
AllocateOutputSetMklshape(context, 0, &output, input.shape(),
mkl_output_mkl_shape);
return;
}
// Create MKL convolution primitives
mkl_params_.in_dims = input_in_mkl_format
? mkl_params_.input_shape.GetDimension()
: input.dims();
mkl_params_.filter_dims = filter.dims();
mkl_params_.in_sizes[0] = static_cast<size_t>(input_cols);
mkl_params_.in_sizes[1] = static_cast<size_t>(input_rows);
mkl_params_.in_sizes[2] = static_cast<size_t>(input_depth);
mkl_params_.in_sizes[3] = static_cast<size_t>(batch);
mkl_params_.out_sizes[0] = static_cast<size_t>(out_cols);
mkl_params_.out_sizes[1] = static_cast<size_t>(out_rows);
mkl_params_.out_sizes[2] = static_cast<size_t>(out_depth);
mkl_params_.out_sizes[3] = static_cast<size_t>(batch);
mkl_params_.input_offset[0] = static_cast<int>(-pad_cols);
mkl_params_.input_offset[1] = static_cast<int>(-pad_rows);
mkl_params_.conv_stride[0] = static_cast<size_t>(stride_cols);
mkl_params_.conv_stride[1] = static_cast<size_t>(stride_rows);
GetStridesFromSizes(data_format_, mkl_params_.out_strides,
mkl_params_.out_sizes);
GetStridesFromSizes(data_format_, mkl_params_.in_strides,
mkl_params_.in_sizes);
// TF filter dimension order (out_depth, in_depth, cols, rows) ->
// MKL filter dimension order (out_depth, in_depth, rows, cols)
mkl_params_.filter_sizes[0] = filter.dim_size(1); // cols
mkl_params_.filter_sizes[1] = filter.dim_size(0); // rows
mkl_params_.filter_sizes[2] = filter.dim_size(2); // in_depth
mkl_params_.filter_sizes[3] = filter.dim_size(3); // out_depth
// TF filter layout - (rows, cols, in_depth, out_depth)
mkl_params_.filter_strides[0] =
filter.dim_size(2) * filter.dim_size(3); // cols
mkl_params_.filter_strides[1] =
filter.dim_size(1) * filter.dim_size(2) * filter.dim_size(3); // rows
mkl_params_.filter_strides[2] = filter.dim_size(3); // in_depth
mkl_params_.filter_strides[3] = 1; // out_depth
if (biasEnabled) {
const Tensor& bias = MklGetInput(context, 2);
mkl_params_.bias_sizes[0] = {static_cast<size_t>(bias.dim_size(0))};
mkl_params_.bias_strides[0] = {1};
}
// Create Convolution Primitive
if (biasEnabled) {
CHECK_EQ(dnnConvolutionCreateForwardBias_F32(
&mkl_prim_convolution_fwd_, nullptr,
dnnAlgorithmConvolutionDirect, mkl_params_.in_dims,
mkl_params_.in_sizes, mkl_params_.out_sizes,
mkl_params_.filter_sizes, mkl_params_.conv_stride,
mkl_params_.input_offset, dnnBorderZeros),
E_SUCCESS);
} else {
CHECK_EQ(dnnConvolutionCreateForward_F32(
&mkl_prim_convolution_fwd_, nullptr,
dnnAlgorithmConvolutionDirect, mkl_params_.in_dims,
mkl_params_.in_sizes, mkl_params_.out_sizes,
mkl_params_.filter_sizes, mkl_params_.conv_stride,
mkl_params_.input_offset, dnnBorderZeros),
E_SUCCESS);
}
TensorShape mkl_output_tf_shape;
MklShape mkl_output_mkl_shape;
mkl_output_mkl_shape.SetMklTensor(true);
mkl_output_mkl_shape.SetMklLayout(mkl_prim_convolution_fwd_,
dnnResourceDst);
mkl_output_mkl_shape.SetTfLayout(mkl_params_.in_dims, mkl_params_.out_sizes,
mkl_params_.out_strides);
mkl_output_tf_shape.AddDim(
dnnLayoutGetMemorySize_F32(
static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
sizeof(T));
AllocateOutputSetMklshape(context, 0, &output, mkl_output_tf_shape,
mkl_output_mkl_shape);
mkl_conv_res_[dnnResourceDst] =
static_cast<void*>(output->flat<T>().data());
MklCreateInputLayouts(context);
Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
mkl_tmp_bias_buf_tensor; // Temp tensor used to allocate tmp
// buffers
MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
&mkl_tmp_filter_buf_tensor,
&mkl_tmp_bias_buf_tensor);
// Execute convolution
CHECK_EQ(dnnExecute_F32(mkl_prim_convolution_fwd_, mkl_conv_res_),
E_SUCCESS);
MklCleanup();
}
private:
typedef struct {
int in_dims;
size_t in_sizes[4];
size_t in_strides[4];
size_t out_sizes[4];
size_t out_strides[4];
int filter_dims;
size_t filter_sizes[4];
size_t filter_strides[4];
size_t bias_sizes[1];
size_t bias_strides[1];
int input_offset[2];
size_t conv_stride[2];
MklShape input_shape;
} MklConv2DOpParams;
// Create MKL dnnLayout_t objects for tensors coming into the layer
void MklCreateInputLayouts(OpKernelContext* context) {
bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
if (input_in_mkl_format) {
mkl_lt_input_ =
static_cast<dnnLayout_t>(mkl_params_.input_shape.GetCurLayout());
} else {
CHECK_EQ(
dnnLayoutCreate_F32(&mkl_lt_input_, mkl_params_.in_dims,
mkl_params_.in_sizes, mkl_params_.in_strides),
E_SUCCESS);
}
CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_filter_, mkl_params_.filter_dims,
mkl_params_.filter_sizes,
mkl_params_.filter_strides),
E_SUCCESS);
if (biasEnabled) {
CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_bias_, 1, mkl_params_.bias_sizes,
mkl_params_.bias_strides),
E_SUCCESS);
}
}
// Compare incoming tensor layouts with MKL preferred layouts and convert
// data to the preferred layout if necessary
void MklPrepareConvolutionInputs(OpKernelContext* context,
Tensor* mkl_tmp_input_buf_tensor,
Tensor* mkl_tmp_filter_buf_tensor,
Tensor* mkl_tmp_bias_buf_tensor) {
bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
mkl_prim_convert_input;
dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
mkl_lt_internal_input;
void *mkl_buf_convert_input, *mkl_buf_convert_filter,
*mkl_buf_convert_bias;
mkl_prim_convert_filter = nullptr;
mkl_prim_convert_bias = nullptr;
mkl_prim_convert_input = nullptr;
mkl_lt_internal_filter = nullptr;
mkl_lt_internal_bias = nullptr;
mkl_lt_internal_input = nullptr;
mkl_buf_convert_input = nullptr;
mkl_buf_convert_filter = nullptr;
mkl_buf_convert_bias = nullptr;
// Compare with internal layouts and convert if needed
const Tensor& input = MklGetInput(context, 0);
void* mkl_buf_input =
const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
CHECK_EQ(
dnnLayoutCreateFromPrimitive_F32(
&mkl_lt_internal_input, mkl_prim_convolution_fwd_, dnnResourceSrc),
E_SUCCESS);
mkl_convert_input =
!dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input_);
if (mkl_convert_input) {
CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input_,
mkl_lt_internal_input),
E_SUCCESS);
AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
&mkl_buf_convert_input);
CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
mkl_buf_convert_input),
E_SUCCESS);
dnnDelete_F32(mkl_prim_convert_input);
}
dnnLayoutDelete_F32(mkl_lt_internal_input);
mkl_conv_res_[dnnResourceSrc] =
(mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
const Tensor& filter = MklGetInput(context, 1);
void* mkl_buf_filter =
const_cast<void*>(static_cast<const void*>(filter.flat<T>().data()));
CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_filter,
mkl_prim_convolution_fwd_,
dnnResourceFilter),
E_SUCCESS);
mkl_convert_filter =
!dnnLayoutCompare_F32(mkl_lt_internal_filter, mkl_lt_filter_);
if (mkl_convert_filter) {
CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, mkl_lt_filter_,
mkl_lt_internal_filter),
E_SUCCESS);
AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor, mkl_lt_internal_filter,
&mkl_buf_convert_filter);
CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter,
mkl_buf_convert_filter),
E_SUCCESS);
dnnDelete_F32(mkl_prim_convert_filter);
}
dnnLayoutDelete_F32(mkl_lt_internal_filter);
mkl_conv_res_[dnnResourceFilter] =
(mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter;
if (biasEnabled) {
const Tensor& bias = MklGetInput(context, 2);
void* mkl_buf_bias =
const_cast<void*>(static_cast<const void*>(bias.flat<T>().data()));
CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_bias,
mkl_prim_convolution_fwd_,
dnnResourceBias),
E_SUCCESS);
mkl_convert_bias =
!dnnLayoutCompare_F32(mkl_lt_internal_bias, mkl_lt_bias_);
if (mkl_convert_bias) {
CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_bias, mkl_lt_bias_,
mkl_lt_internal_bias),
E_SUCCESS);
AllocTmpBuffer(context, mkl_tmp_bias_buf_tensor, mkl_lt_internal_bias,
&mkl_buf_convert_bias);
CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_bias, mkl_buf_bias,
mkl_buf_convert_bias),
E_SUCCESS);
dnnDelete_F32(mkl_prim_convert_bias);
}
dnnLayoutDelete_F32(mkl_lt_internal_bias);
mkl_conv_res_[dnnResourceBias] =
(mkl_convert_bias) ? mkl_buf_convert_bias : mkl_buf_bias;
}
}
void MklCleanup() {
bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
dnnDelete_F32(mkl_prim_convolution_fwd_);
if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input_);
dnnLayoutDelete_F32(mkl_lt_filter_);
if (biasEnabled) dnnLayoutDelete_F32(mkl_lt_bias_);
}
std::vector<int32> strides_;
Padding padding_;
TensorFormat data_format_;
MklConv2DOpParams mkl_params_;
dnnPrimitive_t mkl_prim_convolution_fwd_ = nullptr;
void* mkl_conv_res_[dnnResourceNumber];
dnnLayout_t mkl_lt_filter_ = nullptr, mkl_lt_bias_ = nullptr,
mkl_lt_input_ = nullptr;
};
#define REGISTER_MKL_CPU(T) \
REGISTER_KERNEL_BUILDER( \
Name("MklConv2D").Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.Label(mkl_layer_registry::kMklLayerLabel), \
MklConv2DOp<CPUDevice, T, false>); \
REGISTER_KERNEL_BUILDER( \
Name("MklConv2DWithBias").Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.Label(mkl_layer_registry::kMklLayerLabel), \
MklConv2DOp<CPUDevice, T, true>);
TF_CALL_float(REGISTER_MKL_CPU);
} // namespace tensorflow
#endif // INTEL_MKL

View File

@ -0,0 +1,135 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifdef INTEL_MKL
#include <vector>
#include <algorithm>
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/util/tensor_format.h"
#include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/platform/macros.h"
#include "third_party/mkl/include/mkl_dnn_types.h"
#include "third_party/mkl/include/mkl_dnn.h"
#include "tensorflow/core/util/mkl_util.h"
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
///////////////////////////////////////////////////////////
// Op kernel
///////////////////////////////////////////////////////////
template <typename Device, typename T>
class MklToTfOp : public OpKernel {
public:
explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
}
void Compute(OpKernelContext* context) override {
// 1. Check that input tensor is in MKL format.
const Tensor& input_tensor = MklGetInput(context, 0);
MklShape input_shape;
GetMklShape(context, 0, &input_shape);
// if input is already in Tf format, then just copy input tensor to output.
if (!input_shape.IsMklTensor()) {
context->set_output(0, input_tensor);
VLOG(1) << "MKLToTFConversion: No conversion needed, "
<< "copying input to output";
return;
}
// Check that input data type is same as operator data type and that it is
// same as output data type.
DataType input_data_type = input_type(0);
DataType output_data_type = output_type(0);
CHECK_EQ(op_data_type, input_data_type);
CHECK_EQ(op_data_type, output_data_type);
// We need to recreate Tf tensor shape based on sizes and strides.
// Ideally, we should know what the data_format is, but that attribute
// to this op is not reliable. So below, we rely of sorting logic where
// we sort strides first and then sizes.
TensorShape output_shape;
std::vector<std::pair<int, int>> shape_size;
for (size_t i = 0; i < input_shape.GetDimension(); i++) {
VLOG(1) << "Size: " << input_shape.GetSizes()[i]
<< ", Strides: " << input_shape.GetStrides()[i];
shape_size.push_back(std::make_pair(input_shape.GetSizes()[i],
input_shape.GetStrides()[i]));
}
std::sort(shape_size.begin(), shape_size.end(), [](
std::pair<int, int > a, std::pair<int, int> b) {
return (a.second > b.second) ||
(a.second == b.second && a.first > b.first);
});
for (std::pair<int, int> s_s : shape_size) {
VLOG(1) << "Added dimension: " << s_s.first;
output_shape.AddDim(s_s.first);
}
// Allocate output tensor.
Tensor* output_tensor = NULL;
OP_REQUIRES_OK(context,
context->allocate_output(0, output_shape, &output_tensor));
// 3. Get input and output layout pointers.
dnnLayout_t output_layout = static_cast<dnnLayout_t>(
input_shape.GetTfLayout());
// 4. Execute DNNConversion.
void *input_buffer = static_cast<void*>(const_cast<T*>(
input_tensor.flat<T>().data()));
void *output_buffer = static_cast<void*>(const_cast<T*>(
output_tensor->flat<T>().data()));
input_shape.GetConvertedFlatData(output_layout, input_buffer,
output_buffer);
VLOG(1) << "MKLToTFConversion complete successfully.";
}
private:
/// Data format of the operation
string data_format_str;
/// Data type of the operation
DataType op_data_type;
};
///////////////////////////////////////////////////////////
// Register kernel
///////////////////////////////////////////////////////////
#define REGISTER_CPU(T) \
REGISTER_KERNEL_BUILDER( \
Name("MklToTf").Device(DEVICE_CPU).TypeConstraint<T>("T") \
.Label(mkl_layer_registry::kMklLayerLabel), \
MklToTfOp<CPUDevice, T>);
TF_CALL_float(REGISTER_CPU);
#undef REGISTER_CPU
} // namespace tensorflow
#endif /* INTEL_MKL */

View File

@ -0,0 +1,67 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// See docs in ../ops/array_ops.cc.
#ifdef INTEL_MKL
#define EIGEN_USE_THREADS
#include "tensorflow/core/kernels/transpose_op.h"
#include "tensorflow/core/kernels/transpose_functor.h"
#include "third_party/mkl/include/mkl_trans.h"
namespace tensorflow {
// output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
// of type T and rank N, and a permutation of 0, 1, ..., N-1. It
// shuffles the dimensions of the input tensor according to permutation.
//
// Specifically, the returned tensor output meets the following condition:
// 1) output.dims() == input.dims();
// 2) output.dim_size(i) == input.dim_size(perm[i]);
// 3) output.tensor<T, N>(i_0, i_1, ..., i_N-1) ==
// input.tensor<T, N>(j_0, j_1, ..., j_N-1),
// where i_s == j_{perm[s]}
//
// REQUIRES: perm is a vector of int32.
// REQUIRES: input.dims() == perm.size().
// REQUIRES: perm is a permutation.
Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
gtl::ArraySlice<int32> perm,
Tensor* out) {
if (in.dims() == 2 && in.dtype() == DT_FLOAT) {
float* user_o = out->flat<float>().data();
const float* user_i = in.flat<float>().data();
// Documentation here: https://software.intel.com/en-us/node/520863
// Parameters: (ordering:row-major, operation:transpose, num_rows, num_cols,
// alpha (for scaling), array, dist_bet_adjacent_cols/rows
// (source), array, dist_bet_adjacent_cols/rows (dest))
mkl_somatcopy('R', 'T', in.dim_size(0), in.dim_size(1), 1,
user_i, in.dim_size(1),
user_o, in.dim_size(0));
return Status::OK();
}
// Fallback to eigen if transpose parameters not supported by MKL
typedef Eigen::ThreadPoolDevice CPUDevice;
return ::tensorflow::DoTranspose(ctx->eigen_device<CPUDevice>(), in, perm,
out);
} // MklTransposeCpuOp::DoTranspose
} // namespace tensorflow
#endif // INTEL_MKL

View File

@ -64,6 +64,8 @@ PoolParameters::PoolParameters(OpKernelContext* context,
OP_REQUIRES_OK(
context, GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
padding, &out_width, &pad_cols));
pad_depth = 0;
out_depth = depth;
} else {
// Our current version of depthwise max pooling does not support
// any padding, and expects the depth_window to equal the

View File

@ -66,9 +66,7 @@ class ResizeNearestNeighborOp : public OpKernel {
const int64 in_x =
std::min(static_cast<int64>(floorf(x * st.width_scale)),
(st.in_width - 1));
for (int c = 0; c < st.channels; ++c) {
output_data(b, y, x, c) = input_data(b, in_y, in_x, c);
}
std::copy_n(&input_data(b, in_y, in_x, 0), st.channels, &output_data(b, y, x, 0));
}
}
}

View File

@ -21,7 +21,8 @@ limitations under the License.
namespace tensorflow {
static Graph* BM_ResizeNearestNeighbor(int batches, int width, int height) {
static Graph* BM_Resize(const char* algorithm,
int batches, int width, int height) {
Graph* g = new Graph(OpRegistry::Global());
Tensor in(DT_FLOAT, TensorShape({batches, width, height, 3}));
in.flat<float>().setRandom();
@ -32,21 +33,26 @@ static Graph* BM_ResizeNearestNeighbor(int batches, int width, int height) {
out_size_flat(1) = height * 2;
Node* ret;
NodeBuilder(g->NewName("n"), "ResizeNearestNeighbor")
.Input(test::graph::Constant(g, in))
.Input(test::graph::Constant(g, out_size))
.Finalize(g, &ret);
Status s = NodeBuilder(g->NewName("n"), algorithm)
.Input(test::graph::Constant(g, in))
.Input(test::graph::Constant(g, out_size))
.Finalize(g, &ret);
assert(s.ok());
return g;
}
#define BM_ResizeNearestNeighborDev(DEVICE, B, W, H) \
static void BM_ResizeNearestNeighbor_##DEVICE##_##B##_##W##_##H(int iters) { \
#define BM_ResizeDev(DEVICE, ALGORITHM, B, W, H) \
static void BM_Resize_##ALGORITHM##_##DEVICE##_##B##_##W##_##H(int iters) { \
testing::ItemsProcessed(iters* B* W* H * 3); \
test::Benchmark(#DEVICE, BM_ResizeNearestNeighbor(B, W, H)).Run(iters); \
test::Benchmark(#DEVICE, BM_Resize(#ALGORITHM, B, W, H)).Run(iters); \
} \
BENCHMARK(BM_ResizeNearestNeighbor_##DEVICE##_##B##_##W##_##H)
BENCHMARK(BM_Resize_##ALGORITHM##_##DEVICE##_##B##_##W##_##H)
BM_ResizeNearestNeighborDev(cpu, 1, 499, 499);
BM_ResizeNearestNeighborDev(gpu, 1, 499, 499);
BM_ResizeDev(cpu, ResizeNearestNeighbor, 10, 499, 499);
BM_ResizeDev(gpu, ResizeNearestNeighbor, 10, 499, 499);
BM_ResizeDev(cpu, ResizeBilinear, 10, 499, 499);
BM_ResizeDev(gpu, ResizeBilinear, 10, 499, 499);
} // namespace tensorflow

View File

@ -180,6 +180,20 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
out);
}
#ifdef INTEL_MKL
#define REGISTER(T) \
REGISTER_KERNEL_BUILDER(Name("Transpose") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.TypeConstraint<int32>("Tperm") \
.HostMemory("perm"), \
MklTransposeCpuOp);
TF_CALL_ALL_TYPES(REGISTER);
REGISTER(bfloat16);
#undef REGISTER
#else // INTEL_MKL
#define REGISTER(T) \
REGISTER_KERNEL_BUILDER(Name("Transpose") \
.Device(DEVICE_CPU) \
@ -190,6 +204,7 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
TF_CALL_ALL_TYPES(REGISTER)
REGISTER(bfloat16);
#undef REGISTER
#endif // INTEL_MKL
#if GOOGLE_CUDA
Status TransposeGpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,

View File

@ -41,6 +41,17 @@ class TransposeCpuOp : public TransposeOp {
gtl::ArraySlice<int32> perm, Tensor* out) override;
};
#ifdef INTEL_MKL
class MklTransposeCpuOp : public TransposeOp {
public:
explicit MklTransposeCpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
protected:
Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
gtl::ArraySlice<int32> perm, Tensor* out) override;
};
#endif // INTEL_MKL
class TransposeGpuOp : public TransposeOp {
public:
explicit TransposeGpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}

View File

@ -2502,4 +2502,45 @@ scale_after_normalization: A bool indicating whether the resulted tensor
needs to be multiplied with gamma.
)doc");
#ifdef INTEL_MKL
REGISTER_OP("MklConv2D")
.Input("input: T")
.Input("mkl_input: uint8")
.Input("filter: T")
.Input("mkl_filter: uint8")
.Output("output: T")
.Output("mkl_output: uint8")
.Attr("T: {half, float, double}")
.Attr("strides: list(int)")
.Attr("use_cudnn_on_gpu: bool = true")
.Attr(GetPaddingAttrString())
.Attr(GetConvnetDataFormatAttrString())
.SetShapeFn(shape_inference::Conv2DShape)
.Doc(R"doc(
MKL version of Conv2D
)doc");
REGISTER_OP("MklConv2DWithBias")
.Input("input: T")
.Input("mkl_input: uint8")
.Input("filter: T")
.Input("mkl_filter: uint8")
.Input("bias: T")
.Input("mkl_bias: uint8")
.Output("output: T")
.Output("mkl_output: uint8")
.Attr("T: {half, float, double}")
.Attr("strides: list(int)")
.Attr("use_cudnn_on_gpu: bool = true")
.Attr(GetPaddingAttrString())
.Attr(GetConvnetDataFormatAttrString());
REGISTER_OP("MklToTf")
.Input("input: T")
.Input("mkl_input: uint8")
.Output("output: T")
.Attr("T: {half, float, double}")
.Attr(GetConvnetDataFormatAttrString());
#endif // INTEL_MKL
} // namespace tensorflow

View File

@ -25758,6 +25758,59 @@ op {
summary: "Computes the sum along segments of a tensor."
description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
}
op {
name: "UnsortedSegmentSum"
input_arg {
name: "data"
type_attr: "T"
}
input_arg {
name: "segment_ids"
description: "A tensor whose shape is a prefix of `data.shape`."
type_attr: "Tindices"
}
input_arg {
name: "num_segments"
type: DT_INT32
}
output_arg {
name: "output"
description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`."
type_attr: "T"
}
attr {
name: "T"
type: "type"
allowed_values {
list {
type: DT_FLOAT
type: DT_DOUBLE
type: DT_INT64
type: DT_INT32
type: DT_UINT8
type: DT_UINT16
type: DT_INT16
type: DT_INT8
type: DT_QINT8
type: DT_QUINT8
type: DT_QINT32
type: DT_HALF
}
}
}
attr {
name: "Tindices"
type: "type"
allowed_values {
list {
type: DT_INT32
type: DT_INT64
}
}
}
summary: "Computes the max along segments of a tensor."
description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\n range of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
}
op {
name: "Unstage"
output_arg {

View File

@ -4,11 +4,6 @@ load("@protobuf//:protobuf.bzl", "cc_proto_library")
load("@protobuf//:protobuf.bzl", "py_proto_library")
load("//tensorflow:tensorflow.bzl", "if_not_mobile")
# configure may change the following lines
WITH_GCP_SUPPORT = False
WITH_HDFS_SUPPORT = False
WITH_JEMALLOC = True
# Appends a suffix to a list of deps.
def tf_deps(deps, suffix):
tf_deps = []
@ -196,61 +191,54 @@ def tf_additional_test_srcs():
def tf_kernel_tests_linkstatic():
return 0
# jemalloc only enabled on Linux for now.
# TODO(jhseu): Enable on other platforms.
def tf_additional_lib_defines():
defines = []
if WITH_JEMALLOC:
defines += select({
"//tensorflow:linux_x86_64": [
"TENSORFLOW_USE_JEMALLOC"
],
"//conditions:default": [],
})
return defines
return select({
"//tensorflow:with_jemalloc": ["TENSORFLOW_USE_JEMALLOC"],
"//conditions:default": [],
})
def tf_additional_lib_deps():
deps = []
if WITH_JEMALLOC:
deps += select({
"//tensorflow:linux_x86_64": ["@jemalloc"],
"//conditions:default": [],
})
return deps
return select({
"//tensorflow:with_jemalloc": ["@jemalloc"],
"//conditions:default": [],
})
def tf_additional_core_deps():
deps = []
if WITH_GCP_SUPPORT:
deps.append("//tensorflow/core/platform/cloud:gcs_file_system")
if WITH_HDFS_SUPPORT:
deps.append("//tensorflow/core/platform/hadoop:hadoop_file_system")
return deps
return select({
"//tensorflow:with_gcp_support": [
"//tensorflow/core/platform/cloud:gcs_file_system",
],
"//conditions:default": [],
}) + select({
"//tensorflow:with_hdfs_support": [
"//tensorflow/core/platform/hadoop:hadoop_file_system",
],
"//conditions:default": [],
})
# TODO(jart, jhseu): Delete when GCP is default on.
def tf_additional_cloud_op_deps():
deps = []
if WITH_GCP_SUPPORT:
deps = select({
return select({
"//tensorflow:windows": [],
"//tensorflow:android": [],
"//tensorflow:ios": [],
"//conditions:default":
["//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib"],
})
return deps
"//tensorflow:with_gcp_support": [
"//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
],
"//conditions:default": [],
})
# TODO(jart, jhseu): Delete when GCP is default on.
def tf_additional_cloud_kernel_deps():
deps = []
if WITH_GCP_SUPPORT:
deps = select({
return select({
"//tensorflow:windows": [],
"//tensorflow:android": [],
"//tensorflow:ios": [],
"//conditions:default":
["//tensorflow/contrib/cloud/kernels:bigquery_reader_ops"],
})
return deps
"//tensorflow:with_gcp_support": [
"//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
],
"//conditions:default": [],
})
def tf_lib_proto_parsing_deps():
return [

View File

@ -2,8 +2,6 @@
# The functions in this file might be referred by tensorflow.bzl. They have to
# be separate to avoid cyclic references.
WITH_XLA_SUPPORT = False
def tf_cuda_tests_tags():
return ["local"]
@ -11,16 +9,16 @@ def tf_sycl_tests_tags():
return ["local"]
def tf_additional_plugin_deps():
deps = []
if WITH_XLA_SUPPORT:
deps.append("//tensorflow/compiler/jit")
return deps
return select({
"//tensorflow:with_xla_support": ["//tensorflow/compiler/jit"],
"//conditions:default": [],
})
def tf_additional_xla_deps_py():
return []
def tf_additional_license_deps():
licenses = []
if WITH_XLA_SUPPORT:
licenses.append("@llvm//:LICENSE.TXT")
return licenses
return select({
"//tensorflow:with_xla_support": ["@llvm//:LICENSE.TXT"],
"//conditions:default": [],
})

View File

@ -58,6 +58,7 @@ class LibHDFS {
std::function<hdfsFS(hdfsBuilder*)> hdfsBuilderConnect;
std::function<hdfsBuilder*()> hdfsNewBuilder;
std::function<void(hdfsBuilder*, const char*)> hdfsBuilderSetNameNode;
std::function<int(const char*, char**)> hdfsConfGetStr;
std::function<void(hdfsBuilder*, const char* kerbTicketCachePath)>
hdfsBuilderSetKerbTicketCachePath;
std::function<int(hdfsFS, hdfsFile)> hdfsCloseFile;
@ -85,6 +86,7 @@ class LibHDFS {
BIND_HDFS_FUNC(hdfsBuilderConnect);
BIND_HDFS_FUNC(hdfsNewBuilder);
BIND_HDFS_FUNC(hdfsBuilderSetNameNode);
BIND_HDFS_FUNC(hdfsConfGetStr);
BIND_HDFS_FUNC(hdfsBuilderSetKerbTicketCachePath);
BIND_HDFS_FUNC(hdfsCloseFile);
BIND_HDFS_FUNC(hdfsPread);
@ -147,6 +149,18 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
hdfsBuilder* builder = hdfs_->hdfsNewBuilder();
if (scheme == "file") {
hdfs_->hdfsBuilderSetNameNode(builder, nullptr);
} else if (scheme == "viewfs") {
char *defaultFS = NULL;
hdfs_->hdfsConfGetStr("fs.defaultFS", &defaultFS);
StringPiece defaultScheme, defaultCluster, defaultPath;
io::ParseURI(defaultFS, &defaultScheme, &defaultCluster, &defaultPath);
if (scheme != defaultScheme || namenode != defaultCluster) {
return errors::Unimplemented("viewfs is only supported as a fs.defaultFS.");
}
// The default NameNode configuration will be used (from the XML configuration files). See:
// https://github.com/tensorflow/tensorflow/blob/v1.0.0/third_party/hadoop/hdfs.h#L259
hdfs_->hdfsBuilderSetNameNode(builder, "default");
} else {
hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str());
}
@ -478,5 +492,6 @@ Status HadoopFileSystem::Stat(const string& fname, FileStatistics* stats) {
}
REGISTER_FILE_SYSTEM("hdfs", HadoopFileSystem);
REGISTER_FILE_SYSTEM("viewfs", HadoopFileSystem);
} // namespace tensorflow

View File

@ -53,6 +53,17 @@ limitations under the License.
#define TF_SCANF_ATTRIBUTE(string_index, first_to_check)
#endif
// Control visiblity outside .so
#if defined(COMPILER_MSVC)
# ifdef TF_COMPILE_LIBRARY
# define TF_EXPORT __declspec(dllexport)
# else
# define TF_EXPORT __declspec(dllimport)
# endif // TF_COMPILE_LIBRARY
#else
# define TF_EXPORT __attribute__((visibility("default")))
#endif // COMPILER_MSVC
// GCC can be told that a certain branch is not likely to be taken (for
// instance, a CHECK failure), and use that information in static analysis.
// Giving it this information can help it optimize for the common case in

View File

@ -16,6 +16,9 @@ limitations under the License.
#ifndef TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
#define TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
// included so __cpuidex function is available for GETCPUID on Windows
#include <intrin.h>
// Byte order defines provided by gcc. MSVC doesn't define those so
// we define them here.
// We assume that all windows platform out there are little endian.

View File

@ -24,6 +24,9 @@ limitations under the License.
#include "tensorflow/core/platform/types.h"
#define _mm_load_pd1 _mm_load1_pd
// only define these intrinsics if immintrin.h doesn't have them (VS2015 and earlier)
#if _MSC_VER < 1910
static inline int
_mm256_extract_epi32(__m256i a, const int i)
{
@ -39,3 +42,4 @@ _mm256_insert_epi32(__m256i a, int b, const int i)
}
#endif
#endif
#endif

View File

@ -230,11 +230,9 @@ Status WindowsFileSystem::NewRandomAccessFile(
result->reset();
// Open the file for read-only random access
// Random access is to disable read-ahead as the system reads too much data
// Open in async mode which makes Windows allow more parallelism even
// if we need to do sync I/O on top of it.
DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS |
FILE_FLAG_OVERLAPPED;
DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_OVERLAPPED;
// Shared access is necessary for tests to pass
// almost all tests would work with a possible exception of fault_injection.
DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
@ -306,8 +304,8 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
result->reset();
Status s = Status::OK();
// Open the file for read-only random access
DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS;
// Open the file for read-only
DWORD file_flags = FILE_ATTRIBUTE_READONLY;
// Open in async mode which makes Windows allow more parallelism even
// if we need to do sync I/O on top of it.

View File

@ -0,0 +1,296 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
#define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
#ifdef INTEL_MKL
#include "third_party/mkl/include/mkl_dnn.h"
#include "third_party/mkl/include/mkl_dnn_types.h"
#include "third_party/mkl/include/mkl_service.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/util/tensor_format.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
// The file contains a number of utility classes and functions used by MKL
// enabled kernels
namespace tensorflow {
// This class encapsulates all the meta data that is associated with an MKL
// tensor. A tensor is an MKL tensor if it was created as the result of an
// MKL operation, and did not go through a conversion to a standard
// Tensorflow tensor.
class MklShape {
public:
MklShape() {}
TF_DISALLOW_COPY_AND_ASSIGN(MklShape); // Cannot copy
~MklShape() {
if (sizes_) delete[] sizes_;
if (strides_) delete[] strides_;
if (mklLayout_) CHECK_EQ(dnnLayoutDelete_F32(mklLayout_), E_SUCCESS);
if (tfLayout_) CHECK_EQ(dnnLayoutDelete_F32(tfLayout_), E_SUCCESS);
}
const bool IsMklTensor() const { return isMklTensor_; }
void SetMklTensor(const bool isMklTensor) { isMklTensor_ = isMklTensor; }
void SetMklLayout(const void* primitive, size_t resourceType) {
CHECK_EQ(
dnnLayoutCreateFromPrimitive_F32(&mklLayout_, (dnnPrimitive_t)primitive,
(dnnResourceType_t)resourceType),
E_SUCCESS);
}
void SetTfLayout(const size_t dimension, const size_t* sizes,
const size_t* strides) {
dimension_ = dimension;
if (dimension > 0) { // MKl doesn't support dimension 0
sizes_ = new size_t[dimension];
strides_ = new size_t[dimension];
for (int ii = 0; ii < dimension; ii++) {
sizes_[ii] = sizes[ii];
strides_[ii] = strides[ii];
}
CHECK_EQ(dnnLayoutCreate_F32(&tfLayout_, dimension, sizes, strides),
E_SUCCESS);
}
}
const dnnLayout_t GetMklLayout() const { return mklLayout_; }
const dnnLayout_t GetTfLayout() const { return tfLayout_; }
const dnnLayout_t GetCurLayout() const {
return isMklTensor_ ? mklLayout_ : tfLayout_;
}
size_t GetDimension() const { return dimension_; }
const size_t* GetSizes() const { return sizes_; }
const size_t* GetStrides() const { return strides_; }
void GetConvertedFlatData(dnnLayout_t targetLayout, void* input,
void* output) const {
dnnLayout_t curLayout;
if (isMklTensor_)
curLayout = mklLayout_;
else
curLayout = tfLayout_;
dnnPrimitive_t convert;
CHECK_EQ(dnnConversionCreate_F32(&convert, curLayout, targetLayout),
E_SUCCESS);
CHECK_EQ(dnnConversionExecute_F32(convert, input, output), E_SUCCESS);
CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS);
}
// The following methods are used for serializing and de-serializing the
// contents of the mklshape object.
// The data is serialized in this order
// isMklTensor_
// dimension_
// sizes
// strides
// mklLayout_
// tfLayout_
#define SIZE_OF_MKL_DNN_BUF \
(dnnLayoutSerializationBufferSize_F32()) // Size of buffer needed to
// serialize dnn_layout pointer
// Size of buffer to hold the serialized object, the size is computed as follows
// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes) + sizeof(strides)
// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
#define SIZE_OF_MKL_SERIAL_DATA(dims) \
(2 * sizeof(size_t) + 2 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
// First we need to define some macro for offsets into the serial buffer where
// different elements of Mklshape is written/read from
#define IS_MKL_TENSOR_OFFSET 0
// Location from start of buffer where isMklTensor_ is serialized
#define DIMS_OFFSET \
(IS_MKL_TENSOR_OFFSET + sizeof(size_t)) // Location of dimension_
#define SIZES_OFFSET(dims) \
(DIMS_OFFSET + \
sizeof(size_t)) // Location of sizes. Note dim is not used here, left here
// to make macros consistent.
#define STRIDES_OFFSET(dims) \
(SIZES_OFFSET(dims) + dims * sizeof(size_t)) // Location of strides
#define MKL_LAYOUT_OFFSET(dims) \
(STRIDES_OFFSET(dims) + dims * sizeof(size_t)) // Location of mklLayout_
#define TF_LAYOUT_OFFSET(dims) \
(MKL_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF) // Location of tfLayout_
// TODO(agramesh1) make sure to create a const to share with rewrite pass
// for min size of MKL metadata tensor.
void DeSerializeMklShape(const unsigned char* buf, size_t buf_size) {
CHECK(buf_size >= sizeof(size_t)) << "Bufsize too small in DeSerialize";
// Make sure buffer holds at least isMklTensor_
isMklTensor_ =
*reinterpret_cast<const size_t*>(buf + IS_MKL_TENSOR_OFFSET) != 0;
if (isMklTensor_) { // If it is an MKL Tensor then read the rest
dimension_ = *(reinterpret_cast<const size_t*>(buf + DIMS_OFFSET));
CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_))
<< "Bufsize too small in DeSerialize";
sizes_ = new size_t[dimension_];
strides_ = new size_t[dimension_];
for (int i = 0; i < dimension_; i++) {
sizes_[i] =
reinterpret_cast<const size_t*>(buf + SIZES_OFFSET(dimension_))[i];
strides_[i] = reinterpret_cast<const size_t*>(
buf + STRIDES_OFFSET(dimension_))[i];
}
CHECK_EQ(dnnLayoutDeserialize_F32(&mklLayout_,
buf + MKL_LAYOUT_OFFSET(dimension_)),
E_SUCCESS);
CHECK_EQ(dnnLayoutDeserialize_F32(&tfLayout_,
buf + TF_LAYOUT_OFFSET(dimension_)),
E_SUCCESS);
}
}
void SerializeMklShape(unsigned char* buf, size_t buf_size) const {
CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_))
<< "Bufsize too small to Serialize";
*reinterpret_cast<size_t*>(buf + IS_MKL_TENSOR_OFFSET) =
isMklTensor_ ? 1 : 0;
if (isMklTensor_) {
*(reinterpret_cast<size_t*>(buf + DIMS_OFFSET)) = dimension_;
for (int i = 0; i < dimension_; i++) {
reinterpret_cast<size_t*>(buf + SIZES_OFFSET(dimension_))[i] =
sizes_[i];
reinterpret_cast<size_t*>(buf + STRIDES_OFFSET(dimension_))[i] =
strides_[i];
}
CHECK_EQ(dnnLayoutSerialize_F32(mklLayout_,
buf + MKL_LAYOUT_OFFSET(dimension_)),
E_SUCCESS);
CHECK_EQ(
dnnLayoutSerialize_F32(tfLayout_, buf + TF_LAYOUT_OFFSET(dimension_)),
E_SUCCESS);
}
}
private:
bool isMklTensor_ =
false; // Flag to indicate if the tensor is an MKL tensor or not
dnnLayout_t mklLayout_ = nullptr; // Pointer to the MKL layout
dnnLayout_t tfLayout_ = nullptr; // Pointer to layout of corresponding
// Tensorflow tensor, used when conversion from MKL to standard tensor
size_t dimension_ = 0;
size_t* sizes_ = nullptr; // Required by MKL for conversions
size_t* strides_ = nullptr; // Required by MKL for conversions
};
int inline GetTensorDataIndex(int n) {
return 2 * n; // index corresponding to nth input/output tensor
}
int inline GetTensorMetaDataIndex(int n) {
// index corresponding to meta data of nth input/output tensor
return 2 * n + 1;
}
// Get the MKL shape from the second string tensor
inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
mklshape->DeSerializeMklShape(
ctext->input(GetTensorMetaDataIndex(n)).flat<uint8>().data(),
ctext->input(GetTensorMetaDataIndex(n)).flat<uint8>().size() *
sizeof(uint8));
}
// Gets the actual input
inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) {
return ctext->input(GetTensorDataIndex(n));
}
// Allocate the output tensor, create a second output tensor that will contain
// the MKL shape serialized
inline void AllocateOutputSetMklshape(OpKernelContext* ctext, int n,
Tensor** output,
const TensorShape& tfshape,
const MklShape& mklshape) {
Tensor* second_tensor = nullptr;
TensorShape second_shape;
second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mklshape.GetDimension()));
OP_REQUIRES_OK(
ctext, ctext->allocate_output(GetTensorDataIndex(n), tfshape, output));
OP_REQUIRES_OK(ctext, ctext->allocate_output(GetTensorMetaDataIndex(n),
second_shape, &second_tensor));
mklshape.SerializeMklShape(
second_tensor->flat<uint8>().data(),
second_tensor->flat<uint8>().size() * sizeof(uint8));
}
// Allocates a temp tensor and returns the data buffer for temporary storage.
// Currently
// we only support F32, will need to templatize if other types are added
inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
dnnLayout_t lt_buff, void** buf_out) {
TensorShape tf_shape;
tf_shape.AddDim(
dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(lt_buff)) /
sizeof(float) +
1);
OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::v(),
tf_shape, tensor_out));
*buf_out = static_cast<void*>(tensor_out->flat<float>().data());
}
inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides,
const size_t* sizes) {
// MKL requires strides in NCHW
if (data_format == FORMAT_NHWC) {
strides[0] = sizes[2];
strides[1] = sizes[0] * sizes[2];
strides[2] = 1;
strides[3] = sizes[0] * sizes[1] * sizes[2];
} else {
strides[0] = 1;
strides[1] = sizes[0];
strides[2] = sizes[0] * sizes[1];
strides[3] = sizes[0] * sizes[1] * sizes[2];
}
}
namespace mkl_layer_registry {
static const char* kMklLayerLabel = "MklLayer";
static const string kMklLayerLabelPattern = "label='MklLayer'";
// Check whether opname is registered as MKL-compliant in the registry.
//
// @input: name of the op
// @return: true if opname is registered as Mkl layer op
static inline bool IsMklLayer(const std::string& op_name) {
string kernel = KernelsRegisteredForOp(op_name);
return kernel.find(kMklLayerLabelPattern) != string::npos;
}
} // namespace mkl_layer_registry
} // namespace tensorflow
#endif // INTEL_MKL
#endif // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_

View File

@ -1160,7 +1160,9 @@ for ZeroOut:
```
`c->set_output(0, c->input(0));` declares that the first output's shape should
be set to the first input's shape. There are a number of common shape functions
be set to the first input's shape. If the output is selected by its index as in the above example, the second parameter of `set_output` should be a `ShapeHandle` object. You can create an empty `ShapeHandle` object by its default constructor. The `ShapeHandle` object for an input with index `idx` can be obtained by `c->input(idx)`.
There are a number of common shape functions
that apply to many ops, such as `shape_inference::UnchangedShape` which can be
found in [common_shape_fns.h](https://www.tensorflow.org/code/tensorflow/core/framework/common_shape_fns.h) and used as follows:
@ -1220,7 +1222,15 @@ particular dimension has a very specific value using `InferenceContext::Dim` and
`InferenceContext::WithValue`; you can specify that an output dimension is the
sum / product of two input dimensions using `InferenceContext::Add` and
`InferenceContext::Multiply`. See the `InferenceContext` class for
all of the various shape manipulations you can specify.
all of the various shape manipulations you can specify. The following example sets
shape of the first output to (n, 3), where first input has shape (n, ...)
```c++
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
c->set_output(0, c->Matrix(c->Dim(c->input(0), 0), 3));
return Status::OK();
});
```
If you have a complicated shape function, you should consider adding a test for
validating that various input shape combinations produce the expected output

View File

@ -374,7 +374,7 @@ estimator.fit(input_fn=input_fn, steps=1000)
# Here we evaluate how well our model did. In a real example, we would want
# to use a separate validation and testing data set to avoid overfitting.
estimator.evaluate(input_fn=input_fn)
print(estimator.evaluate(input_fn=input_fn))
```
When run, it produces
```

View File

@ -351,7 +351,7 @@ training.
```python
if step % 100 == 0:
print 'Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)
print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
```
#### Visualize the Status
@ -421,19 +421,19 @@ the training and test datasets. The `do_eval()` function is called thrice, for
the training, validation, and test datasets.
```python
print 'Training Data Eval:'
print('Training Data Eval:')
do_eval(sess,
eval_correct,
images_placeholder,
labels_placeholder,
data_sets.train)
print 'Validation Data Eval:'
print('Validation Data Eval:')
do_eval(sess,
eval_correct,
images_placeholder,
labels_placeholder,
data_sets.validation)
print 'Test Data Eval:'
print('Test Data Eval:')
do_eval(sess,
eval_correct,
images_placeholder,

View File

@ -92,12 +92,12 @@ two following snippets of code are equivalent:
# Using `Session.run()`.
sess = tf.Session()
c = tf.constant(5.0)
print sess.run(c)
print(sess.run(c))
# Using `Tensor.eval()`.
c = tf.constant(5.0)
with tf.Session():
print c.eval()
print(c.eval())
```
In the second example, the session acts as a

View File

@ -144,6 +144,11 @@ specified list, of the variables in the graph. The saver object provides
methods to run these ops, specifying paths for the checkpoint files to write to
or read from.
Note that to restore a model checkpoint without a graph one must first import
the graph from the meta graph file (typical extension is `.meta`). This is
done with @{tf.train.import_meta_graph}, which in turn returns a `Saver` from
which one can than perform a `restore`.
### Checkpoint Files
Variables are saved in binary files that, roughly, contain a map from variable

View File

@ -217,7 +217,7 @@ results = e.evaluate(input_fn=input_fn_test, steps=1)
# Print the stats for the evaluation.
for key in sorted(results):
print "%s: %s" % (key, results[key])
print("%s: %s" % (key, results[key]))
```
### Wide and deep learning

View File

@ -28,7 +28,7 @@ c = tf.matmul(a, b)
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print sess.run(c)
print(sess.run(c))
```
You should see the following output:
@ -61,7 +61,7 @@ with tf.device('/cpu:0'):
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print sess.run(c)
print(sess.run(c))
```
You will see that now `a` and `b` are assigned to `cpu:0`.
@ -131,7 +131,7 @@ with tf.device('/gpu:2'):
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print sess.run(c)
print(sess.run(c))
```
If the device you have specified does not exist, you will get
@ -160,7 +160,7 @@ with tf.device('/gpu:2'):
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True, log_device_placement=True))
# Runs the op.
print sess.run(c)
print(sess.run(c))
```
## Using multiple GPUs
@ -182,7 +182,7 @@ with tf.device('/cpu:0'):
# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# Runs the op.
print sess.run(sum)
print(sess.run(sum))
```
You will see the following output.

View File

@ -188,7 +188,7 @@ def input_fn(df):
categorical_cols = {k: tf.SparseTensor(
indices=[[i, 0] for i in range(df[k].size)],
values=df[k].values,
shape=[df[k].size, 1])
dense_shape=[df[k].size, 1])
for k in CATEGORICAL_COLUMNS}
# Merges the two dictionaries into one.
feature_cols = dict(continuous_cols.items() + categorical_cols.items())
@ -261,6 +261,8 @@ learned through the model training process we'll go through later.
We'll do the similar trick to define the other categorical features:
```python
race = tf.contrib.layers.sparse_column_with_hash_bucket("race", hash_bucket_size=100)
marital_status = tf.contrib.layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size=100)
relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
@ -377,7 +379,7 @@ the labels of the holdout data:
```python
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
print "%s: %s" % (key, results[key])
print("%s: %s" % (key, results[key]))
```
The first line of the output should be something like `accuracy: 0.83557522`,

View File

@ -255,7 +255,7 @@ After reading in the data, you can train and evaluate the model:
m.fit(input_fn=train_input_fn, steps=200)
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
print "%s: %s" % (key, results[key])
print("%s: %s" % (key, results[key]))
```
The first line of the output should be something like `accuracy: 0.84429705`. We

View File

@ -432,7 +432,7 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
// Everything else is 0, so just pick a suitable slider to push up when the
// selected one goes down.
if (adapter.items[lastOtherStyle] == slider) {
lastOtherStyle = lastOtherStyle + 1 % NUM_STYLES;
lastOtherStyle = (lastOtherStyle + 1) % NUM_STYLES;
}
adapter.items[lastOtherStyle].setValue(1.0f - value);
}

View File

@ -1,7 +1,7 @@
# TF Learn Examples
Learn is a high-level API for TensorFlow that allows you to create,
train, and use deep learning models easily. See the [Quickstart tutorial](../../g3doc/tutorials/tflearn/index.md)
train, and use deep learning models easily. See the [Quickstart tutorial](https://www.tensorflow.org/get_started/tflearn)
for an introduction to the API.
To run most of these examples, you need to install the `scikit learn` library (`sudo pip install sklearn`).

View File

@ -16,19 +16,22 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sklearn import cross_validation
from sklearn import datasets
from sklearn import model_selection
from sklearn import metrics
from sklearn import preprocessing
import tensorflow as tf
def main(unused_argv):
# Load dataset
boston = tf.contrib.learn.datasets.load_dataset('boston')
boston = datasets.load_boston()
x, y = boston.data, boston.target
# Split dataset into train / test
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
x_train, x_test, y_train, y_test = model_selection.train_test_split(
x, y, test_size=0.2, random_state=42)
# Scale data (training set) to 0 mean and unit standard deviation.

View File

@ -17,7 +17,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sklearn import datasets
from sklearn import metrics
from sklearn import model_selection
@ -26,7 +26,7 @@ import tensorflow as tf
def main(unused_argv):
# Load dataset.
iris = tf.contrib.learn.datasets.load_dataset('iris')
iris = datasets.load_iris()
x_train, x_test, y_train, y_test = model_selection.train_test_split(
iris.data, iris.target, test_size=0.2, random_state=42)

View File

@ -24,6 +24,7 @@ import numpy as np
import pandas
from sklearn import metrics
import tensorflow as tf
from tensorflow.contrib.layers.python.layers import encoders
learn = tf.contrib.learn
@ -37,7 +38,7 @@ n_words = 0
def bag_of_words_model(features, target):
"""A bag-of-words model. Note it disregards the word order in the text."""
target = tf.one_hot(target, 15, 1, 0)
features = tf.contrib.layers.bow_encoder(
features = encoders.bow_encoder(
features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
logits = tf.contrib.layers.fully_connected(features, 15, activation_fn=None)
loss = tf.contrib.losses.softmax_cross_entropy(logits, target)

View File

@ -278,7 +278,7 @@
" tensor = n.attr['value'].tensor\n",
" size = len(tensor.tensor_content)\n",
" if size > max_const_size:\n",
" tensor.tensor_content = bytes(\"<stripped %d bytes>\"%size, 'utf-8')\n",
" tensor.tensor_content = bytes(\"<stripped %d bytes>\"%size)\n",
" return strip_def\n",
" \n",
"def rename_nodes(graph_def, rename_func):\n",

View File

@ -62,7 +62,7 @@ print('Data size', len(words))
vocabulary_size = 50000
def build_dataset(words):
def build_dataset(words, vocabulary_size):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
@ -81,7 +81,7 @@ def build_dataset(words):
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words)
data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)
del words # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
@ -181,7 +181,7 @@ with graph.as_default():
valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
init = tf.initialize_all_variables()
init = tf.global_variables_initializer()
# Step 5: Begin training.
num_steps = 100001

View File

@ -20,11 +20,17 @@ go get github.com/golang/protobuf/proto
go get github.com/golang/protobuf/protoc-gen-go
cd $(dirname $0)
TF_DIR=${GOPATH}/src/github.com/tensorflow/tensorflow
PROTOC="${TF_DIR}/bazel-out/host/bin/external/protobuf/protoc"
for g in $(echo $GOPATH | sed "s/:/ /g"); do
TF_DIR="${g}/src/github.com/tensorflow/tensorflow"
PROTOC="${TF_DIR}/bazel-out/host/bin/external/protobuf/protoc"
if [ -x "${PROTOC}" ]; then
break
fi
done
if [ ! -x "${PROTOC}" ]
then
set +e
PATH_PROTOC=$(which protoc)
if [ ! -x "${PATH_PROTOC}" ]
then
@ -34,6 +40,7 @@ then
exit 1
fi
PROTOC=$PATH_PROTOC
set -e
fi
# Ensure that protoc-gen-go is available in $PATH

View File

@ -110,7 +110,7 @@ libraries will need to be built from source.
brew install swig
```
3. [Configure](https://www.tensorflow.org/get_started/os_setup#configure_the_installation)
3. [Configure](https://www.tensorflow.org/install/install_sources#configure_the_installation)
(e.g., enable GPU support) and build:
```sh
@ -120,8 +120,8 @@ libraries will need to be built from source.
//tensorflow/java:libtensorflow_jni
```
The JAR (`libtensorflow.jar`) and native library (`libtensorflow_jni.so`) will
be in `bazel-bin/tensorflow/java`.
The JAR (`libtensorflow.jar`) and native library (`libtensorflow_jni.so` on Linux or `libtensorflow_jni.dylib` on OS X) will
be in `bazel-bin/tensorflow/java`. Using these artifacts follow both steps 3 and 4 in the [quickstart](#quickstart) section in order to get your application up and running.
### Maven

View File

@ -27,7 +27,8 @@ package org.tensorflow;
public class SavedModelBundle implements AutoCloseable {
/**
* Load a saved model from an export directory.
* Load a saved model from an export directory. The model that is being loaded should be created using
* the <a href="https://www.tensorflow.org/api_docs/python/tf/saved_model">Saved Model API</a>.
*
* @param exportDir the directory path containing a saved model.
* @param tags the tags identifying the specific metagraphdef to load.

View File

@ -172,8 +172,7 @@ public final class Tensor implements AutoCloseable {
*
* <p>Creates a Tensor with the provided shape of any type where the tensor's data has been
* encoded into {@code data} as per the specification of the TensorFlow <a
* href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C
* API</a>.
* href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C API</a>.
*
* @param dataType the tensor datatype.
* @param shape the tensor shape.

View File

@ -19,8 +19,8 @@ limitations under the License.
* <p><b>WARNING</b>: The API is currently experimental and is not covered by TensorFlow <a
* href="https://www.tensorflow.org/programmers_guide/version_semantics">API stability
* guarantees</a>. See <a
* href="https://www.tensorflow.org/code/tensorflow/java/README.md">README.md</a>
* for installation instructions.
* href="https://www.tensorflow.org/code/tensorflow/java/README.md">README.md</a> for installation
* instructions.
*
* <p>The <a
* href="https://www.tensorflow.org/code/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java">LabelImage</a>

View File

@ -711,14 +711,14 @@ class BaseSession(SessionInterface):
# v is the numpy array [10, 20]
# 'fetches' can be a list.
v = session.run([a, b])
# v a Python list with 2 numpy arrays: the numpy array [10, 20] and the
# v is a Python list with 2 numpy arrays: the 1-D array [10, 20] and the
# 1-D array [1.0, 2.0]
# 'fetches' can be arbitrary lists, tuples, namedtuple, dicts:
MyData = collections.namedtuple('MyData', ['a', 'b'])
v = session.run({'k1': MyData(a, b), 'k2': [b, a]})
# v is a dict with
# v['k1'] is a MyData namedtuple with 'a' the numpy array [10, 20] and
# 'b' the numpy array [1.0, 2.0]
# v['k1'] is a MyData namedtuple with 'a' (the numpy array [10, 20]) and
# 'b' (the numpy array [1.0, 2.0])
# v['k2'] is a list with the numpy array [1.0, 2.0] and the numpy array
# [10, 20].
```

View File

@ -15,6 +15,7 @@ exports_files(["LICENSE"])
load("//tensorflow:tensorflow.bzl", "cuda_py_test")
load("//tensorflow:tensorflow.bzl", "py_test")
load("//tensorflow:tensorflow.bzl", "if_not_windows")
py_library(
name = "debug_py",
@ -33,11 +34,12 @@ py_library(
py_library(
name = "debug_pip",
deps = [
":debug_examples",
":debug_py",
":offline_analyzer",
":session_debug_testlib",
],
] + if_not_windows([
":debug_examples",
]),
)
py_library(

View File

@ -84,9 +84,7 @@ class TensordotTest(test_lib.TestCase):
b_ph: b,
axes_ph: axes_value})
def test_no_partial_shape_inference(self):
# If one of the shapes is only partially defined, the output shape is
# unknown.
def test_partial_shape_inference(self):
a = array_ops.placeholder(dtypes.float32)
b = array_ops.placeholder(dtypes.float32)
axes = ([1], [0])
@ -95,13 +93,21 @@ class TensordotTest(test_lib.TestCase):
a.set_shape([None, 2])
b.set_shape([2, 3])
output = math_ops.tensordot(a, b, axes)
self.assertEqual(output.get_shape().ndims, None)
output_shape = output.get_shape()
self.assertEqual(output_shape.ndims, 2)
output_shape = output_shape.as_list()
self.assertEqual(output_shape[0], None)
self.assertEqual(output_shape[1], 3)
a = array_ops.placeholder(dtypes.float32)
b = array_ops.placeholder(dtypes.float32)
a.set_shape([2, 2])
b.set_shape([2, None])
output = math_ops.tensordot(a, b, axes)
self.assertEqual(output.get_shape().ndims, None)
output_shape = output.get_shape()
self.assertEqual(output_shape.ndims, 2)
output_shape = output_shape.as_list()
self.assertEqual(output_shape[0], 2)
self.assertEqual(output_shape[1], None)
def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):

Some files were not shown because too many files have changed in this diff Show More