diff --git a/configure b/configure index 1eeaffaf74c..c6df6992d9e 100755 --- a/configure +++ b/configure @@ -3,816 +3,12 @@ set -e set -o pipefail -MIN_BAZEL_VERSION=0.4.5 - -# Find out the absolute path to where ./configure resides -pushd `dirname $0` > /dev/null -SOURCE_BASE_DIR=`pwd -P` -popd > /dev/null - -PLATFORM="$(uname -s | tr 'A-Z' 'a-z')" - -function is_linux() { - [[ "${PLATFORM}" == "linux" ]] -} - -function is_macos() { - [[ "${PLATFORM}" == "darwin" ]] -} - -function is_windows() { - # On windows, the shell script is actually running in msys - [[ "${PLATFORM}" =~ msys_nt*|mingw*|cygwin*|uwin* ]] -} - -function is_ppc64le() { - [[ "$(uname -m)" == "ppc64le" ]] -} - -function sed_in_place() { - sed -e $1 $2 > "$2.bak" - mv "$2.bak" $2 -} - -function write_to_bazelrc() { - echo "$1" >> .tf_configure.bazelrc -} - -function write_action_env_to_bazelrc() { - write_to_bazelrc "build --action_env $1=\"$2\"" -} - -function python_path { - "$PYTHON_BIN_PATH" - <&2 - if [ -z "$fromuser" ]; then - exit 1 - fi - PYTHON_BIN_PATH="" - # Retry - done - - if [ -z "$PYTHON_LIB_PATH" ]; then - # Split python_path into an array of paths, this allows path containing spaces - IFS=',' read -r -a python_lib_path <<< "$(python_path)" - - if [ 1 = "$USE_DEFAULT_PYTHON_LIB_PATH" ]; then - PYTHON_LIB_PATH=${python_lib_path[0]} - echo "Using python library path: $PYTHON_LIB_PATH" - - else - echo "Found possible Python library paths:" - for x in "${python_lib_path[@]}"; do - echo " $x" - done - set -- "${python_lib_path[@]}" - echo "Please input the desired Python library path to use. Default is [$1]" - read b || true - if [ "$b" == "" ]; then - PYTHON_LIB_PATH=${python_lib_path[0]} - echo "Using python library path: $PYTHON_LIB_PATH" - else - PYTHON_LIB_PATH="$b" - fi - fi - fi - - if [ ! -x "$PYTHON_BIN_PATH" ] || [ -d "$PYTHON_BIN_PATH" ]; then - echo "PYTHON_BIN_PATH is not executable. Is it the python binary?" - exit 1 - fi - - local python_major_version - python_major_version=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import sys; print(sys.version_info[0]);' | head -c1) - if [ -z "$python_major_version" ]; then - echo -e "\n\nERROR: Problem getting python version. Is $PYTHON_BIN_PATH the correct python binary?" - exit 1 - fi - - # Convert python path to Windows style before writing into bazel.rc - if is_windows; then - PYTHON_BIN_PATH="$(cygpath -m "$PYTHON_BIN_PATH")" - PYTHON_LIB_PATH="$(cygpath -m "$PYTHON_LIB_PATH")" - fi - - # Set-up env variables used by python_configure.bzl - write_action_env_to_bazelrc "PYTHON_BIN_PATH" "$PYTHON_BIN_PATH" - write_action_env_to_bazelrc "PYTHON_LIB_PATH" "$PYTHON_LIB_PATH" - write_to_bazelrc "build --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" - write_to_bazelrc "build --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\"" - write_to_bazelrc "build --force_python=py$python_major_version" - write_to_bazelrc "build --host_force_python=py$python_major_version" - write_to_bazelrc "build --python${python_major_version}_path=\"$PYTHON_BIN_PATH\"" - write_to_bazelrc "test --force_python=py$python_major_version" - write_to_bazelrc "test --host_force_python=py$python_major_version" - write_to_bazelrc "test --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" - write_to_bazelrc "test --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\"" - write_to_bazelrc "run --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" - write_to_bazelrc "run --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\"" - - # Write tools/python_bin_path.sh - echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh -} - -function version { - echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", $1,$2,$3); }'; -} - - -bazel version > bazel.version -set +e -curr_bazel_version=$(grep -m 1 'Build label:' bazel.version | cut -d ' ' -f3) -set -e -rm -f bazel.version - - -echo "You have bazel $curr_bazel_version installed." -if [ -z "$curr_bazel_version" ]; then - echo "WARNING: current bazel installation is not a release version." - echo "Make sure you are running at least bazel $MIN_BAZEL_VERSION." -elif [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then - echo "Please upgrade your bazel installation to version $MIN_BAZEL_VERSION or higher to build TensorFlow!" - echo "Exiting..." - exit 1 +if [ -z "$PYTHON_BIN_PATH" ]; then + PYTHON_BIN_PATH=$(which python || which python3 || true) fi -# This file contains customized config settings. -rm -f .tf_configure.bazelrc -touch .tf_configure.bazelrc -if [[ ! -e .bazelrc ]]; then - if [[ -e "${HOME}/.bazelrc" ]]; then - echo "import ${HOME}/.bazelrc" >.bazelrc - else - touch .bazelrc - fi -fi -sed_in_place "/tf_configure/d" .bazelrc -echo "import %workspace%/.tf_configure.bazelrc" >> .bazelrc +# Set all env variables +$PYTHON_BIN_PATH configure.py -# Delete any leftover BUILD files from the Makefile build, which would interfere -# with Bazel parsing. -MAKEFILE_DOWNLOAD_DIR=tensorflow/contrib/makefile/downloads -if [ -d "${MAKEFILE_DOWNLOAD_DIR}" ]; then - find ${MAKEFILE_DOWNLOAD_DIR} -type f -name '*BUILD' -delete -fi -setup_python - -## Set up MKL related environment settings -write_to_bazelrc 'build:mkl --define with_mkl_support=true' -write_to_bazelrc 'build:mkl --define using_mkl=true' -write_to_bazelrc 'build:mkl -c opt' -write_to_bazelrc 'build:mkl --copt="-DEIGEN_USE_VML"' -echo "" -echo "Add \"--config=mkl\" to your bazel command to build with MKL support." -echo "Please note that MKL on MacOS or windows is still not supported." -echo "If you would like to use a local MKL instead of downloading, please " -echo " set the environment variable \"TF_MKL_ROOT\" every time before build." -echo "" -## End MKL setup - -## Set up architecture-dependent optimization flags. -if [ -z "$CC_OPT_FLAGS" ]; then - if is_ppc64le; then - # gcc on ppc64le does not support -march, use mcpu instead - default_cc_opt_flags="-mcpu=native" - else - default_cc_opt_flags="-march=native" - fi - read -p "Please specify optimization flags to use during compilation when bazel option "\ -"\"--config=opt\" is specified [Default is $default_cc_opt_flags]: " CC_OPT_FLAGS - if [ -z "$CC_OPT_FLAGS" ]; then - CC_OPT_FLAGS=$default_cc_opt_flags - fi -fi - -if is_windows; then - TF_NEED_GCP=0 - TF_NEED_HDFS=0 - TF_NEED_JEMALLOC=0 - TF_NEED_OPENCL=0 - TF_CUDA_CLANG=0 -fi - -if is_linux; then - while [ "$TF_NEED_JEMALLOC" == "" ]; do - read -p "Do you wish to use jemalloc as the malloc implementation? [Y/n] "\ - INPUT - case $INPUT in - [Yy]* ) echo "jemalloc enabled"; TF_NEED_JEMALLOC=1;; - [Nn]* ) echo "jemalloc disabled"; TF_NEED_JEMALLOC=0;; - "" ) echo "jemalloc enabled"; TF_NEED_JEMALLOC=1;; - * ) echo "Invalid selection: " $INPUT;; - esac - done -else - TF_NEED_JEMALLOC=0 -fi - -if [[ "$TF_NEED_JEMALLOC" == "1" ]]; then - write_to_bazelrc 'build --define with_jemalloc=true' -fi - -while [[ "$TF_NEED_GCP" == "" ]]; do - read -p "Do you wish to build TensorFlow with "\ -"Google Cloud Platform support? [y/N] " INPUT - case $INPUT in - [Yy]* ) echo "Google Cloud Platform support will be enabled for "\ -"TensorFlow"; TF_NEED_GCP=1;; - [Nn]* ) echo "No Google Cloud Platform support will be enabled for "\ -"TensorFlow"; TF_NEED_GCP=0;; - "" ) echo "No Google Cloud Platform support will be enabled for "\ -"TensorFlow"; TF_NEED_GCP=0;; - * ) echo "Invalid selection: " $INPUT;; - esac -done - -if [[ "$TF_NEED_GCP" == "1" ]]; then - write_to_bazelrc 'build --define with_gcp_support=true' -fi - -while [[ "$TF_NEED_HDFS" == "" ]]; do - read -p "Do you wish to build TensorFlow with "\ -"Hadoop File System support? [y/N] " INPUT - case $INPUT in - [Yy]* ) echo "Hadoop File System support will be enabled for "\ -"TensorFlow"; TF_NEED_HDFS=1;; - [Nn]* ) echo "No Hadoop File System support will be enabled for "\ -"TensorFlow"; TF_NEED_HDFS=0;; - "" ) echo "No Hadoop File System support will be enabled for "\ -"TensorFlow"; TF_NEED_HDFS=0;; - * ) echo "Invalid selection: " $INPUT;; - esac -done - -if [[ "$TF_NEED_HDFS" == "1" ]]; then - write_to_bazelrc 'build --define with_hdfs_support=true' -fi - -## Enable XLA. -while [[ "$TF_ENABLE_XLA" == "" ]]; do - read -p "Do you wish to build TensorFlow with the XLA just-in-time compiler (experimental)? [y/N] " INPUT - case $INPUT in - [Yy]* ) echo "XLA JIT support will be enabled for TensorFlow"; TF_ENABLE_XLA=1;; - [Nn]* ) echo "No XLA JIT support will be enabled for TensorFlow"; TF_ENABLE_XLA=0;; - "" ) echo "No XLA support will be enabled for TensorFlow"; TF_ENABLE_XLA=0;; - * ) echo "Invalid selection: " $INPUT;; - esac -done - -if [[ "$TF_ENABLE_XLA" == "1" ]]; then - write_to_bazelrc 'build --define with_xla_support=true' -fi - -# Verbs configuration -while [ "$TF_NEED_VERBS" == "" ]; do - read -p "Do you wish to build TensorFlow with "\ -"VERBS support? [y/N] " INPUT - case $INPUT in - [Yy]* ) echo "VERBS support will be enabled for "\ -"TensorFlow"; TF_NEED_VERBS=1;; - [Nn]* ) echo "No VERBS support will be enabled for "\ -"TensorFlow"; TF_NEED_VERBS=0;; - "" ) echo "No VERBS support will be enabled for "\ -"TensorFlow"; TF_NEED_VERBS=0;; - * ) echo "Invalid selection: " $INPUT;; - esac -done - -if [[ "$TF_NEED_VERBS" == "1" ]]; then - write_to_bazelrc 'build --define with_verbs_support=true' -fi - -# Append CC optimization flags to bazel.rc -for opt in $CC_OPT_FLAGS; do - write_to_bazelrc "build:opt --cxxopt=$opt --copt=$opt" -done - -# Run the gen_git_source to create links where bazel can track dependencies for -# git hash propagation -GEN_GIT_SOURCE=tensorflow/tools/git/gen_git_source.py -chmod a+x ${GEN_GIT_SOURCE} -"${PYTHON_BIN_PATH}" ${GEN_GIT_SOURCE} --configure "${SOURCE_BASE_DIR}" - -## Set up SYCL-related environment settings -while [ "$TF_NEED_OPENCL" == "" ]; do - read -p "Do you wish to build TensorFlow with OpenCL support? [y/N] " INPUT - case $INPUT in - [Yy]* ) echo "OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=1;; - [Nn]* ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;; - "" ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;; - * ) echo "Invalid selection: " $INPUT;; - esac -done - -## Set up Cuda-related environment settings - -while [ "$TF_NEED_CUDA" == "" ]; do - read -p "Do you wish to build TensorFlow with CUDA support? [y/N] " INPUT - case $INPUT in - [Yy]* ) echo "CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=1;; - [Nn]* ) echo "No CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=0;; - "" ) echo "No CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=0;; - * ) echo "Invalid selection: " $INPUT;; - esac -done - -export TF_NEED_CUDA -write_action_env_to_bazelrc "TF_NEED_CUDA" "$TF_NEED_CUDA" - -export TF_NEED_OPENCL -write_action_env_to_bazelrc "TF_NEED_OPENCL" "$TF_NEED_OPENCL" - -if [ "$TF_NEED_CUDA" == "1" ]; then -while [[ "$TF_CUDA_CLANG" == "" ]]; do - read -p "Do you want to use clang as CUDA compiler? [y/N] " INPUT - case $INPUT in - [Yy]* ) echo "Clang will be used as CUDA compiler"; TF_CUDA_CLANG=1;; - [Nn]* ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;; - "" ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;; - * ) echo "Invalid selection: " $INPUT;; - esac -done - -export TF_CUDA_CLANG -write_action_env_to_bazelrc "TF_CUDA_CLANG" "$TF_CUDA_CLANG" - -# Set up which clang we should use as the cuda / host compiler. -while [[ "$TF_CUDA_CLANG" == "1" ]] && true; do - fromuser="" - if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then - default_clang_host_compiler_path=$(which clang || true) - read -p "Please specify which clang should be used as device and host compiler. [Default is $default_clang_host_compiler_path]: " CLANG_CUDA_COMPILER_PATH - fromuser="1" - if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then - CLANG_CUDA_COMPILER_PATH="$default_clang_host_compiler_path" - fi - fi - if [ -e "$CLANG_CUDA_COMPILER_PATH" ]; then - export CLANG_CUDA_COMPILER_PATH - write_action_env_to_bazelrc "CLANG_CUDA_COMPILER_PATH" "$CLANG_CUDA_COMPILER_PATH" - break - fi - echo "Invalid clang path. ${CLANG_CUDA_COMPILER_PATH} cannot be found" 1>&2 - if [ -z "$fromuser" ]; then - exit 1 - fi - CLANG_CUDA_COMPILER_PATH="" - # Retry -done - -# Find out where the CUDA toolkit is installed -while true; do - # Configure the Cuda SDK version to use. - if [ -z "$TF_CUDA_VERSION" ]; then - read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: " TF_CUDA_VERSION - fi - # Set default CUDA version if not set - TF_CUDA_VERSION=${TF_CUDA_VERSION:-8.0} - - fromuser="" - if [ -z "$CUDA_TOOLKIT_PATH" ]; then - default_cuda_path=/usr/local/cuda - if is_windows; then - if [ -z "$CUDA_PATH" ]; then - default_cuda_path="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0" - else - default_cuda_path="$(cygpath -m "$CUDA_PATH")" - fi - elif is_linux; then - # If the default doesn't exist, try an alternative default. - if [ ! -d $default_cuda_path ] && [ -d /opt/cuda ]; then - default_cuda_path=/opt/cuda - fi - fi - read -p "Please specify the location where CUDA $TF_CUDA_VERSION toolkit is installed. Refer to README.md for more details. [Default is $default_cuda_path]: " CUDA_TOOLKIT_PATH - fromuser="1" - if [ -z "$CUDA_TOOLKIT_PATH" ]; then - CUDA_TOOLKIT_PATH="$default_cuda_path" - fi - fi - - if [[ -z "$TF_CUDA_VERSION" ]]; then - TF_CUDA_EXT="" - else - TF_CUDA_EXT=".$TF_CUDA_VERSION" - fi - - if is_windows; then - CUDA_RT_LIB_PATH="lib/x64/cudart.lib" - elif is_linux; then - CUDA_RT_LIB_PATH="lib64/libcudart.so${TF_CUDA_EXT}" - elif is_macos; then - CUDA_RT_LIB_PATH="lib/libcudart${TF_CUDA_EXT}.dylib" - fi - - if [ -e "${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}" ]; then - export CUDA_TOOLKIT_PATH - write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "$CUDA_TOOLKIT_PATH" - export TF_CUDA_VERSION - break - fi - echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found" - - if [ -z "$fromuser" ]; then - exit 1 - fi - # Retry - TF_CUDA_VERSION="" - CUDA_TOOLKIT_PATH="" -done - -export TF_CUDA_VERSION -write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION" - -# Set up which gcc nvcc should use as the host compiler -# No need to set this on Windows -while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do - fromuser="" - if [ -z "$GCC_HOST_COMPILER_PATH" ]; then - default_gcc_host_compiler_path=$(which gcc || true) - cuda_bin_symlink="$CUDA_TOOLKIT_PATH/bin/gcc" - if [ -L "$cuda_bin_symlink" ]; then - default_gcc_host_compiler_path=$(readlink $cuda_bin_symlink) - fi - read -p "Please specify which gcc should be used by nvcc as the host compiler. [Default is $default_gcc_host_compiler_path]: " GCC_HOST_COMPILER_PATH - fromuser="1" - if [ -z "$GCC_HOST_COMPILER_PATH" ]; then - GCC_HOST_COMPILER_PATH="$default_gcc_host_compiler_path" - fi - fi - if [ -e "$GCC_HOST_COMPILER_PATH" ]; then - export GCC_HOST_COMPILER_PATH - write_action_env_to_bazelrc "GCC_HOST_COMPILER_PATH" "$GCC_HOST_COMPILER_PATH" - break - fi - echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2 - if [ -z "$fromuser" ]; then - exit 1 - fi - GCC_HOST_COMPILER_PATH="" - # Retry -done - -# Find out where the cuDNN library is installed -while true; do - # Configure the cuDNN version to use. - if [ -z "$TF_CUDNN_VERSION" ]; then - read -p "Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: " TF_CUDNN_VERSION - fi - # Set default CUDNN version if not set - TF_CUDNN_VERSION=${TF_CUDNN_VERSION:-6} - - fromuser="" - if [ -z "$CUDNN_INSTALL_PATH" ]; then - default_cudnn_path=${CUDA_TOOLKIT_PATH} - read -p "Please specify the location where cuDNN $TF_CUDNN_VERSION library is installed. Refer to README.md for more details. [Default is $default_cudnn_path]: " CUDNN_INSTALL_PATH - fromuser="1" - if [ -z "$CUDNN_INSTALL_PATH" ]; then - CUDNN_INSTALL_PATH=$default_cudnn_path - fi - # Result returned from "read" will be used unexpanded. That make "~" unusable. - # Going through one more level of expansion to handle that. - CUDNN_INSTALL_PATH=`"${PYTHON_BIN_PATH}" -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"` - if is_windows; then - CUDNN_INSTALL_PATH="$(cygpath -m "$CUDNN_INSTALL_PATH")" - fi - fi - - if [[ -z "$TF_CUDNN_VERSION" ]]; then - TF_CUDNN_EXT="" - else - TF_CUDNN_EXT=".$TF_CUDNN_VERSION" - fi - - if is_windows; then - CUDA_DNN_LIB_PATH="lib/x64/cudnn.lib" - CUDA_DNN_LIB_ALT_PATH="lib/x64/cudnn.lib" - elif is_linux; then - CUDA_DNN_LIB_PATH="lib64/libcudnn.so${TF_CUDNN_EXT}" - CUDA_DNN_LIB_ALT_PATH="libcudnn.so${TF_CUDNN_EXT}" - elif is_macos; then - CUDA_DNN_LIB_PATH="lib/libcudnn${TF_CUDNN_EXT}.dylib" - CUDA_DNN_LIB_ALT_PATH="libcudnn${TF_CUDNN_EXT}.dylib" - fi - - if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" ] || [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then - export TF_CUDNN_VERSION - write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION" - export CUDNN_INSTALL_PATH - write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH" - break - fi - - if is_linux; then - if ! type ldconfig > /dev/null 2>&1; then - LDCONFIG_BIN=/sbin/ldconfig - else - LDCONFIG_BIN=ldconfig - fi - CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')" - if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then - export TF_CUDNN_VERSION - export CUDNN_INSTALL_PATH - CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})" - write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH" - break - fi - fi - echo "Invalid path to cuDNN ${CUDNN_VERSION} toolkit. Neither of the following two files can be found:" - echo "${CUDNN_INSTALL_PATH}/${CUDA_DNN_LIB_PATH}" - echo "${CUDNN_INSTALL_PATH}/${CUDA_DNN_LIB_ALT_PATH}" - if is_linux; then - echo "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" - fi - - if [ -z "$fromuser" ]; then - exit 1 - fi - # Retry - TF_CUDNN_VERSION="" - CUDNN_INSTALL_PATH="" -done - -export TF_CUDNN_VERSION -write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION" - -# Configure the compute capabilities that TensorFlow builds for. -# Since Cuda toolkit is not backward-compatible, this is not guaranteed to work. -function get_native_cuda_compute_capabilities { - device_query_bin="$CUDA_TOOLKIT_PATH/extras/demo_suite/deviceQuery" # Also works on Windows without .exe - "$device_query_bin" | grep 'Capability' | grep -o '[0-9]*\.[0-9]*' | sed ':a;{N;s/\n/,/};ba' - exit 0 # ensure that this function always exit success even if device detection fails, to prevent the whole configure from aborting -} -while true; do - fromuser="" - native_cuda_compute_capabilities=$(get_native_cuda_compute_capabilities) - default_cuda_compute_capabilities=${native_cuda_compute_capabilities:-"3.5,5.2"} - if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then -cat << EOF -Please specify a list of comma-separated Cuda compute capabilities you want to build with. -You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus. -Please note that each additional compute capability significantly increases your build time and binary size. -EOF - read -p "[Default is: \"$default_cuda_compute_capabilities\"]: " TF_CUDA_COMPUTE_CAPABILITIES - fromuser=1 - fi - if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then - TF_CUDA_COMPUTE_CAPABILITIES=$default_cuda_compute_capabilities - fi - # Check whether all capabilities from the input is valid - COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES//,/ } - ALL_VALID=1 - for CAPABILITY in $COMPUTE_CAPABILITIES; do - if [[ ! "$CAPABILITY" =~ [0-9]+.[0-9]+ ]]; then - echo "Invalid compute capability: " $CAPABILITY - ALL_VALID=0 - break - fi - done - if [ "$ALL_VALID" == "0" ]; then - if [ -z "$fromuser" ]; then - exit 1 - fi - else - export TF_CUDA_COMPUTE_CAPABILITIES - write_action_env_to_bazelrc "TF_CUDA_COMPUTE_CAPABILITIES" "$TF_CUDA_COMPUTE_CAPABILITIES" - break - fi - TF_CUDA_COMPUTE_CAPABILITIES="" -done - -if is_windows; then - # The following three variables are needed for MSVC toolchain configuration in Bazel - export CUDA_PATH="$CUDA_TOOLKIT_PATH" - export CUDA_COMPUTE_CAPABILITIES="$TF_CUDA_COMPUTE_CAPABILITIES" - export NO_WHOLE_ARCHIVE_OPTION=1 - write_action_env_to_bazelrc "CUDA_PATH" "$CUDA_PATH" - write_action_env_to_bazelrc "CUDA_COMPUTE_CAPABILITIES" "$CUDA_COMPUTE_CAPABILITIES" - write_action_env_to_bazelrc "NO_WHOLE_ARCHIVE_OPTION" "1" - write_to_bazelrc "build --config=win-cuda" - write_to_bazelrc "test --config=win-cuda" -else - # If CUDA is enabled, always use GPU during build and test. - if [ "$TF_CUDA_CLANG" == "1" ]; then - write_to_bazelrc "build --config=cuda_clang" - write_to_bazelrc "test --config=cuda_clang" - else - write_to_bazelrc "build --config=cuda" - write_to_bazelrc "test --config=cuda" - fi -fi - -# end of if "$TF_NEED_CUDA" == "1" -fi - -# OpenCL configuration - -if [ "$TF_NEED_OPENCL" == "1" ]; then - -# Determine which C++ compiler should be used as the host compiler -while true; do - fromuser="" - if [ -z "$HOST_CXX_COMPILER" ]; then - default_cxx_host_compiler=$(which g++ || true) - read -p "Please specify which C++ compiler should be used as the host C++ compiler. [Default is $default_cxx_host_compiler]: " HOST_CXX_COMPILER - fromuser="1" - if [ -z "$HOST_CXX_COMPILER" ]; then - HOST_CXX_COMPILER=$default_cxx_host_compiler - fi - fi - if [ -e "$HOST_CXX_COMPILER" ]; then - export HOST_CXX_COMPILER - write_action_env_to_bazelrc "HOST_CXX_COMPILER" "$HOST_CXX_COMPILER" - break - fi - echo "Invalid C++ compiler path. ${HOST_CXX_COMPILER} cannot be found" 1>&2 - if [ -z "$fromuser" ]; then - exit 1 - fi - HOST_CXX_COMPILER="" - # Retry -done - -# Determine which C compiler should be used as the host compiler -while true; do - fromuser="" - if [ -z "$HOST_C_COMPILER" ]; then - default_c_host_compiler=$(which gcc || true) - read -p "Please specify which C compiler should be used as the host C compiler. [Default is $default_c_host_compiler]: " HOST_C_COMPILER - fromuser="1" - if [ -z "$HOST_C_COMPILER" ]; then - HOST_C_COMPILER=$default_c_host_compiler - fi - fi - if [ -e "$HOST_C_COMPILER" ]; then - export HOST_C_COMPILER - write_action_env_to_bazelrc "HOST_C_COMPILER" "$HOST_C_COMPILER" - break - fi - echo "Invalid C compiler path. ${HOST_C_COMPILER} cannot be found" 1>&2 - if [ -z "$fromuser" ]; then - exit 1 - fi - HOST_C_COMPILER="" - # Retry -done - -while true; do - # Configure the OPENCL version to use. - TF_OPENCL_VERSION="1.2" - - # Point to ComputeCpp root - if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then - default_computecpp_toolkit_path=/usr/local/computecpp - read -p "Please specify the location where ComputeCpp for SYCL $TF_OPENCL_VERSION is installed. [Default is $default_computecpp_toolkit_path]: " COMPUTECPP_TOOLKIT_PATH - fromuser="1" - if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then - COMPUTECPP_TOOLKIT_PATH=$default_computecpp_toolkit_path - fi - fi - - if is_linux; then - SYCL_RT_LIB_PATH="lib/libComputeCpp.so" - fi - - if [ -e "${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH}" ]; then - export COMPUTECPP_TOOLKIT_PATH - write_action_env_to_bazelrc "COMPUTECPP_TOOLKIT_PATH" "$COMPUTECPP_TOOLKIT_PATH" - break - fi - echo "Invalid SYCL $TF_OPENCL_VERSION library path. ${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH} cannot be found" - - if [ -z "$fromuser" ]; then - exit 1 - fi - # Retry - TF_OPENCL_VERSION="" - COMPUTECPP_TOOLKIT_PATH="" -done - -# end of if "$TF_NEED_OPENCL" == "1" -fi - - -while [ "$TF_NEED_MPI" == "" ]; do - read -p "Do you wish to build TensorFlow with "\ -"MPI support? [y/N] " INPUT - case $INPUT in - [Yy]* ) echo "MPI support will be enabled for "\ -"TensorFlow"; TF_NEED_MPI=1;; - [Nn]* ) echo "MPI support will not be enabled for "\ -"TensorFlow"; TF_NEED_MPI=0;; - "" ) echo "MPI support will not be enabled for "\ -"TensorFlow"; TF_NEED_MPI=0;; - * ) echo "Invalid selection: " $INPUT;; - esac -done - -# Find out where the MPI toolkit is installed -while true; do - if [ "$TF_NEED_MPI" == "0" ]; then - break; - fi - - fromuser="" - if [ -z "$MPI_HOME" ]; then - #Get the base folder by removing the bin path - default_mpi_path=$(dirname $(dirname $(which mpirun)) || dirname $(dirname $(which mpiexec)) || true) - read -p "Please specify the MPI toolkit folder. [Default is $default_mpi_path]: " MPI_HOME - fromuser="1" - if [ -z "$MPI_HOME" ]; then - MPI_HOME=$default_mpi_path - fi - fi - - #Check that the include and library folders are where we expect them to be - if [ -e "$MPI_HOME/include" ] && [ -e "$MPI_HOME/lib" ]; then - break - fi - - echo "Invalid path to the MPI Toolkit. ${MPI_HOME}/include or ${MPI_HOME}/lib cannot be found." - if [ -z "$fromuser" ]; then - exit 1 - fi - - # Retry - MPI_HOME="" -done - - -if [ "$TF_NEED_MPI" == "1" ]; then - write_to_bazelrc 'build --define with_mpi_support=true' - - #Link the MPI header files - ln -sf "${MPI_HOME}/include/mpi.h" third_party/mpi/mpi.h - - - #Determine if we use OpenMPI or MVAPICH, these require different header files - #to be included here to make bazel dependency checker happy - - if [ -e "${MPI_HOME}/include/mpi_portable_platform.h" ]; then - #OpenMPI - ln -sf "${MPI_HOME}/include/mpi_portable_platform.h" third_party/mpi/ - sed -i -e "s/MPI_LIB_IS_OPENMPI=False/MPI_LIB_IS_OPENMPI=True/" third_party/mpi/mpi.bzl - else - #MVAPICH / MPICH - ln -sf "${MPI_HOME}/include/mpio.h" third_party/mpi/ - ln -sf "${MPI_HOME}/include/mpicxx.h" third_party/mpi/ - sed -i -e "s/MPI_LIB_IS_OPENMPI=True/MPI_LIB_IS_OPENMPI=False/" third_party/mpi/mpi.bzl - fi - - - if [ -e "${MPI_HOME}/lib/libmpi.so" ]; then - ln -sf "${MPI_HOME}/lib/libmpi.so" third_party/mpi/ - else - echo "Cannot find the MPI library file in ${MPI_HOME}/lib " - exit 1 - fi -fi - - -echo "Configuration finished" +echo "Configuration finished" \ No newline at end of file diff --git a/configure.py b/configure.py new file mode 100644 index 00000000000..fac00d1b74b --- /dev/null +++ b/configure.py @@ -0,0 +1,950 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""configure script to get build parameters from user.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import errno +import os +import platform +import re +import site +import subprocess +import sys + +_TF_BAZELRC = '.tf_configure.bazelrc' +_DEFAULT_CUDA_VERSION = '8.0' +_DEFAULT_CUDNN_VERSION = '6' +_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2' +_DEFAULT_CUDA_PATH = '/usr/local/cuda' +_DEFAULT_CUDA_PATH_LINUX = '/opt/cuda' +_DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing ' + 'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION) +_TF_OPENCL_VERSION = '1.2' +_DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp' + + +def is_windows(): + return platform.system() == 'Windows' + + +def is_linux(): + return platform.system() == 'Linux' + + +def is_macos(): + return platform.system() == 'Darwin' + + +def is_ppc64le(): + return platform.machine() == 'ppc64le' + + +def get_input(question): + try: + try: + answer = raw_input(question) + except NameError: + answer = input(question) # pylint: disable=bad-builtin + except EOFError: + answer = '' + return answer + + +def symlink_force(target, link_name): + """Force symlink, equivalent of 'ln -sf'. + + Args: + target: items to link to. + link_name: name of the link. + """ + try: + os.symlink(target, link_name) + except OSError as e: + if e.errno == errno.EEXIST: + os.remove(link_name) + os.symlink(target, link_name) + else: + raise e + + +def sed_in_place(filename, old, new): + """Replace old string with new string in file. + + Args: + filename: string for filename. + old: string to replace. + new: new string to replace to. + """ + with open(filename, 'r') as f: + filedata = f.read() + newdata = filedata.replace(old, new) + with open(filename, 'w') as f: + f.write(newdata) + + +def remove_line_with(filename, token): + """Remove lines that contain token from file. + + Args: + filename: string for filename. + token: string token to check if to remove a line from file or not. + """ + with open(filename, 'r') as f: + filedata = f.read() + + with open(filename, 'w') as f: + for line in filedata.strip().split('\n'): + if token not in line: + f.write(line + '\n') + + +def write_to_bazelrc(line): + with open(_TF_BAZELRC, 'a') as f: + f.write(line + '\n') + + +def write_action_env_to_bazelrc(var_name, var): + write_to_bazelrc('build --action_env %s="%s"' % (var_name, str(var))) + + +def run_shell(cmd): + return subprocess.check_output(cmd, shell=True).decode('UTF-8').strip() + + +def cygpath(path): + """Convert path from posix to windows.""" + return run_shell('cygpath -m "%s"' % path) + + +def get_python_path(environ_cp): + """Get the python site package paths.""" + python_paths = [] + if environ_cp.get('PYTHONPATH'): + python_paths = environ_cp.get('PYTHONPATH').split(':') + try: + library_paths = site.getsitepackages() + except AttributeError: + from distutils.sysconfig import get_python_lib # pylint: disable=g-import-not-at-top + library_paths = [get_python_lib()] + all_paths = set(python_paths + library_paths) + + paths = [] + for path in all_paths: + if os.path.isdir(path): + paths.append(path) + return paths + + +def setup_python(environ_cp): + """Setup python related env variables.""" + # Get PYTHON_BIN_PATH, default is the current running python. + default_python_bin_path = sys.executable + ask_python_bin_path = ('Please specify the location of python. [Default is ' + '%s]: ') % default_python_bin_path + while True: + python_bin_path = get_from_env_or_user_or_default( + environ_cp, 'PYTHON_BIN_PATH', ask_python_bin_path, + default_python_bin_path) + # Check if the path is valid + if (os.path.isfile(python_bin_path) and os.access( + python_bin_path, os.X_OK)) or (os.path.isdir(python_bin_path)): + break + elif not os.path.exists(python_bin_path): + print('Invalid python path: %s cannot be found.' % python_bin_path) + else: + print('%s is not executable. Is it the python binary?' % python_bin_path) + environ_cp['PYTHON_BIN_PATH'] = '' + + # Get PYTHON_LIB_PATH + python_lib_path = environ_cp.get('PYTHON_LIB_PATH') + if not python_lib_path: + python_lib_paths = get_python_path(environ_cp) + if environ_cp.get('USE_DEFAULT_PYTHON_LIB_PATH') == '1': + environ_cp['PYTHON_LIB_PATH'] = python_lib_paths[0] + else: + print('Found possible Python library paths:\n%s' % + '\n'.join(python_lib_paths)) + default_python_lib_path = python_lib_paths[0] + python_lib_path = get_input( + 'Please input the desired Python library path to use. Default is %s' + % python_lib_paths[0]) + if not python_lib_path: + python_lib_path = default_python_lib_path + environ_cp['PYTHON_LIB_PATH'] = python_lib_path + + python_major_version = sys.version_info[0] + # Convert python path to Windows style before writing into bazel.rc + if is_windows(): + python_bin_path = cygpath(python_bin_path) + python_lib_path = cygpath(python_lib_path) + + # Set-up env variables used by python_configure.bzl + write_action_env_to_bazelrc('PYTHON_BIN_PATH', python_bin_path) + write_action_env_to_bazelrc('PYTHON_LIB_PATH', python_lib_path) + write_to_bazelrc('build --define PYTHON_BIN_PATH="%s"' % python_bin_path) + write_to_bazelrc('build --define PYTHON_LIB_PATH="%s"' % python_lib_path) + write_to_bazelrc('build --force_python=py%s' % python_major_version) + write_to_bazelrc('build --host_force_python=py%s' % python_major_version) + write_to_bazelrc('build --python%s_path=\"%s"' % (python_major_version, + python_bin_path)) + write_to_bazelrc('test --force_python=py%s' % python_major_version) + write_to_bazelrc('test --host_force_python=py%s' % python_major_version) + write_to_bazelrc('test --define PYTHON_BIN_PATH="%s"' % python_bin_path) + write_to_bazelrc('test --define PYTHON_LIB_PATH="%s"' % python_lib_path) + write_to_bazelrc('run --define PYTHON_BIN_PATH="%s"' % python_bin_path) + write_to_bazelrc('run --define PYTHON_LIB_PATH="%s"' % python_lib_path) + environ_cp['PYTHON_BIN_PATH'] = python_bin_path + + # Write tools/python_bin_path.sh + with open('tools/python_bin_path.sh', 'w') as f: + f.write('export PYTHON_BIN_PATH="%s"' % python_bin_path) + + +def reset_tf_configure_bazelrc(): + """Reset file that contains customized config settings.""" + open(_TF_BAZELRC, 'w').close() + + home = os.path.expanduser('~') + if not os.path.exists('.bazelrc'): + if os.path.exists(os.path.join(home, '.bazelrc')): + with open('.bazelrc', 'a') as f: + f.write('import %s/.bazelrc\n' % home) + else: + open('.bazelrc', 'w').close() + + remove_line_with('.bazelrc', 'tf_configure') + with open('.bazelrc', 'a') as f: + f.write('import %workspace%/.tf_configure.bazelrc\n') + + +def run_gen_git_source(environ_cp): + """Run the gen_git_source to create links. + + The links are for bazel to track dependencies for git hash propagation. + + Args: + environ_cp: copy of the os.environ. + """ + cmd = '%s tensorflow/tools/git/gen_git_source.py --configure %s' % ( + environ_cp.get('PYTHON_BIN_PATH'), os.getcwd()) + os.system(cmd) + + +def cleanup_makefile(): + """Delete any leftover BUILD files from the Makefile build. + + These files could interfere with Bazel parsing. + """ + makefile_download_dir = 'tensorflow/contrib/makefile/downloads' + if os.path.isdir(makefile_download_dir): + for root, _, filenames in os.walk(makefile_download_dir): + for f in filenames: + if f.endswith('BUILD'): + os.remove(os.path.join(root, f)) + + +def get_var(environ_cp, + var_name, + query_item, + enabled_by_default, + question=None, + yes_reply=None, + no_reply=None): + """Get boolean input from user. + + If var_name is not set in env, ask user to enable query_item or not. If the + response is empty, use the default. + + Args: + environ_cp: copy of the os.environ. + var_name: string for name of environment variable, e.g. "TF_NEED_HDFS". + query_item: string for feature related to the variable, e.g. "Hadoop File + System". + enabled_by_default: boolean for default behavior. + question: optional string for how to ask for user input. + yes_reply: optionanl string for reply when feature is enabled. + no_reply: optional string for reply when feature is disabled. + + Returns: + boolean value of the variable. + """ + if not question: + question = 'Do you wish to build TensorFlow with %s support?' % query_item + if not yes_reply: + yes_reply = '%s support will be enabled for TensorFlow.' % query_item + if not no_reply: + no_reply = 'No %s' % yes_reply + + yes_reply += '\n' + no_reply += '\n' + + if enabled_by_default: + question += ' [Y/n]: ' + else: + question += ' [y/N]: ' + + var = environ_cp.get(var_name) + while var is None: + user_input_origin = get_input(question) + user_input = user_input_origin.strip().lower() + if user_input == 'y': + print(yes_reply) + var = True + elif user_input == 'n': + print(no_reply) + var = False + elif not user_input: + if enabled_by_default: + print(yes_reply) + var = True + else: + print(no_reply) + var = False + else: + print('Invalid selection: %s' % user_input_origin) + return var + + +def set_build_var(environ_cp, var_name, query_item, option_name, + enabled_by_default): + """Set if query_item will be enabled for the build. + + Ask user if query_item will be enabled. Default is used if no input is given. + Set subprocess environment variable and write to .bazelrc if enabled. + + Args: + environ_cp: copy of the os.environ. + var_name: string for name of environment variable, e.g. "TF_NEED_HDFS". + query_item: string for feature related to the variable, e.g. "Hadoop File + System". + option_name: string for option to define in .bazelrc. + enabled_by_default: boolean for default behavior. + """ + + var = str(int(get_var(environ_cp, var_name, query_item, enabled_by_default))) + environ_cp[var_name] = var + if var == '1': + write_to_bazelrc('build --define %s=true' % option_name) + + +def set_action_env_var(environ_cp, + var_name, + query_item, + enabled_by_default, + question=None, + yes_reply=None, + no_reply=None): + """Set boolean action_env variable. + + Ask user if query_item will be enabled. Default is used if no input is given. + Set environment variable and write to .bazelrc. + + Args: + environ_cp: copy of the os.environ. + var_name: string for name of environment variable, e.g. "TF_NEED_HDFS". + query_item: string for feature related to the variable, e.g. "Hadoop File + System". + enabled_by_default: boolean for default behavior. + question: optional string for how to ask for user input. + yes_reply: optionanl string for reply when feature is enabled. + no_reply: optional string for reply when feature is disabled. + """ + var = int( + get_var(environ_cp, var_name, query_item, enabled_by_default, question, + yes_reply, no_reply)) + + write_action_env_to_bazelrc(var_name, var) + environ_cp[var_name] = str(var) + + +def check_bazel_version(min_version): + """Check installed bezel version is at least min_version. + + Args: + min_version: string for minimum bazel version. + """ + try: + curr_version = run_shell('bazel version') + except subprocess.CalledProcessError: + print('Cannot find bazel. Please install bazel.') + sys.exit(0) + + for line in curr_version.split('\n'): + if 'Build label: ' in line: + curr_version = line.split('Build label: ')[1] + break + + min_version_segments = min_version.split('.') + curr_version_segments = curr_version.split('.') + + # Check if current bazel version can be detected properly. + for seg in curr_version_segments: + if not seg.isdigit(): + print('WARNING: current bazel installation is not a release version.') + print('Make sure you are running at least bazel %s' % min_version) + return + + min_version_str = ''.join(['%03d' % int(seg) for seg in min_version_segments]) + curr_version_str = ''.join( + ['%03d' % int(seg) for seg in curr_version_segments]) + if int(curr_version_str) < int(min_version_str): + print('Please upgrade your bazel installation to version %s or higher to ' + 'build TensorFlow!' % min_version) + sys.exit(0) + + +def set_cc_opt_flags(environ_cp): + """Set up architecture-dependent optimization flags. + + Also append CC optimization flags to bazel.rc.. + + Args: + environ_cp: copy of the os.environ. + """ + if is_ppc64le(): + # gcc on ppc64le does not support -march, use mcpu instead + default_cc_opt_flags = '-mcpu=native' + else: + default_cc_opt_flags = '-march=native' + question = ('Please specify optimization flags to use during compilation when' + ' bazel option "--config=opt" is specified [Default is %s]: ' + ) % default_cc_opt_flags + cc_opt_flags = get_from_env_or_user_or_default(environ_cp, 'CC_OPT_FLAGS', + question, default_cc_opt_flags) + for opt in cc_opt_flags.split(): + write_to_bazelrc('build:opt --cxxopt=%s --copt=%s' % (opt, opt)) + + +def set_tf_cuda_clang(environ_cp): + """set TF_CUDA_CLANG action_env. + + Args: + environ_cp: copy of the os.environ. + """ + question = 'Do you want to use clang as CUDA compiler?' + yes_reply = 'Clang will be used as CUDA compiler.' + no_reply = 'nvcc will be used as CUDA compiler.' + set_action_env_var( + environ_cp, + 'TF_CUDA_CLANG', + None, + False, + question=question, + yes_reply=yes_reply, + no_reply=no_reply) + + +def get_from_env_or_user_or_default(environ_cp, var_name, ask_for_var, + var_default): + """Get var_name either from env, or user or default. + + If var_name has been set as environment variable, use the preset value, else + ask for user input. If no input is provided, the default is used. + + Args: + environ_cp: copy of the os.environ. + var_name: string for name of environment variable, e.g. "TF_NEED_HDFS". + ask_for_var: string for how to ask for user input. + var_default: default value string. + + Returns: + string value for var_name + """ + var = environ_cp.get(var_name) + if not var: + var = get_input(ask_for_var) + if not var: + var = var_default + return var + + +def set_clang_cuda_compiler_path(environ_cp): + """Set CLANG_CUDA_COMPILER_PATH.""" + default_clang_path = run_shell('which clang || true') + ask_clang_path = ('Please specify which clang should be used as device and ' + 'host compiler. [Default is %s]: ') % default_clang_path + + while True: + clang_cuda_compiler_path = get_from_env_or_user_or_default( + environ_cp, 'CLANG_CUDA_COMPILER_PATH', ask_clang_path, + default_clang_path) + if os.path.exists(clang_cuda_compiler_path): + break + + # Reset and retry + print('Invalid clang path: %s cannot be found.' % clang_cuda_compiler_path) + environ_cp['CLANG_CUDA_COMPILER_PATH'] = '' + + # Set CLANG_CUDA_COMPILER_PATH + environ_cp['CLANG_CUDA_COMPILER_PATH'] = clang_cuda_compiler_path + write_action_env_to_bazelrc('CLANG_CUDA_COMPILER_PATH', + clang_cuda_compiler_path) + + +def set_gcc_host_compiler_path(environ_cp): + """Set GCC_HOST_COMPILER_PATH.""" + default_gcc_host_compiler_path = run_shell('which gcc || true') + cuda_bin_symlink = '%s/bin/gcc' % environ_cp.get('CUDA_TOOLKIT_PATH') + + if os.path.islink(cuda_bin_symlink): + # os.readlink is only available in linux + default_gcc_host_compiler_path = run_shell('readlink %s' % cuda_bin_symlink) + + ask_gcc_path = ( + 'Please specify which gcc should be used by nvcc as the ' + 'host compiler. [Default is %s]: ') % default_gcc_host_compiler_path + while True: + gcc_host_compiler_path = get_from_env_or_user_or_default( + environ_cp, 'GCC_HOST_COMPILER_PATH', ask_gcc_path, + default_gcc_host_compiler_path) + + if os.path.exists(gcc_host_compiler_path): + break + + # Reset and retry + print('Invalid gcc path. %s cannot be found' % gcc_host_compiler_path) + environ_cp['GCC_HOST_COMPILER_PATH'] = '' + + # Set GCC_HOST_COMPILER_PATH + environ_cp['GCC_HOST_COMPILER_PATH'] = gcc_host_compiler_path + write_action_env_to_bazelrc('GCC_HOST_COMPILER_PATH', gcc_host_compiler_path) + + +def set_tf_cuda_version(environ_cp): + """Set CUDA_TOOLKIT_PATH and TF_CUDA_VERSION.""" + ask_cuda_version = ( + 'Please specify the CUDA SDK version you want to use, ' + 'e.g. 7.0. [Leave empty to default to CUDA %s]: ') % _DEFAULT_CUDA_VERSION + + while True: + # Configure the Cuda SDK version to use. + tf_cuda_version = get_from_env_or_user_or_default( + environ_cp, 'TF_CUDA_VERSION', ask_cuda_version, _DEFAULT_CUDA_VERSION) + + # Find out where the CUDA toolkit is installed + default_cuda_path = _DEFAULT_CUDA_PATH + if is_windows(): + default_cuda_path = cygpath( + environ_cp.get('CUDA_PATH', _DEFAULT_CUDA_PATH_WIN)) + elif is_linux(): + # If the default doesn't exist, try an alternative default. + if (not os.path.exists(default_cuda_path) + ) and os.path.exists(_DEFAULT_CUDA_PATH_LINUX): + default_cuda_path = _DEFAULT_CUDA_PATH_LINUX + ask_cuda_path = ('Please specify the location where CUDA %s toolkit is' + ' installed. Refer to README.md for more details. ' + '[Default is %s]: ') % (tf_cuda_version, default_cuda_path) + cuda_toolkit_path = get_from_env_or_user_or_default( + environ_cp, 'CUDA_TOOLKIT_PATH', ask_cuda_path, default_cuda_path) + + if is_windows(): + cuda_rt_lib_path = 'lib/x64/cudart.lib' + elif is_linux(): + cuda_rt_lib_path = 'lib64/libcudart.so.%s' % tf_cuda_version + elif is_macos(): + cuda_rt_lib_path = 'lib/libcudart.%s.dylib' % tf_cuda_version + + cuda_toolkit_path_full = os.path.join(cuda_toolkit_path, cuda_rt_lib_path) + if os.path.exists(cuda_toolkit_path_full): + break + + # Reset and retry + print('Invalid path to CUDA %s toolkit. %s cannot be found' % + (tf_cuda_version, cuda_toolkit_path_full)) + environ_cp['TF_CUDA_VERSION'] = '' + environ_cp['CUDA_TOOLKIT_PATH'] = '' + + # Set CUDA_TOOLKIT_PATH and TF_CUDA_VERSION + environ_cp['CUDA_TOOLKIT_PATH'] = cuda_toolkit_path + write_action_env_to_bazelrc('CUDA_TOOLKIT_PATH', cuda_toolkit_path) + environ_cp['TF_CUDA_VERSION'] = tf_cuda_version + write_action_env_to_bazelrc('TF_CUDA_VERSION', tf_cuda_version) + + +def set_tf_cunn_version(environ_cp): + """Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION.""" + ask_cudnn_version = ( + '"Please specify the cuDNN version you want to use. ' + '[Leave empty to default to cuDNN %s.0]: ') % _DEFAULT_CUDNN_VERSION + + while True: + tf_cudnn_version = get_from_env_or_user_or_default( + environ_cp, 'TF_CUDNN_VERSION', ask_cudnn_version, + _DEFAULT_CUDNN_VERSION) + + default_cudnn_path = environ_cp.get('CUDA_TOOLKIT_PATH') + ask_cudnn_path = (r'Please specify the location where cuDNN %s library is ' + 'installed. Refer to README.md for more details. [Default' + ' is %s]:') % (tf_cudnn_version, default_cudnn_path) + cudnn_install_path = get_from_env_or_user_or_default( + environ_cp, 'CUDNN_INSTALL_PATH', ask_cudnn_path, default_cudnn_path) + + # Result returned from "read" will be used unexpanded. That make "~" + # unusable. Going through one more level of expansion to handle that. + cudnn_install_path = os.path.realpath( + os.path.expanduser(cudnn_install_path)) + if is_windows(): + cudnn_install_path = cygpath(cudnn_install_path) + + if is_windows(): + cuda_dnn_lib_path = 'lib/x64/cudnn.lib' + cuda_dnn_lib_alt_path = 'lib/x64/cudnn.lib' + elif is_linux(): + cuda_dnn_lib_path = 'lib64/libcudnn.so.%s' % tf_cudnn_version + cuda_dnn_lib_alt_path = 'libcudnn.so.%s' % tf_cudnn_version + elif is_macos(): + cuda_dnn_lib_path = 'lib/libcudnn.%s.dylib' % tf_cudnn_version + cuda_dnn_lib_alt_path = 'libcudnn.%s.dylib' % tf_cudnn_version + + cuda_dnn_lib_path_full = os.path.join(cudnn_install_path, cuda_dnn_lib_path) + cuda_dnn_lib_alt_path_full = os.path.join(cudnn_install_path, + cuda_dnn_lib_alt_path) + if os.path.exists(cuda_dnn_lib_path_full) or os.path.exists( + cuda_dnn_lib_alt_path_full): + break + + # Try another alternative for Linux + if is_linux(): + if subprocess.call(['which', 'ldconfig']): + ldconfig_bin = '/sbin/ldconfig' + else: + ldconfig_bin = 'ldconfig' + cudnn_path_from_ldconfig = run_shell( + r'%s -p | sed -n "s/.*libcudnn.so .* => \(.*\)/\\1/p"' % ldconfig_bin) + if os.path.exists('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version)): + cudnn_install_path = os.path.dirname(cudnn_path_from_ldconfig) + break + + # Reset and Retry + print( + 'Invalid path to cuDNN %s toolkit. None of the following files can be ' + 'found:' % tf_cudnn_version) + print(cuda_dnn_lib_path_full) + print(cuda_dnn_lib_alt_path_full) + if is_linux(): + print('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version)) + + environ_cp['TF_CUDNN_VERSION'] = '' + + # Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION + environ_cp['CUDNN_INSTALL_PATH'] = cudnn_install_path + write_action_env_to_bazelrc('CUDNN_INSTALL_PATH', cudnn_install_path) + environ_cp['TF_CUDNN_VERSION'] = tf_cudnn_version + write_action_env_to_bazelrc('TF_CUDNN_VERSION', tf_cudnn_version) + + +def get_native_cuda_compute_capabilities(environ_cp): + """Get native cuda compute capabilities. + + Args: + environ_cp: copy of the os.environ. + Returns: + string of native cuda compute capabilities, separated by comma. + """ + device_query_bin = os.path.join( + environ_cp.get('CUDA_TOOLKIT_PATH'), 'extras/demo_suite/deviceQuery') + cmd = (r'"%s" | grep "Capability" | grep -o "[0-9]*\.[0-9]*" | sed ' + '":a;{N;s/\\n/,/};ba"') % device_query_bin + try: + output = run_shell(cmd) + except subprocess.CalledProcessError: + output = '' + return output + + +def set_tf_cuda_compute_capabilities(environ_cp): + """Set TF_CUDA_COMPUTE_CAPABILITIES.""" + while True: + native_cuda_compute_capabilities = get_native_cuda_compute_capabilities( + environ_cp) + if not native_cuda_compute_capabilities: + default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES + else: + default_cuda_compute_capabilities = native_cuda_compute_capabilities + + ask_cuda_compute_capabilities = ( + 'Please specify a list of comma-separated ' + 'Cuda compute capabilities you want to ' + 'build with.\nYou can find the compute ' + 'capability of your device at: ' + 'https://developer.nvidia.com/cuda-gpus.\nPlease' + ' note that each additional compute ' + 'capability significantly increases your ' + 'build time and binary size. [Default is: %s]' % + default_cuda_compute_capabilities) + tf_cuda_compute_capabilities = get_from_env_or_user_or_default( + environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES', + ask_cuda_compute_capabilities, default_cuda_compute_capabilities) + # Check whether all capabilities from the input is valid + all_valid = True + for compute_capability in tf_cuda_compute_capabilities.split(','): + if not re.match('[0-9]+.[0-9]+', compute_capability): + print('Invalid compute capability: ' % compute_capability) + all_valid = False + + if all_valid: + break + + # Reset and Retry + environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = '' + + # Set TF_CUDA_COMPUTE_CAPABILITIES + environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = tf_cuda_compute_capabilities + write_action_env_to_bazelrc('TF_CUDA_COMPUTE_CAPABILITIES', + tf_cuda_compute_capabilities) + + +def set_other_cuda_vars(environ_cp): + """Set other CUDA related variables.""" + if is_windows(): + # The following three variables are needed for MSVC toolchain configuration + # in Bazel + environ_cp['CUDA_PATH'] = environ_cp.get('CUDA_TOOLKIT_PATH') + environ_cp['CUDA_COMPUTE_CAPABILITIES'] = environ_cp.get( + 'TF_CUDA_COMPUTE_CAPABILITIES') + environ_cp['NO_WHOLE_ARCHIVE_OPTION'] = 1 + write_action_env_to_bazelrc('CUDA_PATH', environ_cp.get('CUDA_PATH')) + write_action_env_to_bazelrc('CUDA_COMPUTE_CAPABILITIE', + environ_cp.get('CUDA_COMPUTE_CAPABILITIE')) + write_action_env_to_bazelrc('NO_WHOLE_ARCHIVE_OPTION', + environ_cp.get('NO_WHOLE_ARCHIVE_OPTION')) + write_to_bazelrc('build --config=win-cuda') + write_to_bazelrc('test --config=win-cuda') + else: + # If CUDA is enabled, always use GPU during build and test. + if environ_cp.get('TF_CUDA_CLANG') == '1': + write_to_bazelrc('build --config=cuda_clang') + write_to_bazelrc('test --config=cuda_clang') + else: + write_to_bazelrc('build --config=cuda') + write_to_bazelrc('test --config=cuda') + + +def set_host_cxx_compiler(environ_cp): + """Set HOST_CXX_COMPILER.""" + default_cxx_host_compiler = run_shell('which g++ || true') + ask_cxx_host_compiler = ( + 'Please specify which C++ compiler should be used as' + ' the host C++ compiler. [Default is %s]: ') % default_cxx_host_compiler + + while True: + host_cxx_compiler = get_from_env_or_user_or_default( + environ_cp, 'HOST_CXX_COMPILER', ask_cxx_host_compiler, + default_cxx_host_compiler) + if os.path.exists(host_cxx_compiler): + break + + # Reset and retry + print('Invalid C++ compiler path. %s cannot be found' % host_cxx_compiler) + environ_cp['HOST_CXX_COMPILER'] = '' + + # Set HOST_CXX_COMPILER + environ_cp['HOST_CXX_COMPILER'] = host_cxx_compiler + write_action_env_to_bazelrc('HOST_CXX_COMPILER', host_cxx_compiler) + + +def set_host_c_compiler(environ_cp): + """Set HOST_C_COMPILER.""" + default_c_host_compiler = run_shell('which gcc || true') + ask_c_host_compiler = ( + 'Please specify which C compiler should be used as the' + ' host C compiler. [Default is %s]: ') % default_c_host_compiler + + while True: + host_c_compiler = get_from_env_or_user_or_default( + environ_cp, 'HOST_C_COMPILER', ask_c_host_compiler, + default_c_host_compiler) + if os.path.exists(host_c_compiler): + break + + # Reset and retry + print('Invalid C compiler path. %s cannot be found' % host_c_compiler) + environ_cp['HOST_C_COMPILER'] = '' + + # Set HOST_C_COMPILER + environ_cp['HOST_C_COMPILER'] = host_c_compiler + write_action_env_to_bazelrc('HOST_C_COMPILER', host_c_compiler) + + +def set_computecpp_toolkit_path(environ_cp): + """Set COMPUTECPP_TOOLKIT_PATH.""" + ask_computecpp_toolkit_path = ('Please specify the location where ComputeCpp ' + 'for SYCL %s is installed. [Default is %s]: ' + ) % (_TF_OPENCL_VERSION, + _DEFAULT_COMPUTECPP_TOOLKIT_PATH) + + while True: + computecpp_toolkit_path = get_from_env_or_user_or_default( + environ_cp, 'COMPUTECPP_TOOLKIT_PATH', ask_computecpp_toolkit_path, + _DEFAULT_COMPUTECPP_TOOLKIT_PATH) + if is_linux(): + sycl_rt_lib_path = 'lib/libComputeCpp.so' + else: + sycl_rt_lib_path = '' + + sycl_rt_lib_path_full = os.path.join(computecpp_toolkit_path, + sycl_rt_lib_path) + if os.path.exists(sycl_rt_lib_path_full): + break + + print('Invalid SYCL %s library path. %s cannot be found' % + (_TF_OPENCL_VERSION, sycl_rt_lib_path_full)) + environ_cp['COMPUTECPP_TOOLKIT_PATH'] = '' + + # Set COMPUTECPP_TOOLKIT_PATH + environ_cp['COMPUTECPP_TOOLKIT_PATH'] = computecpp_toolkit_path + write_action_env_to_bazelrc('COMPUTECPP_TOOLKIT_PATH', + computecpp_toolkit_path) + + +def set_mpi_home(environ_cp): + """Set MPI_HOME.""" + cmd = ('dirname $(dirname $(which mpirun)) || dirname $(dirname $(which ' + 'mpiexec)) || true') + default_mpi_home = run_shell(cmd) + ask_mpi_home = ('Please specify the MPI toolkit folder. [Default is %s]: ' + ) % default_mpi_home + while True: + mpi_home = get_from_env_or_user_or_default(environ_cp, 'MPI_HOME', + ask_mpi_home, default_mpi_home) + + if os.path.exists(os.path.join(mpi_home, 'include')) and os.path.exists( + os.path.join(mpi_home, 'lib')): + break + + print('Invalid path to the MPI Toolkit. %s or %s cannot be found' % + (os.path.join(mpi_home, 'include'), + os.path.exists(os.path.join(mpi_home, 'lib')))) + environ_cp['MPI_HOME'] = '' + + # Set MPI_HOME + environ_cp['MPI_HOME'] = str(mpi_home) + + +def set_other_mpi_vars(environ_cp): + """Set other MPI related variables.""" + # Link the MPI header files + mpi_home = environ_cp.get('MPI_HOME') + symlink_force('%s/include/mpi.h' % mpi_home, 'third_party/mpi/mpi.h') + + # Determine if we use OpenMPI or MVAPICH, these require different header files + # to be included here to make bazel dependency checker happy + if os.path.exists(os.path.join(mpi_home, 'include/mpi_portable_platform.h')): + symlink_force( + os.path.join(mpi_home, 'include/mpi_portable_platform.h'), + 'third_party/mpi/mpi_portable_platform.h') + # TODO(gunan): avoid editing files in configure + sed_in_place('third_party/mpi/mpi.bzl', 'MPI_LIB_IS_OPENMPI=False', + 'MPI_LIB_IS_OPENMPI=True') + else: + # MVAPICH / MPICH + symlink_force( + os.path.join(mpi_home, 'include/mpio.h'), 'third_party/mpi/mpio.h') + symlink_force( + os.path.join(mpi_home, 'include/mpicxx.h'), 'third_party/mpi/mpicxx.h') + # TODO(gunan): avoid editing files in configure + sed_in_place('third_party/mpi/mpi.bzl', 'MPI_LIB_IS_OPENMPI=True', + 'MPI_LIB_IS_OPENMPI=False') + + if os.path.exists(os.path.join(mpi_home, 'lib/libmpi.so')): + symlink_force( + os.path.join(mpi_home, 'lib/libmpi.so'), 'third_party/mpi/libmpi.so') + else: + raise ValueError('Cannot find the MPI library file in %s/lib' % mpi_home) + + +def set_mkl(): + write_to_bazelrc('build:mkl --define with_mkl_support=true') + write_to_bazelrc('build:mkl --define using_mkl=true') + write_to_bazelrc('build:mkl -c opt') + write_to_bazelrc('build:mkl --copt="-DEIGEN_USE_VML"') + print( + 'Add "--config=mkl" to your bazel command to build with MKL ' + 'support.\nPlease note that MKL on MacOS or windows is still not ' + 'supported.\nIf you would like to use a local MKL instead of ' + 'downloading, please set the environment variable \"TF_MKL_ROOT\" every ' + 'time before build.') + + +def main(): + # Make a copy of os.environ to be clear when functions and getting and setting + # environment variables. + environ_cp = dict(os.environ) + + check_bazel_version('0.4.5') + + reset_tf_configure_bazelrc() + cleanup_makefile() + setup_python(environ_cp) + run_gen_git_source(environ_cp) + + if is_windows(): + environ_cp['TF_NEED_GCP'] = '0' + environ_cp['TF_NEED_HDFS'] = '0' + environ_cp['TF_NEED_JEMALLOC'] = '0' + environ_cp['TF_NEED_OPENCL'] = '0' + environ_cp['TF_CUDA_CLANG'] = '0' + + if is_macos(): + environ_cp['TF_NEED_JEMALLOC'] = '0' + + set_build_var(environ_cp, 'TF_NEED_JEMALLOC', 'jemalloc as malloc', + 'with_jemalloc', True) + set_build_var(environ_cp, 'TF_NEED_GCP', 'Google Cloud Platform', + 'with_gcp_support', False) + set_build_var(environ_cp, 'TF_NEED_HDFS', 'Hadoop File System', + 'with_hdfs_support', False) + set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support', + False) + set_build_var(environ_cp, 'TF_NEED_VERBS', 'VERBS', 'with_verbs_support', + False) + + set_action_env_var(environ_cp, 'TF_NEED_OPENCL', 'OpenCL', False) + if environ_cp.get('TF_NEED_OPENCL') == '1': + set_host_cxx_compiler(environ_cp) + set_host_c_compiler(environ_cp) + set_computecpp_toolkit_path(environ_cp) + + set_action_env_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False) + if environ_cp.get('TF_NEED_CUDA') == '1': + set_tf_cuda_version(environ_cp) + set_tf_cunn_version(environ_cp) + set_tf_cuda_compute_capabilities(environ_cp) + + set_tf_cuda_clang(environ_cp) + if environ_cp.get('TF_CUDA_CLANG') == '1': + # Set up which clang we should use as the cuda / host compiler. + set_clang_cuda_compiler_path(environ_cp) + else: + # Set up which gcc nvcc should use as the host compiler + # No need to set this on Windows + if not is_windows(): + set_gcc_host_compiler_path(environ_cp) + set_other_cuda_vars(environ_cp) + + set_build_var(environ_cp, 'TF_NEED_MPI', 'MPI', 'with_mpi_support', False) + if environ_cp.get('TF_NEED_MPI') == '1': + set_mpi_home(environ_cp) + set_other_mpi_vars(environ_cp) + + set_cc_opt_flags(environ_cp) + set_mkl() + + +if __name__ == '__main__': + main() diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc index 3e69134c50a..371264ef6c2 100644 --- a/tensorflow/c/c_api.cc +++ b/tensorflow/c/c_api.cc @@ -56,21 +56,16 @@ limitations under the License. // The implementation below is at the top level instead of the // brain namespace because we are defining 'extern "C"' functions. -using tensorflow::error::Code; -using tensorflow::errors::InvalidArgument; -using tensorflow::gtl::ArraySlice; -using tensorflow::strings::StrCat; using tensorflow::AllocationDescription; using tensorflow::DataType; using tensorflow::Graph; using tensorflow::GraphDef; -using tensorflow::mutex_lock; using tensorflow::NameRangeMap; using tensorflow::NameRangesForNode; using tensorflow::NewSession; using tensorflow::Node; -using tensorflow::NodeDef; using tensorflow::NodeBuilder; +using tensorflow::NodeDef; using tensorflow::OpDef; using tensorflow::OpRegistry; using tensorflow::PartialTensorShape; @@ -83,6 +78,11 @@ using tensorflow::TensorBuffer; using tensorflow::TensorId; using tensorflow::TensorShape; using tensorflow::TensorShapeProto; +using tensorflow::error::Code; +using tensorflow::errors::InvalidArgument; +using tensorflow::gtl::ArraySlice; +using tensorflow::mutex_lock; +using tensorflow::strings::StrCat; extern "C" { @@ -258,24 +258,27 @@ size_t TF_StringEncode(const char* src, size_t src_len, char* dst, return sz; } -size_t TF_StringDecode(const char* src, size_t src_len, const char** dst, - size_t* dst_len, TF_Status* status) { +static Status TF_StringDecode_Impl(const char* src, size_t src_len, + const char** dst, size_t* dst_len) { tensorflow::uint64 len64 = 0; const char* p = tensorflow::core::GetVarint64Ptr(src, src + src_len, &len64); if (p == nullptr) { - status->status = - InvalidArgument("invalid string encoding or truncated src buffer"); - return 0; + return InvalidArgument("invalid string encoding or truncated src buffer"); } if (len64 > std::numeric_limits::max()) { - status->status = - InvalidArgument("encoded string is ", len64, - "-bytes, which is too large for this architecture"); - return 0; + return InvalidArgument("encoded string is ", len64, + "-bytes, which is too large for this architecture"); } *dst = p; *dst_len = static_cast(len64); - return static_cast(p - src) + *dst_len; + return Status::OK(); +} + +size_t TF_StringDecode(const char* src, size_t src_len, const char** dst, + size_t* dst_len, TF_Status* status) { + status->status = TF_StringDecode_Impl(src, src_len, dst, dst_len); + if (!status->status.ok()) return 0; + return static_cast(*dst - src) + *dst_len; } size_t TF_StringEncodedSize(size_t len) { @@ -391,16 +394,20 @@ void TF_Reset(const TF_SessionOptions* opt, const char** containers, namespace tensorflow { -// Non-static for testing. -bool TF_Tensor_DecodeStrings(TF_Tensor* src, Tensor* dst, TF_Status* status) { +Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) { + if (src->dtype != TF_STRING) { + *dst = TensorCApi::MakeTensor(src->dtype, src->shape, src->buffer); + return Status::OK(); + } + // TF_STRING tensors require copying since Tensor class expects a sequence of + // string objects. const tensorflow::int64 num_elements = src->shape.num_elements(); const char* input = reinterpret_cast(TF_TensorData(src)); const size_t src_size = TF_TensorByteSize(src); if (static_cast(src_size / sizeof(tensorflow::uint64)) < num_elements) { - status->status = InvalidArgument( + return InvalidArgument( "Malformed TF_STRING tensor; too short to hold number of elements"); - return false; } const char* data_start = input + sizeof(tensorflow::uint64) * num_elements; const char* limit = input + src_size; @@ -411,24 +418,30 @@ bool TF_Tensor_DecodeStrings(TF_Tensor* src, Tensor* dst, TF_Status* status) { tensorflow::uint64 offset = reinterpret_cast(input)[i]; if (static_cast(offset) >= (limit - data_start)) { - status->status = InvalidArgument("Malformed TF_STRING tensor; element ", - i, " out of range"); - return false; + return InvalidArgument("Malformed TF_STRING tensor; element ", i, + " out of range"); } size_t len; const char* p; const char* srcp = data_start + offset; - TF_StringDecode(srcp, limit - srcp, &p, &len, status); - if (!status->status.ok()) { - return false; - } + Status status = TF_StringDecode_Impl(srcp, limit - srcp, &p, &len); + if (!status.ok()) return status; dstarray(i).assign(p, len); } - return true; + return Status::OK(); } // Non-static for testing. -TF_Tensor* TF_Tensor_EncodeStrings(const Tensor& src) { +TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src) { + if (src.dtype() != DT_STRING) { + TensorBuffer* buf = TensorCApi::Buffer(src); + buf->Ref(); + return new TF_Tensor{static_cast(src.dtype()), src.shape(), + buf}; + } + // DT_STRING tensors require a copying since TF_Tensor.buffer expects a flatly + // encoded sequence of strings. + // Compute bytes needed for encoding. size_t size = 0; const auto& srcarray = src.flat(); @@ -507,16 +520,8 @@ static bool TF_Run_Inputs( TF_Status* status) { const int ninputs = input_pairs->size(); for (int i = 0; i < ninputs; ++i) { - TF_Tensor* src = c_inputs[i]; - if (c_inputs[i]->dtype != TF_STRING) { - (*input_pairs)[i].second = tensorflow::TensorCApi::MakeTensor( - src->dtype, src->shape, src->buffer); - } else if (!tensorflow::TF_Tensor_DecodeStrings( - src, &(*input_pairs)[i].second, status)) { - // TF_STRING tensors require copying since Tensor class expects - // a sequence of string objects. - return false; - } + status->status = TF_TensorToTensor(c_inputs[i], &(*input_pairs)[i].second); + if (!status->status.ok()) return false; } return true; } @@ -574,15 +579,7 @@ static void TF_Run_Helper( static_cast(src.dtype()), src.shape()); continue; } - if (src.dtype() != tensorflow::DT_STRING) { - // Share the underlying buffer. - TensorBuffer* buf = tensorflow::TensorCApi::Buffer(src); - buf->Ref(); - c_outputs[i] = new TF_Tensor{static_cast(src.dtype()), - src.shape(), buf}; - } else { - c_outputs[i] = tensorflow::TF_Tensor_EncodeStrings(src); - } + c_outputs[i] = TF_TensorFromTensor(src); } } @@ -1062,20 +1059,9 @@ void TF_SetAttrTensorShapeProtoList(TF_OperationDescription* desc, void TF_SetAttrTensor(TF_OperationDescription* desc, const char* attr_name, TF_Tensor* value, TF_Status* status) { - status->status = Status::OK(); Tensor t; - bool ok = true; - - if (value->dtype != TF_STRING) { - t = tensorflow::TensorCApi::MakeTensor(value->dtype, value->shape, - value->buffer); - } else { - // TF_STRING tensors require copying since Tensor class expects - // a sequence of string objects. - ok = tensorflow::TF_Tensor_DecodeStrings(value, &t, status); - } - - if (ok) desc->node_builder.Attr(attr_name, t); + status->status = TF_TensorToTensor(value, &t); + if (status->status.ok()) desc->node_builder.Attr(attr_name, t); } void TF_SetAttrTensorList(TF_OperationDescription* desc, const char* attr_name, @@ -1084,21 +1070,14 @@ void TF_SetAttrTensorList(TF_OperationDescription* desc, const char* attr_name, status->status = Status::OK(); std::vector t; t.reserve(num_values); - bool ok = true; - for (int i = 0; i < num_values && ok; ++i) { - if (values[i]->dtype != TF_STRING) { - t.emplace_back(tensorflow::TensorCApi::MakeTensor( - values[i]->dtype, values[i]->shape, values[i]->buffer)); - } else { - t.emplace_back(::tensorflow::DT_STRING); - // TF_STRING tensors require copying since Tensor class expects - // a sequence of string objects. - ok = tensorflow::TF_Tensor_DecodeStrings(values[i], &t.back(), status); - } + for (int i = 0; i < num_values && status->status.ok(); ++i) { + Tensor v; + status->status = TF_TensorToTensor(values[i], &v); + t.emplace_back(v); } - if (ok) desc->node_builder.Attr(attr_name, t); + if (status->status.ok()) desc->node_builder.Attr(attr_name, t); } void TF_SetAttrValueProto(TF_OperationDescription* desc, const char* attr_name, @@ -1555,9 +1534,7 @@ void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name, Tensor t; status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &t); if (!status->status.ok()) return; - *value = new TF_Tensor{static_cast(t.dtype()), t.shape(), - tensorflow::TensorCApi::Buffer(t)}; - (*value)->buffer->Ref(); + *value = TF_TensorFromTensor(t); } void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name, @@ -1568,10 +1545,7 @@ void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name, if (!status->status.ok()) return; const auto len = std::min(max_values, static_cast(ts.size())); for (int i = 0; i < len; ++i) { - const Tensor& t = ts[i]; - values[i] = new TF_Tensor{static_cast(t.dtype()), t.shape(), - tensorflow::TensorCApi::Buffer(t)}; - values[i]->buffer->Ref(); + values[i] = TF_TensorFromTensor(ts[i]); } } diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc index d6debe3b994..25b6cbd8e7a 100644 --- a/tensorflow/c/c_api_test.cc +++ b/tensorflow/c/c_api_test.cc @@ -45,9 +45,8 @@ limitations under the License. #include "tensorflow/core/util/equal_graph_def.h" namespace tensorflow { - -bool TF_Tensor_DecodeStrings(TF_Tensor* src, Tensor* dst, TF_Status* status); -TF_Tensor* TF_Tensor_EncodeStrings(const Tensor& src); +TF_Tensor* TF_TensorFromTensor(const Tensor& src); +Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst); namespace { @@ -146,19 +145,16 @@ void TestEncodeDecode(int line, const std::vector& data) { for (tensorflow::int64 i = 0; i < src.NumElements(); ++i) { src.flat()(i) = data[i]; } - TF_Tensor* dst = TF_Tensor_EncodeStrings(src); + TF_Tensor* dst = TF_TensorFromTensor(src); // Convert back to a C++ Tensor and ensure we get expected output. - TF_Status* status = TF_NewStatus(); Tensor output; - ASSERT_TRUE(TF_Tensor_DecodeStrings(dst, &output, status)) << line; - ASSERT_EQ(TF_OK, TF_GetCode(status)) << line; + ASSERT_EQ(Status::OK(), TF_TensorToTensor(dst, &output)) << line; ASSERT_EQ(src.NumElements(), output.NumElements()) << line; for (tensorflow::int64 i = 0; i < src.NumElements(); ++i) { ASSERT_EQ(data[i], output.flat()(i)) << line; } - TF_DeleteStatus(status); TF_DeleteTensor(dst); } } @@ -918,7 +914,7 @@ TEST(CAPI, SavedModel) { TF_Operation* input_op = TF_GraphOperationByName(graph, input_op_name.c_str()); ASSERT_TRUE(input_op != nullptr); - csession.SetInputs({{input_op, TF_Tensor_EncodeStrings(input)}}); + csession.SetInputs({{input_op, TF_TensorFromTensor(input)}}); const tensorflow::string output_op_name = tensorflow::ParseTensorName(output_name).first.ToString(); @@ -1636,6 +1632,39 @@ TEST_F(CApiAttributesTest, Tensor) { TF_DeleteTensor(value); } +TEST_F(CApiAttributesTest, StringTensor) { + // Create the string-Tensor "atttribute" value. + char encoded[] = { + 0, 0, 0, 0, 0, 0, 0, 0, // array[uint64] offsets + 1, // varint encoded string length + 'A', + }; + auto deallocator = [](void* data, size_t len, void* arg) {}; + unique_tensor_ptr t_in(TF_NewTensor(TF_STRING, nullptr, 0, &encoded[0], + sizeof(encoded), deallocator, nullptr), + TF_DeleteTensor); + + // Create a TF_Operation with the attribute t_in + auto desc = init("tensor"); + TF_SetAttrTensor(desc, "v", t_in.get(), s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + auto oper = TF_FinishOperation(desc, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + // Fetch the attribute back. + EXPECT_TF_META("v", -1, TF_ATTR_TENSOR, -1); + TF_Tensor* t_out = nullptr; + TF_OperationGetAttrTensor(oper, "v", &t_out, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + EXPECT_EQ(TF_STRING, TF_TensorType(t_out)); + EXPECT_EQ(0, TF_NumDims(t_out)); + ASSERT_EQ(TF_TensorByteSize(t_in.get()), TF_TensorByteSize(t_out)); + EXPECT_EQ(0, memcmp(TF_TensorData(t_in.get()), TF_TensorData(t_out), + TF_TensorByteSize(t_out))); + TF_DeleteTensor(t_out); +} + TEST_F(CApiAttributesTest, TensorList) { const char tensor1[] = {5, 7}; const int64_t dims1[] = {1, 2}; @@ -1647,7 +1676,8 @@ TEST_F(CApiAttributesTest, TensorList) { auto desc = init("list(tensor)"); TF_Tensor* tmp[] = { - Int8Tensor(dims1, ndims1, tensor1), Int8Tensor(dims2, ndims2, tensor2), + Int8Tensor(dims1, ndims1, tensor1), + Int8Tensor(dims2, ndims2, tensor2), }; TF_SetAttrTensorList(desc, "v", tmp, TF_ARRAYSIZE(tmp), s_); for (int i = 0; i < TF_ARRAYSIZE(tmp); ++i) { diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc index cec3ebc0ad6..66a943410e2 100644 --- a/tensorflow/cc/framework/gradients.cc +++ b/tensorflow/cc/framework/gradients.cc @@ -356,7 +356,7 @@ Status SymbolicGradientBuilder::AddGradients() { // Check if any input nodes still have pending gradients and have not been // processed yet. This happens if not all outputs of a node are in 'inputs_'. std::unordered_map requested_grads; - for (Output nout : inputs_) { + for (const Output& nout : inputs_) { if (pending_[nout.node()->id()] > 0) { DCHECK_GT(nout.node()->num_outputs(), 1); int idx = input_nodes_[nout]; @@ -365,7 +365,7 @@ Status SymbolicGradientBuilder::AddGradients() { ++requested_grads[nout.node()]; } } - for (auto& p : requested_grads) { + for (const auto& p : requested_grads) { int num_requested_inputs = p.first->num_outputs() - pending_[p.first->id()]; CHECK_EQ(num_requested_inputs, p.second); } diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index 7eab7bb28f0..77b45aa11e2 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -257,6 +257,11 @@ Status MarkForCompilationPass::Run( ®istration)) { return false; } + + // Don't compile control trigger nodes. We won't preserve their deadness + // semantics correctly, so it's safest not to compile them. + if (node->IsControlTrigger()) return false; + // If this device requires a JIT, we must say yes. if (registration->requires_compilation) return true; diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index 4f0137e8d96..c693f58f8bd 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -353,6 +353,20 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "segment_reduction_ops_test", + size = "small", + srcs = ["segment_reduction_ops_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:math_ops", + "//tensorflow/python:math_ops_gen", + "//tensorflow/python:platform_test", + ], +) + tf_xla_py_test( name = "spacetobatch_op_test", size = "medium", diff --git a/tensorflow/compiler/tests/segment_reduction_ops_test.py b/tensorflow/compiler/tests/segment_reduction_ops_test.py new file mode 100644 index 00000000000..260a04421b6 --- /dev/null +++ b/tensorflow/compiler/tests/segment_reduction_ops_test.py @@ -0,0 +1,139 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Test cases for segment reduction ops.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import numpy as np + +from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import googletest + + +class SegmentReductionOpsTest(XLATestCase): + """Test cases for segment reduction ops.""" + + def UnsortedSegmentSum(self, data, indices, num_segments): + with self.test_session() as sess, self.test_scope(): + d = array_ops.placeholder(data.dtype, shape=data.shape) + if isinstance(indices, int): + i = array_ops.placeholder(np.int32, shape=[]) + else: + i = array_ops.placeholder(indices.dtype, shape=indices.shape) + return sess.run( + math_ops.unsorted_segment_sum(d, i, num_segments), + {d: data, + i: indices}) + + def testUnsortedSegmentSum0DIndices1DData(self): + for dtype in self.numeric_types: + self.assertAllClose( + np.array( + [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5], + [0, 0, 0, 0, 0, 0]], + dtype=dtype), + self.UnsortedSegmentSum( + np.array([0, 1, 2, 3, 4, 5], dtype=dtype), 2, 4)) + + def testUnsortedSegmentSum1DIndices1DData(self): + for dtype in self.numeric_types: + self.assertAllClose( + np.array([1, 3, 2, 9], dtype=dtype), + self.UnsortedSegmentSum( + np.array([0, 1, 2, 3, 4, 5], dtype=dtype), + np.array([3, 0, 2, 1, 3, 3], dtype=np.int32), 4)) + + def testUnsortedSegmentSum1DIndices2DDataDisjoint(self): + for dtype in self.numeric_types: + data = np.array( + [[0, 1, 2, 3], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43], + [50, 51, 52, 53]], + dtype=dtype) + indices = np.array([8, 1, 0, 3, 7], dtype=np.int32) + num_segments = 10 + y = self.UnsortedSegmentSum(data, indices, num_segments) + self.assertAllClose( + np.array( + [[30, 31, 32, 33], [20, 21, 22, 23], [0, 0, 0, 0], + [40, 41, 42, 43], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], + [50, 51, 52, 53], [0, 1, 2, 3], [0, 0, 0, 0]], + dtype=dtype), y) + + def testUnsortedSegmentSum1DIndices2DDataNonDisjoint(self): + for dtype in self.numeric_types: + data = np.array( + [[0, 1, 2, 3], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43], + [50, 51, 52, 53]], + dtype=dtype) + indices = np.array([0, 1, 2, 0, 1], dtype=np.int32) + num_segments = 4 + y = self.UnsortedSegmentSum(data, indices, num_segments) + self.assertAllClose( + np.array( + [[40, 42, 44, 46], [70, 72, 74, 76], [30, 31, 32, 33], + [0, 0, 0, 0]], + dtype=dtype), y) + + def testUnsortedSegmentSum2DIndices3DData(self): + for dtype in self.numeric_types: + data = np.array( + [[[0, 1, 2], [10, 11, 12]], [[100, 101, 102], [110, 111, 112]], + [[200, 201, 202], [210, 211, 212]], [[300, 301, 302], + [310, 311, 312]]], + dtype=dtype) + indices = np.array([[3, 5], [3, 1], [5, 0], [6, 2]], dtype=np.int32) + num_segments = 8 + y = self.UnsortedSegmentSum(data, indices, num_segments) + self.assertAllClose( + np.array( + [[210, 211, 212], [110, 111, 112], [310, 311, 312], + [100, 102, 104], [0, 0, 0.], [210, 212, 214], [300, 301, + 302], [0, 0, 0]], + dtype=dtype), y) + + def testUnsortedSegmentSum1DIndices3DData(self): + for dtype in self.numeric_types: + data = np.array( + [[[0, 1, 2], [10, 11, 12]], [[100, 101, 102], [110, 111, 112]], + [[200, 201, 202], [210, 211, 212]], [[300, 301, 302], + [310, 311, 312]]], + dtype=dtype) + indices = np.array([3, 0, 2, 5], dtype=np.int32) + num_segments = 6 + y = self.UnsortedSegmentSum(data, indices, num_segments) + self.assertAllClose( + np.array( + [[[100, 101, 102.], [110, 111, 112]], [[0, 0, 0], [0, 0, 0]], + [[200, 201, 202], [210, 211, 212]], [[0, 1, 2.], [10, 11, 12]], + [[0, 0, 0], [0, 0, 0]], [[300, 301, 302], [310, 311, 312]]], + dtype=dtype), y) + + def testUnsortedSegmentSumShapeError(self): + for dtype in self.numeric_types: + data = np.ones((4, 8, 7), dtype=dtype) + indices = np.ones((3, 2), dtype=np.int32) + num_segments = 4 + self.assertRaises(ValueError, + functools.partial(self.UnsortedSegmentSum, data, + indices, num_segments)) + + +if __name__ == '__main__': + googletest.main() diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py index f2773143524..ac039e01623 100644 --- a/tensorflow/compiler/tests/tensor_array_ops_test.py +++ b/tensorflow/compiler/tests/tensor_array_ops_test.py @@ -57,11 +57,13 @@ class TensorArrayTest(xla_test.XLATestCase): r0 = w2.read(0) r1 = w2.read(1) r2 = w2.read(2) + flow = w2.flow - d0, d1, d2 = session.run([r0, r1, r2]) + d0, d1, d2, flow_val = session.run([r0, r1, r2, flow]) self.assertAllEqual([[4.0, 5.0]], d0) self.assertAllEqual([[1.0, 3.0]], d1) self.assertAllEqual([[7.0, -8.5]], d2) + self.assertAllEqual([], flow_val.shape) def _testTensorArrayWritePack(self, tf_dtype): with self.test_session(), self.test_scope(): diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc index faa88ecfe2e..1c7a2046aa5 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc @@ -323,12 +323,26 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame, for (Arg& arg : frame->args) { if (!arg.is_loop_invariant) { // Follow the edge from the Enter to Merge. - if (arg.enter->out_edges().size() != 1) { - return errors::Internal("Enter node for loop-varying argument ", - arg.enter->name(), - " does not have exactly one successor"); + const Edge* enter_merge = nullptr; + for (const Edge* e : arg.enter->out_edges()) { + // Ignore control-edges to the sink node. These are allowed by the + // graph invariants, although probably they should have been stripped + // off earlier. + if (e->IsControlEdge() && e->dst()->IsSink()) { + continue; + } + if (enter_merge != nullptr) { + return errors::Internal( + "Enter node for loop-varying argument ", arg.enter->name(), + " has multiple successors: ", enter_merge->dst()->name(), " and ", + e->dst()->name()); + } + enter_merge = e; + } + if (enter_merge == nullptr) { + return errors::Internal("Enter node for loop-varying argument ", + arg.enter->name(), " has zero successors"); } - const Edge* enter_merge = *arg.enter->out_edges().begin(); arg.merge = enter_merge->dst(); if (!IsMerge(arg.merge)) { return errors::InvalidArgument( diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc index 2fb1cc04543..914c8999a6f 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc @@ -96,6 +96,14 @@ TEST(FunctionalizeControlFlow, OneLoopVar) { TF_EXPECT_OK(scope.ToGraph(&graph)); } + // Regression test: control edges from an Enter node to the graph sink should + // be ignored. + for (Node* n : graph.nodes()) { + if (n->name() == "while/Enter") { + graph.AddControlEdge(n, graph.sink_node()); + } + } + FunctionLibraryDefinition library(OpRegistry::Global(), {}); TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library)); diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD index 35bc6b5a24e..546e9be8647 100644 --- a/tensorflow/compiler/tf2xla/kernels/BUILD +++ b/tensorflow/compiler/tf2xla/kernels/BUILD @@ -47,6 +47,7 @@ tf_kernel_library( "reshape_op.cc", "retval_op.cc", "reverse_op.cc", + "segment_reduction_ops.cc", "select_op.cc", "sequence_ops.cc", "shape_op.cc", diff --git a/tensorflow/compiler/tf2xla/kernels/no_op.cc b/tensorflow/compiler/tf2xla/kernels/no_op.cc index b8f0c0b9fe6..8c8a9bbe787 100644 --- a/tensorflow/compiler/tf2xla/kernels/no_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/no_op.cc @@ -23,4 +23,9 @@ namespace tensorflow { // dummy operator using CompilationOnly(). REGISTER_XLA_OP(Name("NoOp").CompilationOnly(), NoOp); +// We register ControlTrigger as a no-op. This is correct since nodes seen +// by the XLA compiler are never dead. This may need rethinking when we add +// support for conditionals to XLA. +REGISTER_XLA_OP(Name("ControlTrigger"), NoOp); + } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc new file mode 100644 index 00000000000..6a0ce775dc6 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc @@ -0,0 +1,155 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h" +#include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/xla_helpers.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/computation_builder.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/types.h" + +namespace tensorflow { +namespace { + +class UnsortedSegmentSum : public XlaOpKernel { + public: + explicit UnsortedSegmentSum(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + } + + void Compile(XlaOpKernelContext* ctx) override { + // output = unsorted_segment_sum(data, indices, num_segments) + // Compute a tensor such that: + // output[i] = sum over {j where indices[j] == i} of data[j] + // output[i] == 0 if i does not appear in indices + // + // Contrast with segment_sum(), which assumes indices are sorted and that + // max(indices)+1 is the desired size of the output. + // + // The returned output tensor has the same type as data, and the same shape + // as data with the first indices.rank dimensions are replaced + // by a single dimension with size num_segments. + + xla::ComputationBuilder* builder = ctx->builder(); + + auto data = ctx->Input(0); + auto data_shape = ctx->InputShape(0); + + auto indices = ctx->Input(1); + auto indices_shape = ctx->InputShape(1); + + OP_REQUIRES(ctx, data_shape.dims() >= indices_shape.dims(), + errors::InvalidArgument( + "UnsortedSegmentSum requires that indices' rank be" + " less than or equal to data's rank.")); + // Validate that indices.shape is a prefix of data.shape. + for (int d = 0; d < indices_shape.dims(); ++d) { + OP_REQUIRES(ctx, (data_shape.dim_size(d) == indices_shape.dim_size(d)), + errors::InvalidArgument( + "UnsortedSegmentSum requires indices shape to be prefix" + " of data_shape, but dimension ", + d, " differs ", data_shape.dim_size(d), " vs. ", + indices_shape.dim_size(d))); + } + + int64 num_segments; + OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(2, &num_segments)); + + // Flatten the indices into 1-D. + auto indices_1d = builder->Reshape(indices, {indices_shape.num_elements()}); + + // flatten data for dynamic indexing. + int64 out_tensor_dims = data_shape.dims() - indices_shape.dims(); + std::vector flat_shape(1 + out_tensor_dims); + flat_shape[0] = indices_shape.num_elements(); + for (int64 k = 0; k < out_tensor_dims; ++k) { + flat_shape[1 + k] = data_shape.dim_size(indices_shape.dims() + k); + } + auto data_flat = builder->Reshape(data, flat_shape); + + // output shape; same as data_shape, but dimension 0 is num_segments. + std::vector out_shape(flat_shape); + out_shape[0] = num_segments; + + // Pad the output array dims to rank >= 3 to work around lowering issues. + // TODO(b/37575001) This is awkward, and could be improved. + int64 extra_dims = 0; + if (out_shape.size() < 3) { + extra_dims = 3u - out_shape.size(); + } + std::vector rshape(extra_dims + out_shape.size(), 1); + for (unsigned k = 0; k < out_shape.size(); ++k) { + rshape[extra_dims + k] = out_shape[k]; + } + auto output = builder->Broadcast(XlaHelpers::Zero(builder, dtype_), rshape); + + auto zero = builder->ConstantR1({0}); + + for (int64 i = 0; i < indices_shape.num_elements(); ++i) { + // output[indices[i]] += data[i] + + std::vector data_start_indices(flat_shape.size()); + data_start_indices[0] = i; + for (unsigned d = 1; d < flat_shape.size(); ++d) { + data_start_indices[d] = 0; + } + std::vector data_limit_indices(flat_shape); + data_limit_indices[0] = i + 1; + std::vector stride(flat_shape.size(), 1); + + auto data_slice = builder->Slice(data_flat, data_start_indices, + data_limit_indices, stride); + + // Reshape the sliced data into the R3+ shape to match output array. + std::vector rdata_shape(extra_dims + flat_shape.size()); + for (int64 k = 0; k <= extra_dims; ++k) { + rdata_shape[k] = 1; + } + for (unsigned k = 1; k < data_limit_indices.size(); ++k) { + rdata_shape[extra_dims + k] = data_limit_indices[k]; + } + auto rdata_slice = builder->Reshape(data_slice, rdata_shape); + + auto index = builder->Slice(indices_1d, {i}, {i + 1}, {1}); + + // Construct the index into the R3+ output array 0, ..., , 0, ... + std::vector out_start_index_parts( + extra_dims + flat_shape.size(), zero); + out_start_index_parts[extra_dims] = builder->Reshape(index, {1}); + auto out_start_indices = builder->ConcatInDim(out_start_index_parts, 0); + + std::vector slice_size(rshape); + slice_size[extra_dims] = 1; + + auto out_slice = + builder->DynamicSlice(output, out_start_indices, slice_size); + auto sumval = builder->Add(out_slice, rdata_slice); + output = builder->DynamicUpdateSlice(output, sumval, out_start_indices); + } + auto reshaped_output = builder->Reshape(output, out_shape); + ctx->SetOutput(0, reshaped_output); + } + + private: + DataType dtype_; +}; + +REGISTER_XLA_OP(Name("UnsortedSegmentSum"), UnsortedSegmentSum); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc index bdd52b7f8e5..34cc8b23159 100644 --- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc @@ -182,7 +182,10 @@ class TensorArrayOp : public XlaOpKernel { dtype_, value, &var)); var->tensor_array_size = size; ctx->SetResourceOutput(0, var); - ctx->SetConstantOutput(1, Tensor(DT_FLOAT)); + + Tensor flow(DT_FLOAT, TensorShape({})); + flow.scalar()() = 0.0f; + ctx->SetConstantOutput(1, flow); } private: @@ -216,6 +219,7 @@ class TensorArrayWriteOp : public XlaOpKernel { xla::ComputationDataHandle ta = resource->value; xla::ComputationDataHandle index = ctx->Input(1); xla::ComputationDataHandle value = ctx->Input(2); + xla::ComputationDataHandle flow = ctx->Input(3); // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0]. auto start_indices = XlaHelpers::PadWithZeros(b, index, elem_shape.dims()); @@ -228,7 +232,7 @@ class TensorArrayWriteOp : public XlaOpKernel { DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(), start_indices); resource->value = written; - ctx->SetConstantOutput(0, Tensor(DT_FLOAT)); + ctx->SetOutput(0, flow); } private: @@ -369,6 +373,7 @@ class TensorArrayScatterOp : public XlaOpKernel { xla::ComputationDataHandle ta = resource->value; const xla::ComputationDataHandle value = ctx->Input(2); + const xla::ComputationDataHandle flow = ctx->Input(3); auto slice_dims = value_shape.dim_sizes(); slice_dims[0] = 1LL; @@ -394,7 +399,7 @@ class TensorArrayScatterOp : public XlaOpKernel { } resource->value = ta; - ctx->SetConstantOutput(0, Tensor(DT_FLOAT)); + ctx->SetOutput(0, flow); } private: @@ -489,6 +494,7 @@ class TensorArraySplitOp : public XlaOpKernel { lengths.size(), " vs. ", resource->tensor_array_size, ")")); const xla::ComputationDataHandle value = ctx->Input(1); + const xla::ComputationDataHandle flow = ctx->Input(3); OP_REQUIRES(ctx, value_shape.num_elements() == ta_shape.num_elements(), errors::InvalidArgument("mismatched element count ", @@ -497,7 +503,7 @@ class TensorArraySplitOp : public XlaOpKernel { resource->value = b->Add(ta, b->Reshape(value, ta_shape.dim_sizes())); - ctx->SetConstantOutput(0, Tensor(DT_FLOAT)); + ctx->SetOutput(0, flow); } private: diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index 5eef45b11d4..e0a03a78f1d 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -132,7 +132,10 @@ cc_library( cc_library( name = "statusor", srcs = ["statusor.cc"], - hdrs = ["statusor.h"], + hdrs = [ + "statusor.h", + "statusor_internals.h", + ], visibility = ["//visibility:public"], deps = [ ":status", diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 4c2705b0072..bd040c166d9 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -1943,6 +1943,7 @@ cc_library( ":buffer_liveness", ":hlo", ":hlo_pass", + "//tensorflow/compiler/xla:shape_util", "//tensorflow/core:lib", ], ) diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index b351861425d..4837402c15b 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -1488,9 +1488,9 @@ Status AlgebraicSimplifierVisitor::HandleConvolution( // We cannot insert bitcasts if the layouts will not be compatible. // TODO(b/33178038): Consider inserting a transpose if a bitcast would be // invalid. - if (!valid_bitcast_callback_(lhs->shape(), input_shape) || - !valid_bitcast_callback_(rhs->shape(), new_filter_shape) || - !valid_bitcast_callback_(dot_output_shape, convolution_shape)) { + if (!valid_bitcast_callback_(input_shape, lhs->shape()) || + !valid_bitcast_callback_(new_filter_shape, rhs->shape()) || + !valid_bitcast_callback_(convolution_shape, dot_output_shape)) { return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h index f8919f0caad..4295a3227a8 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.h +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h @@ -26,12 +26,13 @@ namespace xla { // A pass which performs AlgebraicSimplications. class AlgebraicSimplifier : public HloPassInterface { public: - // Given two shapes, determines if it is valid to bitcast between them after - // considering platform dependent effects on layout like alignment - // restrictions. - // Precondition: the two shapes have layouts, the same number of - // elements and ShapeUtil::ReshapeIsBitcast returns true. - using ValidBitcastCallback = std::function; + // Given shapes 'from_shape' and 'to_shape', determines if it is valid to + // bitcast from 'from_shape' to 'to_shape' after considering platform + // dependent effects on layout like alignment restrictions. Precondition: the + // two shapes have layouts, the same number of elements and + // ShapeUtil::ReshapeIsBitcast returns true. + using ValidBitcastCallback = + std::function; // If is_layout_sensitive is true, then the simplifier preserves layout during // transformation. Otherwise, layout is ignored. If valid_bitcast_callback diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index 7248cb5f4c0..2ca4af67cd5 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -72,6 +72,7 @@ cc_library( "//tensorflow/compiler/xla/service:hlo_subcomputation_unification", "//tensorflow/compiler/xla/service:hlo_verifier", "//tensorflow/compiler/xla/service:inliner", + "//tensorflow/compiler/xla/service:reduce_precision_insertion", "//tensorflow/compiler/xla/service:reshape_mover", "//tensorflow/compiler/xla/service:transpose_folding", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", # fixdeps: keep diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index 6d819355c4c..b86342d0b3e 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -74,6 +74,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_verifier.h" #include "tensorflow/compiler/xla/service/inliner.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" #include "tensorflow/compiler/xla/service/reshape_mover.h" #include "tensorflow/compiler/xla/service/transpose_folding.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -253,6 +254,14 @@ Status CpuCompiler::RunHloPasses(HloModule* module) { HloPassPipeline pipeline("CPU"); pipeline.AddInvariantChecker(); + for (const auto& reduce_precision_options : + module->config().debug_options().hlo_reduce_precision_options()) { + if (reduce_precision_options.pass_timing() == + HloReducePrecisionOptions::BEFORE_OP_FUSION) { + pipeline.AddPass(reduce_precision_options); + } + } + // TODO(b/35786417): Re-enable inliner pass after fixing the bug and deciding // where we will take this pass in future. // pipeline.AddPass(); @@ -278,6 +287,15 @@ Status CpuCompiler::RunHloPasses(HloModule* module) { TransposeFolding::NeverFoldTranspose); pipeline.AddPass(/*is_layout_sensitive=*/false); pipeline.AddPass(); + + for (const auto& reduce_precision_options : + module->config().debug_options().hlo_reduce_precision_options()) { + if (reduce_precision_options.pass_timing() == + HloReducePrecisionOptions::AFTER_OP_FUSION) { + pipeline.AddPass(reduce_precision_options); + } + } + pipeline.AddPass( module->mutable_entry_computation_layout()); // The LayoutAssignment pass may leave behind kCopy instructions which are diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index fa95e234992..cdd7c8187c9 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -432,6 +432,7 @@ cc_library( "//tensorflow/compiler/xla/service:hlo_proto_util", "//tensorflow/compiler/xla/service:hlo_subcomputation_unification", "//tensorflow/compiler/xla/service:hlo_verifier", + "//tensorflow/compiler/xla/service:reduce_precision_insertion", "//tensorflow/compiler/xla/service:reshape_mover", "//tensorflow/compiler/xla/service:transpose_folding", "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend", diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index d60c45a5c3a..2acf95084a9 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -56,6 +56,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" +#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" #include "tensorflow/compiler/xla/service/reshape_mover.h" #include "tensorflow/compiler/xla/service/transpose_folding.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -123,6 +124,15 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module, { HloPassPipeline pipeline("optimization"); pipeline.AddInvariantChecker(); + + for (const auto& reduce_precision_options : + hlo_module->config().debug_options().hlo_reduce_precision_options()) { + if (reduce_precision_options.pass_timing() == + HloReducePrecisionOptions::BEFORE_OP_FUSION) { + pipeline.AddPass(reduce_precision_options); + } + } + { auto& pass = pipeline.AddPass>("simplification"); @@ -149,8 +159,27 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module, fusion.AddPass(/*may_duplicate=*/false); fusion.AddPass(/*may_duplicate=*/true); fusion.AddPass(); - return fusion.Run(hlo_module).status(); + TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status()); + + HloPassPipeline reduce_pipeline("reduce-precision"); + for (const auto& reduce_precision_options : + hlo_module->config().debug_options().hlo_reduce_precision_options()) { + if (reduce_precision_options.pass_timing() == + HloReducePrecisionOptions::AFTER_OP_FUSION) { + reduce_pipeline.AddPass( + reduce_precision_options); + } + } + StatusOr reduce_result = reduce_pipeline.Run(hlo_module); + TF_RETURN_IF_ERROR(reduce_result.status()); + + if (reduce_result.ValueOrDie()) { + // Do another fusion pass, with the expectation that we may be able to + // fuse the new ReducePrecision operations. + TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status()); + } } + return tensorflow::Status::OK(); } // Modifies the given HLO module so that it will be accepted by IrEmitter. diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc index acd26c4e31c..c6202548f1c 100644 --- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc +++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc @@ -48,36 +48,36 @@ using ::tensorflow::Env; using ::tensorflow::gtl::nullopt; using ::tensorflow::gtl::optional; using ::tensorflow::io::JoinPath; -using ::tensorflow::strings::Appendf; -using ::tensorflow::strings::Printf; using ::tensorflow::strings::StrAppend; using ::tensorflow::strings::StrCat; using ::tensorflow::str_util::Join; +using ::tensorflow::str_util::StringReplace; using ::tensorflow::WriteStringToFile; namespace xla { namespace hlo_graph_dumper { namespace { -// Node color schemes, used by NodeColorAttributes. -enum ColorScheme { - kBlue, - kBrown, - kDarkBlue, - kDarkGreen, - kDarkRed, - kGray, - kGreen, - kOrange, - kPurple, - kRed, - kWhite, - kYellow, - - // Causes the node's border to be a dashed line, and its content to be gray - // text on a white background, suggesting that this is an "unimportant" node. - kDashedBorder, +// Helpers for Printf and Appendf. +template +struct PrintfConvert { + const T& operator()(const T& t) const { return t; } }; +template <> +struct PrintfConvert { + const char* operator()(const string& s) const { return s.c_str(); } +}; + +// Like tensorflow::strings::Printf/Appendf, but you don't need to call c_str() +// on strings. +template +string Printf(const char* fmt, const Ts&... ts) { + return tensorflow::strings::Printf(fmt, PrintfConvert()(ts)...); +} +template +void Appendf(string* s, const char* fmt, const Ts&... ts) { + tensorflow::strings::Appendf(s, fmt, PrintfConvert()(ts)...); +} // Used to indicate how we should treat a given HLOInstruction in the graph. // should we treat it like normal, hide it, and so on? @@ -92,6 +92,9 @@ enum NodeFilterResult { // Style the node the same as kSomeOperandsOmitted, but also don't connect it // to its operands, even if they're present in the graph. kOmitNodeOperands, + // Same style as kSomeOperandsOmitted, but used to indicate that some of the + // node's *users* have been omitted. + kSomeUsersOmitted, }; // NodeFilter is essentially a map from HloInstruction*s to NodeFilterResult. @@ -118,11 +121,41 @@ class NodeFilter { auto result = filter_(instr); return result == kOmitNodeOperands || result == kSomeOperandsOmitted; } + bool Deemphasized(const HloInstruction* instr) const { + auto result = filter_(instr); + return result == kOmitNodeOperands || result == kSomeOperandsOmitted || + result == kSomeUsersOmitted; + } + + bool ShowFusionSubcomputation(const HloInstruction* instr) const { + CHECK_EQ(instr->opcode(), HloOpcode::kFusion); + return Show(instr) && !SomeOrAllOperandsOmitted(instr); + } private: std::function filter_; }; +// Node color schemes, used by NodeColorAttributes. +enum ColorScheme { + kBlue, + kBrown, + kDarkBlue, + kDarkGreen, + kDarkRed, + kGray, + kGreen, + kOrange, + kPurple, + kRed, + kWhite, + kYellow, + + // Causes the node's border to be a dashed line, and its content to be gray + // text on a white background, suggesting that this is an "unimportant" node. + kDashedBorder, +}; + // Given a ColorScheme, returns an attribute string for a node of that color. // Sets the node's style and fill/stroke/text colors. // @@ -170,19 +203,8 @@ string NodeColorAttributes(ColorScheme color) { // Replaces <> with <>, so that this string is safe(er) for use in a // graphviz HTML-like string. string HtmlLikeStringSanitize(tensorflow::StringPiece s) { - return tensorflow::str_util::StringReplace( - tensorflow::str_util::StringReplace(s, "<", "<", /*replace_all=*/true), - ">", ">", /*replace_all=*/true); -} - -// Returns the dot graph identifier for the given instruction. -string InstructionId(const HloInstruction* instruction) { - return Printf("%lld", reinterpret_cast(instruction)); -} - -// Returns the dot graph identifier for the given computation. -string ComputationId(const HloComputation* computation) { - return Printf("%lld", reinterpret_cast(computation)); + return StringReplace(StringReplace(s, "<", "<", /*replace_all=*/true), ">", + ">", /*replace_all=*/true); } // Tries to generates a human-readable one-word description of the given @@ -194,9 +216,15 @@ string ComputationId(const HloComputation* computation) { // "return param0 * param1;" --> "multiply" // "return min(param0, param1);" --> "min" // "return max(param0, param1);" --> "max" +// "return param0 <= param1;" --> "less-or-equal" +// "return param0 >= param1;" --> "greater-or-equal" +// "return param0 > param1;" --> "greater-than" +// "return param0 < param1;" --> "less-than" +// "return param0 == param1;" --> "equal-to" +// "return param0 != param1;" --> "not-equal-to" // -// where param0 and param1 are effective scalars. Since all of the ops above -// are commutative, we also support them with param0 and param1 swapped. +// where param0 and param1 are effective scalars. For the ops that are +// commutative, we also support them with param0 and param1 swapped. // // This is useful primarily for reduce and map nodes. These take a // subcomputation which is almost always one of the four above, and pattern @@ -219,6 +247,7 @@ optional MatchTrivialComputation(const HloComputation* computation) { operand1->opcode() != HloOpcode::kParameter) { return nullopt; } + // Check that the two operands of root are param0 and param1. All of the // opcodes we recognize are commutative, so we're OK with either order. auto n0 = operand0->parameter_number(); @@ -227,6 +256,20 @@ optional MatchTrivialComputation(const HloComputation* computation) { return nullopt; } + // If the params are reversed, check that the operation being performed is + // commutative. + if (n0 == 1) { + switch (root->opcode()) { + case HloOpcode::kLe: + case HloOpcode::kGe: + case HloOpcode::kGt: + case HloOpcode::kLt: + return nullopt; + default: + break; + } + } + // Check that the root and params are all effective scalars. if (!ShapeUtil::IsEffectiveScalar(root->shape()) || !ShapeUtil::IsEffectiveScalar(operand0->shape()) || @@ -244,444 +287,542 @@ optional MatchTrivialComputation(const HloComputation* computation) { return "min"; case HloOpcode::kMaximum: return "max"; + case HloOpcode::kLe: + return "less-or-equal"; + case HloOpcode::kGe: + return "greater-or-equal"; + case HloOpcode::kGt: + return "greater-than"; + case HloOpcode::kLt: + return "less-than"; + case HloOpcode::kEq: + return "equal-to"; + case HloOpcode::kNe: + return "not-equal-to"; default: return nullopt; } } -// Returns the dot graph edges and nodes for the given instruction sequence. -// Edges which extend between computations are added to the vector -// intercomputation_edges. This is necessary because graphviz does not render -// the graph properly unless these inter-computation edges appear after all -// subgraph statements. -string InstructionSequenceGraph( - const std::list>& instructions, - bool show_addresses, bool show_layouts, - std::vector* intercomputation_edges, - const HloExecutionProfile* hlo_execution_profile, - const NodeFilter& filter) { - string graph_body; +class HloDotDumper { + public: + HloDotDumper(const HloComputation* computation, tensorflow::StringPiece label, + bool show_addresses, bool show_layouts, + const HloExecutionProfile* profile, NodeFilter filter) + : computation_(computation), + label_(label.ToString()), + show_addresses_(show_addresses), + show_layouts_(show_layouts), + profile_(profile), + filter_(std::move(filter)) {} - for (auto& instruction : instructions) { - if (!filter.Show(instruction.get())) { - continue; - } + string Dump(); - // We don't display constants as separate nodes; they're merged into their - // users. - if (instruction->opcode() == HloOpcode::kConstant) { - continue; - } - - ColorScheme color = kYellow; - string shape = "box"; - - // Build the first line or two of the node, containing its name and opcode - // (if the opcode isn't redundant with the name). - string name; - if (instruction->opcode() == HloOpcode::kParameter) { - // If we have a parameter, put the param number in the name. - name = StrCat("Parameter ", instruction->parameter_number(), - "
", HtmlLikeStringSanitize(instruction->name())); - } else if (tensorflow::StringPiece(instruction->name()) - .starts_with( - StrCat("%", instruction->ExtendedOpcodeStr()))) { - // The HLO instruction name contains usually the opcode, e.g. "%add.42" is - // an add instruction. In this case we render just the name. - name = StrCat("", HtmlLikeStringSanitize(instruction->name()), ""); - } else if (instruction->opcode() == HloOpcode::kFusion && - tensorflow::StringPiece(instruction->name()) - .starts_with( - StrCat("%", HloOpcodeString(instruction->opcode())))) { - // Fusion nodes are usually named e.g. "%fusion.5". We render these as - // e.g. "%fusion.5
input fusion". - name = StrCat("", HtmlLikeStringSanitize(instruction->name()), - "
", - HtmlLikeStringSanitize(instruction->ToCategory())); - } else { - // If the name does not contain the opcode, render both. - name = StrCat("", - HtmlLikeStringSanitize(instruction->ExtendedOpcodeStr()), - "
", HtmlLikeStringSanitize(instruction->name())); - } - - if (HloOpcode::kConvolution == instruction->opcode()) { - StrAppend( - &name, "
", - HtmlLikeStringSanitize( - instruction->ConvolutionDimensionNumbersToString()), - "
", - HtmlLikeStringSanitize(window_util::ToString(instruction->window()))); - } - - if (!instruction->metadata().op_name().empty()) { - StrAppend(&name, "
", - HtmlLikeStringSanitize(instruction->metadata().op_name())); - } - if (!instruction->metadata().source_file().empty() && - instruction->metadata().source_line() != 0) { - StrAppend(&name, "
", instruction->metadata().source_file(), ":", - instruction->metadata().source_line()); - } - - // Pick different colors or shapes for instructions which are particularly - // expensive (eg, dot) and those which are unusual in some way or unique - // (eg, parameter). - switch (instruction->opcode()) { - // "Normal" instructions. Mostly cheap and elementwise. No call to - // embedded computations. In this case, use default color, shape and - // label. - case HloOpcode::kAbs: - case HloOpcode::kAdd: - case HloOpcode::kCeil: - case HloOpcode::kClamp: - case HloOpcode::kConvert: - case HloOpcode::kCos: - case HloOpcode::kDivide: - case HloOpcode::kEq: - case HloOpcode::kExp: - case HloOpcode::kFloor: - case HloOpcode::kGe: - case HloOpcode::kGt: - case HloOpcode::kIndex: - case HloOpcode::kIsFinite: - case HloOpcode::kLe: - case HloOpcode::kLog: - case HloOpcode::kLogicalAnd: - case HloOpcode::kLogicalNot: - case HloOpcode::kLogicalOr: - case HloOpcode::kLt: - case HloOpcode::kMaximum: - case HloOpcode::kMinimum: - case HloOpcode::kMultiply: - case HloOpcode::kNe: - case HloOpcode::kNegate: - case HloOpcode::kPower: - case HloOpcode::kRemainder: - case HloOpcode::kSelect: - case HloOpcode::kSign: - case HloOpcode::kSin: - case HloOpcode::kSlice: - case HloOpcode::kSort: - case HloOpcode::kSubtract: - case HloOpcode::kTanh: - break; - case HloOpcode::kRng: - StrAppend(&name, "
", - RandomDistribution_Name(instruction->random_distribution())); - break; - case HloOpcode::kBroadcast: - case HloOpcode::kTranspose: - StrAppend(&name, "
", "dims={", - Join(instruction->dimensions(), ","), "}"); - break; - case HloOpcode::kBitcast: - case HloOpcode::kTuple: - case HloOpcode::kTrace: - color = kWhite; - break; - case HloOpcode::kGetTupleElement: - color = kWhite; - StrAppend(&name, "
index=", instruction->tuple_index()); - break; - case HloOpcode::kConcatenate: - case HloOpcode::kCopy: - case HloOpcode::kDynamicSlice: - case HloOpcode::kDynamicUpdateSlice: - case HloOpcode::kPad: - case HloOpcode::kReshape: - case HloOpcode::kReverse: - case HloOpcode::kUpdate: - color = kGreen; - break; - case HloOpcode::kConvolution: - case HloOpcode::kDot: - color = kDarkBlue; - break; - case HloOpcode::kParameter: - color = kOrange; - break; - case HloOpcode::kBatchNormTraining: - StrAppend(&name, " feature_index=", instruction->feature_index()); - color = kPurple; - break; - case HloOpcode::kBatchNormGrad: - StrAppend(&name, " feature_index=", instruction->feature_index()); - color = kPurple; - break; - case HloOpcode::kReduce: - StrAppend(&name, " dims=", Join(instruction->dimensions(), ",")); - color = kPurple; - break; - case HloOpcode::kSelectAndScatter: - case HloOpcode::kReduceWindow: - color = kPurple; - break; - case HloOpcode::kWhile: - shape = "ellipse"; - color = kDarkGreen; - break; - case HloOpcode::kMap: - case HloOpcode::kFusion: - color = kGray; - break; - case HloOpcode::kSend: - case HloOpcode::kRecv: - case HloOpcode::kInfeed: - case HloOpcode::kOutfeed: - case HloOpcode::kCrossReplicaSum: - color = kBrown; - break; - case HloOpcode::kCall: - color = kDarkGreen; - break; - case HloOpcode::kCustomCall: - color = kDarkGreen; - StrAppend(&name, "
", - "custom_call_target=", instruction->custom_call_target()); - break; - case HloOpcode::kReducePrecision: - // Make ReducePrecision ops a bit more visible, since typically they - // will be inserted as modifications to an existing graph. - color = kRed; - break; - case HloOpcode::kConstant: - LOG(FATAL) << "Constants don't get their own nodes in the graph."; - } - - // Create instruction node with appropriate label, shape, and color. - // label is interpreted as an HTML-like string, so newlines must be - // delimited with
, rather than \n. - string label = - StrCat(name, "
", ShapeUtil::HumanString(instruction->shape())); - - if (show_addresses) { - Appendf(&label, "
[%p]", instruction.get()); - } - if (show_layouts && LayoutUtil::HasLayout(instruction->shape())) { - string layout_string; - if (ShapeUtil::IsTuple(instruction->shape())) { - // For tuples, emit the full shape because the layout of a tuple is not - // represented in a single Layout field. - layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape()); - } else { - layout_string = - Join(instruction->shape().layout().minor_to_major(), ","); - } - StrAppend(&label, "
layout={", layout_string, "}"); - } - if (hlo_execution_profile != nullptr) { - auto hlo_cycles_executed = - hlo_execution_profile->GetProfileResult(*instruction); - auto total_cycles_executed = - hlo_execution_profile->total_cycles_executed(*instruction->parent()); - if (hlo_cycles_executed > 0 && total_cycles_executed > 0) { - Appendf(&label, "
%% of cycles executed=%.2f", - (static_cast(hlo_cycles_executed) / - static_cast(total_cycles_executed)) * - 100); - } - } - - // If this node's operands are omitted, style it accordingly. - if (filter.SomeOrAllOperandsOmitted(instruction.get())) { - color = kDashedBorder; - } - - // If this node is highlighted, override its formatting. - if (filter.Highlight(instruction.get())) { - shape = "diamond"; - color = kDarkRed; - } - - // Create edges from the instruction's operands to the instruction. - if (!filter.OmitOperands(instruction.get())) { - int64 operand_number = 0; - for (auto* operand : instruction->operands()) { - if (!filter.Show(operand) || - operand->opcode() == HloOpcode::kConstant) { - ++operand_number; - continue; - } - Appendf(&graph_body, "%s -> %s", InstructionId(operand).c_str(), - InstructionId(instruction.get()).c_str()); - if (instruction->operand_count() > 1) { - Appendf(&graph_body, " [headlabel=\"%lld\",labeldistance=2]", - operand_number); - } - StrAppend(&graph_body, ";\n"); - ++operand_number; - } - - // Fusion nodes are handled specially because they contain nested - // expressions. - if (instruction->opcode() == HloOpcode::kFusion) { - string cluster_name = - StrCat("cluster_", InstructionId(instruction.get())); - StrAppend(&graph_body, "subgraph ", cluster_name, " {\n"); - StrAppend(&graph_body, "label=", - HtmlLikeStringSanitize(instruction->name()), - ">;\nstyle=\"rounded,filled\";\n" - "color=lightgrey;\n"); - StrAppend(&graph_body, - InstructionSequenceGraph(instruction->fused_instructions(), - show_addresses, show_layouts, - intercomputation_edges, - hlo_execution_profile, NodeFilter()), - "}\n"); - string fusion_edge = StrCat( - InstructionId(instruction->fused_expression_root()), " -> ", - InstructionId(instruction.get()), - " [ style = \"dotted\", arrowsize=0.0, ltail=", cluster_name, - " ];\n"); - intercomputation_edges->push_back(fusion_edge); - } else { - // If instruction has just one computation and it's trivial (e.g. - // "return param0 + param1"), put the trivial computation type (e.g. - // "add") into instruction's label. Otherwise, add a dotted edge - // between the instruction and its subcomputations. - const auto& subcomputations = instruction->called_computations(); - - bool trivial_subcomputation = false; - if (subcomputations.size() == 1) { - optional computation_type = - MatchTrivialComputation(subcomputations.front()); - if (computation_type) { - trivial_subcomputation = true; - StrAppend(&label, "
Subcomputation: ", *computation_type, - ""); - } - } - - if (!trivial_subcomputation) { - for (const HloComputation* computation : - instruction->called_computations()) { - string cluster_name = - StrCat("cluster_", ComputationId(computation)); - string call_edge = Printf( - "%s -> %s [ style=dashed; ltail=%s ];\n", - InstructionId(computation->root_instruction()).c_str(), - InstructionId(instruction.get()).c_str(), cluster_name.c_str()); - intercomputation_edges->push_back(call_edge); - } - } - } - } - - // Inline constant operands into the node. - for (int64 i = 0; i < instruction->operand_count(); ++i) { - const HloInstruction* operand = instruction->operand(i); - if (operand->opcode() != HloOpcode::kConstant) { - continue; - } - - StrAppend(&label, "
operand ", i, " = "); - if (ShapeUtil::IsEffectiveScalar(operand->shape())) { - auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex( - operand->shape(), /*linear_index=*/0); - StrAppend(&label, ShapeUtil::HumanString(operand->shape()), "{", - operand->literal().GetAsString(elem_idx), "}"); - } else { - if (tensorflow::StringPiece(operand->name()).starts_with("%constant")) { - StrAppend(&label, operand->name()); - } else { - StrAppend(&label, "constant ", operand->name()); - } - } - } - - Appendf(&graph_body, "%s [label=<%s>, shape=%s, %s];\n", - InstructionId(instruction.get()).c_str(), label.c_str(), - shape.c_str(), NodeColorAttributes(color).c_str()); + private: + // Returns the dot graph identifier for the given instruction. + string InstructionId(const HloInstruction* instruction) { + return StrCat(reinterpret_cast(instruction)); } - return graph_body; + + // Returns the dot graph identifier for the given computation. + string SubcomputationId(const HloComputation* computation) { + return StrCat("cluster_", reinterpret_cast(computation)); + } + + string Header(); + string Footer(); + + // Maps HloComputations we should dump to their parent instruction in the + // outer computation. + std::unordered_map + SubcomputationsToDump(); + + string DumpSubcomputation(const HloComputation* subcomp, + const HloInstruction* parent_instr); + string DumpComputation(const HloComputation* comp); + string DumpInstruction(const HloInstruction* instr); + ColorScheme GetInstructionColor(const HloInstruction* instr); + string GetInstructionNodeShape(const HloInstruction* instr); + string GetInstructionNodeLabel(const HloInstruction* instr); + string GetInstructionNodeExtraInfo(const HloInstruction* instr); + string GetInstructionNodeInlinedConstants(const HloInstruction* instr); + void AddInstructionIncomingEdges(const HloInstruction* instr); + + // If instr has just one computation and it's trivial (e.g. "return param0 + + // param1"), returns a string you can put into the node's body that names the + // subcomputation, e.g. "Subcomputation: add". + string GetInstructionTrivialComputationStr(const HloInstruction* instr); + + const HloComputation* computation_; // never null + const string label_; // overall name for the graph + const bool show_addresses_; + const bool show_layouts_; + const HloExecutionProfile* profile_; // may be null + const NodeFilter filter_; + + // Edges to print from Footer(). Edges come at the end because graphviz is + // unhappy if an edge from a subcomputation to a node in the outer computation + // appears before both the inner computation and the destination node are + // defined. + std::vector edges_; +}; + +string HloDotDumper::Dump() { + string g = Header(); + for (const auto& kv : SubcomputationsToDump()) { + const HloComputation* subcomp = kv.first; + const HloInstruction* parent = kv.second; + StrAppend(&g, DumpSubcomputation(subcomp, parent)); + } + StrAppend(&g, DumpComputation(computation_)); + StrAppend(&g, Footer()); + return g; } -// DOT graphs accept a stylesheet as a URL. So naturally, an inline stylesheet -// is a data URI! -// -// We don't perform any escaping on this string, so be careful not to use double -// quotes inside. -static const char* dot_stylesheet = R"( -data:text/css, -@import url(https://fonts.googleapis.com/css?family=Roboto:400,700); -svg text { - font-family: 'Roboto'; - font-size: 12px; -} +string HloDotDumper::Header() { + // DOT graphs accept a stylesheet as a URI. So naturally, an inline + // stylesheet is a data URI! + const char* fmt = R"(digraph G { +rankdir = TB; +compound = true; +label = <%s>; +labelloc = t; +stylesheet=" + data:text/css, + @import url(https://fonts.googleapis.com/css?family=Roboto:400,700); + svg text { + font-family: 'Roboto'; + font-size: 12px; + } +" + )"; -string ComputationToDotGraph(const HloComputation& computation, - const string& label, bool show_addresses, - bool show_layouts, - const HloExecutionProfile* hlo_execution_profile, - const NodeFilter& filter) { - string graph_label = StrCat(label, "
", computation.name()); - if (hlo_execution_profile != nullptr) { - auto cycles = hlo_execution_profile->total_cycles_executed(computation); + string graph_label = StrCat(label_, "
", computation_->name()); + if (profile_ != nullptr) { + auto cycles = profile_->total_cycles_executed(*computation_); Appendf(&graph_label, "
total cycles = %lld (%s)", cycles, - tensorflow::strings::HumanReadableNum(cycles).c_str()); + tensorflow::strings::HumanReadableNum(cycles)); } - string graph = Printf( - R"(digraph G { -rankdir=TB; -compound=true; -label=<%s>; -labelloc=t; -stylesheet="%s" -)", - graph_label.c_str(), dot_stylesheet); + return Printf(fmt, graph_label); +} +string HloDotDumper::Footer() { return StrCat(Join(edges_, "\n"), "\n}"); } + +std::unordered_map +HloDotDumper::SubcomputationsToDump() { // Dump the subcomputations of each instruction that's shown and doesn't have // its operands omitted. If an instruction has just one subcomputation and // it's trivial, omit it: We'll display that subcomputation inlined into the // instruction's node when we draw it. - std::unordered_set computations_to_dump; - for (const auto& instr : computation.instructions()) { - if (!filter.Show(instr.get()) || filter.OmitOperands(instr.get())) { + std::unordered_map to_dump; + for (const auto& instr : computation_->instructions()) { + if (!filter_.Show(instr.get()) || + filter_.SomeOrAllOperandsOmitted(instr.get())) { continue; } if (instr->opcode() == HloOpcode::kFusion) { - computations_to_dump.insert(instr->fused_instructions_computation()); + to_dump[instr->fused_instructions_computation()] = instr.get(); } - const auto& subcomputations = instr->called_computations(); - if (subcomputations.size() != 1 || - !MatchTrivialComputation(subcomputations.front())) { - for (const HloComputation* computation : instr->called_computations()) { - computations_to_dump.insert(computation); + for (const HloComputation* comp : instr->called_computations()) { + if (!MatchTrivialComputation(comp)) { + to_dump[comp] = instr.get(); } } } + return to_dump; +} - // Emit embedded computations as subgraph clusters. - std::vector intercomputation_edges; - for (const HloComputation* embedded : - computation.MakeEmbeddedComputationsList()) { - if (!computations_to_dump.count(embedded)) { +string HloDotDumper::DumpSubcomputation(const HloComputation* subcomp, + const HloInstruction* parent_instr) { + const char* computation_fmt = R"(subgraph %s { +%s; +label = <%s>; +labelloc = t; +%s +} // %s + +)"; + + string id = SubcomputationId(subcomp); + + string subcomp_label, style; + if (parent_instr->opcode() == HloOpcode::kFusion) { + subcomp_label = Printf("Fused expression for %s
%s", + HtmlLikeStringSanitize(parent_instr->name()), + HtmlLikeStringSanitize(parent_instr->ToCategory())); + + // Subcomputation's fill/stroke color is light/dark red/gray, depending on + // whether or not the subcomputation's fusion node is highlighted. + bool highlight = filter_.Highlight(parent_instr); + const char* fillcolor = highlight ? "#ffcdd2" : "#f5f5f5"; + const char* strokecolor = highlight ? "#b71c1c" : "#c2c2c2"; + style = Printf(R"(style="rounded,filled,bold"; fillcolor="%s"; color="%s")", + fillcolor, strokecolor); + } else { + subcomp_label = Printf("Subcomputation for %s
%s", + HtmlLikeStringSanitize(parent_instr->name()), + HtmlLikeStringSanitize(subcomp->name())); + style = "style=rounded; color=black;"; + } + + string comp_body = DumpComputation(subcomp); + string computation = + Printf(computation_fmt, id, style, subcomp_label, comp_body, id); + + // Add an edge from the subcomputation to its parent node. If subcomp + // belongs to a fusion node, it's drawn in place of the fusion instruction, so + // there's no need to link those. + if (parent_instr->opcode() != HloOpcode::kFusion) { + const char* edge_fmt = R"(%s -> %s [ltail="%s", style="dashed"];)"; + edges_.push_back( + Printf(edge_fmt, InstructionId(subcomp->root_instruction()), + InstructionId(parent_instr), SubcomputationId(subcomp))); + } + + return computation; +} + +string HloDotDumper::DumpComputation(const HloComputation* comp) { + string g; + for (const auto& instr : comp->instructions()) { + if (!filter_.Show(instr.get())) { continue; } - // Don't pass our filter down into the subcomputation -- always render the - // whole thing. - string graph_body = InstructionSequenceGraph( - embedded->instructions(), show_addresses, show_layouts, - &intercomputation_edges, hlo_execution_profile, NodeFilter()); - Appendf(&graph, - "subgraph cluster_%s " - "{\nstyle=rounded;label=<%s>;labelloc=t;\n%s}\n", - ComputationId(embedded).c_str(), embedded->name().c_str(), - graph_body.c_str()); + StrAppend(&g, DumpInstruction(instr.get())); } - StrAppend(&graph, - InstructionSequenceGraph(computation.instructions(), show_addresses, - show_layouts, &intercomputation_edges, - hlo_execution_profile, filter)); + return g; +} - // Edges between computations (subgraph clusters) must be emitted last for the - // graph to be rendered properly for some reason. - StrAppend(&graph, Join(intercomputation_edges, "\n"), "}\n"); +string HloDotDumper::DumpInstruction(const HloInstruction* instr) { + // We don't display constants as separate nodes; they're merged into their + // users. + if (instr->opcode() == HloOpcode::kConstant) { + return ""; + } + // Omit the fusion node if its subcomputation is drawn, since the + // subcomputation will be drawn inline. + if (instr->opcode() == HloOpcode::kFusion && + filter_.ShowFusionSubcomputation(instr)) { + return ""; + } - return graph; + ColorScheme color = GetInstructionColor(instr); + string node_shape = GetInstructionNodeShape(instr); + string node_label = GetInstructionNodeLabel(instr); + string extra_info = GetInstructionNodeExtraInfo(instr); + string inlined_constants = GetInstructionNodeInlinedConstants(instr); + string trivial_subcomputation = GetInstructionTrivialComputationStr(instr); + AddInstructionIncomingEdges(instr); + + // Override the node's styling if it should be (de-)emphasized. + if (filter_.Deemphasized(instr)) { + color = kDashedBorder; + } + if (filter_.Highlight(instr)) { + node_shape = "diamond"; + color = kDarkRed; + } + + // Build the text that will be displayed inside the node. + string node_body = node_label; + for (const string& s : + {trivial_subcomputation, extra_info, inlined_constants}) { + if (!s.empty()) { + StrAppend(&node_body, "
", s); + } + } + + return Printf("%s [label=<%s>, shape=%s, %s];\n", InstructionId(instr), + node_body, node_shape, NodeColorAttributes(color)); +} + +string HloDotDumper::GetInstructionNodeInlinedConstants( + const HloInstruction* instr) { + auto stringify_constant = [](const HloInstruction* constant) { + if (ShapeUtil::IsEffectiveScalar(constant->shape())) { + auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex( + constant->shape(), /*linear_index=*/0); + return Printf("%s{%s}", ShapeUtil::HumanString(constant->shape()), + constant->literal().GetAsString(elem_idx)); + } + if (tensorflow::StringPiece(constant->name()).starts_with("%constant")) { + return constant->name(); + } + return StrCat("constant ", constant->name()); + }; + + // Special case: If instr is a parameter to a fusion node, check whether the + // corresponding operand to the fusion node is a constant. + if (instr->opcode() == HloOpcode::kParameter && instr->IsFused()) { + const HloInstruction* fusion = instr->fusion_instruction(); + const HloInstruction* operand = fusion->operand(instr->parameter_number()); + if (operand->opcode() != HloOpcode::kConstant) { + return ""; + } + return stringify_constant(operand); + } + + std::vector lines; + for (int64 i = 0; i < instr->operand_count(); ++i) { + const HloInstruction* operand = instr->operand(i); + if (operand->opcode() != HloOpcode::kConstant) { + continue; + } + lines.push_back( + Printf("operand %lld = %s", i, stringify_constant(operand))); + } + return Join(lines, "
"); +} + +ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { + // Pick different colors or shapes for instructions which are particularly + // expensive (eg, dot) and those which are unusual in some way or unique + // (eg, parameter). + switch (instr->opcode()) { + case HloOpcode::kAbs: + case HloOpcode::kAdd: + case HloOpcode::kCeil: + case HloOpcode::kClamp: + case HloOpcode::kConvert: + case HloOpcode::kCos: + case HloOpcode::kDivide: + case HloOpcode::kEq: + case HloOpcode::kExp: + case HloOpcode::kFloor: + case HloOpcode::kGe: + case HloOpcode::kGt: + case HloOpcode::kIndex: + case HloOpcode::kIsFinite: + case HloOpcode::kLe: + case HloOpcode::kLog: + case HloOpcode::kLogicalAnd: + case HloOpcode::kLogicalNot: + case HloOpcode::kLogicalOr: + case HloOpcode::kLt: + case HloOpcode::kMaximum: + case HloOpcode::kMinimum: + case HloOpcode::kMultiply: + case HloOpcode::kNe: + case HloOpcode::kNegate: + case HloOpcode::kPower: + case HloOpcode::kRemainder: + case HloOpcode::kSelect: + case HloOpcode::kSign: + case HloOpcode::kSin: + case HloOpcode::kSlice: + case HloOpcode::kSort: + case HloOpcode::kSubtract: + case HloOpcode::kTanh: + case HloOpcode::kRng: + case HloOpcode::kBroadcast: + case HloOpcode::kTranspose: + return kYellow; + case HloOpcode::kBitcast: + case HloOpcode::kTuple: + case HloOpcode::kTrace: + case HloOpcode::kGetTupleElement: + return kWhite; + case HloOpcode::kConcatenate: + case HloOpcode::kCopy: + case HloOpcode::kDynamicSlice: + case HloOpcode::kDynamicUpdateSlice: + case HloOpcode::kPad: + case HloOpcode::kReshape: + case HloOpcode::kReverse: + case HloOpcode::kUpdate: + return kGreen; + case HloOpcode::kConvolution: + case HloOpcode::kDot: + return kDarkBlue; + case HloOpcode::kReducePrecision: + return kRed; + case HloOpcode::kParameter: + return kOrange; + case HloOpcode::kBatchNormTraining: + case HloOpcode::kBatchNormGrad: + case HloOpcode::kReduce: + case HloOpcode::kSelectAndScatter: + case HloOpcode::kReduceWindow: + return kPurple; + case HloOpcode::kMap: + case HloOpcode::kFusion: + return kGray; + case HloOpcode::kSend: + case HloOpcode::kRecv: + case HloOpcode::kInfeed: + case HloOpcode::kOutfeed: + case HloOpcode::kCrossReplicaSum: + return kBrown; + case HloOpcode::kCustomCall: + case HloOpcode::kWhile: + case HloOpcode::kCall: + return kDarkGreen; + case HloOpcode::kConstant: + LOG(FATAL) << "Constants don't get their own nodes in the graph."; + } +} + +string HloDotDumper::GetInstructionNodeShape(const HloInstruction* instr) { + // Give while loops a different shape so they're easier to pick out. + switch (instr->opcode()) { + case HloOpcode::kWhile: + return "ellipse"; + default: + return "rect"; + } +} + +string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) { + // If we have a parameter, put the param number in the name. + if (instr->opcode() == HloOpcode::kParameter) { + return Printf("Parameter %lld", instr->parameter_number()); + } + + // The HLO instruction name contains usually the opcode, e.g. "%add.42" is + // an add instruction. In this case we render just the name. + if (tensorflow::StringPiece(instr->name()) + .starts_with(StrCat("%", HloOpcodeString(instr->opcode())))) { + return Printf("%s", HtmlLikeStringSanitize(instr->name())); + } + + // If the name does not contain the opcode, render both. + return Printf("%s
%s", + HtmlLikeStringSanitize(instr->ExtendedOpcodeStr()), + HtmlLikeStringSanitize(instr->name())); +} + +string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) { + string opcode_specific_info = [&]() -> string { + switch (instr->opcode()) { + case HloOpcode::kRng: + return RandomDistribution_Name(instr->random_distribution()); + case HloOpcode::kConvolution: + return StrCat( + HtmlLikeStringSanitize( + instr->ConvolutionDimensionNumbersToString()), + "
", + HtmlLikeStringSanitize(window_util::ToString(instr->window()))); + case HloOpcode::kBroadcast: + case HloOpcode::kTranspose: + case HloOpcode::kReduce: + return Printf("dims={%s}", Join(instr->dimensions(), ",")); + case HloOpcode::kGetTupleElement: + return Printf("index=%lld", instr->tuple_index()); + case HloOpcode::kBatchNormTraining: + case HloOpcode::kBatchNormGrad: + return Printf("feature_index=%lld", instr->feature_index()); + case HloOpcode::kCustomCall: + return Printf("custom_call_target=%s", instr->custom_call_target()); + default: + return ""; + } + }(); + + std::vector lines; + if (!opcode_specific_info.empty()) { + lines.push_back(opcode_specific_info); + } + + // Some instructions have giant tuples as their shapes, so truncate the HLO's + // shape to kMaxShapeLen characters. + constexpr int kMaxShapeLen = 64; + string instr_shape = ShapeUtil::HumanString(instr->shape()); + if (instr_shape.length() > kMaxShapeLen) { + instr_shape = + StrCat(tensorflow::StringPiece(instr_shape).substr(0, kMaxShapeLen - 3), + "..."); + } + lines.push_back(instr_shape); + + if (show_addresses_) { + lines.push_back(Printf("[%p]", instr)); + } + if (show_layouts_ && LayoutUtil::HasLayout(instr->shape())) { + string layout_str; + if (ShapeUtil::IsTuple(instr->shape())) { + // For tuples, emit the full shape because the layout of a tuple is not + // represented in a single Layout field. + layout_str = ShapeUtil::HumanStringWithLayout(instr->shape()); + } else { + layout_str = Join(instr->shape().layout().minor_to_major(), ","); + } + lines.push_back(Printf("layout={%s}", layout_str)); + } + if (profile_ != nullptr) { + double hlo_cycles_executed = profile_->GetProfileResult(*instr); + double total_cycles_executed = + profile_->total_cycles_executed(*instr->parent()); + if (hlo_cycles_executed > 0 && total_cycles_executed > 0) { + lines.push_back( + Printf("%% of cycles executed=%.2f", + 100 * hlo_cycles_executed / total_cycles_executed)); + } + } + return Join(lines, "
"); +} + +void HloDotDumper::AddInstructionIncomingEdges(const HloInstruction* instr) { + auto add_edge = [&](const HloInstruction* from, const HloInstruction* to, + int64 operand_num) { + // Fusion nodes' subcomputations are displayed inline, so if 'from' is a + // fusion node and the node's subcomputation is shown, we draw our edge + // starting at the fusion node's root instead of at the fusion node itself. + if (from->opcode() == HloOpcode::kFusion && + filter_.ShowFusionSubcomputation(from)) { + from = from->fused_expression_root(); + } + if (!filter_.Show(from) || from->opcode() == HloOpcode::kConstant) { + return; + } + string edge = Printf("%s -> %s", InstructionId(from), InstructionId(to)); + if (instr->operand_count() > 1) { + Appendf(&edge, R"( [headlabel="%lld",labeldistance=2])", operand_num); + } + StrAppend(&edge, ";"); + edges_.push_back(edge); + }; + + // Add edges from instr's operands to instr. Parameters within fusion + // expressions are handled specially -- we draw an edge from the corresponding + // operand on the fusion node itself to the parameter. + if (instr->opcode() == HloOpcode::kParameter && instr->IsFused()) { + const HloInstruction* fusion = instr->fusion_instruction(); + add_edge(fusion->operand(instr->parameter_number()), instr, + /*operand_num=*/0); + } else { + for (int64 i = 0; i < instr->operand_count(); ++i) { + add_edge(instr->operand(i), instr, i); + } + } +} + +string HloDotDumper::GetInstructionTrivialComputationStr( + const HloInstruction* instr) { + // called_computations() on a fusion node "inherits" any called computations + // of the fused root, which isn't what we want. Just ignore fusion nodes + // here; they're handled separately. + if (instr->opcode() == HloOpcode::kFusion) { + return ""; + } + + std::vector lines; + for (int64 i = 0; i < instr->called_computations().size(); ++i) { + optional computation_type = + MatchTrivialComputation(instr->called_computations()[i]); + if (!computation_type) { + continue; + } + if (instr->called_computations().size() == 1) { + lines.push_back(Printf("Subcomputation: %s", + HtmlLikeStringSanitize(*computation_type))); + } else { + lines.push_back(Printf("Subcomputation %lld: %s", i, + HtmlLikeStringSanitize(*computation_type))); + } + } + return Join(lines, "
"); } tensorflow::mutex& RendererMutex() { @@ -750,14 +891,6 @@ class FileGraphRenderer : public GraphRendererInterface { // Gets a NodeFilter that includes roughly all instructions whose distance from // root is <= radius. -// -// It's confusing to draw a node and include only some of its operands. So if -// some but not all of a node's operands are <= radius units away from the root, -// we include the other operands (unless there are a lot of them, as often in a -// tuple node). These additional operands may have as inputs other nodes -// already present in the graph, but we don't draw those edges unless *all* of -// the inputs are present. (Otherwise we'd have the same problem we were trying -// to solve in the first place!) NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) { // First, find the neighborhood of nodes with distance from root <= radius. // These nodes are our initial set of "normal" nodes. @@ -788,14 +921,25 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) { } } - // If you're looking at node X, it's probably not interesting that node Y - // also happens to use the same constant, so we don't traverse into - // constants' users. - if (instr->opcode() != HloOpcode::kConstant) { - for (const HloInstruction* user : instr->users()) { - if (!nodes.count(user)) { - worklist.push_back({user, depth + 1}); - } + // Traverse into instr's users, unless: + // + // - there are a ton of them, in which case they're probably not + // interesting (and anyway, rendering them all would make the graph + // unreadable), or + // - instr is a constant, in which case its users are probably not + // interesting. + if (instr->opcode() == HloOpcode::kConstant) { + continue; + } + constexpr int kMaxUsersToRender = 16; + if (instr->user_count() > kMaxUsersToRender) { + // If we're going to skip this node's users, style it as such. + nodes[instr] = kSomeUsersOmitted; + continue; + } + for (const HloInstruction* user : instr->users()) { + if (!nodes.count(user)) { + worklist.push_back({user, depth + 1}); } } } @@ -804,43 +948,27 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) { return nodes.count(instr) > 0; }; - // If a node has some but not all of its operands omitted, add the operands to - // the map with type kOmitNodeOperands. Unless the node has a lot of - // operands, in which case just mark the node as "some operands omitted". - std::vector extra_operands; + // Mark nodes which don't have all of their operands present as "some operands + // omitted". for (auto& kv : nodes) { const HloInstruction* instr = kv.first; NodeFilterResult& filter_result = kv.second; const auto& operands = instr->operands(); - // Mark nodes with many operands and some omitted as "some operands omitted" - // and carry on -- don't add their omitted operands to extra_operands. - if (operands.size() > 4) { - if (std::any_of(operands.begin(), operands.end(), is_displayed) && - !std::all_of(operands.begin(), operands.end(), is_displayed)) { - filter_result = kSomeOperandsOmitted; - } - continue; - } - - if (std::any_of(operands.begin(), operands.end(), is_displayed)) { - for (const HloInstruction* operand : operands) { - if (!is_displayed(operand)) { - extra_operands.push_back(operand); - } - } + // Mark nodes with some omitted as "some operands omitted". + if (std::any_of(operands.begin(), operands.end(), is_displayed) && + !std::all_of(operands.begin(), operands.end(), is_displayed)) { + filter_result = kSomeOperandsOmitted; } } - for (const HloInstruction* instr : extra_operands) { - nodes[instr] = kOmitNodeOperands; - } - // Some of the nodes in extra_operands may now have all of their inputs - // present in nodes. We can promote these to normal nodes. - for (const HloInstruction* instr : extra_operands) { - const auto& operands = instr->operands(); - if (std::all_of(operands.begin(), operands.end(), is_displayed)) { - nodes[instr] = kNormalNode; + // Promote nodes with type kSomeUsersOmitted to kNormalNode if all of their + // users made it into the graph by other means. + for (auto& kv : nodes) { + const auto& users = kv.first->users(); + if (kv.second == kSomeUsersOmitted && + std::all_of(users.begin(), users.end(), is_displayed)) { + kv.second = kNormalNode; } } @@ -862,6 +990,10 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) { if (it != nodes.end()) { return it->second; } + // Show all nodes in subcomputations. + if (instr->parent() != root->parent()) { + return kNormalNode; + } return kHideNode; }); } @@ -886,10 +1018,12 @@ string DumpGraph(const HloComputation& computation, const string& label, graph_url = FileGraphRenderer().RenderGraph( graph, GraphRendererInterface::TF_GRAPHDEF, debug_options); } else { - graph = ComputationToDotGraph(computation, label, - debug_options.xla_hlo_graph_addresses(), - debug_options.xla_hlo_graph_layout(), - hlo_execution_profile, NodeFilter()); + graph = + HloDotDumper(&computation, label, + /*show_addresses=*/debug_options.xla_hlo_graph_addresses(), + /*show_layouts=*/debug_options.xla_hlo_graph_layout(), + hlo_execution_profile, NodeFilter()) + .Dump(); graph_url = GetGraphRenderer()->RenderGraph( graph, GraphRendererInterface::DOT_GRAPH, debug_options); } @@ -903,11 +1037,12 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius) { string label = StrCat("Neighborhood of ", radius, " nodes around ", node.name()); NodeFilter filter = MakeNodeFilter(&node, radius); - string graph = ComputationToDotGraph( - *node.parent(), label, - /*show_addresses=*/debug_options.xla_hlo_graph_addresses(), - /*show_layouts=*/debug_options.xla_hlo_graph_layout(), - /*hlo_execution_profile=*/nullptr, filter); + string graph = + HloDotDumper(node.parent(), label, + /*show_addresses=*/debug_options.xla_hlo_graph_addresses(), + /*show_layouts=*/debug_options.xla_hlo_graph_layout(), + /*profile=*/nullptr, filter) + .Dump(); return GetGraphRenderer()->RenderGraph( graph, GraphRendererInterface::DOT_GRAPH, debug_options); } diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h index 358e611d57f..8a6376b2d1c 100644 --- a/tensorflow/compiler/xla/service/hlo_opcode.h +++ b/tensorflow/compiler/xla/service/hlo_opcode.h @@ -112,6 +112,11 @@ bool HloOpcodeIsComparison(HloOpcode opcode); // Returns true iff the given opcode has variadic operands. bool HloOpcodeIsVariadic(HloOpcode opcode); +// Returns the number of HloOpcode values. +inline const uint32_t HloOpcodeCount() { + return static_cast(HloOpcode::kWhile) + 1; +} + } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OPCODE_H_ diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc index dafefdc4910..e083226b14d 100644 --- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc +++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" #include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/core/platform/logging.h" namespace xla { @@ -30,14 +31,15 @@ StatusOr ReducePrecisionInsertion::Run(HloModule* module) { for (auto& instruction : computation->instructions()) { VLOG(3) << "Visited instruction: " << instruction->ToString(); - // For now, ReducePrecision is only implemented for F32 data, so this + // For now, ReducePrecision is only implemented for F32 arrays, so this // ignore instructions that produce other data. In particular, this // currently ignores instructions producing tuples, even if those tuples - // contain F32 data inside them. The assumption is that in most cases + // contain F32 arrays inside them. The assumption is that in most cases // equivalent behavior can be obtained by adding ReducePrecision - // instructions after the instructions that pull the F32 data out of the - // tuples. + // instructions after the instructions that pull the F32 arrays out of + // the tuples. if (instruction->shape().element_type() == PrimitiveType::F32 && + !ShapeUtil::IsScalar(instruction->shape()) && should_reduce_output_precision_(instruction->opcode())) { instructions_to_suffix.push_back(instruction.get()); } @@ -58,4 +60,33 @@ StatusOr ReducePrecisionInsertion::Run(HloModule* module) { return changed; } +ReducePrecisionInsertion::OpcodeFilterFunction +ReducePrecisionInsertion::make_filter_function( + const HloReducePrecisionOptions& reduce_precision_options) { + // Implement the filter function with a lookup table. + std::vector filter(HloOpcodeCount(), false); + for (const auto& opcode : reduce_precision_options.opcodes_to_suffix()) { + filter[opcode] = true; + } + return [filter](const HloOpcode opcode) { + return filter[static_cast(opcode)]; + }; +} + +HloReducePrecisionOptions ReducePrecisionInsertion::make_options_proto( + const HloReducePrecisionOptions::PassTiming pass_timing, + const int exponent_bits, const int mantissa_bits, + const OpcodeFilterFunction& should_reduce_output_precision) { + HloReducePrecisionOptions options; + options.set_pass_timing(pass_timing); + options.set_exponent_bits(exponent_bits); + options.set_mantissa_bits(mantissa_bits); + for (uint32_t opcode = 0; opcode < HloOpcodeCount(); opcode++) { + if (should_reduce_output_precision(static_cast(opcode))) { + options.add_opcodes_to_suffix(opcode); + } + } + return options; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.h b/tensorflow/compiler/xla/service/reduce_precision_insertion.h index e9c8bba0313..34b865b9ced 100644 --- a/tensorflow/compiler/xla/service/reduce_precision_insertion.h +++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.h @@ -42,6 +42,17 @@ class ReducePrecisionInsertion : public HloPassInterface { : exponent_bits_(exponent_bits), mantissa_bits_(mantissa_bits), should_reduce_output_precision_(should_reduce_output_precision) {} + + // Version of the constructor that takes an HloReducePrecisionOptions proto + // rather than explicitly-enumerated parameters, for convenience when + // creating passes based on DebugOptions. + explicit ReducePrecisionInsertion( + const HloReducePrecisionOptions& reduce_precision_options) + : exponent_bits_(reduce_precision_options.exponent_bits()), + mantissa_bits_(reduce_precision_options.mantissa_bits()), + should_reduce_output_precision_( + make_filter_function(reduce_precision_options)) {} + ~ReducePrecisionInsertion() override{}; tensorflow::StringPiece name() const override { @@ -52,6 +63,15 @@ class ReducePrecisionInsertion : public HloPassInterface { // (reduce-precision instructions were inserted). StatusOr Run(HloModule* module) override; + // Convert between the (inconvenient) xla.proto HloReducePrecisionOptions + // representation and OpcodeFilterFunction functions. + static OpcodeFilterFunction make_filter_function( + const HloReducePrecisionOptions& reduce_precision_options); + static HloReducePrecisionOptions make_options_proto( + const HloReducePrecisionOptions::PassTiming pass_timing, + const int exponent_bits, const int mantissa_bits, + const OpcodeFilterFunction& should_reduce_output_precision); + private: // Parameters for the precision reduction to be added. const int exponent_bits_; @@ -59,7 +79,7 @@ class ReducePrecisionInsertion : public HloPassInterface { // Function to determine (from the opcode) whether a given instruction should // have a reduce-precision instruction inserted in its output stream. - const OpcodeFilterFunction& should_reduce_output_precision_; + const OpcodeFilterFunction should_reduce_output_precision_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/statusor.cc b/tensorflow/compiler/xla/statusor.cc index 36f08fc99f4..72ab67ff810 100644 --- a/tensorflow/compiler/xla/statusor.cc +++ b/tensorflow/compiler/xla/statusor.cc @@ -19,28 +19,20 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" namespace xla { -namespace internal { +namespace internal_statusor { -Status StatusOrHelper::HandleInvalidStatusCtorArg() { +void Helper::HandleInvalidStatusCtorArg(Status* status) { const char* kMessage = - "Status::OK is not a valid constructor argument to StatusOr"; + "An OK status is not a valid constructor argument to StatusOr"; LOG(ERROR) << kMessage; - // In optimized builds, we will fall back to tensorflow::error::INTERNAL. - return Status(tensorflow::error::INTERNAL, kMessage); + // Fall back to tensorflow::error::INTERNAL. + *status = ::tensorflow::errors::Internal(kMessage); } -Status StatusOrHelper::HandleNullObjectCtorArg() { - const char* kMessage = - "NULL is not a valid constructor argument to StatusOr"; - LOG(ERROR) << kMessage; - // In optimized builds, we will fall back to tensorflow::error::INTERNAL. - return Status(tensorflow::error::INTERNAL, kMessage); -} - -void StatusOrHelper::Crash(const Status& status) { +void Helper::Crash(const Status& status) { LOG(FATAL) << "Attempting to fetch value instead of handling error " << status; } -} // namespace internal +} // namespace internal_statusor } // namespace xla diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h index d8cd736238c..92bcfa0f44d 100644 --- a/tensorflow/compiler/xla/statusor.h +++ b/tensorflow/compiler/xla/statusor.h @@ -72,216 +72,233 @@ limitations under the License. #define TENSORFLOW_COMPILER_XLA_STATUSOR_H_ #include "tensorflow/compiler/xla/status.h" +#include "tensorflow/compiler/xla/statusor_internals.h" #include "tensorflow/core/platform/macros.h" namespace xla { #if defined(__clang__) // Only clang supports warn_unused_result as a type annotation. -template +template class TF_MUST_USE_RESULT StatusOr; #endif -template ::value> -class StatusOr { - template +template +class StatusOr : private internal_statusor::StatusOrData, + private internal_statusor::TraitsBase< + std::is_copy_constructible::value, + std::is_move_constructible::value> { + template friend class StatusOr; + typedef internal_statusor::StatusOrData Base; + public: typedef T element_type; - // Construct a new StatusOr with Status::UNKNOWN status - StatusOr(); + // Constructs a new StatusOr with Status::UNKNOWN status. This is marked + // 'explicit' to try to catch cases like 'return {};', where people think + // StatusOr> will be initialized with an empty vector, + // instead of a Status::UNKNOWN status. + explicit StatusOr(); - // Construct a new StatusOr with the given non-ok status. After calling + // StatusOr will be copy constructuble/assignable if T is copy + // constructible. + StatusOr(const StatusOr&) = default; + StatusOr& operator=(const StatusOr&) = default; + + // StatusOr will be move constructuble/assignable if T is move + // constructible. + StatusOr(StatusOr&&) = default; + StatusOr& operator=(StatusOr&&) = default; + + // Conversion copy/move constructor, T must be convertible from U. + // TODO(b/62186717): These should not participate in overload resolution if U + // is not convertible to T. + template + StatusOr(const StatusOr& other); + template + StatusOr(StatusOr&& other); + + // Conversion copy/move assignment operator, T must be convertible from U. + template + StatusOr& operator=(const StatusOr& other); + template + StatusOr& operator=(StatusOr&& other); + + // Constructs a new StatusOr with the given value. After calling this + // constructor, calls to ValueOrDie() will succeed, and calls to status() will + // return OK. + // + // NOTE: Not explicit - we want to use StatusOr as a return type + // so it is convenient and sensible to be able to do 'return T()' + // when the return type is StatusOr. + // + // REQUIRES: T is copy constructible. + StatusOr(const T& value); + + // Constructs a new StatusOr with the given non-ok status. After calling // this constructor, calls to ValueOrDie() will CHECK-fail. // // NOTE: Not explicit - we want to use StatusOr as a return // value, so it is convenient and sensible to be able to do 'return // Status()' when the return type is StatusOr. // - // REQUIRES: status != Status::OK. This requirement is DCHECKed. - // In optimized builds, passing Status::OK here will have the effect + // REQUIRES: !status.ok(). This requirement is DCHECKed. + // In optimized builds, passing Status::OK() here will have the effect // of passing tensorflow::error::INTERNAL as a fallback. - StatusOr(Status status); // NOLINT + StatusOr(const Status& status); + StatusOr& operator=(const Status& status); - // Construct a new StatusOr with the given value. If T is a plain pointer, - // value must not be NULL. After calling this constructor, calls to - // ValueOrDie() will succeed, and calls to status() will return OK. + // TODO(b/62186997): Add operator=(T) overloads. + + // Similar to the `const T&` overload. // - // NOTE: Not explicit - we want to use StatusOr as a return type - // so it is convenient and sensible to be able to do 'return T()' - // when the return type is StatusOr. - // - // REQUIRES: if T is a plain pointer, value != NULL. This requirement is - // DCHECKed. In optimized builds, passing a NULL pointer here will have - // the effect of passing tensorflow::error::INTERNAL as a fallback. - StatusOr(const T& value); // NOLINT + // REQUIRES: T is move constructible. + StatusOr(T&& value); - // Copy constructor. - StatusOr(const StatusOr& other) = default; - - // Conversion copy constructor, T must be copy constructible from U - template - StatusOr(const StatusOr& other); - - // Assignment operator. - StatusOr& operator=(const StatusOr& other) = default; - - // Conversion assignment operator, T must be assignable from U - template - StatusOr& operator=(const StatusOr& other); - - // Move constructor and move-assignment operator. - StatusOr(StatusOr&& other) = default; - StatusOr& operator=(StatusOr&& other) = default; - - // Rvalue-reference overloads of the other constructors and assignment - // operators, to support move-only types and avoid unnecessary copying. - // - // Implementation note: we could avoid all these rvalue-reference overloads - // if the existing lvalue-reference overloads took their arguments by value - // instead. I think this would also let us omit the conversion assignment - // operator altogether, since we'd get the same functionality for free - // from the implicit conversion constructor and ordinary assignment. - // However, this could result in extra copy operations unless we use - // std::move to avoid them, and we can't use std::move because this code - // needs to be portable to C++03. - StatusOr(T&& value); // NOLINT - template - StatusOr(StatusOr&& other); - - // Returns a reference to our status. If this contains a T, then - // returns Status::OK. - const Status& status() const { return status_; } + // RValue versions of the operations declared above. + StatusOr(Status&& status); + StatusOr& operator=(Status&& status); // Returns this->status().ok() - bool ok() const { return status_.ok(); } + bool ok() const { return this->status_.ok(); } + + // Returns a reference to our status. If this contains a T, then + // returns Status::OK(). + const Status& status() const &; + Status status() &&; // Returns a reference to our current value, or CHECK-fails if !this->ok(). - const T& ValueOrDie() const; - T& ValueOrDie(); + // + // Note: for value types that are cheap to copy, prefer simple code: + // + // T value = statusor.ValueOrDie(); + // + // Otherwise, if the value type is expensive to copy, but can be left + // in the StatusOr, simply assign to a reference: + // + // T& value = statusor.ValueOrDie(); // or `const T&` + // + // Otherwise, if the value type supports an efficient move, it can be + // used as follows: + // + // T value = std::move(statusor).ValueOrDie(); + // + // The std::move on statusor instead of on the whole expression enables + // warnings about possible uses of the statusor object after the move. + // C++ style guide waiver for ref-qualified overloads granted in cl/143176389 + // See go/ref-qualifiers for more details on such overloads. + const T& ValueOrDie() const &; + T& ValueOrDie() &; + const T&& ValueOrDie() const &&; + T&& ValueOrDie() &&; - // Moves our current value out of this object and returns it, or CHECK-fails - // if !this->ok(). - // Use of this method is discouraged; prefer std::move(statusor.ValueOrDie()) - // instead. T ConsumeValueOrDie() { return std::move(ValueOrDie()); } - private: - Status status_; - T value_; -}; - -// Partial specialization for when T is not copy-constructible. This uses all -// methods from the core implementation, but removes copy assignment and copy -// construction. -template -class StatusOr : public StatusOr { - public: - // Remove copies. - StatusOr(const StatusOr& other) = delete; - StatusOr& operator=(const StatusOr& other) = delete; - template - StatusOr(const StatusOr& other) = delete; - StatusOr(const T& value) = delete; - - // Use the superclass version for other constructors and operators. - StatusOr() = default; - StatusOr(StatusOr&& other) = default; - StatusOr& operator=(StatusOr&& other) = default; - StatusOr(T&& value) // NOLINT - : StatusOr::StatusOr(std::move(value)) {} - StatusOr(Status status) // NOLINT - : StatusOr::StatusOr(std::move(status)) {} - template - StatusOr(StatusOr&& other) // NOLINT - : StatusOr::StatusOr(std::move(other)) {} + // Ignores any errors. This method does nothing except potentially suppress + // complaints from any tools that are checking that errors are not dropped on + // the floor. + void IgnoreError() const; }; //////////////////////////////////////////////////////////////////////////////// // Implementation details for StatusOr -namespace internal { - -class StatusOrHelper { - public: - // Move type-agnostic error handling to the .cc. - static Status HandleInvalidStatusCtorArg(); - static Status HandleNullObjectCtorArg(); - static void Crash(const Status& status); - - // Customized behavior for StatusOr vs. StatusOr - template - struct Specialize; -}; +template +StatusOr::StatusOr() : Base(Status(tensorflow::error::UNKNOWN, "")) {} template -struct StatusOrHelper::Specialize { - // For non-pointer T, a reference can never be NULL. - static inline bool IsValueNull(const T& t) { return false; } -}; +StatusOr::StatusOr(const T& value) : Base(value) {} template -struct StatusOrHelper::Specialize { - static inline bool IsValueNull(const T* t) { return t == NULL; } -}; +StatusOr::StatusOr(const Status& status) : Base(status) {} -} // namespace internal - -template -inline StatusOr::StatusOr() - : status_(tensorflow::error::UNKNOWN, "") {} - -template -inline StatusOr::StatusOr(Status status) - : status_(std::move(status)) { - if (status_.ok()) { - status_ = internal::StatusOrHelper::HandleInvalidStatusCtorArg(); - } +template +StatusOr& StatusOr::operator=(const Status& status) { + this->Assign(status); + return *this; } -template -inline StatusOr::StatusOr(const T& value) - : value_(value) { - if (internal::StatusOrHelper::Specialize::IsValueNull(value)) { - status_ = internal::StatusOrHelper::HandleNullObjectCtorArg(); - } +template +StatusOr::StatusOr(T&& value) : Base(std::move(value)) {} + +template +StatusOr::StatusOr(Status&& status) : Base(std::move(status)) {} + +template +StatusOr& StatusOr::operator=(Status&& status) { + this->Assign(std::move(status)); + return *this; } -template +template template -inline StatusOr::StatusOr(const StatusOr& other) - : status_(other.status_), value_(other.value_) {} +inline StatusOr::StatusOr(const StatusOr& other) + : Base(static_cast::Base&>(other)) {} -template -inline StatusOr::StatusOr(T&& value) - : value_(std::move(value)) { - if (internal::StatusOrHelper::Specialize::IsValueNull(value_)) { - status_ = internal::StatusOrHelper::HandleNullObjectCtorArg(); - } -} - -template +template template -inline StatusOr::StatusOr(StatusOr&& other) - : status_(std::move(other.status_)), value_(std::move(other.value_)) {} - -template -inline const T& StatusOr::ValueOrDie() const { - if (!ok()) { - internal::StatusOrHelper::Crash(status()); - } - return value_; +inline StatusOr& StatusOr::operator=(const StatusOr& other) { + if (other.ok()) + this->Assign(other.ValueOrDie()); + else + this->Assign(other.status()); + return *this; } -template -inline T& StatusOr::ValueOrDie() { - if (!status_.ok()) { - internal::StatusOrHelper::Crash(status()); +template +template +inline StatusOr::StatusOr(StatusOr&& other) + : Base(static_cast::Base&&>(other)) {} + +template +template +inline StatusOr& StatusOr::operator=(StatusOr&& other) { + if (other.ok()) { + this->Assign(std::move(other).ValueOrDie()); + } else { + this->Assign(std::move(other).status()); } - return value_; + return *this; +} + +template +const Status& StatusOr::status() const & { + return this->status_; +} +template +Status StatusOr::status() && { + return ok() ? Status::OK() : std::move(this->status_); +} + +template +const T& StatusOr::ValueOrDie() const & { + this->EnsureOk(); + return this->data_; +} + +template +T& StatusOr::ValueOrDie() & { + this->EnsureOk(); + return this->data_; +} + +template +const T&& StatusOr::ValueOrDie() const && { + this->EnsureOk(); + return std::move(this->data_); +} + +template +T&& StatusOr::ValueOrDie() && { + this->EnsureOk(); + return std::move(this->data_); +} + +template +void StatusOr::IgnoreError() const { + // no-op } } // namespace xla diff --git a/tensorflow/compiler/xla/statusor_internals.h b/tensorflow/compiler/xla/statusor_internals.h new file mode 100644 index 00000000000..a2fda5bb3c6 --- /dev/null +++ b/tensorflow/compiler/xla/statusor_internals.h @@ -0,0 +1,245 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_ +#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_ + +#include "tensorflow/compiler/xla/status.h" +#include "tensorflow/core/platform/macros.h" + +namespace xla { +namespace internal_statusor { + +class Helper { + public: + // Move type-agnostic error handling to the .cc. + static void HandleInvalidStatusCtorArg(Status*); + TF_ATTRIBUTE_NORETURN static void Crash(const Status& status); +}; + +// Construct an instance of T in `p` through placement new, passing Args... to +// the constructor. +// This abstraction is here mostly for the gcc performance fix. +template +void PlacementNew(void* p, Args&&... args) { +#if defined(__GNUC__) && !defined(__clang__) + // Teach gcc that 'p' cannot be null, fixing code size issues. + if (p == nullptr) __builtin_unreachable(); +#endif + new (p) T(std::forward(args)...); +} + +// Helper base class to hold the data and all operations. +// We move all this to a base class to allow mixing with the appropriate +// TraitsBase specialization. +template +class StatusOrData { + template + friend class StatusOrData; + + public: + StatusOrData() = delete; + + StatusOrData(const StatusOrData& other) { + if (other.ok()) { + MakeValue(other.data_); + MakeStatus(); + } else { + MakeStatus(other.status_); + } + } + + StatusOrData(StatusOrData&& other) noexcept { + if (other.ok()) { + MakeValue(std::move(other.data_)); + MakeStatus(); + } else { + MakeStatus(std::move(other.status_)); + } + } + + template + StatusOrData(const StatusOrData& other) { + if (other.ok()) { + MakeValue(other.data_); + MakeStatus(); + } else { + MakeStatus(other.status_); + } + } + + template + StatusOrData(StatusOrData&& other) { + if (other.ok()) { + MakeValue(std::move(other.data_)); + MakeStatus(); + } else { + MakeStatus(std::move(other.status_)); + } + } + + explicit StatusOrData(const T& value) : data_(value) { MakeStatus(); } + explicit StatusOrData(T&& value) : data_(std::move(value)) { MakeStatus(); } + + explicit StatusOrData(const Status& status) : status_(status) { + EnsureNotOk(); + } + explicit StatusOrData(Status&& status) : status_(std::move(status)) { + EnsureNotOk(); + } + + StatusOrData& operator=(const StatusOrData& other) { + if (this == &other) return *this; + if (other.ok()) + Assign(other.data_); + else + Assign(other.status_); + return *this; + } + + StatusOrData& operator=(StatusOrData&& other) { + if (this == &other) return *this; + if (other.ok()) + Assign(std::move(other.data_)); + else + Assign(std::move(other.status_)); + return *this; + } + + ~StatusOrData() { + if (ok()) { + status_.~Status(); + data_.~T(); + } else { + status_.~Status(); + } + } + + void Assign(const T& value) { + if (ok()) { + data_.~T(); + MakeValue(value); + } else { + MakeValue(value); + status_ = Status::OK(); + } + } + + void Assign(T&& value) { + if (ok()) { + data_.~T(); + MakeValue(std::move(value)); + } else { + MakeValue(std::move(value)); + status_ = Status::OK(); + } + } + + void Assign(const Status& status) { + Clear(); + status_ = status; + EnsureNotOk(); + } + + void Assign(Status&& status) { + Clear(); + status_ = std::move(status); + EnsureNotOk(); + } + + bool ok() const { return status_.ok(); } + + protected: + // status_ will always be active after the constructor. + // We make it a union to be able to initialize exactly how we need without + // waste. + // Eg. in the copy constructor we use the default constructor of Status in + // the ok() path to avoid an extra Ref call. + union { + Status status_; + }; + + // data_ is active iff status_.ok()==true + struct Dummy {}; + union { + // When T is const, we need some non-const object we can cast to void* for + // the placement new. dummy_ is that object. + Dummy dummy_; + T data_; + }; + + void Clear() { + if (ok()) data_.~T(); + } + + void EnsureOk() const { + if (!ok()) Helper::Crash(status_); + } + + void EnsureNotOk() { + if (ok()) Helper::HandleInvalidStatusCtorArg(&status_); + } + + // Construct the value (ie. data_) through placement new with the passed + // argument. + template + void MakeValue(Arg&& arg) { + internal_statusor::PlacementNew(&dummy_, std::forward(arg)); + } + + // Construct the status (ie. status_) through placement new with the passed + // argument. + template + void MakeStatus(Args&&... args) { + internal_statusor::PlacementNew(&status_, + std::forward(args)...); + } +}; + +// Helper base class to allow implicitly deleted constructors and assignment +// operations in StatusOr. +// TraitsBase will explicitly delete what it can't support and StatusOr will +// inherit that behavior implicitly. +template +struct TraitsBase { + TraitsBase() = default; + TraitsBase(const TraitsBase&) = default; + TraitsBase(TraitsBase&&) = default; + TraitsBase& operator=(const TraitsBase&) = default; + TraitsBase& operator=(TraitsBase&&) = default; +}; + +template <> +struct TraitsBase { + TraitsBase() = default; + TraitsBase(const TraitsBase&) = delete; + TraitsBase(TraitsBase&&) = default; + TraitsBase& operator=(const TraitsBase&) = delete; + TraitsBase& operator=(TraitsBase&&) = default; +}; + +template <> +struct TraitsBase { + TraitsBase() = default; + TraitsBase(const TraitsBase&) = delete; + TraitsBase(TraitsBase&&) = delete; + TraitsBase& operator=(const TraitsBase&) = delete; + TraitsBase& operator=(TraitsBase&&) = delete; +}; + +} // namespace internal_statusor +} // namespace xla + +#endif // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_ diff --git a/tensorflow/compiler/xla/statusor_test.cc b/tensorflow/compiler/xla/statusor_test.cc index f8555113f81..5fa2211ac66 100644 --- a/tensorflow/compiler/xla/statusor_test.cc +++ b/tensorflow/compiler/xla/statusor_test.cc @@ -29,8 +29,6 @@ limitations under the License. namespace xla { namespace { -using tensorflow::Status; - class Base1 { public: virtual ~Base1() {} @@ -59,6 +57,14 @@ class CopyNoAssign { const CopyNoAssign& operator=(const CopyNoAssign&); }; +class NoDefaultConstructor { + public: + explicit NoDefaultConstructor(int foo); +}; + +static_assert(!std::is_default_constructible(), + "Should not be default-constructible."); + StatusOr> ReturnUniquePtr() { // Uses implicit constructor from T&& return std::unique_ptr(new int(0)); @@ -69,6 +75,18 @@ TEST(StatusOr, ElementType) { static_assert(std::is_same::element_type, char>(), ""); } +TEST(StatusOr, TestNoDefaultConstructorInitialization) { + // Explicitly initialize it with an error code. + StatusOr statusor(tensorflow::errors::Cancelled("")); + EXPECT_FALSE(statusor.ok()); + EXPECT_EQ(statusor.status().code(), tensorflow::error::CANCELLED); + + // Default construction of StatusOr initializes it with an UNKNOWN error code. + StatusOr statusor2; + EXPECT_FALSE(statusor2.ok()); + EXPECT_EQ(statusor2.status().code(), tensorflow::error::UNKNOWN); +} + TEST(StatusOr, TestMoveOnlyInitialization) { StatusOr> thing(ReturnUniquePtr()); ASSERT_TRUE(thing.ok()); diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc index 074e28cec77..d692a810325 100644 --- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc +++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc @@ -308,6 +308,137 @@ XLA_TEST_P(BatchNormTest, DISABLED_ON_GPU(RandomizedTests)) { ErrorSpec(0.01, 1)); } +// TODO(b/62764704): Implement on GPU. Disabled on 2017-06-20. +XLA_TEST_P(BatchNormTest, DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU( + DISABLED_ON_GPU(RandomizedGradTests)))) { + float epsilon = 0.001; + ComputationBuilder builder(client_, TestName()); + const std::vector& bounds = GetParam().bounds; + Array4D input_array(bounds[0], bounds[1], bounds[2], bounds[3]); + input_array.FillRandom(GetParam().random_value_var, + GetParam().random_value_mean); + + Array4D grad_output_array(bounds[0], bounds[1], bounds[2], bounds[3]); + grad_output_array.FillRandom(GetParam().random_value_var, + GetParam().random_value_mean); + + const int64 feature_index = GetParam().feature_index; + const int64 num_elements_per_feature = + Product(bounds) / bounds[feature_index]; + const int64 feature_bound = bounds[feature_index]; + std::vector scale(feature_bound, 2); + + auto input_squared = + ReferenceUtil::MapArray4D(input_array, [](float a) { return a * a; }); + std::vector reduce_dims; + for (int64 i = 0; i < bounds.size(); ++i) { + if (i != feature_index) { + reduce_dims.push_back(i); + } + } + + auto sum = + ReferenceUtil::Reduce4DTo1D(input_array, /*init=*/0.0f, reduce_dims, + [](float a, float b) { return a + b; }); + + auto sum_squared = + ReferenceUtil::Reduce4DTo1D(*input_squared, /*init=*/0.0f, reduce_dims, + [](float a, float b) { return a + b; }); + + std::vector mean(feature_bound); + + for (int64 i = 0; i < feature_bound; ++i) { + mean[i] = sum[i] / num_elements_per_feature; + } + + std::vector mean_square(feature_bound); + for (int64 i = 0; i < feature_bound; ++i) { + mean_square[i] = mean[i] * mean[i]; + } + + std::vector square_mean(feature_bound); + for (int64 i = 0; i < feature_bound; ++i) { + square_mean[i] = sum_squared[i] / num_elements_per_feature; + } + + std::vector var(feature_bound); + for (int64 i = 0; i < feature_bound; ++i) { + var[i] = square_mean[i] - mean_square[i]; + } + + Array4D mean_4D = + *ReferenceUtil::Broadcast1DTo4D(mean, bounds, feature_index); + auto var_4D = *ReferenceUtil::Broadcast1DTo4D(var, bounds, feature_index); + auto scale_4D = *ReferenceUtil::Broadcast1DTo4D(scale, bounds, feature_index); + + auto var_add_epsilon = *ReferenceUtil::MapArray4D( + var_4D, [epsilon](float a) { return std::sqrt(a + epsilon); }); + + auto grad_output_times_var = + *ReferenceUtil::MapArray4D(grad_output_array, var_add_epsilon, + [](float a, float b) { return a * b; }); + + auto grad_activation = *ReferenceUtil::MapArray4D( + grad_output_times_var, scale_4D, [](float a, float b) { return a * b; }); + + auto activation_shifted = *ReferenceUtil::MapArray4D( + input_array, mean_4D, [](float a, float b) { return a - b; }); + + auto grad_scale_before_reduction = + *ReferenceUtil::MapArray4D(grad_output_times_var, activation_shifted, + [](float a, float b) { return a * b; }); + + auto grad_scale = ReferenceUtil::Reduce4DTo1D( + grad_scale_before_reduction, /*init=*/0.0f, reduce_dims, + [](float a, float b) { return a + b; }); + + auto grad_offset = + ReferenceUtil::Reduce4DTo1D(grad_output_array, /*init=*/0.0f, reduce_dims, + [](float a, float b) { return a + b; }); + + auto expected_grad_activation = + Literal::CreateR4FromArray4D(grad_activation); + + auto input_literal = Literal::CreateR4FromArray4D(input_array); + auto scale_literal = Literal::CreateR1(scale); + auto mean_literal = Literal::CreateR1(mean); + auto var_literal = Literal::CreateR1(var); + auto grad_output_literal = + Literal::CreateR4FromArray4D(grad_output_array); + + auto input_parameter = builder.Parameter(0, input_literal->shape(), "input"); + auto scale_parameter = builder.Parameter(1, scale_literal->shape(), "scale"); + auto mean_parameter = builder.Parameter(2, mean_literal->shape(), "mean"); + auto var_parameter = builder.Parameter(3, var_literal->shape(), "variance"); + auto grad_output_parameter = + builder.Parameter(4, grad_output_literal->shape(), "grad_output"); + + std::unique_ptr input_data = + client_->TransferToServer(*input_literal).ConsumeValueOrDie(); + std::unique_ptr scale_data = + client_->TransferToServer(*scale_literal).ConsumeValueOrDie(); + std::unique_ptr mean_data = + client_->TransferToServer(*mean_literal).ConsumeValueOrDie(); + std::unique_ptr var_data = + client_->TransferToServer(*var_literal).ConsumeValueOrDie(); + std::unique_ptr grad_output_data = + client_->TransferToServer(*grad_output_literal).ConsumeValueOrDie(); + + auto t = builder.BatchNormGrad(input_parameter, scale_parameter, + mean_parameter, var_parameter, + grad_output_parameter, epsilon, feature_index); + + auto expected = + *Literal::MakeTuple({expected_grad_activation.get(), + Literal::CreateR1(grad_scale).get(), + Literal::CreateR1(grad_offset).get()}); + + ComputeAndCompareTuple(&builder, expected, + {input_data.get(), scale_data.get(), mean_data.get(), + var_data.get(), grad_output_data.get()}, + ErrorSpec(0.01, 1)); +} + INSTANTIATE_TEST_CASE_P( BatchNormTest_Instantiation, BatchNormTest, ::testing::Values(BatchNormTestParam{{2, 2, 2, 2}, 0, 100.2f, 200.0f}, @@ -319,6 +450,7 @@ INSTANTIATE_TEST_CASE_P( BatchNormTestParam{{10, 10, 10, 10}, 1, -666.6f, 777.7f}, BatchNormTestParam{{10, 10, 10, 10}, 2, 0.f, 777.7f}, BatchNormTestParam{{1, 1, 10, 130}, 2, 0.f, 777.7f}, + BatchNormTestParam{{1, 1, 130, 11}, 2, 0.f, 777.7f}, BatchNormTestParam{{1, 1, 10, 1}, 3, 888.8f, 9.9f}, BatchNormTestParam{{24, 129, 1, 2}, 2, 10000, 10000}, @@ -446,6 +578,37 @@ XLA_TEST_F(BatchNormTest, DISABLED_ON_GPU(LargeEpsilonTest)) { ErrorSpec(0.1)); } +// TODO(b/62764704): Implement on CPU and GPU. Disabled on 2017-07-11. +XLA_TEST_F(BatchNormTest, DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU( + DISABLED_ON_GPU(BatchNormGradBasic)))) { + const int kFeatureIndex = 2; + ComputationBuilder builder(client_, TestName()); + + auto operand = + builder.ConstantR4FromArray4D(Array4D(2, 2, 2, 1, 0.0f)); + + auto scale = builder.ConstantR1({1.0f, 1.0f}); + + auto mean = builder.ConstantR1({0.0f, 0.0f}); + + auto var = builder.ConstantR1({1.0f, 1.0f}); + + auto grad_output = builder.ConstantR4FromArray4D( + {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}}); + + builder.BatchNormGrad(operand, scale, mean, var, grad_output, + /*epsilon=*/0.0, kFeatureIndex); + + auto expected = *Literal::MakeTuple( + {Literal::CreateR4( + {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}}) + .get(), + Literal::CreateR1({0, 0}).get(), + Literal::CreateR1({16, 20}).get()}); + + ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.1)); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc index 576c1c703df..9e85e357070 100644 --- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc +++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc @@ -44,295 +44,310 @@ namespace { class DynamicSliceTest : public ClientLibraryTestBase { protected: - template + template void TestR1() { // Slice at dimension start. - RunR1({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {0}, {5}, - {0.0, 1.0, 2.0, 3.0, 4.0}); + RunR1({0, 1, 2, 3, 4, 5, 6, 7}, {0}, {5}, {0, 1, 2, 3, 4}); // Slice in the middle. - RunR1({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {2}, {3}, - {2.0, 3.0, 4.0}); + RunR1({0, 1, 2, 3, 4, 5, 6, 7}, {2}, {3}, {2, 3, 4}); // Slice at dimension boundaries. - RunR1({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {5}, {3}, - {5.0, 6.0, 7.0}); + RunR1({0, 1, 2, 3, 4, 5, 6, 7}, {5}, {3}, {5, 6, 7}); // Slice at dimension boundaries, but with sizes that cause indices to wrap. - RunR1({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {6}, {4}, - {6.0, 7.0, 0.0, 1.0}); + RunR1({0, 1, 2, 3, 4, 5, 6, 7}, {6}, {4}, {6, 7, 0, 1}); // Zero element slice. - RunR1({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {2}, {0}, {}); + RunR1({0, 1, 2, 3, 4, 5, 6, 7}, {2}, {0}, {}); } - template + template void TestR2() { // Slice at dimension start. - RunR2({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, - {0, 0}, {2, 2}, {{1.0f, 2.0f}, {4.0f, 5.0f}}); + RunR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {0, 0}, {2, 2}, + {{1, 2}, {4, 5}}); // Slice in the middle. - RunR2({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, - {1, 1}, {2, 1}, {{5.0f}, {8.0f}}); + RunR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {2, 1}, + {{5}, {8}}); // Slice at dimension boundaries. - RunR2({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, - {1, 1}, {2, 1}, {{5.0f}, {8.0f}}); + RunR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {2, 1}, + {{5}, {8}}); // Slice at dimension boundaries, but with sizes that cause indices to wrap. - RunR2({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, - {1, 1}, {3, 3}, - {{5.0f, 6.0f, 4.0f}, {8.0f, 9.0f, 7.0f}, {2.0f, 3.0f, 1.0f}}); + RunR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {3, 3}, + {{5, 6, 4}, {8, 9, 7}, {2, 3, 1}}); // Zero element slice: 2x0. - RunR2({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, - {0, 0}, {2, 0}, {{}, {}}); + RunR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {0, 0}, {2, 0}, + {{}, {}}); // Zero element slice: 0x2. - RunR2({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, - {0, 0}, {0, 2}, Array2D(0, 2)); + RunR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {0, 0}, {0, 2}, + Array2D(0, 2)); } - template + template void TestR3() { // R3 Shape: [2, 3, 2] // clang-format off // Slice at dimension start. - RunR3( - {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}, - {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}}, - {0, 0, 0}, {2, 1, 2}, - {{{1.0f, 2.0f}}, {{7.0f, 8.0f}}}); + RunR3( + {{{1, 2}, {3, 4}, {5, 6}}, + {{7, 8}, {9, 10}, {11, 12}}}, + {0, 0, 0}, {2, 1, 2}, + {{{1, 2}}, {{7, 8}}}); // Slice in the middle. - RunR3( - {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}, - {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}}, - {0, 1, 1}, {2, 2, 1}, - {{{4.0f}, {6.0f}}, {{10.0f}, {12.0f}}}); + RunR3( + {{{1, 2}, {3, 4}, {5, 6}}, + {{7, 8}, {9, 10}, {11, 12}}}, + {0, 1, 1}, {2, 2, 1}, + {{{4}, {6}}, {{10}, {12}}}); // Slice at dimension boundaries, but with sizes that cause indices to wrap. - RunR3( - {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}, - {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}}, - {0, 2, 1}, {2, 1, 2}, - {{{6.0f, 5.0f}}, {{12.0f, 11.0f}}}); + RunR3( + {{{1, 2}, {3, 4}, {5, 6}}, + {{7, 8}, {9, 10}, {11, 12}}}, + {0, 2, 1}, {2, 1, 2}, + {{{6, 5}}, {{12, 11}}}); // clang-format on } - template - void RunR1(const std::vector& input_values, + template + void RunR1(tensorflow::gtl::ArraySlice input_values, const std::vector slice_starts, const std::vector& slice_sizes, - const std::vector& expected_values) { + tensorflow::gtl::ArraySlice expected_values) { ComputationBuilder builder(client_, TestName()); // Initialize and transfer dynamic slice start indices parameter. ComputationDataHandle starts; std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantR1(input_values); + auto input = builder.ConstantR1(input_values); builder.DynamicSlice(input, starts, slice_sizes); // Run computation and compare against expected values. - ComputeAndCompareR1(&builder, expected_values, {start_data.get()}, - ErrorSpec(0.000001)); + ComputeAndCompareR1(&builder, expected_values, {start_data.get()}); } - template - void RunR2(const Array2D& input_values, + template + void RunR2(const Array2D& input_values, const std::vector slice_starts, const std::vector& slice_sizes, - const Array2D& expected_values) { + const Array2D& expected_values) { ComputationBuilder builder(client_, TestName()); // Initialize and transfer dynamic slice start indices parameter. ComputationDataHandle starts; std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantR2FromArray2D(input_values); + auto input = builder.ConstantR2FromArray2D(input_values); builder.DynamicSlice(input, starts, slice_sizes); // Run computation and compare against expected values. - ComputeAndCompareR2(&builder, expected_values, {start_data.get()}, - ErrorSpec(0.000001)); + ComputeAndCompareR2(&builder, expected_values, {start_data.get()}); } - template - void RunR3(const Array3D& input_values, + template + void RunR3(const Array3D& input_values, const std::vector slice_starts, const std::vector& slice_sizes, - const Array3D& expected_values) { + const Array3D& expected_values) { ComputationBuilder builder(client_, TestName()); // Initialize and transfer dynamic slice start indices parameter. ComputationDataHandle starts; std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantR3FromArray3D(input_values); + auto input = builder.ConstantR3FromArray3D(input_values); builder.DynamicSlice(input, starts, slice_sizes); // Run computation and compare against expected values. - ComputeAndCompareR3(&builder, expected_values, {start_data.get()}, - ErrorSpec(0.000001)); + ComputeAndCompareR3(&builder, expected_values, {start_data.get()}); } }; -XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1(); } +XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1(); } -XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1(); } +XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1(); } -XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1(); } +XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1(); } -XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2(); } +XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2(); } -XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2(); } +XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2(); } -XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2(); } +XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2(); } -XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3(); } +XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3(); } -XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3(); } +XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3(); } -XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3(); } +XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3(); } + +XLA_TEST_F(DynamicSliceTest, Int32R1Pred) { + // Slice at dimension start. + RunR1({true, false, false, true, false, true, true, false}, {0}, + {5}, {true, false, false, true, false}); + // Slice in the middle. + RunR1({true, false, false, true, false, true, true, false}, {2}, + {3}, {false, true, false}); + // Slice at dimension boundaries. + RunR1({true, false, false, true, false, true, true, false}, {5}, + {3}, {true, true, false}); + // Zero element slice. + RunR1({true, false, false, true, false, true, true, false}, {2}, + {0}, {}); +} + +XLA_TEST_F(DynamicSliceTest, Int32R2Pred) { + // Slice at dimension start. + RunR2( + {{true, false, true}, {false, false, true}, {true, true, false}}, {0, 0}, + {2, 2}, {{true, false}, {false, false}}); + // Slice in the middle. + RunR2( + {{true, false, true}, {false, false, true}, {true, true, false}}, {1, 1}, + {2, 1}, {{false}, {true}}); + // Slice at dimension boundaries. + RunR2( + {{true, false, true}, {false, false, true}, {true, true, false}}, {1, 1}, + {2, 1}, {{false}, {true}}); + // Zero element slice: 2x0. + RunR2( + {{true, false, true}, {false, false, true}, {true, true, false}}, {0, 0}, + {2, 0}, {{}, {}}); + // Zero element slice: 0x2. + RunR2( + {{true, false, true}, {false, false, true}, {true, true, false}}, {0, 0}, + {0, 2}, Array2D(0, 2)); +} + +XLA_TEST_F(DynamicSliceTest, Int32R3Pred) { + // R3 Shape: [2, 3, 2] + // clang-format off + + // Slice at dimension start. + RunR3( + {{{true, false}, {false, true}, {true, true}}, + {{false, true}, {true, false}, {false, false}}}, + {0, 0, 0}, {2, 1, 2}, + {{{true, false}}, {{false, true}}}); + + // Slice in the middle. + RunR3( + {{{true, false}, {false, true}, {true, true}}, + {{false, true}, {true, false}, {false, false}}}, + {0, 1, 1}, {2, 2, 1}, + {{{true}, {true}}, {{false}, {false}}}); + + // clang-format on +} class DynamicUpdateSliceTest : public ClientLibraryTestBase { protected: - template + template void TestR1() { - // clang-format off // Slice at dimension start. - RunR1({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, - {8.0, 9.0, 10.0}, {0}, - {8.0, 9.0, 10.0, 3.0, 4.0, 5.0, 6.0, 7.0}); + RunR1({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {0}, + {8, 9, 10, 3, 4, 5, 6, 7}); // Slice in the middle. - RunR1({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, - {8.0, 9.0, 10.0}, {2}, - {0.0, 1.0, 8.0, 9.0, 10.0, 5.0, 6.0, 7.0}); + RunR1({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {2}, + {0, 1, 8, 9, 10, 5, 6, 7}); // Slice at dimension boundaries. - RunR1({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, - {8.0, 9.0, 10.0}, {5}, - {0.0, 1.0, 2.0, 3.0, 4.0, 8.0, 9.0, 10.0}); + RunR1({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {5}, + {0, 1, 2, 3, 4, 8, 9, 10}); // Slice at dimension boundaries, but with sizes that cause indices to wrap. - RunR1({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, - {8.0, 9.0, 10.0}, {6}, - {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0, 9.0}); + RunR1({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {6}, + {0, 1, 2, 3, 4, 5, 8, 9}); // Zero-sized update. - RunR1({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, - {}, {2}, - {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}); - // clang-format on + RunR1({0, 1, 2, 3, 4, 5, 6, 7}, {}, {2}, + {0, 1, 2, 3, 4, 5, 6, 7}); } - template + template void TestR2() { - // clang-format off // Slice at dimension start. - RunR2( - {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, - {{10.0f, 11.0f}}, {0, 0}, - {{10.0f, 11.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}); + RunR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {0, 0}, + {{10, 11, 3}, {4, 5, 6}, {7, 8, 9}}); // Slice in the middle. - RunR2( - {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, - {{10.0f, 11.0f}}, {1, 1}, - {{1.0f, 2.0f, 3.0f}, {4.0f, 10.0f, 11.0f}, {7.0f, 8.0f, 9.0f}}); + RunR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {1, 1}, + {{1, 2, 3}, {4, 10, 11}, {7, 8, 9}}); // Slice at dimension boundaries. - RunR2( - {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, - {{10.0f, 11.0f}}, {2, 1}, - {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 10.0f, 11.0f}}); + RunR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {2, 1}, + {{1, 2, 3}, {4, 5, 6}, {7, 10, 11}}); // Slice at dimension boundaries, but with sizes that cause indices to wrap. - RunR2( - {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, - {{10.0f, 11.0f}}, {2, 2}, - {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 10.0f}}); + RunR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {2, 2}, + {{1, 2, 3}, {4, 5, 6}, {7, 8, 10}}); // Zero-sized update. - RunR2( - {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}, - {{}}, {2, 1}, - {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}}); - // clang-format on + RunR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{}}, {2, 1}, + {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); } - template + template void TestR3() { // R3 Shape: [2, 3, 2] - // clang-format off // Slice at dimension start. - RunR3( - {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}, - {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}}, - {{{13.0f, 14.0f}, {15.0f, 16.0f}}, - {{17.0f, 18.0f}, {19.0f, 20.0f}}}, - {0, 0, 0}, - {{{13.0f, 14.0f}, {15.0f, 16.0f}, {5.0f, 6.0f}}, - {{17.0f, 18.0f}, {19.0f, 20.0f}, {11.0f, 12.0f}}}); + RunR3( + {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, + {{{13, 14}, {15, 16}}, {{17, 18}, {19, 20}}}, {0, 0, 0}, + {{{13, 14}, {15, 16}, {5, 6}}, {{17, 18}, {19, 20}, {11, 12}}}); // Slice in the middle. - RunR3( - {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}, - {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}}, - {{{13.0f}, {15.0f}}}, - {1, 1, 1}, - {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}, - {{7.0f, 8.0f}, {9.0f, 13.0f}, {11.0f, 15.0f}}}); + RunR3( + {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {{{13}, {15}}}, + {1, 1, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 13}, {11, 15}}}); // Slice at dimension boundaries, but with sizes that cause indices to wrap. - RunR3( - {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}, - {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}}, - {{{13.0f}, {15.0f}}}, - {1, 2, 1}, - {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}}, - {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 13.0f}}}); - // clang-format on + RunR3( + {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {{{13}, {15}}}, + {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 13}}}); } - template - void RunR1(const std::vector& input_values, - const std::vector& update_values, + template + void RunR1(tensorflow::gtl::ArraySlice input_values, + tensorflow::gtl::ArraySlice update_values, const std::vector slice_starts, - const std::vector& expected_values) { + tensorflow::gtl::ArraySlice expected_values) { ComputationBuilder builder(client_, TestName()); // Initialize and transfer dynamic slice start indices parameter. ComputationDataHandle starts; std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantR1(input_values); - auto update = builder.ConstantR1(update_values); + auto input = builder.ConstantR1(input_values); + auto update = builder.ConstantR1(update_values); builder.DynamicUpdateSlice(input, update, starts); // Run computation and compare against expected values. - ComputeAndCompareR1(&builder, expected_values, {start_data.get()}, - ErrorSpec(0.000001)); + ComputeAndCompareR1(&builder, expected_values, {start_data.get()}); } - template - void RunR2(const Array2D& input_values, - const Array2D& update_values, + template + void RunR2(const Array2D& input_values, + const Array2D& update_values, const std::vector slice_starts, - const Array2D& expected_values) { + const Array2D& expected_values) { ComputationBuilder builder(client_, TestName()); // Initialize and transfer dynamic slice start indices parameter. ComputationDataHandle starts; std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantR2FromArray2D(input_values); - auto update = builder.ConstantR2FromArray2D(update_values); + auto input = builder.ConstantR2FromArray2D(input_values); + auto update = builder.ConstantR2FromArray2D(update_values); builder.DynamicUpdateSlice(input, update, starts); // Run computation and compare against expected values. - ComputeAndCompareR2(&builder, expected_values, {start_data.get()}, - ErrorSpec(0.000001)); + ComputeAndCompareR2(&builder, expected_values, {start_data.get()}); } - template - void RunR3(const Array3D& input_values, - const Array3D& update_values, + template + void RunR3(const Array3D& input_values, + const Array3D& update_values, const std::vector slice_starts, - const Array3D& expected_values) { + const Array3D& expected_values) { ComputationBuilder builder(client_, TestName()); // Initialize and transfer dynamic slice start indices parameter. ComputationDataHandle starts; std::unique_ptr start_data = CreateR1Parameter( slice_starts, 0, "slice_starts", &builder, &starts); // Build dynamic slice computation. - auto input = builder.ConstantR3FromArray3D(input_values); - auto update = builder.ConstantR3FromArray3D(update_values); + auto input = builder.ConstantR3FromArray3D(input_values); + auto update = builder.ConstantR3FromArray3D(update_values); builder.DynamicUpdateSlice(input, update, starts); // Run computation and compare against expected values. - ComputeAndCompareR3(&builder, expected_values, {start_data.get()}, - ErrorSpec(0.000001)); + ComputeAndCompareR3(&builder, expected_values, {start_data.get()}); } void RunR3Contiguous(std::vector operand_shape, int32 index, @@ -393,23 +408,81 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { } }; -XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1(); } +XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1(); } -XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1(); } +XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1(); } -XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1(); } +XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1(); } -XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2(); } +XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2(); } -XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2(); } +XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2(); } -XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2(); } +XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2(); } -XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3(); } +XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3(); } -XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3(); } +XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3(); } -XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3(); } +XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3(); } + +XLA_TEST_F(DynamicUpdateSliceTest, Int32R1Pred) { + // Slice at dimension start. + RunR1({false, false, true, true, false, true, true, false}, + {true, true, false}, {0}, + {true, true, false, true, false, true, true, false}); + // Slice in the middle. + RunR1({false, false, true, true, false, true, true, false}, + {false, true, true}, {2}, + {false, false, false, true, true, true, true, false}); + // Slice at dimension boundaries. + RunR1({false, false, true, true, false, true, true, false}, + {false, true, true}, {5}, + {false, false, true, true, false, false, true, true}); + // Zero-sized update. + RunR1({false, false, true, true, false, true, true, false}, {}, + {2}, {false, false, true, true, false, true, true, false}); +} + +XLA_TEST_F(DynamicUpdateSliceTest, Int32R2Pred) { + // Slice at dimension start. + RunR2( + {{false, true, false}, {true, false, true}, {false, true, true}}, + {{true, false}}, {0, 0}, + {{true, false, false}, {true, false, true}, {false, true, true}}); + // Slice in the middle. + RunR2( + {{false, true, false}, {true, false, true}, {false, true, true}}, + {{true, false}}, {1, 1}, + {{false, true, false}, {true, true, false}, {false, true, true}}); + // Slice at dimension boundaries. + RunR2( + {{false, true, false}, {true, false, true}, {false, true, true}}, + {{true, false}}, {2, 1}, + {{false, true, false}, {true, false, true}, {false, true, false}}); + // Zero-sized update. + RunR2( + {{false, true, false}, {true, false, true}, {false, true, true}}, {{}}, + {2, 1}, {{false, true, false}, {true, false, true}, {false, true, true}}); +} + +XLA_TEST_F(DynamicUpdateSliceTest, Int32R3Pred) { + // R3 Shape: [2, 3, 2] + // Slice at dimension start. + RunR3( + {{{true, false}, {false, true}, {true, true}}, + {{false, false}, {false, true}, {true, false}}}, + {{{false, true}, {true, false}}, {{true, true}, {false, true}}}, + {0, 0, 0}, + {{{false, true}, {true, false}, {true, true}}, + {{true, true}, {false, true}, {true, false}}}); + // Slice in the middle. + RunR3({{{true, false}, {false, true}, {true, true}}, + {{false, false}, {false, true}, {true, false}}}, + {{{false}, {true}}}, {1, 1, 1}, + {{{true, false}, {false, true}, {true, true}}, + {{false, false}, {false, false}, {true, true}}}); +} // Tests for simple R3 case where the update is contiguous (i.e. the minor // two dimensions are not sliced). diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc index 48212dc7d18..527205bbb0d 100644 --- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -39,8 +40,11 @@ limitations under the License. namespace xla { namespace { -class ReducePrecisionTest : public ClientLibraryTestBase, - public ::testing::WithParamInterface {}; +// Tests to confirm that the ReducePrecision operation produces the expected +// numerical values. +class ReducePrecisionAccuracyTest : public ClientLibraryTestBase, + public ::testing::WithParamInterface { +}; // For reduction to IEEE-f16, we want to test the following cases, in both // positive and negative variants. (Note: IEEE-f16 is 5 exponent bits and 10 @@ -201,7 +205,7 @@ static const uint32_t test_values[][4] = { FPVAL(11111111, 1111111111, 1111111111111) // NaN }}; -XLA_TEST_P(ReducePrecisionTest, ReducePrecisionF32) { +XLA_TEST_P(ReducePrecisionAccuracyTest, ReducePrecisionF32) { int index = GetParam(); int exponent_bits = exponent_sizes[index]; int mantissa_bits = mantissa_sizes[index]; @@ -238,9 +242,87 @@ XLA_TEST_P(ReducePrecisionTest, ReducePrecisionF32) { ComputeAndCompareR1(&builder, expected_values, {a_data.get()}); } -INSTANTIATE_TEST_CASE_P(ReducePrecisionTest, ReducePrecisionTest, +INSTANTIATE_TEST_CASE_P(ReducePrecisionAccuracyTest, + ReducePrecisionAccuracyTest, ::testing::Values(0, 1, 2, 3), TestDataToString); +// Tests to confirm that the compiler optimization functions add the expected +// ReducePrecisionInsertion passes. +class ReducePrecisionInsertionTest : public ClientLibraryTestBase {}; + +XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) { + ComputationBuilder builder(client_, TestName()); + + std::unique_ptr a_literal = Literal::CreateR1({1.00001}); + std::unique_ptr a_data = + client_->TransferToServer(*a_literal).ConsumeValueOrDie(); + auto a = builder.Parameter(0, a_literal->shape(), "a"); + + // Abs doesn't affect resolution. + auto abs = builder.Abs(a); + + // Near 1.0, Log(x) approximates x - 1; this lets us confirm that the + // reduce-precision operation showed up in the correct place in the + // graph. + auto log = builder.Log(abs); + + // Insert precision-reduction after the Abs(x) operation, rounding that + // result to exactly 1.0f. + auto reduce_precision_pass = execution_options_.mutable_debug_options() + ->add_hlo_reduce_precision_options(); + *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto( + HloReducePrecisionOptions::BEFORE_OP_FUSION, 5, 10, + [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; }); + + ComputeAndCompareR1(&builder, {0.0f}, {a_data.get()}); +} + +XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) { + ComputationBuilder builder(client_, TestName()); + + std::unique_ptr a_literal = Literal::CreateR1({1.00001}); + std::unique_ptr a_data = + client_->TransferToServer(*a_literal).ConsumeValueOrDie(); + auto a = builder.Parameter(0, a_literal->shape(), "a"); + + // These two operations should be fused by any reasonable backend. + auto abs = builder.Abs(a); + auto neg = builder.Neg(abs); + + // Add a pass after operation fusion, suffixing kAbs operations. This + // should not see into the fusion nodes and thus should not affect the + // result. + auto reduce_precision_pass = execution_options_.mutable_debug_options() + ->add_hlo_reduce_precision_options(); + *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto( + HloReducePrecisionOptions::AFTER_OP_FUSION, 5, 10, + [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; }); + + ComputeAndCompareR1(&builder, {-1.00001f}, {a_data.get()}); +} + +XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) { + ComputationBuilder builder(client_, TestName()); + + std::unique_ptr a_literal = Literal::CreateR1({1.00001}); + std::unique_ptr a_data = + client_->TransferToServer(*a_literal).ConsumeValueOrDie(); + auto a = builder.Parameter(0, a_literal->shape(), "a"); + + // These two operations should be fused by any reasonable backend. + auto abs = builder.Abs(a); + auto neg = builder.Neg(abs); + + // Add a pass after operation fusion, suffixing kFusion operations. + auto reduce_precision_pass = execution_options_.mutable_debug_options() + ->add_hlo_reduce_precision_options(); + *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto( + HloReducePrecisionOptions::AFTER_OP_FUSION, 5, 10, + [](const HloOpcode opcode) { return opcode == HloOpcode::kFusion; }); + + ComputeAndCompareR1(&builder, {-1.0f}, {a_data.get()}); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto index 00fb7f12b85..be4e00f63cc 100644 --- a/tensorflow/compiler/xla/xla.proto +++ b/tensorflow/compiler/xla/xla.proto @@ -20,6 +20,24 @@ import "tensorflow/compiler/xla/service/session.proto"; package xla; +// Options for the HLO insert-reduce-precision-operations pass. +message HloReducePrecisionOptions { + // When to run the pass. + enum PassTiming { + BEFORE_OP_FUSION = 0; + AFTER_OP_FUSION = 1; + } + PassTiming pass_timing = 1; + + // Exponent and mantissa bit counts for the reduced precision. + uint32 exponent_bits = 2; + uint32 mantissa_bits = 3; + + // Opcodes for operations that should be suffixed with reduced-precision + // operations. + repeated uint32 opcodes_to_suffix = 4; +} + // Debugging options for XLA. These options may change at any time - there are // no guarantees about backward or forward compatibility for these fields. message DebugOptions { @@ -112,6 +130,11 @@ message DebugOptions { // the generated IR. bool xla_llvm_enable_invariant_load_metadata = 72; + // Options for inserting reduce-precision operations for numerical + // experimentation. This is a repeated field, as we may want to have + // multiple passes with different parameters. + repeated HloReducePrecisionOptions hlo_reduce_precision_options = 80; + // This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the // computation will run n! times with all permunations of layouts for the // output shape in rank n. For example, with a 3D shape, all permutations of diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt index d3cc61ce290..f1f6144acd4 100644 --- a/tensorflow/contrib/makefile/tf_op_files.txt +++ b/tensorflow/contrib/makefile/tf_op_files.txt @@ -199,6 +199,7 @@ tensorflow/core/kernels/aggregate_ops.cc tensorflow/core/kernels/depthwise_conv_op.cc tensorflow/core/kernels/dequantize_op.cc tensorflow/core/kernels/meta_support.cc +tensorflow/core/kernels/population_count_op.cc tensorflow/core/kernels/quantization_utils.cc tensorflow/core/kernels/quantize_down_and_shrink_range.cc tensorflow/core/kernels/quantize_op.cc diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py index 06954f51d8e..c14463bdad2 100644 --- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py +++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py @@ -210,7 +210,7 @@ class RNNCellTest(test.TestCase): sess.run([variables_lib.global_variables_initializer()]) sess.run([g, out_m], {x.name: 1 * np.ones([batch_size, input_size]), - m.name: 0.1 * np.ones([batch_size - 1, state_size])}) + m.name: 0.1 * np.ones([batch_size - 1, state_size])}) def testBasicLSTMCellStateSizeError(self): """Tests that state_size must be num_units * 2.""" @@ -218,7 +218,7 @@ class RNNCellTest(test.TestCase): with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): num_units = 2 - state_size = num_units * 3 # state_size must be num_units * 2 + state_size = num_units * 3 # state_size must be num_units * 2 batch_size = 3 input_size = 4 x = array_ops.zeros([batch_size, input_size]) @@ -406,6 +406,31 @@ class RNNCellTest(test.TestCase): # States are left untouched self.assertAllClose(res[2], res[3]) + def testResidualWrapperWithSlice(self): + with self.test_session() as sess: + with variable_scope.variable_scope( + "root", initializer=init_ops.constant_initializer(0.5)): + x = array_ops.zeros([1, 5]) + m = array_ops.zeros([1, 3]) + base_cell = rnn_cell_impl.GRUCell(3) + g, m_new = base_cell(x, m) + variable_scope.get_variable_scope().reuse_variables() + def residual_with_slice_fn(inp, out): + inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3]) + return inp_sliced + out + g_res, m_new_res = rnn_cell_impl.ResidualWrapper( + base_cell, residual_with_slice_fn)(x, m) + sess.run([variables_lib.global_variables_initializer()]) + res_g, res_g_res, res_m_new, res_m_new_res = sess.run( + [g, g_res, m_new, m_new_res], { + x: np.array([[1., 1., 1., 1., 1.]]), + m: np.array([[0.1, 0.1, 0.1]]) + }) + # Residual connections + self.assertAllClose(res_g_res, res_g + [1., 1., 1.]) + # States are left untouched + self.assertAllClose(res_m_new, res_m_new_res) + def testDeviceWrapper(self): with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): diff --git a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py index be28184ae64..61b7107a172 100644 --- a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py +++ b/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py @@ -220,15 +220,14 @@ class SpectralOpsTest(test.TestCase): # stft_bound, inverse_stft_bound). # TODO(rjryan): Investigate why STFT gradient error is so high. test_configs = [ - (512, 64, 32, 64, 2e-3, 3e-5), - (512, 64, 64, 64, 2e-3, 3e-5), - (512, 64, 25, 64, 2e-3, 3e-5), - (512, 25, 15, 36, 2e-3, 3e-5), - (123, 23, 5, 42, 2e-3, 4e-5), + (64, 16, 8, 16), + (64, 16, 16, 16), + (64, 16, 7, 16), + (64, 7, 4, 9), + (29, 5, 1, 10), ] - for (signal_length, frame_length, frame_step, fft_length, - stft_bound, inverse_stft_bound) in test_configs: + for (signal_length, frame_length, frame_step, fft_length) in test_configs: signal_shape = [signal_length] signal = random_ops.random_uniform(signal_shape) stft_shape = [max(0, 1 + (signal_length - frame_length) // frame_step), @@ -242,8 +241,8 @@ class SpectralOpsTest(test.TestCase): stft, stft_shape) inverse_stft_error = test.compute_gradient_error( stft, stft_shape, inverse_stft, inverse_stft_shape) - self.assertLess(stft_error, stft_bound) - self.assertLess(inverse_stft_error, inverse_stft_bound) + self.assertLess(stft_error, 2e-3) + self.assertLess(inverse_stft_error, 4e-5) if __name__ == "__main__": diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py index 712871cc04a..b6fa185709d 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py @@ -54,9 +54,12 @@ def _tpu_job(run_config): return None if run_config.master in ['', 'local'] else 'tpu_worker' -def _per_shard_batch_size(global_batch_size, run_config): +def _per_shard_batch_size(global_batch_size, run_config, use_tpu): """Returns the batch size for each shard.""" - return global_batch_size // run_config.tpu_config.num_shards + if use_tpu: + return global_batch_size // run_config.tpu_config.num_shards + else: + return global_batch_size class _SIGNAL(object): @@ -470,7 +473,7 @@ class _ModelFnWrapper(object): self._train_batch_size = train_batch_size def call_without_tpu(self, features, labels): - return self._call_model_fn(features, labels) + return self._call_model_fn(features, labels, False) def convert_to_single_tpu_train_step(self, dequeue_fn): """Converts the `model_fn` as a single train step on TPU.""" @@ -481,8 +484,8 @@ class _ModelFnWrapper(object): features, labels = dequeue_fn() # Makes deep copy with `config` and params` in case user mutates them. - estimator_spec = self._verify_estimator_spec(self._call_model_fn( - features, labels, add_batch_size_in_params=True)) + estimator_spec = self._verify_estimator_spec( + self._call_model_fn(features, labels, True)) loss, train_op = estimator_spec.loss, estimator_spec.train_op with ops.control_dependencies([train_op]): return array_ops.identity(loss) @@ -492,7 +495,7 @@ class _ModelFnWrapper(object): def config(self): return self._config - def _call_model_fn(self, features, labels, add_batch_size_in_params=False): + def _call_model_fn(self, features, labels, use_tpu): """Calls the model_fn with required parameters.""" model_fn_args = util.fn_args(self._model_fn) kwargs = {} @@ -513,16 +516,15 @@ class _ModelFnWrapper(object): if 'params' in model_fn_args: kwargs['params'] = params - if add_batch_size_in_params: - if 'params' not in model_fn_args: - raise ValueError( - 'model_fn ({}) does not include params argument, ' - 'required by TPUEstimator to pass batch size as ' - 'params[\'batch_size\']'.format(self._model_fn)) - if self._mode == model_fn_lib.ModeKeys.TRAIN: - # For TPU training. `params` is never `None`. - params[_BATCH_SIZE_KEY] = _per_shard_batch_size(self._train_batch_size, - config) + if 'params' not in model_fn_args: + raise ValueError( + 'model_fn ({}) does not include params argument, ' + 'required by TPUEstimator to pass batch size as ' + 'params[\'batch_size\']'.format(self._model_fn)) + if self._mode == model_fn_lib.ModeKeys.TRAIN: + # For TPU training. `params` is never `None`. + params[_BATCH_SIZE_KEY] = _per_shard_batch_size( + self._train_batch_size, config, use_tpu) return self._model_fn(features=features, **kwargs) @@ -609,16 +611,12 @@ class TPUEstimator(estimator_lib.Estimator): 'batch size {} must be divisible by number of shards {}' .format(train_batch_size, config.tpu_config.num_shards)) - if use_tpu: - # Verifies the model_fn signature according to Estimator framework. - estimator_lib._verify_model_fn_args(model_fn, params) # pylint: disable=protected-access - # We cannot store config and params in this constructor as parent - # constructor might change them, such as assigning a temp dir for - # config.model_dir. - model_function = augment_model_fn_with_tpu_support( - model_fn, train_batch_size) - else: - model_function = model_fn + # Verifies the model_fn signature according to Estimator framework. + estimator_lib._verify_model_fn_args(model_fn, params) # pylint: disable=protected-access + # We cannot store config and params in this constructor as parent + # constructor might change them, such as assigning a temp dir for + # config.model_dir. + model_function = _augment_model_fn(model_fn, train_batch_size, use_tpu) super(TPUEstimator, self).__init__( model_fn=model_function, @@ -670,9 +668,6 @@ class TPUEstimator(estimator_lib.Estimator): Raises: ValueError: if input_fn takes invalid arguments or does not have `params`. """ - if not self._use_tpu or mode != model_fn_lib.ModeKeys.TRAIN: - return super(TPUEstimator, self)._call_input_fn(input_fn, mode) - input_fn_args = util.fn_args(input_fn) config = self.config # a deep copy. kwargs = {} @@ -686,8 +681,13 @@ class TPUEstimator(estimator_lib.Estimator): kwargs['config'] = config # Now for TPU training. - per_shard_batch_size = _per_shard_batch_size(self._train_batch_size, config) - kwargs['params'][_BATCH_SIZE_KEY] = per_shard_batch_size + if mode == model_fn_lib.ModeKeys.TRAIN: + kwargs['params'][_BATCH_SIZE_KEY] = ( + _per_shard_batch_size(self._train_batch_size, config, self._use_tpu)) + + if not self._use_tpu or mode != model_fn_lib.ModeKeys.TRAIN: + with ops.device('/cpu:0'): + return input_fn(**kwargs) job = _tpu_job(config) def placement_function(index): @@ -746,7 +746,7 @@ def _create_infeed_enqueue_ops_and_dequeue_fn(inputs_holder): return (dequeue_fn, enqueue_fn) -def augment_model_fn_with_tpu_support(model_fn, train_batch_size): +def _augment_model_fn(model_fn, train_batch_size, use_tpu): """Returns a new model_fn, which wraps the TPU support.""" def _model_fn(features, labels, mode, config, params): @@ -755,7 +755,7 @@ def augment_model_fn_with_tpu_support(model_fn, train_batch_size): train_batch_size) # TODO(jhseu): Move to EVAL and PREDICT to TPU. - if mode != model_fn_lib.ModeKeys.TRAIN: + if not use_tpu or mode != model_fn_lib.ModeKeys.TRAIN: return model_fn_wrapper.call_without_tpu(features, labels) inputs = _InputsHolder(sharded_features=features, sharded_labels=labels) diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc index 9fdb3da6a0d..95cafa24b19 100644 --- a/tensorflow/core/framework/attr_value_util.cc +++ b/tensorflow/core/framework/attr_value_util.cc @@ -15,7 +15,9 @@ limitations under the License. #include "tensorflow/core/framework/attr_value_util.h" +#include #include + #include "tensorflow/core/framework/attr_value.pb_text.h" #include "tensorflow/core/framework/tensor.pb_text.h" #include "tensorflow/core/framework/tensor_shape.pb.h" @@ -27,7 +29,6 @@ limitations under the License. #include "tensorflow/core/platform/protobuf.h" namespace tensorflow { - namespace { string SummarizeString(const string& str) { @@ -460,7 +461,8 @@ bool HasPlaceHolder(const AttrValue& val) { return false; } -bool SubstitutePlaceholders(SubstituteFunc substitute, AttrValue* value) { +bool SubstitutePlaceholders(const SubstituteFunc& substitute, + AttrValue* value) { switch (value->value_case()) { case AttrValue::kList: { for (NameAttrList& func : *value->mutable_list()->mutable_func()) { diff --git a/tensorflow/core/framework/attr_value_util.h b/tensorflow/core/framework/attr_value_util.h index 08cc3b7158e..08d813bb6f9 100644 --- a/tensorflow/core/framework/attr_value_util.h +++ b/tensorflow/core/framework/attr_value_util.h @@ -16,8 +16,10 @@ limitations under the License. #ifndef TENSORFLOW_FRAMEWORK_ATTR_VALUE_UTIL_H_ #define TENSORFLOW_FRAMEWORK_ATTR_VALUE_UTIL_H_ +#include #include #include + #include "tensorflow/core/framework/attr_value.pb.h" // TODO(62899350): Remove #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/tensor.h" @@ -100,8 +102,8 @@ bool HasPlaceHolder(const AttrValue& val); // SubstituteFunc is given a placeholder string. If the placeholder is // unknown, SubstituteFunc returns false. Otherwise, overwrites the // attr value and returns true. -typedef std::function SubstituteFunc; -bool SubstitutePlaceholders(SubstituteFunc substitute, AttrValue* value); +using SubstituteFunc = std::function; +bool SubstitutePlaceholders(const SubstituteFunc& substitute, AttrValue* value); } // namespace tensorflow diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc index db279ae67f3..7f845bb9e2c 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding.cc +++ b/tensorflow/core/grappler/optimizers/constant_folding.cc @@ -104,7 +104,8 @@ ConstantFolding::ConstantFolding() { ops_to_preserve_ = std::regex( "Placeholder.*|Const|.*Save.*|.*Restore.*|.*Reader|" "Enter|RefEnter|Exit|RefExit|NextIteration|RefNextIteration|" - ".*Quantized.*"); + ".*Quantized.*", + std::regex_constants::optimize); } string ConstantFolding::AddControlDependency(const string& input_name) { @@ -240,13 +241,18 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item, } bool ConstantFolding::IsFoldable(const NodeDef& node) const { + // Folding not applicable to ops with no inputs. + if (node.input().empty()) { + return false; + } + // Skips nodes that must be preserved, and op_types that don't benefit from // folding if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) { return false; } - std::cmatch match; - if (std::regex_match(node.op().c_str(), match, ops_to_preserve_)) { + if (std::regex_match(node.op().c_str(), ops_to_preserve_, + std::regex_constants::match_any)) { return false; } @@ -264,23 +270,6 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const { return false; } - DeviceTypeVector device_types; - status = SupportedDeviceTypesForNode({DeviceType(DEVICE_CPU)}, node, - &device_types); - if (!status.ok()) { - return false; - } - // Only fold ops with a CPU implementation available. - if (device_types.empty()) { - return false; - } - DCHECK_EQ(DeviceType(DEVICE_CPU), device_types[0]); - - // Folding not applicable to ops with no inputs. - if (node.input().empty()) { - return false; - } - // No need to (and don't) fold nodes that have no outgoing edges. Such nodes // could be introduced by an earlier constant folding pass and are preserved // in case users want to fetch their values; re-processing them would @@ -391,12 +380,15 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node, // Control dependency break; } - // There should be a single output since the input node should be a constant - // node. - TensorVector output; - TF_RETURN_IF_ERROR( - EvaluateNode(*node_map_->GetNode(input), TensorVector(), &output)); - inputs.push_back(output[position]); + const NodeDef* input_node = node_map_->GetNode(input); + if (!IsConstant(*input_node)) { + return Status(error::INVALID_ARGUMENT, + strings::StrCat("Can't fold ", node.name(), ", its ", input, + " isn't constant")); + } + Tensor* value = new Tensor(input_node->attr().at("dtype").type()); + CHECK(value->FromProto(input_node->attr().at("value").tensor())); + inputs.emplace_back(value); } TensorVector output_tensors; @@ -583,24 +575,31 @@ Status ConstantFolding::FoldNode(const NodeDef& node, GraphDef* output) { Status ConstantFolding::FoldGraph(GraphDef* output) { std::unordered_set processed_nodes; - int previously_processed = 0; - do { - previously_processed = processed_nodes.size(); - for (const auto& node : graph_.node()) { - if (IsFoldable(node) && - processed_nodes.find(node.name()) == processed_nodes.end()) { - Status s = FoldNode(node, output); - if (!s.ok()) { - VLOG(1) << "Failed to fold node " << node.name() << ": " << s; + std::deque queue; + for (const auto& node : graph_.node()) { + if (IsFoldable(node)) { + queue.push_back(&node); + } + } + while (!queue.empty()) { + const NodeDef* node = queue.front(); + queue.pop_front(); + if (processed_nodes.count(node->name())) { + continue; + } + Status s = FoldNode(*node, output); + processed_nodes.insert(node->name()); + if (!s.ok()) { + VLOG(1) << "Failed to fold node " << node->name() << ": " << s; + } else { + auto outputs = node_map_->GetOutputs(node->name()); + for (auto& output : outputs) { + if (IsFoldable(*output)) { + queue.push_back(output); } - processed_nodes.insert(node.name()); } } - // Try again as long as we find new constants. In most cases, this loop will - // only run once since the graph is already in topological order. - VLOG(1) << "Folded " << processed_nodes.size() - previously_processed - << " nodes in this pass"; - } while (previously_processed != processed_nodes.size()); + } // Build the graph after constant folding. Note that we keep all processed // nodes in the graph in case users need to fetch their values. @@ -740,7 +739,6 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output, Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item, GraphDef* output) { graph_ = item.graph; - LOG(INFO) << "Initial graph size: " << item.graph.node_size(); node_map_.reset(new NodeMap(&graph_)); for (const auto& node : item.fetch) { nodes_to_preserve_.insert(NodeName(node)); @@ -761,7 +759,6 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item, TF_RETURN_IF_ERROR(FoldGraph(output)); TF_RETURN_IF_ERROR(SimplifyGraph(output, properties)); - LOG(INFO) << "Optimized graph size: " << output->node_size(); *output->mutable_library() = item.graph.library(); *output->mutable_versions() = item.graph.versions(); diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index fffcb980db2..a493452777f 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -701,6 +701,39 @@ tf_kernel_library( deps = ARRAY_DEPS, ) +tf_kernel_library( + name = "compare_and_bitpack_op", + srcs = ["compare_and_bitpack_op.cc"], + hdrs = ["compare_and_bitpack_op.h"], + gpu_srcs = [ + "compare_and_bitpack_op.h", + "compare_and_bitpack_op_gpu.cu.cc", + ], + deps = ARRAY_DEPS, +) + +# TODO(ebrevdo): Add benchmarks once the op is in the autogen array namespace. +# tf_cuda_cc_test( +# name = "compare_and_bitpack_op_test", +# srcs = ["compare_and_bitpack_op_test.cc"], +# deps = [ +# ":array", +# ":ops_testutil", +# ":ops_util", +# "//third_party/eigen3", +# "//tensorflow/cc:cc_ops", +# "//tensorflow/cc:cc_ops_internal", +# "//tensorflow/core:core_cpu", +# "//tensorflow/core:core_cpu_internal", +# "//tensorflow/core:framework", +# "//tensorflow/core:lib", +# "//tensorflow/core:protos_all_cc", +# "//tensorflow/core:test", +# "//tensorflow/core:test_main", +# "//tensorflow/core:testlib", +# ], +# ) + tf_kernel_library( name = "reshape_op", prefix = "reshape_op", @@ -2344,10 +2377,12 @@ cc_library( ":bucketize_op", ":cast_op", ":check_numerics_op", + ":compare_and_bitpack_op", ":cross_op", ":cwise_op", ":fft_ops", ":matmul_op", + ":population_count_op", ":reduction_ops", ":scan_ops", ":segment_reduction_ops", @@ -2409,6 +2444,12 @@ tf_kernel_library( deps = MATH_DEPS + ["//tensorflow/core:bitwise_ops_op_lib"], ) +tf_kernel_library( + name = "population_count_op", + prefix = "population_count_op", + deps = MATH_DEPS, +) + tf_kernel_library( name = "fft_ops", prefix = "fft_ops", @@ -4292,6 +4333,8 @@ filegroup( "fake_quant_ops.cc", "fifo_queue.cc", "fused_batch_norm_op.cc", + "population_count_op.cc", + "population_count_op.h", "winograd_transform.h", ":android_extended_ops_headers", ] + select({ diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.cc b/tensorflow/core/kernels/compare_and_bitpack_op.cc new file mode 100644 index 00000000000..9f626a274a4 --- /dev/null +++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc @@ -0,0 +1,185 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// See docs in ../ops/math_ops.cc + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/compare_and_bitpack_op.h" + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template +class CompareAndBitpackOp : public OpKernel { + public: + explicit CompareAndBitpackOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* c) override { + const Tensor& input_t = c->input(0); + const Tensor& threshold_t = c->input(1); + OP_REQUIRES( + c, TensorShapeUtils::IsScalar(threshold_t.shape()), + errors::InvalidArgument("Compare must be a scalar, but saw shape: ", + threshold_t.shape().DebugString())); + const TensorShape& input_shape = input_t.shape(); + OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_shape), + errors::InvalidArgument( + "Input should be at least a vector, but saw a scalar.")); + OP_REQUIRES(c, input_shape.dim_size(input_shape.dims() - 1) % 8 == 0, + errors::InvalidArgument( + "Inner dimension of input should be " + "divisible by ", + 8, ", but saw shape: ", input_shape.DebugString())); + + TensorShape output_shape = input_shape; + int rank = input_shape.dims(); + output_shape.set_dim(rank - 1, input_shape.dim_size(rank - 1) / 8); + + Tensor* output_t; + OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output_t)); + + auto input = input_t.flat_inner_dims(); + auto threshold = threshold_t.scalar(); + auto output = output_t->flat_inner_dims(); + + functor::CompareAndBitpack func; + func(c, input, threshold, output); + } +}; + +#define REGISTER_COMPARE_AND_BITPACK(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("CompareAndBitpack").Device(DEVICE_CPU).TypeConstraint("T"), \ + CompareAndBitpackOp); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_COMPARE_AND_BITPACK); +TF_CALL_bool(REGISTER_COMPARE_AND_BITPACK); + +#undef REGISTER_COMPARE_AND_BITPACK + +namespace functor { + +template +struct ComputeShard { + static EIGEN_STRONG_INLINE void Compute(typename TTypes::ConstMatrix input, + typename TTypes::Matrix output, + const T& thresh, int64 start, + int64 limit) { + for (int64 i = start; i < limit; ++i) { + uint8* out = output.data() + i; + const T* block = input.data() + 8 * i; + *out = ((((block[0] > thresh) << 7)) | (((block[1] > thresh) << 6)) | + (((block[2] > thresh) << 5)) | (((block[3] > thresh) << 4)) | + (((block[4] > thresh) << 3)) | (((block[5] > thresh) << 2)) | + (((block[6] > thresh) << 1)) | (((block[7] > thresh)))); + } + } +}; + +// Specialization for bool on systems where sizeof(bool) == 1. +template +struct ComputeShard::value>::type, + typename std::enable_if::type> { + static EIGEN_STRONG_INLINE void Compute( + typename TTypes::ConstMatrix input, + typename TTypes::Matrix output, bool /*thresh*/, int64 start, + int64 limit) { + // NOTE(ebrevdo): This assumes memory is little-endian. + for (int64 i = start; i < limit; ++i) { + uint8* out = output.data() + i; + const int64 block = *reinterpret_cast(input.data() + 8 * i); + *out = + ((((block & (1LL << (7 * 8))) >> (7 * 8 - 0))) | + (((block & (1LL << (6 * 8))) >> (6 * 8 - 1))) | + (((block & (1LL << (5 * 8))) >> (5 * 8 - 2))) | + (((block & (1LL << (4 * 8))) >> (4 * 8 - 3))) | + (((block & (1LL << (3 * 8))) >> (3 * 8 - 4))) | + (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) | + (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7))); + } + } +}; + +template +struct CompareAndBitpack { + void operator()(OpKernelContext* c, typename TTypes::ConstMatrix input, + typename TTypes::ConstScalar threshold, + TTypes::Matrix output) { + const T thresh = threshold(); + auto shard = [&, thresh](int64 start, int64 limit) { + ComputeShard::Compute(input, output, thresh, start, limit); + }; + int64 total_shards = output.size(); // Approximate cmp as an add and + // bitwise-or + shift as an add. + const double total_cost = 8 * (Eigen::TensorOpCost::AddCost() + + Eigen::TensorOpCost::AddCost()); + const int64 shard_cost = (total_cost >= static_cast(kint64max)) + ? kint64max + : static_cast(total_cost); + + auto worker_threads = *(c->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, total_shards, + shard_cost, shard); + } +}; + +} // namespace functor + +#if GOOGLE_CUDA + +#define REGISTER_COMPARE_AND_BITPACK(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("CompareAndBitpack").Device(DEVICE_GPU).TypeConstraint("T"), \ + CompareAndBitpackOp); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_COMPARE_AND_BITPACK); +TF_CALL_bool(REGISTER_COMPARE_AND_BITPACK); + +#undef REGISTER_COMPARE_AND_BITPACK + +namespace functor { + +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void CompareAndBitpack::operator()( \ + OpKernelContext* c, typename TTypes::ConstMatrix input, \ + typename TTypes::ConstScalar threshold, \ + TTypes::Matrix output); \ + extern template struct CompareAndBitpack; + +TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC) +TF_CALL_bool(DECLARE_GPU_SPEC) + +#undef DECLARE_GPU_SPEC + +} // namespace functor + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.h b/tensorflow/core/kernels/compare_and_bitpack_op.h new file mode 100644 index 00000000000..8e020249c10 --- /dev/null +++ b/tensorflow/core/kernels/compare_and_bitpack_op.h @@ -0,0 +1,42 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_ +#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_ + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { + +namespace functor { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template +struct CompareAndBitpack { + void operator()(OpKernelContext* c, typename TTypes::ConstMatrix input, + typename TTypes::ConstScalar threshold, + TTypes::Matrix output); +}; + +} // namespace functor + +} // namespace tensorflow + +#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_ diff --git a/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc new file mode 100644 index 00000000000..345405e3fe6 --- /dev/null +++ b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc @@ -0,0 +1,141 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/compare_and_bitpack_op.h" + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { + +template +__global__ void CompareAndBitpackKernel(const int size, const T* threshold, + const T* input, uint8* output) { + // TODO(ebrevdo): Erich said: to get a better memory access pattern + // you could have 8 threads load this data and do a comparison, then + // use the ballot instruction to combine the values from each thread + // in the warp in one instruction (so each thread will have the + // result for 4 blocks) followed by an appropriate shift and mask to + // get the 8-bits of interest. + const T thresh = ldg(threshold); + CUDA_1D_KERNEL_LOOP(i, size) { + const T* block = input + 8 * i; + output[i] = + ((((ldg(block) > thresh) << 7)) | (((ldg(block + 1) > thresh) << 6)) | + (((ldg(block + 2) > thresh) << 5)) | + (((ldg(block + 3) > thresh) << 4)) | + (((ldg(block + 4) > thresh) << 3)) | + (((ldg(block + 5) > thresh) << 2)) | + (((ldg(block + 6) > thresh) << 1)) | (((ldg(block + 7) > thresh)))); + } +} + +template <> +__global__ void CompareAndBitpackKernel(const int size, + const bool* threshold, + const bool* input, + uint8* output) { + // TODO(ebrevdo): Erich said: I think you could again have multiple + // threads work on one block and use the ballot instruction to the + // bit packing in one instruction. + CUDA_1D_KERNEL_LOOP(i, size) { + const int64 block = ldg(reinterpret_cast(input + 8 * i)); + // NOTE(ebrevdo): This assumes memory is little-endian. + output[i] = + ((((block & (1LL << (7 * 8))) >> (7 * 8 - 0))) | + (((block & (1LL << (6 * 8))) >> (6 * 8 - 1))) | + (((block & (1LL << (5 * 8))) >> (5 * 8 - 2))) | + (((block & (1LL << (4 * 8))) >> (4 * 8 - 3))) | + (((block & (1LL << (3 * 8))) >> (3 * 8 - 4))) | + (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) | + (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7))); + } +} + +template <> +__global__ void CompareAndBitpackKernel(const int size, + const float* threshold, + const float* input, + uint8* output) { + const float thresh = ldg(threshold); + CUDA_1D_KERNEL_LOOP(i, size) { + const float4 block0 = ldg(reinterpret_cast(input + 8 * i)); + const float4 block1 = + ldg(reinterpret_cast(input + 8 * i + 4)); + output[i] = ((((block0.x > thresh) << 7)) | (((block0.y > thresh) << 6)) | + (((block0.z > thresh) << 5)) | (((block0.w > thresh) << 4)) | + (((block1.x > thresh) << 3)) | (((block1.y > thresh) << 2)) | + (((block1.z > thresh) << 1)) | (((block1.w > thresh)))); + } +} + +template <> +__global__ void CompareAndBitpackKernel(const int size, + const double* threshold, + const double* input, + uint8* output) { + const double thresh = ldg(threshold); + CUDA_1D_KERNEL_LOOP(i, size) { + const double2 block0 = ldg(reinterpret_cast(input + 8 * i)); + const double2 block1 = + ldg(reinterpret_cast(input + 8 * i + 2)); + const double2 block2 = + ldg(reinterpret_cast(input + 8 * i + 4)); + const double2 block3 = + ldg(reinterpret_cast(input + 8 * i + 6)); + output[i] = ((((block0.x > thresh) << 7)) | (((block0.y > thresh) << 6)) | + (((block1.x > thresh) << 5)) | (((block1.y > thresh) << 4)) | + (((block2.x > thresh) << 3)) | (((block2.y > thresh) << 2)) | + (((block3.x > thresh) << 1)) | (((block3.y > thresh)))); + } +} + +#define DEFINE_GPU_SPECS(T) \ + template <> \ + void CompareAndBitpack::operator()( \ + OpKernelContext* c, typename TTypes::ConstMatrix input, \ + typename TTypes::ConstScalar threshold, \ + TTypes::Matrix output) { \ + const GPUDevice& d = c->eigen_device(); \ + int64 total_count = output.size(); \ + CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); \ + \ + CompareAndBitpackKernel \ + <<>>( \ + total_count, threshold.data(), input.data(), output.data()); \ + } + +TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS) +TF_CALL_bool(DEFINE_GPU_SPECS) + +#undef DECLARE_GPU_SPECS + +} // namespace functor + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/population_count_op.cc b/tensorflow/core/kernels/population_count_op.cc new file mode 100644 index 00000000000..12ff6b69f87 --- /dev/null +++ b/tensorflow/core/kernels/population_count_op.cc @@ -0,0 +1,163 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// See docs in ../ops/math_ops.cc + +#define EIGEN_USE_THREADS + +#include + +#include "tensorflow/core/kernels/population_count_op.h" + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template +class PopulationCountOp : public OpKernel { + public: + explicit PopulationCountOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* c) override { + const Tensor& input_t = c->input(0); + Tensor* output_t; + OP_REQUIRES_OK(c, c->allocate_output(0, input_t.shape(), &output_t)); + + auto input = input_t.flat(); + auto output = output_t->flat(); + + functor::PopulationCount popcnt; + popcnt(c, input, output); + } +}; + +#define REGISTER_POPULATION_COUNT(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("PopulationCount").Device(DEVICE_CPU).TypeConstraint("T"), \ + PopulationCountOp); + +TF_CALL_uint8(REGISTER_POPULATION_COUNT); +TF_CALL_int8(REGISTER_POPULATION_COUNT); +TF_CALL_uint16(REGISTER_POPULATION_COUNT); +TF_CALL_int16(REGISTER_POPULATION_COUNT); +TF_CALL_int32(REGISTER_POPULATION_COUNT); +TF_CALL_int64(REGISTER_POPULATION_COUNT); + +#undef REGISTER_POPULATION_COUNT + +namespace functor { + +namespace { + +template +inline uint8 PopCnt(const T v); + +#define POPCNT(T, N) \ + template <> \ + uint8 PopCnt(const T v) { \ + return std::bitset(v).count(); \ + } + +POPCNT(int8, 8); +POPCNT(uint8, 8); +POPCNT(int16, 16); +POPCNT(uint16, 16); +POPCNT(int32, 32); +POPCNT(int64, 64); + +#undef POPCNT + +} // namespace + +template +struct PopulationCount { + void operator()(OpKernelContext* c, typename TTypes::ConstFlat input, + TTypes::Flat output) { + const T* input_ptr = input.data(); + uint8* output_ptr = output.data(); + auto shard = [input_ptr, output_ptr](int64 start, int64 limit) { + for (int64 i = start; i < limit; ++i) { + output_ptr[i] = PopCnt(input_ptr[i]); + } + }; + int64 total_shards = input.size(); + // Approximating cost of popcnt: convert T to int64 + // (std::bitset constructor) and convert int64 to uint8 + // (bitset.count() -> output). The .count() itself is relatively cheap. + const double total_cost = (Eigen::TensorOpCost::CastCost() + + Eigen::TensorOpCost::CastCost()); + const int64 shard_cost = (total_cost >= static_cast(kint64max)) + ? kint64max + : static_cast(total_cost); + + auto worker_threads = *(c->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, total_shards, + shard_cost, shard); + } +}; + +} // namespace functor + +#if GOOGLE_CUDA + +#define REGISTER_POPULATION_COUNT(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("PopulationCount").Device(DEVICE_GPU).TypeConstraint("T"), \ + PopulationCountOp) + +TF_CALL_uint8(REGISTER_POPULATION_COUNT); +TF_CALL_int8(REGISTER_POPULATION_COUNT); +TF_CALL_uint16(REGISTER_POPULATION_COUNT); +TF_CALL_int16(REGISTER_POPULATION_COUNT); +TF_CALL_int32(REGISTER_POPULATION_COUNT); +TF_CALL_int64(REGISTER_POPULATION_COUNT); + +#undef REGISTER_POPULATION_COUNT + +namespace functor { + +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void PopulationCount::operator()( \ + OpKernelContext* c, typename TTypes::ConstFlat input, \ + TTypes::Flat output); \ + extern template struct PopulationCount + +TF_CALL_uint8(DECLARE_GPU_SPEC); +TF_CALL_int8(DECLARE_GPU_SPEC); +TF_CALL_uint16(DECLARE_GPU_SPEC); +TF_CALL_int16(DECLARE_GPU_SPEC); +TF_CALL_int32(DECLARE_GPU_SPEC); +TF_CALL_int64(DECLARE_GPU_SPEC); + +#undef DECLARE_GPU_SPEC + +} // namespace functor + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/population_count_op.h b/tensorflow/core/kernels/population_count_op.h new file mode 100644 index 00000000000..de89582e139 --- /dev/null +++ b/tensorflow/core/kernels/population_count_op.h @@ -0,0 +1,38 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_ +#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_ + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { + +namespace functor { + +template +struct PopulationCount { + void operator()(OpKernelContext* c, typename TTypes::ConstFlat input, + TTypes::Flat output); +}; + +} // namespace functor + +} // namespace tensorflow + +#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_ diff --git a/tensorflow/core/kernels/population_count_op_gpu.cu.cc b/tensorflow/core/kernels/population_count_op_gpu.cu.cc new file mode 100644 index 00000000000..27a687ba409 --- /dev/null +++ b/tensorflow/core/kernels/population_count_op_gpu.cu.cc @@ -0,0 +1,92 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/population_count_op.h" + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { + +template +__global__ void PopulationCountKernel(const int size, const T* input, + uint8* output) { + CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popc(ldg(input + i)); } +} + +template <> +__global__ void PopulationCountKernel(const int size, const int8* input, + uint8* output) { + // For some reason, __popc on a negative int8 gets confused. + CUDA_1D_KERNEL_LOOP(i, size) { + output[i] = __popc(ldg(reinterpret_cast(input + i))); + } +} + +template <> +__global__ void PopulationCountKernel(const int size, const int16* input, + uint8* output) { + // For some reason, __popc on a negative int16 gets confused. + CUDA_1D_KERNEL_LOOP(i, size) { + output[i] = __popc(ldg(reinterpret_cast(input + i))); + } +} + +template <> +__global__ void PopulationCountKernel(const int size, const int64* input, + uint8* output) { + CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); } +} + +#define DEFINE_GPU_SPECS(T) \ + template <> \ + void PopulationCount::operator()( \ + OpKernelContext* c, typename TTypes::ConstFlat input, \ + TTypes::Flat output) { \ + const GPUDevice& d = c->eigen_device(); \ + int64 total_count = input.size(); \ + CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); \ + PopulationCountKernel \ + <<>>( \ + total_count, input.data(), output.data()); \ + } + +TF_CALL_uint8(DEFINE_GPU_SPECS); +TF_CALL_int8(DEFINE_GPU_SPECS); +TF_CALL_uint16(DEFINE_GPU_SPECS); +TF_CALL_int16(DEFINE_GPU_SPECS); +TF_CALL_int32(DEFINE_GPU_SPECS); +TF_CALL_int64(DEFINE_GPU_SPECS); + +#undef DEFINE_GPU_SPECS + +} // namespace functor + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/ops/bitwise_ops.cc b/tensorflow/core/ops/bitwise_ops.cc index 2005d5e1028..3ffc4ab74af 100644 --- a/tensorflow/core/ops/bitwise_ops.cc +++ b/tensorflow/core/ops/bitwise_ops.cc @@ -40,6 +40,22 @@ computation is performed on the underlying representation of x. .Attr("T: {int8, int16, int32, int64, uint8, uint16}") \ .SetShapeFn(shape_inference::UnchangedShape) +REGISTER_OP("PopulationCount") + .Input("x: T") + .Output("y: uint8") + .Attr("T: {int8, int16, int32, int64, uint8, uint16}") + .SetShapeFn(shape_inference::UnchangedShape) + .Doc(R"doc( +Computes element-wise population count (a.k.a. popcount, bitsum, bitcount). + +For each entry in `x`, calculates the number of `1` (on) bits in the binary +representation of that entry. + +**NOTE**: It is more efficient to first `tf.bitcast` your tensors into +`int32` or `int64` and perform the bitcount on the result, than to feed in +8- or 16-bit inputs and then aggregate the resulting counts. +)doc"); + REGISTER_OP("BitwiseAnd").BINARY_BITWISE().Doc(R"doc( Elementwise computes the bitwise AND of `x` and `y`. diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index 94224f22b9c..b82035bfc32 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -4597,6 +4597,37 @@ op { } } } +op { + name: "CompareAndBitpack" + input_arg { + name: "input" + type_attr: "T" + } + input_arg { + name: "threshold" + type_attr: "T" + } + output_arg { + name: "output" + type: DT_UINT8 + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_BOOL + type: DT_HALF + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT8 + type: DT_INT16 + type: DT_INT32 + type: DT_INT64 + } + } + } +} op { name: "Complex" input_arg { @@ -16267,6 +16298,31 @@ op { } } } +op { + name: "PopulationCount" + input_arg { + name: "x" + type_attr: "T" + } + output_arg { + name: "y" + type: DT_UINT8 + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_INT8 + type: DT_INT16 + type: DT_INT32 + type: DT_INT64 + type: DT_UINT8 + type: DT_UINT16 + } + } + } +} op { name: "Pow" input_arg { diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index f4e0625c66d..36f999ff607 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -2458,6 +2458,64 @@ out_type: The type of the output. Should be a lower bit depth than Tinput. )doc"); +REGISTER_OP("CompareAndBitpack") + .Input("input: T") + .Input("threshold: T") + .Output("output: uint8") + .Attr("T: {bool, float16, float32, float64, int8, int16, int32, int64}") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle input; + TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &input)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + ShapeHandle output = input; + if (c->RankKnown(input)) { + int rank = c->Rank(input); + auto inner_dim = c->Dim(input, rank - 1); + DimensionHandle inferred_dim; + TF_RETURN_IF_ERROR(c->Divide(inner_dim, 8, + /* evenly_divisible */ true, + &inferred_dim)); + TF_RETURN_IF_ERROR( + c->ReplaceDim(output, rank - 1, inferred_dim, &output)); + } + c->set_output(0, output); + + return Status::OK(); + }) + .Doc(R"doc( +Compare values of `input` to `threshold` and pack resulting bits into a `uint8`. + +Each comparison returns a boolean `true` (if `input_value > threshold`) +or and `false` otherwise. + +This operation is useful for Locality-Sensitive-Hashing (LSH) and other +algorithms that use hashing approximations of cosine and `L2` distances; +codes can be generated from an input via: + +```python +codebook_size = 50 +codebook_bits = codebook_size * 32 +codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits], + dtype=x.dtype, + initializer=tf.orthogonal_initializer()) +codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.) +codes = tf.bitcast(codes, tf.int32) # go from uint8 to int32 +# now codes has shape x.shape[:-1] + [codebook_size] +``` + +**NOTE**: Currently, the innermost dimension of the tensor must be divisible +by 8. + +Given an `input` shaped `[s0, s1, ..., s_n]`, the output is +a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`. + +input: Values to compare against `threshold` and bitpack. +threshold: Threshold to compare against. +T: The type of the input and threshold. +output: The bitpacked comparisons. +)doc"); + REGISTER_OP("RequantizationRange") .Input("input: Tinput") .Input("input_min: float") diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index 1822fc1133d..468434bd283 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -499,7 +499,7 @@ op { } input_arg { name: "reduction_indices" - description: "The dimensions to reduce." + description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`." type_attr: "Tidx" } output_arg { @@ -601,7 +601,7 @@ op { } input_arg { name: "reduction_indices" - description: "The dimensions to reduce." + description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`." type_attr: "Tidx" } output_arg { @@ -1691,7 +1691,7 @@ op { } input_arg { name: "dimension" - description: "int32 or int64, 0 <= dimension < rank(input). Describes\nwhich dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0." + description: "int32 or int64, must be in the range `[-rank(input), rank(input))`.\nDescribes which dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0." type_attr: "Tidx" } output_arg { @@ -1757,7 +1757,7 @@ op { } input_arg { name: "dimension" - description: "int32 or int64, 0 <= dimension < rank(input). Describes\nwhich dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0." + description: "int32 or int64, must be in the range `[-rank(input), rank(input))`.\nDescribes which dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0." type_attr: "Tidx" } output_arg { @@ -4406,6 +4406,43 @@ op { summary: "Computes the reverse mode backpropagated gradient of the Cholesky algorithm." description: "For an explanation see \"Differentiation of the Cholesky algorithm\" by\nIain Murray http://arxiv.org/abs/1602.07527." } +op { + name: "CompareAndBitpack" + input_arg { + name: "input" + description: "Values to compare against `threshold` and bitpack." + type_attr: "T" + } + input_arg { + name: "threshold" + description: "Threshold to compare against." + type_attr: "T" + } + output_arg { + name: "output" + description: "The bitpacked comparisons." + type: DT_UINT8 + } + attr { + name: "T" + type: "type" + description: "The type of the input and threshold." + allowed_values { + list { + type: DT_BOOL + type: DT_HALF + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT8 + type: DT_INT16 + type: DT_INT32 + type: DT_INT64 + } + } + } + summary: "Compare values of `input` to `threshold` and pack resulting bits into a `uint8`." + description: "Each comparison returns a boolean `true` (if `input_value > threshold`)\nor and `false` otherwise.\n\nThis operation is useful for Locality-Sensitive-Hashing (LSH) and other\nalgorithms that use hashing approximations of cosine and `L2` distances;\ncodes can be generated from an input via:\n\n```python\ncodebook_size = 50\ncodebook_bits = codebook_size * 32\ncodebook = tf.get_variable(\'codebook\', [x.shape[-1].value, codebook_bits],\n dtype=x.dtype,\n initializer=tf.orthogonal_initializer())\ncodes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)\ncodes = tf.bitcast(codes, tf.int32) # go from uint8 to int32\n# now codes has shape x.shape[:-1] + [codebook_size]\n```\n\n**NOTE**: Currently, the innermost dimension of the tensor must be divisible\nby 8.\n\nGiven an `input` shaped `[s0, s1, ..., s_n]`, the output is\na `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`." +} op { name: "Complex" input_arg { @@ -5656,10 +5693,12 @@ op { name: "Cumprod" input_arg { name: "x" + description: "A `Tensor`. Must be one of the following types: `float32`, `float64`,\n`int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,\n`complex128`, `qint8`, `quint8`, `qint32`, `half`." type_attr: "T" } input_arg { name: "axis" + description: "A `Tensor` of type `int32` (default: 0). Must be in the range\n`[-rank(x), rank(x))`." type_attr: "Tidx" } output_arg { @@ -5672,6 +5711,7 @@ op { default_value { b: false } + description: "If `True`, perform exclusive cumprod." } attr { name: "reverse" @@ -5679,6 +5719,7 @@ op { default_value { b: false } + description: "A `bool` (default: False)." } attr { name: "T" @@ -5722,10 +5763,12 @@ op { name: "Cumsum" input_arg { name: "x" + description: "A `Tensor`. Must be one of the following types: `float32`, `float64`,\n`int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,\n`complex128`, `qint8`, `quint8`, `qint32`, `half`." type_attr: "T" } input_arg { name: "axis" + description: "A `Tensor` of type `int32` (default: 0). Must be in the range\n`[-rank(x), rank(x))`." type_attr: "Tidx" } output_arg { @@ -5738,6 +5781,7 @@ op { default_value { b: false } + description: "If `True`, perform exclusive cumsum." } attr { name: "reverse" @@ -5745,6 +5789,7 @@ op { default_value { b: false } + description: "A `bool` (default: False)." } attr { name: "T" @@ -7615,7 +7660,7 @@ op { } input_arg { name: "dim" - description: "0-D (scalar). Specifies the dimension index at which to\nexpand the shape of `input`." + description: "0-D (scalar). Specifies the dimension index at which to\nexpand the shape of `input`. Must be in the range\n`[-rank(input) - 1, rank(input)]`." type_attr: "Tdim" } output_arg { @@ -12325,7 +12370,7 @@ op { } input_arg { name: "reduction_indices" - description: "The dimensions to reduce." + description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`." type_attr: "Tidx" } output_arg { @@ -13102,7 +13147,7 @@ op { } input_arg { name: "reduction_indices" - description: "The dimensions to reduce." + description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`." type_attr: "Tidx" } output_arg { @@ -13293,7 +13338,7 @@ op { } input_arg { name: "reduction_indices" - description: "The dimensions to reduce." + description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`." type_attr: "Tidx" } output_arg { @@ -15436,6 +15481,33 @@ op { summary: "Compute the polygamma function \\\\(\\psi^{(n)}(x)\\\\)." description: "The polygamma function is defined as:\n\n\n\\\\(\\psi^{(n)}(x) = \\frac{d^n}{dx^n} \\psi(x)\\\\)\n\nwhere \\\\(\\psi(x)\\\\) is the digamma function." } +op { + name: "PopulationCount" + input_arg { + name: "x" + type_attr: "T" + } + output_arg { + name: "y" + type: DT_UINT8 + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_INT8 + type: DT_INT16 + type: DT_INT32 + type: DT_INT64 + type: DT_UINT8 + type: DT_UINT16 + } + } + } + summary: "Computes element-wise population count (a.k.a. popcount, bitsum, bitcount)." + description: "For each entry in `x`, calculates the number of `1` (on) bits in the binary\nrepresentation of that entry.\n\n**NOTE**: It is more efficient to first `tf.bitcast` your tensors into\n`int32` or `int64` and perform the bitcount on the result, than to feed in\n8- or 16-bit inputs and then aggregate the resulting counts." +} op { name: "Pow" input_arg { @@ -15662,7 +15734,7 @@ op { } input_arg { name: "reduction_indices" - description: "The dimensions to reduce." + description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`." type_attr: "Tidx" } output_arg { @@ -21708,7 +21780,7 @@ op { } input_arg { name: "axis" - description: "1-D. The indices of the dimensions to reverse." + description: "1-D. The indices of the dimensions to reverse. Must be in the range\n`[-rank(tensor), rank(tensor))`." type_attr: "Tidx" } output_arg { @@ -27260,7 +27332,7 @@ op { list { } } - description: "If specified, only squeezes the dimensions listed. The dimension\nindex starts at 0. It is an error to squeeze a dimension that is not 1." + description: "If specified, only squeezes the dimensions listed. The dimension\nindex starts at 0. It is an error to squeeze a dimension that is not 1. Must\nbe in the range `[-rank(input), rank(input))`." has_minimum: true } summary: "Removes dimensions of size 1 from the shape of a tensor." @@ -28250,7 +28322,7 @@ op { } input_arg { name: "reduction_indices" - description: "The dimensions to reduce." + description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`." type_attr: "Tidx" } output_arg { diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl index e1f123a0a01..04bf2aeca65 100644 --- a/tensorflow/core/platform/default/build_config_root.bzl +++ b/tensorflow/core/platform/default/build_config_root.bzl @@ -28,7 +28,7 @@ def tf_additional_verbs_deps(): "//tensorflow:with_verbs_support": [ "//tensorflow/contrib/verbs:verbs_server_lib", "//tensorflow/contrib/verbs:grpc_verbs_client", - ], + ], "//conditions:default": [], }) diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h index eaf0171e72d..47523c7d2b0 100644 --- a/tensorflow/core/platform/macros.h +++ b/tensorflow/core/platform/macros.h @@ -20,6 +20,7 @@ limitations under the License. #if (defined(__GNUC__) || defined(__APPLE__)) && !defined(SWIG) // Compiler supports GCC-style attributes #define TF_ATTRIBUTE_NORETURN __attribute__((noreturn)) +#define TF_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline)) #define TF_ATTRIBUTE_NOINLINE __attribute__((noinline)) #define TF_ATTRIBUTE_UNUSED __attribute__((unused)) #define TF_ATTRIBUTE_COLD __attribute__((cold)) @@ -33,6 +34,7 @@ limitations under the License. #elif defined(COMPILER_MSVC) // Non-GCC equivalents #define TF_ATTRIBUTE_NORETURN __declspec(noreturn) +#define TF_ATTRIBUTE_ALWAYS_INLINE #define TF_ATTRIBUTE_NOINLINE #define TF_ATTRIBUTE_UNUSED #define TF_ATTRIBUTE_COLD @@ -43,6 +45,7 @@ limitations under the License. #else // Non-GCC equivalents #define TF_ATTRIBUTE_NORETURN +#define TF_ATTRIBUTE_ALWAYS_INLINE #define TF_ATTRIBUTE_NOINLINE #define TF_ATTRIBUTE_UNUSED #define TF_ATTRIBUTE_COLD diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc index df5c8008792..e9baad54226 100644 --- a/tensorflow/core/platform/posix/error.cc +++ b/tensorflow/core/platform/posix/error.cc @@ -171,11 +171,7 @@ error::Code ErrnoToCode(int err_number) { Status IOError(const string& context, int err_number) { auto code = ErrnoToCode(err_number); - if (code == error::UNKNOWN) { - return Status(code, strings::StrCat(context, "; ", strerror(err_number))); - } else { - return Status(code, context); - } + return Status(code, strings::StrCat(context, "; ", strerror(err_number))); } } // namespace tensorflow diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go index 10388509a9a..095cbbe637b 100644 --- a/tensorflow/go/op/wrappers.go +++ b/tensorflow/go/op/wrappers.go @@ -1130,7 +1130,8 @@ type SqueezeAttr func(optionalAttr) // SqueezeSqueezeDims sets the optional squeeze_dims attribute to value. // // value: If specified, only squeezes the dimensions listed. The dimension -// index starts at 0. It is an error to squeeze a dimension that is not 1. +// index starts at 0. It is an error to squeeze a dimension that is not 1. Must +// be in the range `[-rank(input), rank(input))`. // If not specified, defaults to <> // // REQUIRES: len(value) >= 0 @@ -7069,6 +7070,61 @@ func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_ha return op.Output(0) } +// TextLineReaderV2Attr is an optional argument to TextLineReaderV2. +type TextLineReaderV2Attr func(optionalAttr) + +// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value. +// +// value: Number of lines to skip from the beginning of every file. +// If not specified, defaults to 0 +func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr { + return func(m optionalAttr) { + m["skip_header_lines"] = value + } +} + +// TextLineReaderV2Container sets the optional container attribute to value. +// +// value: If non-empty, this reader is placed in the given container. +// Otherwise, a default container is used. +// If not specified, defaults to "" +func TextLineReaderV2Container(value string) TextLineReaderV2Attr { + return func(m optionalAttr) { + m["container"] = value + } +} + +// TextLineReaderV2SharedName sets the optional shared_name attribute to value. +// +// value: If non-empty, this reader is named in the given bucket +// with this shared_name. Otherwise, the node name is used instead. +// If not specified, defaults to "" +func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr { + return func(m optionalAttr) { + m["shared_name"] = value + } +} + +// A Reader that outputs the lines of a file delimited by '\n'. +// +// Returns The handle to reference the Reader. +func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "TextLineReaderV2", + + Attrs: attrs, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + // Computes rectified linear 6: `min(max(features, 0), 6)`. func Relu6(scope *Scope, features tf.Output) (activations tf.Output) { if scope.Err() != nil { @@ -12819,7 +12875,8 @@ func ReciprocalGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) { // // Arguments: // tensor: Up to 8-D. -// axis: 1-D. The indices of the dimensions to reverse. +// axis: 1-D. The indices of the dimensions to reverse. Must be in the range +// `[-rank(tensor), rank(tensor))`. // // Returns The same shape as `tensor`. func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) { @@ -14493,61 +14550,6 @@ func Tanh(scope *Scope, x tf.Output) (y tf.Output) { return op.Output(0) } -// TextLineReaderV2Attr is an optional argument to TextLineReaderV2. -type TextLineReaderV2Attr func(optionalAttr) - -// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value. -// -// value: Number of lines to skip from the beginning of every file. -// If not specified, defaults to 0 -func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr { - return func(m optionalAttr) { - m["skip_header_lines"] = value - } -} - -// TextLineReaderV2Container sets the optional container attribute to value. -// -// value: If non-empty, this reader is placed in the given container. -// Otherwise, a default container is used. -// If not specified, defaults to "" -func TextLineReaderV2Container(value string) TextLineReaderV2Attr { - return func(m optionalAttr) { - m["container"] = value - } -} - -// TextLineReaderV2SharedName sets the optional shared_name attribute to value. -// -// value: If non-empty, this reader is named in the given bucket -// with this shared_name. Otherwise, the node name is used instead. -// If not specified, defaults to "" -func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr { - return func(m optionalAttr) { - m["shared_name"] = value - } -} - -// A Reader that outputs the lines of a file delimited by '\n'. -// -// Returns The handle to reference the Reader. -func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "TextLineReaderV2", - - Attrs: attrs, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - // Component-wise multiplies a SparseTensor by a dense Tensor. // // The output locations corresponding to the implicitly zero elements in the sparse @@ -16147,6 +16149,8 @@ func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf type CumprodAttr func(optionalAttr) // CumprodExclusive sets the optional exclusive attribute to value. +// +// value: If `True`, perform exclusive cumprod. // If not specified, defaults to false func CumprodExclusive(value bool) CumprodAttr { return func(m optionalAttr) { @@ -16155,6 +16159,8 @@ func CumprodExclusive(value bool) CumprodAttr { } // CumprodReverse sets the optional reverse attribute to value. +// +// value: A `bool` (default: False). // If not specified, defaults to false func CumprodReverse(value bool) CumprodAttr { return func(m optionalAttr) { @@ -16192,6 +16198,13 @@ func CumprodReverse(value bool) CumprodAttr { // ```python // tf.cumprod([a, b, c], exclusive=True, reverse=True) # => [b * c, c, 1] // ``` +// +// Arguments: +// x: A `Tensor`. Must be one of the following types: `float32`, `float64`, +// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, +// `complex128`, `qint8`, `quint8`, `qint32`, `half`. +// axis: A `Tensor` of type `int32` (default: 0). Must be in the range +// `[-rank(x), rank(x))`. func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) { if scope.Err() != nil { return @@ -16420,6 +16433,8 @@ func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, ma type CumsumAttr func(optionalAttr) // CumsumExclusive sets the optional exclusive attribute to value. +// +// value: If `True`, perform exclusive cumsum. // If not specified, defaults to false func CumsumExclusive(value bool) CumsumAttr { return func(m optionalAttr) { @@ -16428,6 +16443,8 @@ func CumsumExclusive(value bool) CumsumAttr { } // CumsumReverse sets the optional reverse attribute to value. +// +// value: A `bool` (default: False). // If not specified, defaults to false func CumsumReverse(value bool) CumsumAttr { return func(m optionalAttr) { @@ -16465,6 +16482,13 @@ func CumsumReverse(value bool) CumsumAttr { // ```python // tf.cumsum([a, b, c], exclusive=True, reverse=True) # => [b + c, c, 0] // ``` +// +// Arguments: +// x: A `Tensor`. Must be one of the following types: `float32`, `float64`, +// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, +// `complex128`, `qint8`, `quint8`, `qint32`, `half`. +// axis: A `Tensor` of type `int32` (default: 0). Must be in the range +// `[-rank(x), rank(x))`. func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) { if scope.Err() != nil { return @@ -17894,6 +17918,28 @@ func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf. return op.Output(0), op.Output(1), op.Output(2) } +// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount). +// +// For each entry in `x`, calculates the number of `1` (on) bits in the binary +// representation of that entry. +// +// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into +// `int32` or `int64` and perform the bitcount on the result, than to feed in +// 8- or 16-bit inputs and then aggregate the resulting counts. +func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "PopulationCount", + Input: []tf.Input{ + x, + }, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + // AssertAttr is an optional argument to Assert. type AssertAttr func(optionalAttr) @@ -18063,7 +18109,8 @@ func AnyKeepDims(value bool) AnyAttr { // // Arguments: // input: The tensor to reduce. -// reduction_indices: The dimensions to reduce. +// reduction_indices: The dimensions to reduce. Must be in the range +// `[-rank(input), rank(input))`. // // Returns The reduced tensor. func Any(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AnyAttr) (output tf.Output) { @@ -19213,7 +19260,8 @@ func ProdKeepDims(value bool) ProdAttr { // // Arguments: // input: The tensor to reduce. -// reduction_indices: The dimensions to reduce. +// reduction_indices: The dimensions to reduce. Must be in the range +// `[-rank(input), rank(input))`. // // Returns The reduced tensor. func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...ProdAttr) (output tf.Output) { @@ -20258,7 +20306,8 @@ func MaxKeepDims(value bool) MaxAttr { // // Arguments: // input: The tensor to reduce. -// reduction_indices: The dimensions to reduce. +// reduction_indices: The dimensions to reduce. Must be in the range +// `[-rank(input), rank(input))`. // // Returns The reduced tensor. func Max(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MaxAttr) (output tf.Output) { @@ -20583,7 +20632,8 @@ func Sqrt(scope *Scope, x tf.Output) (y tf.Output) { // Arguments: // // dim: 0-D (scalar). Specifies the dimension index at which to -// expand the shape of `input`. +// expand the shape of `input`. Must be in the range +// `[-rank(input) - 1, rank(input)]`. // // Returns Contains the same data as `input`, but its shape has an additional // dimension of size 1 added. @@ -20623,7 +20673,8 @@ func AllKeepDims(value bool) AllAttr { // // Arguments: // input: The tensor to reduce. -// reduction_indices: The dimensions to reduce. +// reduction_indices: The dimensions to reduce. Must be in the range +// `[-rank(input), rank(input))`. // // Returns The reduced tensor. func All(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AllAttr) (output tf.Output) { @@ -21665,8 +21716,8 @@ func ArgMinOutputType(value tf.DataType) ArgMinAttr { // // Arguments: // -// dimension: int32 or int64, 0 <= dimension < rank(input). Describes -// which dimension of the input Tensor to reduce across. For vectors, +// dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`. +// Describes which dimension of the input Tensor to reduce across. For vectors, // use dimension = 0. func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) { if scope.Err() != nil { @@ -22716,7 +22767,8 @@ func MeanKeepDims(value bool) MeanAttr { // // Arguments: // input: The tensor to reduce. -// reduction_indices: The dimensions to reduce. +// reduction_indices: The dimensions to reduce. Must be in the range +// `[-rank(input), rank(input))`. // // Returns The reduced tensor. func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MeanAttr) (output tf.Output) { @@ -22856,7 +22908,8 @@ func MinKeepDims(value bool) MinAttr { // // Arguments: // input: The tensor to reduce. -// reduction_indices: The dimensions to reduce. +// reduction_indices: The dimensions to reduce. Must be in the range +// `[-rank(input), rank(input))`. // // Returns The reduced tensor. func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MinAttr) (output tf.Output) { @@ -22914,8 +22967,8 @@ func ArgMaxOutputType(value tf.DataType) ArgMaxAttr { // // Arguments: // -// dimension: int32 or int64, 0 <= dimension < rank(input). Describes -// which dimension of the input Tensor to reduce across. For vectors, +// dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`. +// Describes which dimension of the input Tensor to reduce across. For vectors, // use dimension = 0. func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) { if scope.Err() != nil { @@ -23888,6 +23941,51 @@ func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Outp return op.Output(0), op.Output(1), op.Output(2) } +// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`. +// +// Each comparison returns a boolean `true` (if `input_value > threshold`) +// or and `false` otherwise. +// +// This operation is useful for Locality-Sensitive-Hashing (LSH) and other +// algorithms that use hashing approximations of cosine and `L2` distances; +// codes can be generated from an input via: +// +// ```python +// codebook_size = 50 +// codebook_bits = codebook_size * 32 +// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits], +// dtype=x.dtype, +// initializer=tf.orthogonal_initializer()) +// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.) +// codes = tf.bitcast(codes, tf.int32) # go from uint8 to int32 +// # now codes has shape x.shape[:-1] + [codebook_size] +// ``` +// +// **NOTE**: Currently, the innermost dimension of the tensor must be divisible +// by 8. +// +// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is +// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`. +// +// Arguments: +// input: Values to compare against `threshold` and bitpack. +// threshold: Threshold to compare against. +// +// Returns The bitpacked comparisons. +func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "CompareAndBitpack", + Input: []tf.Input{ + input, threshold, + }, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + // Outputs a `Summary` protocol buffer with a tensor and per-plugin data. // // Arguments: @@ -24724,7 +24822,8 @@ func SumKeepDims(value bool) SumAttr { // // Arguments: // input: The tensor to reduce. -// reduction_indices: The dimensions to reduce. +// reduction_indices: The dimensions to reduce. Must be in the range +// `[-rank(input), rank(input))`. // // Returns The reduced tensor. func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...SumAttr) (output tf.Output) { diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index cac05c372a3..896d466c259 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -1671,6 +1671,18 @@ cuda_py_test( ], ) +cuda_py_test( + name = "compare_and_bitpack_op_test", + size = "small", + srcs = ["compare_and_bitpack_op_test.py"], + additional_deps = [ + "//third_party/py/numpy", + "//tensorflow/python:math_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework_for_generated_wrappers", + ], +) + cuda_py_test( name = "scalar_test", size = "small", diff --git a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py new file mode 100644 index 00000000000..56ddd6e4282 --- /dev/null +++ b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py @@ -0,0 +1,83 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tensorflow.ops.compare_and_bitpack_op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import test + + +class CompareAndBitpackTest(test.TestCase): + + def _testCompareAndBitpack(self, + x, threshold, + truth, + expected_err_re=None): + with self.test_session(use_gpu=True): + ans = math_ops.compare_and_bitpack(x, threshold) + if expected_err_re is None: + tf_ans = ans.eval() + self.assertShapeEqual(truth, ans) + self.assertAllEqual(tf_ans, truth) + else: + with self.assertRaisesOpError(expected_err_re): + ans.eval() + + def _testBasic(self, dtype): + rows = 371 + cols = 294 + x = np.random.randn(rows, cols * 8) + if dtype == np.bool: + x = x > 0 + else: + x = x.astype(dtype) + threshold = dtype(0) + # np.packbits flattens the tensor, so we reshape it back to the + # expected dimensions. + truth = np.packbits(x > threshold).reshape(rows, cols) + self._testCompareAndBitpack(x, threshold, truth) + + def testBasicFloat32(self): + self._testBasic(np.float32) + + def testBasicFloat64(self): + self._testBasic(np.float64) + + def testBasicFloat16(self): + self._testBasic(np.float16) + + def testBasicBool(self): + self._testBasic(np.bool) + + def testBasicInt8(self): + self._testBasic(np.int8) + + def testBasicInt16(self): + self._testBasic(np.int16) + + def testBasicInt32(self): + self._testBasic(np.int32) + + def testBasicInt64(self): + self._testBasic(np.int64) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py index 04d65b88a1c..9a946925693 100644 --- a/tensorflow/python/kernel_tests/gather_op_test.py +++ b/tensorflow/python/kernel_tests/gather_op_test.py @@ -88,58 +88,58 @@ class GatherTest(test.TestCase): def testHigherRank(self): # We check that scalar and empty indices shapes work as well - for shape in (4, 3, 2), (2, 1, 3, 2): - for indices_shape in (), (0,), (3, 0), (3, 5), (5, 2, 3): - for dtype in _TEST_TYPES: - for axis in range(len(shape)): - params = self._buildParams(np.random.randn(*shape), dtype) - indices = np.random.randint(shape[axis], size=indices_shape) - with self.test_session(use_gpu=True) as sess: - tf_params = constant_op.constant(params) - tf_indices = constant_op.constant(indices) - # Check that both positive and negative indices for axis work. - tf_axis = constant_op.constant(axis) - tf_negative_axis = constant_op.constant(-len(shape) + axis) - gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis) - gather_negative_axis = array_ops.gather( - tf_params, tf_indices, axis=tf_negative_axis) - gather_value, gather_negative_axis_value = sess.run( - [gather, gather_negative_axis]) - gather_np = np.take(params, indices, axis) - self.assertAllEqual(gather_np, gather_value) - self.assertAllEqual(gather_np, gather_negative_axis_value) - expected_shape = (params.shape[:axis] + indices.shape + - params.shape[axis + 1:]) - self.assertEqual(expected_shape, gather.shape) - self.assertEqual(expected_shape, gather_negative_axis.shape) + shape = (2, 1, 3, 2) + for indices_shape in (), (0,), (2, 0), (2, 3): + for dtype in _TEST_TYPES: + for axis in range(len(shape)): + params = self._buildParams(np.random.randn(*shape), dtype) + indices = np.random.randint(shape[axis], size=indices_shape) + with self.test_session(use_gpu=True) as sess: + tf_params = constant_op.constant(params) + tf_indices = constant_op.constant(indices) + # Check that both positive and negative indices for axis work. + tf_axis = constant_op.constant(axis) + tf_negative_axis = constant_op.constant(-len(shape) + axis) + gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis) + gather_negative_axis = array_ops.gather( + tf_params, tf_indices, axis=tf_negative_axis) + gather_value, gather_negative_axis_value = sess.run( + [gather, gather_negative_axis]) + gather_np = np.take(params, indices, axis) + self.assertAllEqual(gather_np, gather_value) + self.assertAllEqual(gather_np, gather_negative_axis_value) + expected_shape = (params.shape[:axis] + indices.shape + + params.shape[axis + 1:]) + self.assertEqual(expected_shape, gather.shape) + self.assertEqual(expected_shape, gather_negative_axis.shape) - # Test gradients - gather_grad = np.random.randn( - *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype) - if dtype.is_complex: - gather_grad -= 1j * gather_grad - params_grad, indices_grad, axis_grad = gradients_impl.gradients( - gather, [tf_params, tf_indices, tf_axis], gather_grad) - self.assertEqual(indices_grad, None) - self.assertEqual(axis_grad, None) - # For axis 0, we are able to create an efficient IndexedSlices for - # the gradient. - if axis == 0: - self.assertEqual(type(params_grad), ops.IndexedSlices) - params_grad = ops.convert_to_tensor(params_grad) - correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype) - outer_dims = axis - inner_dims = len(shape) - axis - 1 - gather_grad = gather_grad.reshape( - shape[:axis] + (indices.size,) + shape[axis + 1:]) - for source_index, dest_index in enumerate(indices.flat): - dest_slice = ((slice(None),) * outer_dims + (dest_index,) + + # Test gradients + gather_grad = np.random.randn( + *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype) + if dtype.is_complex: + gather_grad -= 1j * gather_grad + params_grad, indices_grad, axis_grad = gradients_impl.gradients( + gather, [tf_params, tf_indices, tf_axis], gather_grad) + self.assertEqual(indices_grad, None) + self.assertEqual(axis_grad, None) + # For axis 0, we are able to create an efficient IndexedSlices for + # the gradient. + if axis == 0: + self.assertEqual(type(params_grad), ops.IndexedSlices) + params_grad = ops.convert_to_tensor(params_grad) + correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype) + outer_dims = axis + inner_dims = len(shape) - axis - 1 + gather_grad = gather_grad.reshape( + shape[:axis] + (indices.size,) + shape[axis + 1:]) + for source_index, dest_index in enumerate(indices.flat): + dest_slice = ((slice(None),) * outer_dims + (dest_index,) + + (slice(None),) * inner_dims) + source_slice = ((slice(None),) * outer_dims + (source_index,) + (slice(None),) * inner_dims) - source_slice = ((slice(None),) * outer_dims + (source_index,) + - (slice(None),) * inner_dims) - correct_params_grad[dest_slice] += gather_grad[source_slice] - self.assertAllClose(correct_params_grad, params_grad.eval(), - atol=2e-6, rtol=2e-6) + correct_params_grad[dest_slice] += gather_grad[source_slice] + self.assertAllClose(correct_params_grad, params_grad.eval(), + atol=2e-6, rtol=2e-6) def testString(self): params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]]) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index bb86640eaba..f64c89ac5d2 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -330,7 +330,7 @@ def rank(input, name=None): # pylint: disable=redefined-builtin """Returns the rank of a tensor. - This operation returns an integer representing the rank of `input`. + Returns a 0-D `int32` `Tensor` representing the rank of `input`. For example: diff --git a/tensorflow/python/ops/bitwise_ops.py b/tensorflow/python/ops/bitwise_ops.py index cbabc3ed9ba..44daf135370 100644 --- a/tensorflow/python/ops/bitwise_ops.py +++ b/tensorflow/python/ops/bitwise_ops.py @@ -36,5 +36,6 @@ ops.NotDifferentiable("BitwiseAnd") ops.NotDifferentiable("BitwiseOr") ops.NotDifferentiable("BitwiseXor") ops.NotDifferentiable("Invert") +ops.NotDifferentiable("PopulationCount") remove_undocumented(__name__) diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py index 904cf99a5ab..1d08c8f82dc 100644 --- a/tensorflow/python/ops/bitwise_ops_test.py +++ b/tensorflow/python/ops/bitwise_ops_test.py @@ -18,10 +18,14 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import numpy as np +import six + from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import test_util from tensorflow.python.ops import bitwise_ops +from tensorflow.python.ops import gen_bitwise_ops from tensorflow.python.platform import googletest @@ -46,6 +50,25 @@ class BitwiseOpTest(test_util.TensorFlowTestCase): self.assertAllEqual(or_result, [5, 5, 7, 15]) self.assertAllEqual(xor_result, [5, 5, 4, 5]) + def testPopulationCountOp(self): + dtype_list = [dtypes.int8, dtypes.int16, + dtypes.int32, dtypes.int64, + dtypes.uint8, dtypes.uint16] + raw_inputs = [0, 1, -1, 3, -3, 5, -5, 14, -14, + 127, 128, 255, 256, 65535, 65536, + 2**31 - 1, 2**31, 2**32 - 1, 2**32, -2**32 + 1, -2**32, + -2**63 + 1, 2**63 - 1] + def count_bits(x): + return sum([bin(z).count("1") for z in six.iterbytes(x.tobytes())]) + for dtype in dtype_list: + with self.test_session(use_gpu=True) as sess: + print("PopulationCount test: ", dtype) + inputs = np.array(raw_inputs, dtype=dtype.as_numpy_dtype) + truth = [count_bits(x) for x in inputs] + input_tensor = constant_op.constant(inputs, dtype=dtype) + popcnt_result = sess.run(gen_bitwise_ops.population_count(input_tensor)) + self.assertAllEqual(truth, popcnt_result) + def testInvertOp(self): dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16] diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py index 44d6c7e2759..4ba812eaf5d 100644 --- a/tensorflow/python/ops/control_flow_ops.py +++ b/tensorflow/python/ops/control_flow_ops.py @@ -61,6 +61,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_array_ops from tensorflow.python.ops import gen_control_flow_ops @@ -983,9 +984,16 @@ class GradLoopState(object): # the right control flow context. real_value = self._grad_context.AddValue(cur_value) break + elif constant_op.is_constant(cur_value): + # If the value to be forwarded is a constant, clone the constant in + # the gradient loop rather than using a stack. + # TODO(phawkins): consider hoisting the constant out of the loop + # instead. + real_value = constant_op.constant( + tensor_util.constant_value(cur_value), dtype=cur_value.dtype) + break else: # Record the history of this value in forward_ctxt. - # TODO(yuanbyu): Avoid recording constants. self._grad_context.Exit() history_value = cur_grad_state.AddForwardAccumulator(cur_value) self._grad_context.Enter() diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py index 1e2f9999957..42b4f952bbc 100644 --- a/tensorflow/python/ops/init_ops.py +++ b/tensorflow/python/ops/init_ops.py @@ -41,7 +41,6 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops -from tensorflow.python.ops import math_ops class Initializer(object): diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py index 87f6f92a8a8..cc8c6239470 100644 --- a/tensorflow/python/ops/nn_test.py +++ b/tensorflow/python/ops/nn_test.py @@ -830,7 +830,8 @@ class ReluTest(test_lib.TestCase): class MomentsTest(test_lib.TestCase): - def doOutputTest(self, input_shape, moments_axes, tol=1e-4): + def doOutputTest(self, input_shape, moments_axes, tol=1e-4, + check_gradients=False): for mu in [0.0, 1.0, 1e3]: for sigma in [1.0, 0.1]: for keep_dims in [True, False]: @@ -846,6 +847,15 @@ class MomentsTest(test_lib.TestCase): mean, variance = nn_impl.moments( inputs, moments_axes, keep_dims=keep_dims) + if check_gradients: + err = gradient_checker.compute_gradient_error( + inputs, input_shape, mean, mean.shape.as_list()) + self.assertLess(err, 1e-3) + err = gradient_checker.compute_gradient_error( + inputs, input_shape, variance, variance.shape.as_list()) + self.assertLess(err, 1e-3) + + # Evaluate. [mean, variance] = sess.run([mean, variance]) # Make sure that there are no NaNs self.assertFalse(np.isnan(mean).any()) @@ -853,6 +863,12 @@ class MomentsTest(test_lib.TestCase): self.assertAllClose(mean, expected_mean, rtol=tol, atol=tol) self.assertAllClose(variance, expected_var, rtol=tol, atol=tol) + def testOutputAndGradient2DInput0(self): + self.doOutputTest((10, 10), (0,), check_gradients=True) + + def testOutputAndGradient2DInput01(self): + self.doOutputTest((10, 10), (0, 1), check_gradients=True) + def testOutput2DInput0(self): self.doOutputTest((10, 300), (0,)) diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py index f7854e86c0c..304b6ae665f 100644 --- a/tensorflow/python/ops/rnn_cell_impl.py +++ b/tensorflow/python/ops/rnn_cell_impl.py @@ -786,13 +786,18 @@ class DropoutWrapper(RNNCell): class ResidualWrapper(RNNCell): """RNNCell wrapper that ensures cell inputs are added to the outputs.""" - def __init__(self, cell): + def __init__(self, cell, residual_fn=None): """Constructs a `ResidualWrapper` for `cell`. Args: cell: An instance of `RNNCell`. + residual_fn: (Optional) The function to map raw cell inputs and raw cell + outputs to the actual cell outputs of the residual network. + Defaults to calling nest.map_structure on (lambda i, o: i + o), inputs + and outputs. """ self._cell = cell + self._residual_fn = residual_fn @property def state_size(self): @@ -807,7 +812,7 @@ class ResidualWrapper(RNNCell): return self._cell.zero_state(batch_size, dtype) def __call__(self, inputs, state, scope=None): - """Run the cell and add its inputs to its outputs. + """Run the cell and then apply the residual_fn on its inputs to its outputs. Args: inputs: cell inputs. @@ -822,13 +827,14 @@ class ResidualWrapper(RNNCell): ValueError: If cell inputs and outputs have different structure (value). """ outputs, new_state = self._cell(inputs, state, scope=scope) - nest.assert_same_structure(inputs, outputs) # Ensure shapes match def assert_shape_match(inp, out): inp.get_shape().assert_is_compatible_with(out.get_shape()) - nest.map_structure(assert_shape_match, inputs, outputs) - res_outputs = nest.map_structure( - lambda inp, out: inp + out, inputs, outputs) + def default_residual_fn(inputs, outputs): + nest.assert_same_structure(inputs, outputs) + nest.map_structure(assert_shape_match, inputs, outputs) + return nest.map_structure(lambda inp, out: inp + out, inputs, outputs) + res_outputs = (self._residual_fn or default_residual_fn)(inputs, outputs) return (res_outputs, new_state) diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt index b21d9a8ee33..a75e9e80802 100644 --- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt @@ -54,7 +54,7 @@ tf_class { } member_method { name: "__init__" - argspec: "args=[\'self\', \'cell\'], varargs=None, keywords=None, defaults=None" + argspec: "args=[\'self\', \'cell\', \'residual_fn\'], varargs=None, keywords=None, defaults=[\'None\'], " } member_method { name: "add_loss" diff --git a/tensorflow/tools/ci_build/builds/builds_common.sh b/tensorflow/tools/ci_build/builds/builds_common.sh index 9323a96e74c..e3b58d038a7 100644 --- a/tensorflow/tools/ci_build/builds/builds_common.sh +++ b/tensorflow/tools/ci_build/builds/builds_common.sh @@ -17,6 +17,7 @@ # Common Bash functions used by build scripts COLOR_NC='\033[0m' +COLOR_LIGHT_GRAY='\033[0;37m' COLOR_GREEN='\033[0;32m' COLOR_RED='\033[0;31m' diff --git a/tensorflow/tools/ci_build/builds/configured b/tensorflow/tools/ci_build/builds/configured index 25cb51ea7cc..563e07e3afb 100755 --- a/tensorflow/tools/ci_build/builds/configured +++ b/tensorflow/tools/ci_build/builds/configured @@ -56,7 +56,7 @@ else fi pushd "${CI_TENSORFLOW_SUBMODULE_PATH:-.}" -yes "" | ./configure +$PYTHON_BIN_PATH configure.py popd # Gather and print build information diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh index db011a6badd..112dab3a733 100755 --- a/tensorflow/tools/ci_build/builds/pip.sh +++ b/tensorflow/tools/ci_build/builds/pip.sh @@ -73,6 +73,9 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/builds_common.sh" +SKIP_RETURN_CODE=112 + + # Get the command line arguments CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' ) shift @@ -310,6 +313,13 @@ create_activate_virtualenv_and_install_tensorflow() { # Smoke test of tensorflow install in clean virtualenv ################################################################################ do_clean_virtualenv_smoke_test() { + if [[ -n "${NO_TEST_ON_INSTALL}" ]] && + [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then + echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:" + echo " Skipping smoke test of tensorflow install in clean virtualenv" + return ${SKIP_RETURN_CODE} + fi + CLEAN_VENV_DIR="${PIP_TEST_ROOT}/venv_clean" create_activate_virtualenv_and_install_tensorflow --clean \ "${CLEAN_VENV_DIR}" "${WHL_PATH}" @@ -361,6 +371,7 @@ do_virtualenv_pip_test() { [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:" echo " Skipping ALL Python unit tests on install" + return ${SKIP_RETURN_CODE} else # Call run_pip_tests.sh to perform test-on-install "${SCRIPT_DIR}/run_pip_tests.sh" --virtualenv ${GPU_FLAG} ${MAC_FLAG} @@ -379,6 +390,7 @@ do_virtualenv_oss_serial_pip_test() { [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:" echo " Skipping Python unit tests on install tagged with oss_serial" + return ${SKIP_RETURN_CODE} else # Call run_pip_tests.sh to perform test-on-install "${SCRIPT_DIR}/run_pip_tests.sh" \ @@ -402,6 +414,7 @@ do_test_user_ops() { fi else echo "Skipping user-op test-on-install due to DO_TEST_USER_OPS = ${DO_TEST_USER_OPS}" + return ${SKIP_RETURN_CODE} fi } @@ -424,6 +437,7 @@ do_test_tfdbg_binaries() { popd else echo "Skipping test of tfdbg binaries due to DO_TEST_TFDBG_BINARIES = ${DO_TEST_TFDBG_BINARIES}" + return ${SKIP_RETURN_CODE} fi } @@ -439,6 +453,7 @@ do_test_tutorials() { fi else echo "Skipping tutorial tests-on-install due to DO_TEST_TUTORIALS = ${DO_TEST_TUTORIALS}" + return ${SKIP_RETURN_CODE} fi } @@ -455,6 +470,7 @@ do_ffmpeg_integration_test() { fi else echo "Skipping ffmpeg integration due to DO_INTEGRATION_TESTS = ${DO_INTEGRATION_TESTS}" + return ${SKIP_RETURN_CODE} fi } @@ -468,6 +484,7 @@ PIP_TASKS_DESC=("Smoke test of pip install in clean virtualenv" "PIP tests in vi COUNTER=0 FAIL_COUNTER=0 PASS_COUNTER=0 +SKIP_COUNTER=0 while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do INDEX=COUNTER ((INDEX++)) @@ -480,7 +497,9 @@ while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do ${PIP_TASKS[COUNTER]} RESULT=$? - if [[ ${RESULT} != "0" ]]; then + if [[ ${RESULT} == ${SKIP_RETURN_CODE} ]]; then + ((SKIP_COUNTER++)) + elif [[ ${RESULT} != "0" ]]; then ((FAIL_COUNTER++)) else ((PASS_COUNTER++)) @@ -503,7 +522,9 @@ while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do ((INDEX++)) echo "${INDEX}. ${PIP_TASKS[COUNTER]}: ${PIP_TASKS_DESC[COUNTER]}" - if [[ ${STEP_EXIT_CODES[COUNTER]} == "0" ]]; then + if [[ ${STEP_EXIT_CODES[COUNTER]} == ${SKIP_RETURN_CODE} ]]; then + printf " ${COLOR_LIGHT_GRAY}SKIP${COLOR_NC}\n" + elif [[ ${STEP_EXIT_CODES[COUNTER]} == "0" ]]; then printf " ${COLOR_GREEN}PASS${COLOR_NC}\n" else printf " ${COLOR_RED}FAIL${COLOR_NC}\n" @@ -513,7 +534,7 @@ while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do done echo -echo "${FAIL_COUNTER} failed; ${PASS_COUNTER} passed." +echo "${SKIP_COUNTER} skipped; ${FAIL_COUNTER} failed; ${PASS_COUNTER} passed." echo if [[ ${FAIL_COUNTER} == "0" ]]; then diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh index f66846654d1..9a6890401b7 100755 --- a/tensorflow/tools/ci_build/builds/run_pip_tests.sh +++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh @@ -120,7 +120,7 @@ else fi export TF_NEED_CUDA=$IS_GPU -yes "" | ./configure +${PYTHON_BIN_PATH} configure.py # Figure out how many concurrent tests we can run and do run the tests. BAZEL_PARALLEL_TEST_FLAGS="" diff --git a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh index 118e85fee0b..ca840796543 100755 --- a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh +++ b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh @@ -30,7 +30,7 @@ export TF_NEED_HDFS=0 export TF_NEED_CUDA=0 # Only running cc tests, python version does not matter. export PYTHON_BIN_PATH=`which python` -yes "" | ./configure +$PYTHON_BIN_PATH configure.py # Run bazel test command. Double test timeouts to avoid flakes. bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test --test_lang_filters=cc -k \ diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh index fa3d27fa41e..5c82c9efafa 100755 --- a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh +++ b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh @@ -29,7 +29,7 @@ export TF_NEED_GCP=0 export TF_NEED_HDFS=0 export TF_NEED_CUDA=0 export PYTHON_BIN_PATH=`which python2` -yes "" | ./configure +$PYTHON_BIN_PATH configure.py # Run bazel test command. Double test timeouts to avoid flakes. bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \ diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh index 258dec4fec8..7155636a53f 100755 --- a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh +++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh @@ -29,7 +29,7 @@ export TF_NEED_GCP=0 export TF_NEED_HDFS=0 export TF_NEED_CUDA=0 export PYTHON_BIN_PATH=`which python3` -yes "" | ./configure +$PYTHON_BIN_PATH configure.py # Run bazel test command. Double test timeouts to avoid flakes. bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \ diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh index 9c450ab4dab..218d2a89913 100755 --- a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh +++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh @@ -29,7 +29,7 @@ export TF_NEED_GCP=0 export TF_NEED_HDFS=0 export TF_NEED_CUDA=0 export PYTHON_BIN_PATH=`which python3` -yes "" | ./configure +$PYTHON_BIN_PATH configure.py # Run bazel test command. Double test timeouts to avoid flakes. bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \ diff --git a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh index f2ea8d3c773..dff72c25bf7 100755 --- a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh +++ b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh @@ -32,7 +32,7 @@ export PYTHON_BIN_PATH=`which python3` export TF_NEED_CUDA=1 export TF_CUDA_COMPUTE_CAPABILITIES=3.7 -yes "" | ./configure +$PYTHON_BIN_PATH configure.py # Run bazel test command. Double test timeouts to avoid flakes. bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \ diff --git a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh index 4e0c3d1d333..a36a8445afd 100755 --- a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh +++ b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh @@ -32,7 +32,7 @@ export PYTHON_BIN_PATH=`which python3` export TF_NEED_CUDA=1 export TF_CUDA_COMPUTE_CAPABILITIES=3.7 -yes "" | ./configure +$PYTHON_BIN_PATH configure.py # Run bazel test command. Double test timeouts to avoid flakes. bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \ diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh index 0b8c73993f8..0ee894e2c44 100755 --- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh +++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh @@ -30,7 +30,7 @@ export TF_NEED_GCP=0 export TF_NEED_HDFS=0 export TF_NEED_CUDA=0 export PYTHON_BIN_PATH=$(which python2) -yes "" | ./configure +$PYTHON_BIN_PATH configure.py which bazel bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac \ --test_timeout 300,450,1200,3600 \ diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh index 11064130713..f548adc5ca8 100755 --- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh +++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh @@ -33,7 +33,7 @@ export TF_NEED_CUDA=1 export TF_ENABLE_XLA=1 export TF_CUDA_COMPUTE_CAPABILITIES=3.7 -yes "" | ./configure +$PYTHON_BIN_PATH configure.py # Run bazel test command. Double test timeouts to avoid flakes. bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \ diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD index 12bacf3c27d..2d96406d270 100644 --- a/third_party/llvm/llvm.BUILD +++ b/third_party/llvm/llvm.BUILD @@ -899,6 +899,7 @@ cc_library( "include/llvm/Target/ARM/InstPrinter/*.h", "include/llvm/Target/ARM/InstPrinter/*.def", "include/llvm/Target/ARM/InstPrinter/*.inc", + "lib/Target/ARM/*.h", "lib/Target/ARM/InstPrinter/*.h", ]), copts = ["-Iexternal/llvm/lib/Target/ARM"], @@ -1206,6 +1207,7 @@ cc_library( "lib/IR/*.h", ]), hdrs = glob([ + "include/llvm/Analysis/*.def", "include/llvm/IR/*.h", "include/llvm/IR/*.def", "include/llvm/IR/*.inc", @@ -2022,6 +2024,8 @@ cc_library( "lib/Target/*.h", ]), hdrs = glob([ + "include/llvm/CodeGen/*.h", + "include/llvm/CodeGen/*.def", "include/llvm/Target/*.h", "include/llvm/Target/*.def", "include/llvm/Target/*.inc",