Merge changes from github.

PiperOrigin-RevId: 176695926
This commit is contained in:
Yifei Feng 2017-11-22 13:42:21 -08:00 committed by TensorFlower Gardener
parent e219aeb542
commit b1d8c59e9b
217 changed files with 6304 additions and 1432 deletions

View File

@ -42,7 +42,7 @@ The Code of Conduct also applies within project spaces and in public spaces when
Conflicts in an open source project can take many forms, from someone having a bad day and using harsh and hurtful language in the issue queue, to more serious instances such as sexist/racist statements or threats of violence, and everything in between.
If the behaviour is threatening or harassing, or for other reasons requires immediate escalation, please see below.
If the behavior is threatening or harassing, or for other reasons requires immediate escalation, please see below.
However, for the vast majority of issues, we aim to empower individuals to first resolve conflicts themselves, asking for help when needed, and only after that fails to escalate further. This approach gives people more control over the outcome of their dispute.
@ -55,14 +55,14 @@ If you are experiencing or witnessing conflict, we ask you to use the following
## Reporting Violations
Violations of the Code of Conduct can be reported to TensorFlows Project Steward at conduct@tensorflow.org. The Project Steward will determine whether the Code of Conduct was violated, and will issue an appropriate sanction, possibly including a written warning or expulsion from the project, project sponsored spaces, or project forums. We ask that you make a good-faith effort to resolve your conflict via the conflict resolution policy before submitting a report.
Violations of the Code of Conduct can be reported to TensorFlows Project Stewards, Edd Wilder-James (ewj@google.com) and Sarah Novotny (sarahnovotny@google.com). The Project Steward will determine whether the Code of Conduct was violated, and will issue an appropriate sanction, possibly including a written warning or expulsion from the project, project sponsored spaces, or project forums. We ask that you make a good-faith effort to resolve your conflict via the conflict resolution policy before submitting a report.
Violations of the Code of Conduct can occur in any setting, even those unrelated to the project. We will only consider complaints about conduct that has occurred within one year of the report.
## Enforcement
If the Project Steward receives a report alleging a violation of the Code of Conduct, the Project Steward will notify the accused of the report, and provide them an opportunity to discuss the report before a sanction is issued. The Project Steward will do their utmost to keep the reporter anonymous. If the act is ongoing (such as someone engaging in harassment), or involves a threat to anyone's safety (e.g. threats of violence), the Project Steward may issue sanctions without notice.
If the Project Stewards receive a report alleging a violation of the Code of Conduct, the Project Stewards will notify the accused of the report, and provide them an opportunity to discuss the report before a sanction is issued. The Project Stewards will do their utmost to keep the reporter anonymous. If the act is ongoing (such as someone engaging in harassment), or involves a threat to anyone's safety (e.g. threats of violence), the Project Stewards may issue sanctions without notice.
## Attribution

View File

@ -73,11 +73,11 @@ $ python
## For more information
* [TensorFlow website](https://www.tensorflow.org)
* [TensorFlow Website](https://www.tensorflow.org)
* [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
* [TensorFlow Model Zoo](https://github.com/tensorflow/models)
* [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
* [TensorFlow course at Stanford](https://web.stanford.edu/class/cs20si)
* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate.

View File

@ -43,6 +43,7 @@ _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION)
_TF_OPENCL_VERSION = '1.2'
_DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
_DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
def is_windows():
@ -636,7 +637,7 @@ def set_tf_cuda_version(environ_cp):
write_action_env_to_bazelrc('TF_CUDA_VERSION', tf_cuda_version)
def set_tf_cunn_version(environ_cp):
def set_tf_cudnn_version(environ_cp):
"""Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION."""
ask_cudnn_version = (
'Please specify the cuDNN version you want to use. '
@ -883,6 +884,28 @@ def set_computecpp_toolkit_path(environ_cp):
computecpp_toolkit_path)
def set_trisycl_include_dir(environ_cp):
"""Set TRISYCL_INCLUDE_DIR."""
ask_trisycl_include_dir = ('Please specify the location of the triSYCL '
'include directory. (Use --config=sycl_trisycl '
'when building with Bazel) '
'[Default is %s]: ') % (
_DEFAULT_TRISYCL_INCLUDE_DIR)
while True:
trisycl_include_dir = get_from_env_or_user_or_default(
environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
_DEFAULT_TRISYCL_INCLUDE_DIR)
if os.path.exists(trisycl_include_dir):
break
print('Invalid triSYCL include directory, %s cannot be found' %
(trisycl_include_dir))
# Set TRISYCL_INCLUDE_DIR
environ_cp['TRISYCL_INCLUDE_DIR'] = trisycl_include_dir
write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR', trisycl_include_dir)
def set_mpi_home(environ_cp):
"""Set MPI_HOME."""
default_mpi_home = which('mpirun') or which('mpiexec') or ''
@ -997,6 +1020,8 @@ def main():
environ_cp['TF_NEED_GCP'] = '0'
environ_cp['TF_NEED_HDFS'] = '0'
environ_cp['TF_NEED_JEMALLOC'] = '0'
environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
environ_cp['TF_NEED_COMPUTECPP'] = '0'
environ_cp['TF_NEED_OPENCL'] = '0'
environ_cp['TF_CUDA_CLANG'] = '0'
@ -1018,17 +1043,21 @@ def main():
set_build_var(environ_cp, 'TF_NEED_VERBS', 'VERBS', 'with_verbs_support',
False, 'verbs')
set_action_env_var(environ_cp, 'TF_NEED_OPENCL', 'OpenCL', False)
if environ_cp.get('TF_NEED_OPENCL') == '1':
set_action_env_var(environ_cp, 'TF_NEED_OPENCL_SYCL', 'OpenCL SYCL', False)
if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1':
set_host_cxx_compiler(environ_cp)
set_host_c_compiler(environ_cp)
set_computecpp_toolkit_path(environ_cp)
set_action_env_var(environ_cp, 'TF_NEED_COMPUTECPP', 'ComputeCPP', True)
if environ_cp.get('TF_NEED_COMPUTECPP') == '1':
set_computecpp_toolkit_path(environ_cp)
else:
set_trisycl_include_dir(environ_cp)
set_action_env_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)
if (environ_cp.get('TF_NEED_CUDA') == '1' and
'TF_CUDA_CONFIG_REPO' not in environ_cp):
set_tf_cuda_version(environ_cp)
set_tf_cunn_version(environ_cp)
set_tf_cudnn_version(environ_cp)
set_tf_cuda_compute_capabilities(environ_cp)
set_tf_cuda_clang(environ_cp)

View File

@ -54,6 +54,15 @@ config_setting(
visibility = ["//visibility:public"],
)
config_setting(
name = "raspberry_pi_armeabi",
values = {
"crosstool_top": "@local_config_arm_compiler//:toolchain",
"cpu": "armeabi",
},
visibility = ["//visibility:public"],
)
config_setting(
name = "android_arm",
values = {
@ -760,6 +769,13 @@ tf_cc_shared_object(
],
)
exports_files(
[
"tf_version_script.lds",
"tf_exported_symbols.lds",
],
)
py_library(
name = "tensorflow_py",
srcs = ["__init__.py"],

View File

@ -119,7 +119,7 @@ def tf_library(name, graph, config,
out_nodes_file,
] + freeze_saver_srcs,
outs=[freeze_file],
cmd=("$(location //tensorflow/python/tools:freeze_graph)" +
cmd=("$(location @org_tensorflow//tensorflow/python/tools:freeze_graph)" +
freeze_args),
tools=["@org_tensorflow//tensorflow/python/tools:freeze_graph"],
tags=tags,
@ -130,6 +130,10 @@ def tf_library(name, graph, config,
header_file = name + ".h"
object_file = name + ".o"
ep = ("__" + PACKAGE_NAME + "__" + name).replace("/", "_")
if type(tfcompile_flags) == type(""):
flags = tfcompile_flags
else:
flags = " ".join(["'" + arg.replace("'", "'\\''") + "'" for arg in (tfcompile_flags or [])])
native.genrule(
name=("gen_" + name),
srcs=[
@ -148,7 +152,7 @@ def tf_library(name, graph, config,
" --target_triple=" + target_llvm_triple() +
" --out_header=$(@D)/" + header_file +
" --out_object=$(@D)/" + object_file +
" " + (tfcompile_flags or "")),
" " + flags),
tools=[tfcompile_tool],
visibility=visibility,
testonly=testonly,
@ -185,7 +189,7 @@ def tf_library(name, graph, config,
" --cpp_class=" + cpp_class +
" --target_triple=" + target_llvm_triple() +
" --out_session_module=$(@D)/" + session_module_pb +
" " + (tfcompile_flags or "")),
" " + flags),
tools=[tfcompile_tool],
visibility=visibility,
testonly=testonly,
@ -195,8 +199,7 @@ def tf_library(name, graph, config,
# The cc_library rule packaging up the header and object file, and needed
# kernel implementations.
need_xla_data_proto = (tfcompile_flags and
tfcompile_flags.find("--gen_program_shape") != -1)
need_xla_data_proto = (flags and flags.find("--gen_program_shape") != -1)
native.cc_library(
name=name,
srcs=[object_file],
@ -253,7 +256,7 @@ def tf_library(name, graph, config,
],
outs=[test_file],
cmd=("sed " + sed_replace +
" $(location //tensorflow/compiler/aot:test.cc) " +
" $(location @org_tensorflow//tensorflow/compiler/aot:test.cc) " +
"> $(OUTS)"),
tags=tags,
)

View File

@ -672,7 +672,7 @@ tf_library(
cpp_class = "LSTMLayerInference",
graph = "lstm_layer_inference.pbtxt",
tags = ["manual"],
tfcompile_flags = "--xla_cpu_multi_thread_eigen=false",
tfcompile_flags = ["--xla_cpu_multi_thread_eigen=false"],
)
# -----------------------------------------------------------------------------

View File

@ -36,7 +36,7 @@ class FusedBatchNormTest(XLATestCase):
x_square = x * x
x_square_sum = np.sum(x_square, (0, 1, 2))
x_sum = np.sum(x, axis=(0, 1, 2))
element_count = np.size(x) / int(np.shape(x)[0])
element_count = np.size(x) / int(np.shape(x)[-1])
mean = x_sum / element_count
var = x_square_sum / element_count - mean * mean
normalized = (x - mean) / np.sqrt(var + epsilon)
@ -64,8 +64,9 @@ class FusedBatchNormTest(XLATestCase):
return grad_x, grad_scale, grad_offset
def testInference(self):
x_shape = [2, 2, 6, 2]
scale_shape = [2]
channel = 3
x_shape = [2, 2, 6, channel]
scale_shape = [channel]
x_val = np.random.random_sample(x_shape).astype(np.float32)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
@ -74,8 +75,9 @@ class FusedBatchNormTest(XLATestCase):
with self.test_session() as sess, self.test_scope():
# To avoid constant folding
t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x")
scale = array_ops.placeholder(np.float32, shape=[2], name="scale")
offset = array_ops.placeholder(np.float32, shape=[2], name="offset")
scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
offset = array_ops.placeholder(
np.float32, shape=scale_shape, name="offset")
epsilon = 0.001
y_ref, mean_ref, var_ref = self._reference_training(
x_val, scale_val, offset_val, epsilon, data_format)
@ -97,8 +99,9 @@ class FusedBatchNormTest(XLATestCase):
self.assertAllClose(y_val, y_ref, atol=1e-3)
def _testLearning(self, use_gradient_checker):
x_shape = [2, 2, 6, 2]
scale_shape = [2]
channel = 3
x_shape = [2, 2, 6, channel]
scale_shape = [channel]
x_val = np.random.random_sample(x_shape).astype(np.float32)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
@ -109,8 +112,9 @@ class FusedBatchNormTest(XLATestCase):
with self.test_session() as sess, self.test_scope():
# To avoid constant folding
t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x")
scale = array_ops.placeholder(np.float32, shape=[2], name="scale")
offset = array_ops.placeholder(np.float32, shape=[2], name="offset")
scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
offset = array_ops.placeholder(
np.float32, shape=scale_shape, name="offset")
epsilon = 0.001
y, mean, var = nn.fused_batch_norm(
t_val,
@ -154,8 +158,9 @@ class FusedBatchNormTest(XLATestCase):
def testGradient(self):
# TODO(b/64270657): Use gradient_checker here in addition to comparing with
# this reference implementation.
x_shape = [2, 2, 6, 2]
scale_shape = [2]
channel = 3
x_shape = [2, 2, 6, channel]
scale_shape = [channel]
grad_val = np.random.random_sample(x_shape).astype(np.float32)
x_val = np.random.random_sample(x_shape).astype(np.float32)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)

View File

@ -222,7 +222,7 @@ class HloInstruction {
tensorflow::gtl::ArraySlice<int64> strides);
// Creates a slice instruction, where the first operand is sliced by
// start indices specified in the second operand, and by size specfied in
// start indices specified in the second operand, and by size specified in
// 'slice_sizes'.
static std::unique_ptr<HloInstruction> CreateDynamicSlice(
const Shape& shape, HloInstruction* operand,

View File

@ -792,8 +792,8 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
// sub = Sub(mul, clamp)
// tuple = Tuple({sub, sub, mul, C1})
//
// Notable complexities are repeated operands in a same instruction, different
// shapes, use of value in different expressions.
// Notable complexities are repeated operands in the same instruction,
// different shapes, use of value in different expressions.
auto c1 = builder.AddInstruction(
HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
auto c2 = builder.AddInstruction(

View File

@ -37,7 +37,7 @@ set_target_properties(lib_tf PROPERTIES IMPORTED_LOCATION
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIS_SLIM_BUILD \
-std=c++11 -fno-rtti -fno-exceptions \
-O2 -Wno-narrowing -fomit-frame-pointer \
-mfpu=neon -mfloat-abi=softfp -fPIE \
-mfpu=neon -mfloat-abi=softfp -fPIE -fPIC \
-ftemplate-depth=900 \
-DGOOGLE_PROTOBUF_NO_RTTI \
-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER")

View File

@ -82,6 +82,7 @@ cc_library(
tf_cc_test(
name = "adaptive_shared_batch_scheduler_test",
srcs = ["adaptive_shared_batch_scheduler_test.cc"],
tags = ["manual"], # b/69013768
deps = [
":adaptive_shared_batch_scheduler",
"//tensorflow/contrib/batching/test_util:fake_clock_env",

View File

@ -461,7 +461,7 @@ class BatchResource : public ResourceBase {
return Status::OK();
}
// Looks up the batcher queue for 'queue_name'. If it did't previously exist,
// Looks up the batcher queue for 'queue_name'. If it didn't previously exist,
// creates it.
Status LookupOrCreateBatcherQueue(const string& queue_name,
BatcherQueue** queue) {

View File

@ -759,7 +759,7 @@ class CsiszarVIMCOTest(test.TestCase):
def _csiszar_vimco_helper_grad(self, logu, delta):
"""Finite difference approximation of `grad(csiszar_vimco_helper, logu)`."""
# This code actually estimates the sum of the Jacobiab because thats what
# This code actually estimates the sum of the Jacobiab because that's what
# TF's `gradients` does.
np_log_avg_u1, np_log_sooavg_u1 = self._csiszar_vimco_helper(
logu[..., None] + np.diag([delta]*len(logu)))

View File

@ -34,13 +34,41 @@ option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
if(HAIKU)
option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" OFF)
else()
option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" ON)
endif()
if (NOT WIN32)
# Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
# for targets that link ${CMAKE_THREAD_LIBS_INIT}.
find_package (Threads)
option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/)
option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/)
if (NOT tensorflow_CUDNN_INCLUDE)
# option's default value is OFF. Fill it with real default values
set(tensorflow_CUDNN_INCLUDE /usr/include)
endif (NOT tensorflow_CUDNN_INCLUDE)
option(tensorflow_PATH_CUDNN_STATIC_LIB "Override PATH_STATIC_LIB for libcudnn_static.a" ${tensorflow_PATH_STATIC_LIB})
option(tensorflow_PATH_NCCL_STATIC_LIB "Override PATH_STATIC_LIB for libnccl_static.a" ${tensorflow_PATH_STATIC_LIB})
option(tensorflow_CUDA_LIBRARY_PATH "Designate the default CUDA library paths" /usr/local/cuda/lib64)
if (NOT tensorflow_CUDA_LIBRARY_PATH)
# option's default value is OFF. Fill it with real default values
set(tensorflow_CUDA_LIBRARY_PATH /usr/local/cuda/lib64)
endif (NOT tensorflow_CUDA_LIBRARY_PATH)
endif()
if (WIN32)
set(BOOL_WIN32 ON)
else (WIN32)
set(BOOL_WIN32 OFF)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
endif (WIN32)
# [CLEANUP] Remove when done
# For debugging
function(SHOW_VARIABLES)
@ -58,7 +86,12 @@ set (DOWNLOAD_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/downloads"
CACHE PATH "Location where external projects will be downloaded.")
mark_as_advanced(DOWNLOAD_LOCATION)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
if (tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
else()
set(CMAKE_POSITION_INDEPENDENT_CODE OFF)
endif()
add_definitions(-DEIGEN_AVOID_STL_ARRAY)
if(WIN32)
add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
@ -217,20 +250,35 @@ endif()
if(UNIX)
list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
endif()
if(HAIKU)
list(APPEND tensorflow_EXTERNAL_LIBRARIES network)
endif()
if (tensorflow_ENABLE_GPU)
if (NOT WIN32)
# Default install paths for cuda libraries in Linux
# In some Linux distros, find_package(CUDA) seems to require CMAKE_LIBRARY_PATH to include cuda-lib paths
list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}")
list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
endif (NOT WIN32)
find_package(CUDA 8.0 REQUIRED)
# by default we assume compute cabability 3.5 and 5.2. If you change this change it in
# CUDA_NVCC_FLAGS and cuda_config.h below
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true) # Flush denormals to zero
set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
include_directories(${CUDA_INCLUDE})
if (WIN32)
find_package(CUDA 8.0 REQUIRED)
# by default we assume compute cabability 3.5 and 5.2. If you change this change it in
# CUDA_NVCC_FLAGS and cuda_config.h below
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true) # Flush denormals to zero
set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
include_directories(${CUDA_INCLUDE})
add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2)
else (WIN32)
# Without these double quotes, cmake in Linux makes it "-DTF_EXTRA_CUDA_CAPABILITIES=3.0, -D3.5, -D5.2" for cc, which incurs build breaks
add_definitions(-DGOOGLE_CUDA=1 -D"TF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2")
endif (WIN32)
if (WIN32)
# add cudnn
if(NOT CUDNN_HOME)
set(CUDNN_HOME ${CUDA_TOOLKIT_TARGET_DIR})
@ -238,18 +286,48 @@ if (tensorflow_ENABLE_GPU)
include_directories(${CUDNN_HOME})
set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${CUDNN_HOME}/lib/x64/cudnn.lib)
else (WIN32)
set(CUDNN_INCLUDE "${tensorflow_CUDNN_INCLUDE}")
# create cuda_config.h
FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
"#ifndef CUDA_CUDA_CONFIG_H_\n"
"#define CUDA_CUDA_CONFIG_H_\n"
"#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
"#define TF_CUDA_VERSION \"64_80\"\n"
"#define TF_CUDNN_VERSION \"64_6\"\n"
"#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
"#endif // CUDA_CUDA_CONFIG_H_\n"
)
find_library(nccl_STATIC_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
if (NOT nccl_STATIC_LIBRARY)
message(FATAL_ERROR "NCCL is required for GPU-build")
else (NOT nccl_STATIC_LIBRARY)
message("nccl-static: ${nccl_STATIC_LIBRARY}")
# something like /usr/lib64/libnccl_static.a
endif (NOT nccl_STATIC_LIBRARY)
find_library(cudnn_STATIC_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
if (NOT cudnn_STATIC_LIBRARY)
message(FATAL_ERROR "CUDNN is required for GPU-build")
else (NOT cudnn_STATIC_LIBRARY)
message("cudnn-static: ${cudnn_STATIC_LIBRARY}")
endif (NOT cudnn_STATIC_LIBRARY)
find_library(culibos_STATIC_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
if (NOT culibos_STATIC_LIBRARY)
message(FATAL_ERROR "CULIBOS is required for GPU-build")
else (NOT culibos_STATIC_LIBRARY)
message("culibos-static: ${culibos_STATIC_LIBRARY}")
endif (NOT culibos_STATIC_LIBRARY)
include_directories(${CUDNN_INCLUDE})
set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_STATIC_LIBRARY} ${culibos_STATIC_LIBRARY} ${nccl_STATIC_LIBRARY})
endif (WIN32)
# create cuda_config.h
FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
"#ifndef CUDA_CUDA_CONFIG_H_\n"
"#define CUDA_CUDA_CONFIG_H_\n"
"#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
"#define TF_CUDA_VERSION \"64_80\"\n"
"#define TF_CUDNN_VERSION \"64_6\"\n"
"#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
"#endif // CUDA_CUDA_CONFIG_H_\n"
)
if (WIN32)
# tf assumes in various places header files to be in cuda/include. On windows the cuda sdk
# installs them under cuda/version/include and to avoid that we need to change tf we copy a
# few files to cuda/include
@ -261,12 +339,25 @@ if (tensorflow_ENABLE_GPU)
${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
)
include_directories(${tensorflow_source_dir}/third_party/gpus)
# add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
else(WIN32)
# Linux has slightly differnt install paths than Windows
FILE(COPY
${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_INCLUDE}/cudnn.h
${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h
${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
)
endif(WIN32)
# NOTE(mrry): Update these flags when the version of CUDA or cuDNN used
# in the default build is upgraded.
include_directories(${tensorflow_source_dir}/third_party/gpus)
# add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
# NOTE(mrry): Update these flags when the version of CUDA or cuDNN used
# in the default build is upgraded.
if(WIN32)
set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
msvcp_dll_name=msvcp140.dll
cudart_dll_name=cudart64_80.dll
@ -275,7 +366,9 @@ if (tensorflow_ENABLE_GPU)
cudnn_dll_name=cudnn64_6.dll
cudnn_version_number=6)
else(WIN32)
message(FATAL_ERROR "CMake GPU build is currently only supported on Windows.")
set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
cuda_version_number=8.0
cudnn_version_number=6)
endif(WIN32)
else(tensorflow_ENABLE_GPU)
set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
@ -293,9 +386,7 @@ include(tf_core_framework.cmake)
# NOTE: Disabled until issue #3996 is fixed.
# include(tf_stream_executor.cmake)
if (tensorflow_ENABLE_GPU)
if (WIN32)
include(tf_stream_executor.cmake)
endif()
endif()
include(tf_core_cpu.cmake)

View File

@ -39,8 +39,12 @@ ExternalProject_Add(boringssl
# BUILD_IN_SOURCE 1
INSTALL_COMMAND ""
CMAKE_CACHE_ARGS
if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
else()
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
endif()
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
)

View File

@ -42,8 +42,12 @@ ExternalProject_Add(jsoncpp
BUILD_IN_SOURCE 1
INSTALL_COMMAND ""
CMAKE_CACHE_ARGS
if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
else()
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
endif()
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
)

View File

@ -29,10 +29,14 @@ ExternalProject_Add(lmdb
INSTALL_DIR ${lmdb_INSTALL}
DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
CMAKE_CACHE_ARGS
if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
else()
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
endif()
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
)
if(WIN32)

View File

@ -41,10 +41,14 @@ ExternalProject_Add(png
INSTALL_DIR ${png_INSTALL}
DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
CMAKE_CACHE_ARGS
if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
else()
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
endif()
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_INSTALL_PREFIX:STRING=${png_INSTALL}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
)

View File

@ -44,8 +44,12 @@ ExternalProject_Add(protobuf
${PROTOBUF_ADDITIONAL_CMAKE_OPTIONS}
INSTALL_COMMAND ""
CMAKE_CACHE_ARGS
if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
else()
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
endif()
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
)

View File

@ -38,7 +38,11 @@ ExternalProject_Add(re2
BUILD_IN_SOURCE 1
DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
CMAKE_CACHE_ARGS
if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
else()
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
endif()
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_INSTALL_PREFIX:STRING=${re2_INSTALL}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
)
)

View File

@ -40,11 +40,15 @@ ExternalProject_Add(snappy
LOG_CONFIGURE ON
LOG_BUILD ON
CMAKE_CACHE_ARGS
if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
else()
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
endif()
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DSNAPPY_BUILD_TESTS:BOOL=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
)
# actually enables snappy in the source code
add_definitions(-DTF_USE_SNAPPY)
add_definitions(-DTF_USE_SNAPPY)

View File

@ -53,9 +53,13 @@ else()
INSTALL_DIR ${sqlite_INSTALL}
DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
CMAKE_CACHE_ARGS
if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
else()
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
endif()
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:STRING=${sqlite_INSTALL}
)

View File

@ -42,9 +42,13 @@ ExternalProject_Add(zlib
BUILD_IN_SOURCE 1
DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
CMAKE_CACHE_ARGS
if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
else()
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
endif()
-DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
)
# put zlib includes in the directory where they are expected

View File

@ -148,7 +148,11 @@ list(REMOVE_ITEM tf_cc_srcs ${tf_cc_test_srcs})
add_library(tf_cc OBJECT ${tf_cc_srcs})
add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
if (WIN32)
set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
else (WIN32)
set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
endif (WIN32)
add_custom_target(tf_extension_ops)
function(AddUserOps)
@ -164,15 +168,13 @@ function(AddUserOps)
# create shared library from source and cuda obj
add_library(${_AT_TARGET} SHARED ${_AT_SOURCES} ${gpu_lib})
target_link_libraries(${_AT_TARGET} ${pywrap_tensorflow_lib})
if(WIN32)
if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
# some ops call out to cuda directly; need to link libs for the cuda dlls
target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
endif()
if (_AT_DISTCOPY)
add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
endif()
if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
# some ops call out to cuda directly; need to link libs for the cuda dlls
target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
endif()
if (_AT_DISTCOPY)
add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
endif()
if (_AT_DEPENDS)
add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
@ -180,9 +182,19 @@ function(AddUserOps)
# make sure TF_COMPILE_LIBRARY is not defined for this target
get_target_property(target_compile_flags ${_AT_TARGET} COMPILE_FLAGS)
if(target_compile_flags STREQUAL "target_compile_flags-NOTFOUND")
set(target_compile_flags "/UTF_COMPILE_LIBRARY")
if (WIN32)
set(target_compile_flags "/UTF_COMPILE_LIBRARY")
else (WIN32)
# gcc uses UTF as default
set(target_compile_flags "-finput-charset=UTF-8")
endif (WIN32)
else()
set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
if (WIN32)
set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
else (WIN32)
# gcc uses UTF as default
set(target_compile_flags "${target_compile_flags} -finput-charset=UTF-8")
endif (WIN32)
endif()
set_target_properties(${_AT_TARGET} PROPERTIES COMPILE_FLAGS ${target_compile_flags})
add_dependencies(tf_extension_ops ${_AT_TARGET})

View File

@ -179,6 +179,7 @@ file(GLOB_RECURSE tf_core_gpu_kernels_srcs
"${tensorflow_source_dir}/tensorflow/contrib/image/kernels/*.cu.cc"
"${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/*.cu.cc"
"${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/*.cu.cc"
"${tensorflow_source_dir}/tensorflow/contrib/resampler/kernels/*.cu.cc"
)
if(WIN32 AND tensorflow_ENABLE_GPU)
@ -202,16 +203,16 @@ endif(WIN32 AND tensorflow_ENABLE_GPU)
add_library(tf_core_kernels OBJECT ${tf_core_kernels_srcs})
add_dependencies(tf_core_kernels tf_core_cpu)
if(WIN32)
if (WIN32)
target_compile_options(tf_core_kernels PRIVATE /MP)
if (tensorflow_ENABLE_GPU)
set_source_files_properties(${tf_core_gpu_kernels_srcs} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
set(tf_core_gpu_kernels_lib tf_core_gpu_kernels)
cuda_add_library(${tf_core_gpu_kernels_lib} ${tf_core_gpu_kernels_srcs})
set_target_properties(${tf_core_gpu_kernels_lib}
PROPERTIES DEBUG_POSTFIX ""
COMPILE_FLAGS "${TF_REGULAR_CXX_FLAGS}"
)
add_dependencies(${tf_core_gpu_kernels_lib} tf_core_cpu)
endif()
endif (WIN32)
if (tensorflow_ENABLE_GPU)
set_source_files_properties(${tf_core_gpu_kernels_srcs} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
set(tf_core_gpu_kernels_lib tf_core_gpu_kernels)
cuda_add_library(${tf_core_gpu_kernels_lib} ${tf_core_gpu_kernels_srcs})
set_target_properties(${tf_core_gpu_kernels_lib}
PROPERTIES DEBUG_POSTFIX ""
COMPILE_FLAGS "${TF_REGULAR_CXX_FLAGS}"
)
add_dependencies(${tf_core_gpu_kernels_lib} tf_core_cpu)
endif()

View File

@ -34,3 +34,8 @@ target_link_libraries(tf_label_image_example PUBLIC
${tf_core_gpu_kernels_lib}
${tensorflow_EXTERNAL_LIBRARIES}
)
install(TARGETS tf_label_image_example
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib)

View File

@ -715,6 +715,9 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
set(require_shape_fn 1)
endif()
get_filename_component(GENERATE_PYTHON_OP_LIB_MKDIRPATH ${GENERATE_PYTHON_OP_LIB_DESTINATION} PATH)
file(MAKE_DIRECTORY ${GENERATE_PYTHON_OP_LIB_MKDIRPATH})
# Create a C++ executable that links in the appropriate op
# registrations and generates Python wrapper code based on the
# registered ops.
@ -743,6 +746,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
${GENERATE_PYTHON_OP_LIB_DESTINATION} PARENT_SCOPE)
endfunction()
GENERATE_PYTHON_OP_LIB("audio_ops")
GENERATE_PYTHON_OP_LIB("array_ops")
GENERATE_PYTHON_OP_LIB("bitwise_ops")
GENERATE_PYTHON_OP_LIB("math_ops")
@ -987,7 +991,7 @@ add_library(pywrap_tensorflow_internal SHARED
$<TARGET_OBJECTS:tf_tools_transform_graph_lib>
$<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
$<TARGET_OBJECTS:tf_core_kernels>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
${pywrap_tensorflow_deffile}
)
@ -1063,25 +1067,23 @@ if(WIN32)
DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
endif(WIN32)
if(WIN32)
# include contrib/seq2seq as .so
#
set(tf_beam_search_srcs
"${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
"${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
"${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
)
# include contrib/seq2seq as .so
#
set(tf_beam_search_srcs
"${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
"${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
"${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
)
set(tf_beam_search_gpu_srcs
"${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc"
)
set(tf_beam_search_gpu_srcs
"${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc"
)
AddUserOps(TARGET _beam_search_ops
SOURCES "${tf_beam_search_srcs}"
GPUSOURCES ${tf_beam_search_gpu_srcs}
DEPENDS pywrap_tensorflow_internal tf_python_ops
DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
endif(WIN32)
AddUserOps(TARGET _beam_search_ops
SOURCES "${tf_beam_search_srcs}"
GPUSOURCES ${tf_beam_search_gpu_srcs}
DEPENDS pywrap_tensorflow_internal tf_python_ops
DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
############################################################
# Build a PIP package containing the TensorFlow runtime.

View File

@ -73,7 +73,7 @@ add_library(tensorflow SHARED
$<TARGET_OBJECTS:tf_tools_transform_graph_lib>
$<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
$<TARGET_OBJECTS:tf_core_kernels>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
${tensorflow_deffile}
)
@ -94,3 +94,46 @@ endif()
if(WIN32)
add_dependencies(tensorflow tensorflow_static)
endif(WIN32)
install(TARGETS tensorflow
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib)
# install necessary headers
# tensorflow headers
install(DIRECTORY ${tensorflow_source_dir}/tensorflow/cc/
DESTINATION include/tensorflow/cc
FILES_MATCHING PATTERN "*.h")
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tensorflow/cc/
DESTINATION include/tensorflow/cc
FILES_MATCHING PATTERN "*.h")
install(DIRECTORY ${tensorflow_source_dir}/tensorflow/core/
DESTINATION include/tensorflow/core
FILES_MATCHING PATTERN "*.h")
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tensorflow/core/
DESTINATION include/tensorflow/core
FILES_MATCHING PATTERN "*.h")
install(DIRECTORY ${tensorflow_source_dir}/tensorflow/stream_executor/
DESTINATION include/tensorflow/stream_executor
FILES_MATCHING PATTERN "*.h")
# google protobuf headers
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src/google/
DESTINATION include/google
FILES_MATCHING PATTERN "*.h")
# nsync headers
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/
DESTINATION include/external/nsync
FILES_MATCHING PATTERN "*.h")
# Eigen directory
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/Eigen/
DESTINATION include/Eigen)
# external directory
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/
DESTINATION include/external/eigen_archive)
# third_party eigen directory
install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/
DESTINATION include/third_party/eigen3)
# unsupported Eigen directory
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/
DESTINATION include/unsupported/Eigen)

View File

@ -74,6 +74,9 @@ endif()
#)
#list(REMOVE_ITEM tf_stream_executor_srcs ${tf_stream_executor_test_srcs})
if (NOT WIN32)
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lgomp")
endif (NOT WIN32)
add_library(tf_stream_executor OBJECT ${tf_stream_executor_srcs})
add_dependencies(tf_stream_executor

View File

@ -73,7 +73,7 @@ add_executable(${transform_graph}
$<TARGET_OBJECTS:tf_core_direct_session>
$<TARGET_OBJECTS:tf_tools_transform_graph_lib>
$<TARGET_OBJECTS:tf_core_kernels>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
)
@ -95,7 +95,7 @@ add_executable(${summarize_graph}
$<TARGET_OBJECTS:tf_core_direct_session>
$<TARGET_OBJECTS:tf_tools_transform_graph_lib>
$<TARGET_OBJECTS:tf_core_kernels>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
)
@ -117,7 +117,7 @@ add_executable(${compare_graphs}
$<TARGET_OBJECTS:tf_core_direct_session>
$<TARGET_OBJECTS:tf_tools_transform_graph_lib>
$<TARGET_OBJECTS:tf_core_kernels>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
)
@ -138,7 +138,7 @@ add_executable(${benchmark_model}
$<TARGET_OBJECTS:tf_core_ops>
$<TARGET_OBJECTS:tf_core_direct_session>
$<TARGET_OBJECTS:tf_core_kernels>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
$<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
)
@ -147,3 +147,8 @@ target_link_libraries(${benchmark_model} PUBLIC
${tf_core_gpu_kernels_lib}
${tensorflow_EXTERNAL_LIBRARIES}
)
install(TARGETS ${transform_graph} ${summarize_graph} ${compare_graphs} ${benchmark_model}
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib)

View File

@ -34,3 +34,8 @@ target_link_libraries(tf_tutorials_example_trainer PUBLIC
${tf_core_gpu_kernels_lib}
${tensorflow_EXTERNAL_LIBRARIES}
)
install(TARGETS tf_tutorials_example_trainer
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib)

View File

@ -395,8 +395,8 @@ class CrfDecodeForwardRnnCell(rnn_cell.RNNCell):
scope: Unused variable scope of this cell.
Returns:
backpointers: [batch_size, num_tags], containing backpointers.
new_state: [batch_size, num_tags], containing new score values.
backpointers: A [batch_size, num_tags] matrix of backpointers.
new_state: A [batch_size, num_tags] matrix of new score values.
"""
# For simplicity, in shape comments, denote:
# 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
@ -436,8 +436,9 @@ class CrfDecodeBackwardRnnCell(rnn_cell.RNNCell):
"""Build the CrfDecodeBackwardRnnCell.
Args:
inputs: [batch_size, num_tags], backpointer of next step (in time order).
state: [batch_size, 1], next position's tag index.
inputs: A [batch_size, num_tags] matrix of
backpointer of next step (in time order).
state: A [batch_size, 1] matrix of tag index of next step.
scope: Unused variable scope of this cell.
Returns:
@ -461,16 +462,16 @@ def crf_decode(potentials, transition_params, sequence_length):
This is a function for tensor.
Args:
potentials: A [batch_size, max_seq_len, num_tags] tensor, matrix of
potentials: A [batch_size, max_seq_len, num_tags] tensor of
unary potentials.
transition_params: A [num_tags, num_tags] tensor, matrix of
transition_params: A [num_tags, num_tags] matrix of
binary potentials.
sequence_length: A [batch_size] tensor, containing sequence lengths.
sequence_length: A [batch_size] vector of true sequence lengths.
Returns:
decode_tags: A [batch_size, max_seq_len] tensor, with dtype tf.int32.
decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
Contains the highest scoring tag indices.
best_score: A [batch_size] tensor, containing the score of decode_tags.
best_score: A [batch_size] vector, containing the score of `decode_tags`.
"""
# If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
# and the max activation.

View File

@ -11,6 +11,7 @@ py_test(
size = "small",
srcs = ["batch_dataset_op_test.py"],
srcs_version = "PY2AND3",
tags = ["no_pip"],
deps = [
":dataset_serialization_test",
"//tensorflow/contrib/data/python/ops:dataset_ops",
@ -373,6 +374,7 @@ py_test(
size = "small",
srcs = ["sequence_dataset_op_test.py"],
srcs_version = "PY2AND3",
tags = ["no_pip"],
deps = [
":dataset_serialization_test",
"//tensorflow/contrib/data/python/ops:dataset_ops",
@ -450,6 +452,7 @@ py_test(
size = "small",
srcs = ["zip_dataset_op_test.py"],
srcs_version = "PY2AND3",
tags = ["no_pip"],
deps = [
":dataset_serialization_test",
"//tensorflow/contrib/data/python/ops:dataset_ops",
@ -466,7 +469,10 @@ py_test(
size = "small",
srcs = ["prefetching_ops_test.py"],
srcs_version = "PY2AND3",
tags = ["no_oss"], # b/68785503
tags = [
"manual",
"no_oss", # b/68785503
],
deps = [
"//tensorflow/contrib/data/python/ops:prefetching_py",
"//tensorflow/core:protos_all_py",

View File

@ -140,6 +140,23 @@ cuda_py_test(
],
)
cuda_py_test(
name = "cauchy_test",
size = "medium",
srcs = ["python/kernel_tests/cauchy_test.py"],
additional_deps = [
":distributions_py",
"//third_party/py/numpy",
"//tensorflow/python:array_ops",
"//tensorflow/python:client_testlib",
"//tensorflow/python:framework_for_generated_wrappers",
"//tensorflow/python:framework_test_lib",
"//tensorflow/python:gradients",
"//tensorflow/python:platform_test",
"//tensorflow/python:variables",
],
)
cuda_py_test(
name = "chi2_test",
srcs = ["python/kernel_tests/chi2_test.py"],

View File

@ -24,6 +24,7 @@ from __future__ import print_function
from tensorflow.contrib.distributions.python.ops import bijectors
from tensorflow.contrib.distributions.python.ops.binomial import *
from tensorflow.contrib.distributions.python.ops.cauchy import *
from tensorflow.contrib.distributions.python.ops.chi2 import *
from tensorflow.contrib.distributions.python.ops.conditional_distribution import *
from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
@ -83,6 +84,7 @@ from tensorflow.python.util.all_util import remove_undocumented
_allowed_symbols = [
'bijectors',
'Cauchy',
'ConditionalDistribution',
'ConditionalTransformedDistribution',
'FULLY_REPARAMETERIZED',

View File

@ -0,0 +1,438 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Cauchy."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import importlib
import numpy as np
from tensorflow.contrib.distributions.python.ops import cauchy as cauchy_lib
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_shape
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import gradients_impl
from tensorflow.python.ops import variables
from tensorflow.python.platform import test
from tensorflow.python.platform import tf_logging
def try_import(name): # pylint: disable=invalid-name
module = None
try:
module = importlib.import_module(name)
except ImportError as e:
tf_logging.warning("Could not import %s: %s" % (name, str(e)))
return module
stats = try_import("scipy.stats")
class CauchyTest(test.TestCase):
def setUp(self):
self._rng = np.random.RandomState(123)
def assertAllFinite(self, tensor):
is_finite = np.isfinite(tensor.eval())
all_true = np.ones_like(is_finite, dtype=np.bool)
self.assertAllEqual(all_true, is_finite)
def _testParamShapes(self, sample_shape, expected):
with self.test_session():
param_shapes = cauchy_lib.Cauchy.param_shapes(sample_shape)
loc_shape, scale_shape = param_shapes["loc"], param_shapes["scale"]
self.assertAllEqual(expected, loc_shape.eval())
self.assertAllEqual(expected, scale_shape.eval())
loc = array_ops.zeros(loc_shape)
scale = array_ops.ones(scale_shape)
self.assertAllEqual(expected,
array_ops.shape(
cauchy_lib.Cauchy(loc, scale).sample()).eval())
def _testParamStaticShapes(self, sample_shape, expected):
param_shapes = cauchy_lib.Cauchy.param_static_shapes(sample_shape)
loc_shape, scale_shape = param_shapes["loc"], param_shapes["scale"]
self.assertEqual(expected, loc_shape)
self.assertEqual(expected, scale_shape)
def testParamShapes(self):
sample_shape = [10, 3, 4]
self._testParamShapes(sample_shape, sample_shape)
self._testParamShapes(constant_op.constant(sample_shape), sample_shape)
def testParamStaticShapes(self):
sample_shape = [10, 3, 4]
self._testParamStaticShapes(sample_shape, sample_shape)
self._testParamStaticShapes(
tensor_shape.TensorShape(sample_shape), sample_shape)
def testCauchyLogPDF(self):
with self.test_session():
batch_size = 6
loc = constant_op.constant([3.0] * batch_size)
scale = constant_op.constant([np.sqrt(10.0)] * batch_size)
x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
log_pdf = cauchy.log_prob(x)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), log_pdf.shape)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
log_pdf.eval().shape)
self.assertAllEqual(cauchy.batch_shape, log_pdf.shape)
self.assertAllEqual(cauchy.batch_shape, log_pdf.eval().shape)
pdf = cauchy.prob(x)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.shape)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.eval().shape)
self.assertAllEqual(cauchy.batch_shape, pdf.shape)
self.assertAllEqual(cauchy.batch_shape, pdf.eval().shape)
if not stats:
return
expected_log_pdf = stats.cauchy(loc.eval(), scale.eval()).logpdf(x)
self.assertAllClose(expected_log_pdf, log_pdf.eval())
self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
def testCauchyLogPDFMultidimensional(self):
with self.test_session():
batch_size = 6
loc = constant_op.constant([[3.0, -3.0]] * batch_size)
scale = constant_op.constant(
[[np.sqrt(10.0), np.sqrt(15.0)]] * batch_size)
x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
log_pdf = cauchy.log_prob(x)
log_pdf_values = log_pdf.eval()
self.assertEqual(log_pdf.shape, (6, 2))
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), log_pdf.shape)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
log_pdf.eval().shape)
self.assertAllEqual(cauchy.batch_shape, log_pdf.shape)
self.assertAllEqual(cauchy.batch_shape, log_pdf.eval().shape)
pdf = cauchy.prob(x)
pdf_values = pdf.eval()
self.assertEqual(pdf.shape, (6, 2))
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.shape)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf_values.shape)
self.assertAllEqual(cauchy.batch_shape, pdf.shape)
self.assertAllEqual(cauchy.batch_shape, pdf_values.shape)
if not stats:
return
expected_log_pdf = stats.cauchy(loc.eval(), scale.eval()).logpdf(x)
self.assertAllClose(expected_log_pdf, log_pdf_values)
self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
def testCauchyCDF(self):
with self.test_session():
batch_size = 50
loc = self._rng.randn(batch_size)
scale = self._rng.rand(batch_size) + 1.0
x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
cdf = cauchy.cdf(x)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.shape)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.eval().shape)
self.assertAllEqual(cauchy.batch_shape, cdf.shape)
self.assertAllEqual(cauchy.batch_shape, cdf.eval().shape)
if not stats:
return
expected_cdf = stats.cauchy(loc, scale).cdf(x)
self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
def testCauchySurvivalFunction(self):
with self.test_session():
batch_size = 50
loc = self._rng.randn(batch_size)
scale = self._rng.rand(batch_size) + 1.0
x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
sf = cauchy.survival_function(x)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.shape)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.eval().shape)
self.assertAllEqual(cauchy.batch_shape, sf.shape)
self.assertAllEqual(cauchy.batch_shape, sf.eval().shape)
if not stats:
return
expected_sf = stats.cauchy(loc, scale).sf(x)
self.assertAllClose(expected_sf, sf.eval(), atol=0)
def testCauchyLogCDF(self):
with self.test_session():
batch_size = 50
loc = self._rng.randn(batch_size)
scale = self._rng.rand(batch_size) + 1.0
x = np.linspace(-100.0, 10.0, batch_size).astype(np.float64)
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
cdf = cauchy.log_cdf(x)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.shape)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.eval().shape)
self.assertAllEqual(cauchy.batch_shape, cdf.shape)
self.assertAllEqual(cauchy.batch_shape, cdf.eval().shape)
if not stats:
return
expected_cdf = stats.cauchy(loc, scale).logcdf(x)
self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
def testFiniteGradientAtDifficultPoints(self):
for dtype in [np.float32, np.float64]:
g = ops.Graph()
with g.as_default():
loc = variables.Variable(dtype(0.0))
scale = variables.Variable(dtype(1.0))
dist = cauchy_lib.Cauchy(loc=loc, scale=scale)
x = np.array([-100., -20., -5., 0., 5., 20., 100.]).astype(dtype)
for func in [
dist.cdf, dist.log_cdf, dist.survival_function,
dist.log_survival_function, dist.log_prob, dist.prob
]:
value = func(x)
grads = gradients_impl.gradients(value, [loc, scale])
with self.test_session(graph=g):
variables.global_variables_initializer().run()
self.assertAllFinite(value)
self.assertAllFinite(grads[0])
self.assertAllFinite(grads[1])
def testCauchyLogSurvivalFunction(self):
with self.test_session():
batch_size = 50
loc = self._rng.randn(batch_size)
scale = self._rng.rand(batch_size) + 1.0
x = np.linspace(-10.0, 100.0, batch_size).astype(np.float64)
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
sf = cauchy.log_survival_function(x)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.shape)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.eval().shape)
self.assertAllEqual(cauchy.batch_shape, sf.shape)
self.assertAllEqual(cauchy.batch_shape, sf.eval().shape)
if not stats:
return
expected_sf = stats.cauchy(loc, scale).logsf(x)
self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
def testCauchyEntropy(self):
with self.test_session():
loc = np.array([1.0, 1.0, 1.0])
scale = np.array([[1.0, 2.0, 3.0]])
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
entropy = cauchy.entropy()
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), entropy.shape)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
entropy.eval().shape)
self.assertAllEqual(cauchy.batch_shape, entropy.shape)
self.assertAllEqual(cauchy.batch_shape, entropy.eval().shape)
if not stats:
return
expected_entropy = stats.cauchy(loc, scale[0]).entropy().reshape((1, 3))
self.assertAllClose(expected_entropy, entropy.eval())
def testCauchyMode(self):
with self.test_session():
# Mu will be broadcast to [7, 7, 7].
loc = [7.]
scale = [11., 12., 13.]
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
self.assertAllEqual((3,), cauchy.mode().shape)
self.assertAllEqual([7., 7, 7], cauchy.mode().eval())
def testCauchyMean(self):
with self.test_session():
loc = [1., 2., 3.]
scale = [7.]
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
self.assertAllEqual((3,), cauchy.mean().shape)
self.assertAllEqual([np.nan] * 3, cauchy.mean().eval())
def testCauchyNanMean(self):
with self.test_session():
loc = [1., 2., 3.]
scale = [7.]
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
with self.assertRaises(ValueError):
cauchy.mean().eval()
def testCauchyQuantile(self):
with self.test_session():
batch_size = 50
loc = self._rng.randn(batch_size)
scale = self._rng.rand(batch_size) + 1.0
p = np.linspace(0.000001, 0.999999, batch_size).astype(np.float64)
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
x = cauchy.quantile(p)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), x.shape)
self.assertAllEqual(cauchy.batch_shape_tensor().eval(), x.eval().shape)
self.assertAllEqual(cauchy.batch_shape, x.shape)
self.assertAllEqual(cauchy.batch_shape, x.eval().shape)
if not stats:
return
expected_x = stats.cauchy(loc, scale).ppf(p)
self.assertAllClose(expected_x, x.eval(), atol=0.)
def testCauchyVariance(self):
with self.test_session():
# scale will be broadcast to [7, 7, 7]
loc = [1., 2., 3.]
scale = [7.]
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
self.assertAllEqual((3,), cauchy.variance().shape)
self.assertAllEqual([np.nan] * 3, cauchy.variance().eval())
def testCauchyNanVariance(self):
with self.test_session():
# scale will be broadcast to [7, 7, 7]
loc = [1., 2., 3.]
scale = [7.]
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
with self.assertRaises(ValueError):
cauchy.variance().eval()
def testCauchyStandardDeviation(self):
with self.test_session():
# scale will be broadcast to [7, 7, 7]
loc = [1., 2., 3.]
scale = [7.]
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
self.assertAllEqual((3,), cauchy.stddev().shape)
self.assertAllEqual([np.nan] * 3, cauchy.stddev().eval())
def testCauchyNanStandardDeviation(self):
with self.test_session():
# scale will be broadcast to [7, 7, 7]
loc = [1., 2., 3.]
scale = [7.]
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
with self.assertRaises(ValueError):
cauchy.stddev().eval()
def testCauchySample(self):
with self.test_session():
loc = constant_op.constant(3.0)
scale = constant_op.constant(1.0)
loc_v = 3.0
n = constant_op.constant(100000)
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
samples = cauchy.sample(n)
sample_values = samples.eval()
self.assertEqual(sample_values.shape, (100000,))
self.assertAllClose(np.median(sample_values), loc_v, atol=1e-1)
expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
tensor_shape.TensorShape(cauchy.batch_shape_tensor().eval()))
self.assertAllEqual(expected_shape, samples.shape)
self.assertAllEqual(expected_shape, sample_values.shape)
expected_shape = (
tensor_shape.TensorShape([n.eval()]).concatenate(cauchy.batch_shape))
self.assertAllEqual(expected_shape, samples.shape)
self.assertAllEqual(expected_shape, sample_values.shape)
def testCauchySampleMultiDimensional(self):
with self.test_session():
batch_size = 2
loc = constant_op.constant([[3.0, -3.0]] * batch_size)
scale = constant_op.constant([[0.5, 1.0]] * batch_size)
loc_v = [3.0, -3.0]
n = constant_op.constant(100000)
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
samples = cauchy.sample(n)
sample_values = samples.eval()
self.assertEqual(samples.shape, (100000, batch_size, 2))
self.assertAllClose(
np.median(sample_values[:, 0, 0]), loc_v[0], atol=1e-1)
self.assertAllClose(
np.median(sample_values[:, 0, 1]), loc_v[1], atol=1e-1)
expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
tensor_shape.TensorShape(cauchy.batch_shape_tensor().eval()))
self.assertAllEqual(expected_shape, samples.shape)
self.assertAllEqual(expected_shape, sample_values.shape)
expected_shape = (
tensor_shape.TensorShape([n.eval()]).concatenate(cauchy.batch_shape))
self.assertAllEqual(expected_shape, samples.shape)
self.assertAllEqual(expected_shape, sample_values.shape)
def testCauchyNegativeLocFails(self):
with self.test_session():
cauchy = cauchy_lib.Cauchy(loc=[1.], scale=[-5.], validate_args=True)
with self.assertRaisesOpError("Condition x > 0 did not hold"):
cauchy.mode().eval()
def testCauchyShape(self):
with self.test_session():
loc = constant_op.constant([-3.0] * 5)
scale = constant_op.constant(11.0)
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
self.assertEqual(cauchy.batch_shape_tensor().eval(), [5])
self.assertEqual(cauchy.batch_shape, tensor_shape.TensorShape([5]))
self.assertAllEqual(cauchy.event_shape_tensor().eval(), [])
self.assertEqual(cauchy.event_shape, tensor_shape.TensorShape([]))
def testCauchyShapeWithPlaceholders(self):
loc = array_ops.placeholder(dtype=dtypes.float32)
scale = array_ops.placeholder(dtype=dtypes.float32)
cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
with self.test_session() as sess:
# get_batch_shape should return an "<unknown>" tensor.
self.assertEqual(cauchy.batch_shape, tensor_shape.TensorShape(None))
self.assertEqual(cauchy.event_shape, ())
self.assertAllEqual(cauchy.event_shape_tensor().eval(), [])
self.assertAllEqual(
sess.run(
cauchy.batch_shape_tensor(),
feed_dict={
loc: 5.0,
scale: [1.0, 2.0]
}), [2])
if __name__ == "__main__":
test.main()

View File

@ -0,0 +1,219 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""The Cauchy distribution class."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_shape
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import check_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import random_ops
from tensorflow.python.ops.distributions import distribution
__all__ = [
"Cauchy",
]
class Cauchy(distribution.Distribution):
"""The Cauchy distribution with location `loc` and scale `scale`.
#### Mathematical details
The probability density function (pdf) is,
```none
pdf(x; loc, scale) = 1 / (pi * scale * (1 + ((x - loc) / scale)**2))
```
where `loc` is the location, and `scale` is the scale.
The Cauchy distribution is a member of the [location-scale family](
https://en.wikipedia.org/wiki/Location-scale_family), i.e.
```none
X ~ Cauchy(loc=0, scale=1)
Y ~ Cauchy(loc=loc, scale=scale)
Y = loc + scale * X
```
#### Examples
Examples of initialization of one or a batch of distributions.
```python
# Define a single scalar Cauchy distribution.
dist = Cauchy(loc=0., scale=3.)
# Evaluate the cdf at 1, returning a scalar.
dist.cdf(1.)
# Define a batch of two scalar valued Cauchy distributions.
dist = Cauchy(loc=[1, 2.], scale=[11, 22.])
# Evaluate the pdf of the first distribution on 0, and the second on 1.5,
# returning a length two tensor.
dist.prob([0, 1.5])
# Get 3 samples, returning a 3 x 2 tensor.
dist.sample([3])
```
Arguments are broadcast when possible.
```python
# Define a batch of two scalar valued Cauchy distributions.
# Both have median 1, but different scales.
dist = tf.contrib.distributions.Cauchy(loc=1., scale=[11, 22.])
# Evaluate the pdf of both distributions on the same point, 3.0,
# returning a length 2 tensor.
dist.prob(3.0)
```
"""
def __init__(self,
loc,
scale,
validate_args=False,
allow_nan_stats=True,
name="Cauchy"):
"""Construct Cauchy distributions.
The parameters `loc` and `scale` must be shaped in a way that supports
broadcasting (e.g. `loc + scale` is a valid operation).
Args:
loc: Floating point tensor; the modes of the distribution(s).
scale: Floating point tensor; the locations of the distribution(s).
Must contain only positive values.
validate_args: Python `bool`, default `False`. When `True` distribution
parameters are checked for validity despite possibly degrading runtime
performance. When `False` invalid inputs may silently render incorrect
outputs.
allow_nan_stats: Python `bool`, default `True`. When `True`,
statistics (e.g., mean, mode, variance) use the value "`NaN`" to
indicate the result is undefined. When `False`, an exception is raised
if one or more of the statistic's batch members are undefined.
name: Python `str` name prefixed to Ops created by this class.
Raises:
TypeError: if `loc` and `scale` have different `dtype`.
"""
parameters = locals()
with ops.name_scope(name, values=[loc, scale]):
with ops.control_dependencies([check_ops.assert_positive(scale)]
if validate_args else []):
self._loc = array_ops.identity(loc, name="loc")
self._scale = array_ops.identity(scale, name="scale")
check_ops.assert_same_float_dtype([self._loc, self._scale])
super(Cauchy, self).__init__(
dtype=self._scale.dtype,
reparameterization_type=distribution.FULLY_REPARAMETERIZED,
validate_args=validate_args,
allow_nan_stats=allow_nan_stats,
parameters=parameters,
graph_parents=[self._loc, self._scale],
name=name)
@staticmethod
def _param_shapes(sample_shape):
return dict(
zip(("loc", "scale"),
([ops.convert_to_tensor(sample_shape, dtype=dtypes.int32)] * 2)))
@property
def loc(self):
"""Distribution parameter for the mean."""
return self._loc
@property
def scale(self):
"""Distribution parameter for standard deviation."""
return self._scale
def _batch_shape_tensor(self):
return array_ops.broadcast_dynamic_shape(
array_ops.shape(self.loc), array_ops.shape(self.scale))
def _batch_shape(self):
return array_ops.broadcast_static_shape(self.loc.shape, self.scale.shape)
def _event_shape_tensor(self):
return constant_op.constant([], dtype=dtypes.int32)
def _event_shape(self):
return tensor_shape.scalar()
def _sample_n(self, n, seed=None):
shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
probs = random_ops.random_uniform(
shape=shape, minval=0., maxval=1., dtype=self.dtype, seed=seed)
return self._quantile(probs)
def _log_prob(self, x):
return self._log_unnormalized_prob(x) - self._log_normalization()
def _cdf(self, x):
return math_ops.atan(self._z(x)) / np.pi + 0.5
def _log_cdf(self, x):
return math_ops.log1p(2 / np.pi * math_ops.atan(self._z(x))) - np.log(2)
def _log_unnormalized_prob(self, x):
return -math_ops.log1p(math_ops.square(self._z(x)))
def _log_normalization(self):
return np.log(np.pi) + math_ops.log(self.scale)
def _entropy(self):
h = np.log(4 * np.pi) + math_ops.log(self.scale)
return h * array_ops.ones_like(self.loc)
def _quantile(self, p):
return self.loc + self.scale * math_ops.tan(np.pi * (p - 0.5))
def _mode(self):
return self.loc * array_ops.ones_like(self.scale)
def _z(self, x):
"""Standardize input `x`."""
with ops.name_scope("standardize", values=[x]):
return (x - self.loc) / self.scale
def _inv_z(self, z):
"""Reconstruct input `x` from a its normalized version."""
with ops.name_scope("reconstruct", values=[z]):
return z * self.scale + self.loc
def _mean(self):
if self.allow_nan_stats:
return array_ops.fill(self.batch_shape_tensor(),
self.dtype.as_numpy_dtype(np.nan))
else:
raise ValueError("`mean` is undefined for Cauchy distribution.")
def _stddev(self):
if self.allow_nan_stats:
return array_ops.fill(self.batch_shape_tensor(),
self.dtype.as_numpy_dtype(np.nan))
else:
raise ValueError("`stddev` is undefined for Cauchy distribution.")

View File

@ -429,7 +429,9 @@
"cpu_tensor = tf.random_normal([SIZE, SIZE])\n",
"\n",
"if is_gpu_available:\n",
" gpu_tensor = cpu_tensor.gpu()"
" gpu_tensor = cpu_tensor.gpu()\n",
"else:\n",
" print(\"GPU not available.\")"
]
},
{

View File

@ -383,7 +383,7 @@
"\n",
"`implicit_value_and_gradients()` returns a function that accepts the same inputs as the function passed in, and returns a tuple consisting of:\n",
"\n",
"1. the value returned by the function passed in (in this case, the loss calculated by `calculate_linear_model_loss()`), and\n",
"1. the value returned by the function passed in (in this case, the loss calculated by `loss_fn()`), and\n",
"1. a list of tuples consisting of:\n",
" 1. The value of the gradient (a `tf.Tensor`) with respect to a given variable\n",
" 1. The corresponding variable (`tf.Variable`)\n",
@ -698,7 +698,7 @@
"source": [
"## Other Ways to Compute Gradients\n",
"\n",
"Using our loss function as an example (`calculate_linear_model_loss()`), there are several other ways we could compute gradients:\n",
"Using our loss function as an example (`loss_fn()`), there are several other ways we could compute gradients:\n",
"\n",
"1. `tfe.implicit_gradients()`\n",
"1. `tfe.gradients_function()`\n",
@ -841,7 +841,7 @@
"# tfe.implicit_value_and_gradients() demo\n",
"value_gradients_fn = tfe.implicit_value_and_gradients(loss_fn)\n",
"\n",
"# Returns only gradients:\n",
"# Returns the value returned by the function passed in, gradients, and variables:\n",
"value_gradients_fn(inputs, labels, wb)"
]
}

View File

@ -9,7 +9,7 @@
"source": [
"# Eager Execution Tutorial: Importing Data\n",
"\n",
"This notebook demonstrates the use of the [`tf.contrib.data.Dataset` API](https://www.tensorflow.org/programmers_guide/datasets) to build pipelines to feed data to your program. It covers:\n",
"This notebook demonstrates the use of the [`tf.data.Dataset` API](https://www.tensorflow.org/programmers_guide/datasets) to build pipelines to feed data to your program. It covers:\n",
"\n",
"* Creating a `Dataset`.\n",
"* Iteration over a `Dataset` with eager execution enabled.\n",
@ -64,7 +64,7 @@
"source": [
"# Step 1: Create a source `Dataset`\n",
"\n",
"Create a _source_ dataset using one of the factory functions like [`Dataset.from_tensors`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#from_tensors), [`Dataset.from_tensor_slices`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#from_tensor_slices) or using objects that read from files like [`TextLineDataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/TextLineDataset) or [`TFRecordDataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/TFRecordDataset). See the [Programmer's Guide](https://www.google.com/url?sa=D\u0026q=https%3A%2F%2Fwww.tensorflow.org%2Fprogrammers_guide%2Fdatasets%23reading_input_data) for more information."
"Create a _source_ dataset using one of the factory functions like [`Dataset.from_tensors`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensors), [`Dataset.from_tensor_slices`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices) or using objects that read from files like [`TextLineDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TextLineDataset) or [`TFRecordDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset). See the [Programmer's Guide](https://www.google.com/url?sa=D\u0026q=https%3A%2F%2Fwww.tensorflow.org%2Fprogrammers_guide%2Fdatasets%23reading_input_data) for more information."
]
},
{
@ -83,7 +83,7 @@
},
"outputs": [],
"source": [
"ds_tensors = tf.contrib.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])\n",
"ds_tensors = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])\n",
"\n",
"# Create a CSV file\n",
"import tempfile\n",
@ -93,7 +93,7 @@
"Line 2\n",
"Line 3\n",
" \"\"\")\n",
"ds_file = tf.contrib.data.TextLineDataset(filename)\n"
"ds_file = tf.data.TextLineDataset(filename)\n"
]
},
{
@ -105,7 +105,7 @@
"source": [
"# Step 2: Apply transformations\n",
"\n",
"Use the transformations functions like [`map`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#map), [`batch`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#batch), [`shuffle`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#shuffle) etc. to apply transformations to the records of the dataset. See the [API documentation for `tf.contrib.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset) for details."
"Use the transformations functions like [`map`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map), [`batch`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch), [`shuffle`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shuffle) etc. to apply transformations to the records of the dataset. See the [API documentation for `tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) for details."
]
},
{

View File

@ -286,7 +286,6 @@ def _fused_batch_norm(inputs,
ValueError: If the rank of `inputs` is neither 2 or 4.
ValueError: If rank or `C` dimension of `inputs` is undefined.
"""
# TODO(reedwm): Add support for fp16 inputs.
if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
raise ValueError('data_format has to be either NCHW or NHWC.')
with variable_scope.variable_scope(
@ -310,7 +309,6 @@ def _fused_batch_norm(inputs,
new_shape = [-1, channels, 1, 1]
inputs = array_ops.reshape(inputs, new_shape)
inputs_shape = inputs.get_shape()
dtype = inputs.dtype.base_dtype
if data_format == DATA_FORMAT_NHWC:
params_shape = inputs_shape[-1:]
else:
@ -320,9 +318,10 @@ def _fused_batch_norm(inputs,
(inputs.name, params_shape))
# Allocate parameters for the beta and gamma of the normalization.
trainable_beta = trainable and center
beta_collections = utils.get_variable_collections(variables_collections,
'beta')
# Float32 required to avoid precision-loss when using fp16 input/output
variable_dtype = dtypes.float32
if not param_initializers:
param_initializers = {}
if not param_regularizers:
@ -336,13 +335,13 @@ def _fused_batch_norm(inputs,
beta = variables.model_variable(
'beta',
shape=params_shape,
dtype=dtype,
dtype=variable_dtype,
initializer=beta_initializer,
regularizer=beta_regularizer,
collections=beta_collections,
trainable=trainable_beta)
trainable=trainable)
else:
beta = array_ops.constant(0.0, shape=params_shape)
beta = array_ops.constant(0.0, dtype=variable_dtype, shape=params_shape)
if scale:
gamma_collections = utils.get_variable_collections(
@ -352,13 +351,13 @@ def _fused_batch_norm(inputs,
gamma = variables.model_variable(
'gamma',
shape=params_shape,
dtype=dtype,
dtype=variable_dtype,
initializer=gamma_initializer,
regularizer=gamma_regularizer,
collections=gamma_collections,
trainable=trainable)
else:
gamma = array_ops.constant(1.0, shape=params_shape)
gamma = array_ops.constant(1.0, dtype=variable_dtype, shape=params_shape)
# Create moving_mean and moving_variance variables and add them to the
# appropriate collections. We disable variable partitioning while creating
@ -375,7 +374,7 @@ def _fused_batch_norm(inputs,
moving_mean = variables.model_variable(
'moving_mean',
shape=params_shape,
dtype=dtype,
dtype=variable_dtype,
initializer=moving_mean_initializer,
trainable=False,
collections=moving_mean_collections)
@ -386,7 +385,7 @@ def _fused_batch_norm(inputs,
moving_variance = variables.model_variable(
'moving_variance',
shape=params_shape,
dtype=dtype,
dtype=variable_dtype,
initializer=moving_variance_initializer,
trainable=False,
collections=moving_variance_collections)

View File

@ -1774,10 +1774,13 @@ class BatchNormTest(test.TestCase):
with self.assertRaisesRegexp(ValueError, 'undefined'):
_layers.batch_norm(inputs, data_format='NCHW')
def _testCreateOp(self, fused):
def _testCreateOp(self, fused, dtype=None):
if dtype is None:
dtype = dtypes.float32
height, width = 3, 3
with self.test_session():
images = np.random.uniform(size=(5, height, width, 3)).astype('f')
images = np.random.uniform(size=(5, height, width, 3)).astype(
dtype.as_numpy_dtype)
output = _layers.batch_norm(images, fused=fused)
expected_name = ('BatchNorm/FusedBatchNorm' if fused else
'BatchNorm/batchnorm')
@ -1792,6 +1795,9 @@ class BatchNormTest(test.TestCase):
def testCreateOpFused(self):
self._testCreateOp(True)
def testCreateOpFusedFloat16(self):
self._testCreateOp(True, dtypes.float16)
def _testCreateOpBetaRegularizer(self, fused=True):
height, width = 3, 3
with self.test_session():
@ -2659,11 +2665,64 @@ class BatchNormTest(test.TestCase):
def testBatchNormBeta(self):
# Test case for 11673
with self.test_session() as sess:
a = array_ops.placeholder(dtypes.float32, shape=(10, 10, 10, 10))
b = _layers.batch_norm(a, center=False, data_format='NCHW',
zero_debias_moving_mean=True)
a_32 = array_ops.placeholder(dtypes.float32, shape=(10, 10, 10, 10))
_layers.batch_norm(
a_32, center=False, data_format='NCHW', zero_debias_moving_mean=True)
a_16 = array_ops.placeholder(dtypes.float16, shape=(10, 10, 10, 10))
_layers.batch_norm(
a_16, center=False, data_format='NCHW', zero_debias_moving_mean=True)
sess.run(variables_lib.global_variables_initializer())
def testVariablesAreFloat32(self):
height, width = 3, 3
with self.test_session():
images = random_ops.random_uniform(
(5, height, width, 3), seed=1, dtype=dtypes.float16)
_layers.batch_norm(images, scale=True)
beta = variables.get_variables_by_name('beta')[0]
gamma = variables.get_variables_by_name('gamma')[0]
self.assertEqual(beta.dtype, dtypes.float32_ref)
self.assertEqual(gamma.dtype, dtypes.float32_ref)
moving_mean = variables.get_variables_by_name('moving_mean')[0]
moving_variance = variables.get_variables_by_name('moving_variance')[0]
self.assertEqual(moving_mean.dtype, dtypes.float32_ref)
self.assertEqual(moving_variance.dtype, dtypes.float32_ref)
def _runFusedBatchNorm(self, shape, dtype):
channels = shape[1]
images = np.arange(np.product(shape), dtype=dtype).reshape(shape)
beta = init_ops.constant_initializer(
np.arange(2, channels + 2, dtype=np.float32))
gamma = init_ops.constant_initializer(
np.arange(10, channels + 10, dtype=np.float32) * 2.0)
mean = init_ops.constant_initializer(
np.arange(3, channels + 3, dtype=np.float32) * 5.0)
variance = init_ops.constant_initializer(
np.arange(1, channels + 1, dtype=np.float32) * 4.0)
output = _layers.batch_norm(
images,
fused=True,
is_training=True,
scale=True,
epsilon=0.5,
param_initializers={
'beta': beta,
'gamma': gamma,
'moving_mean': mean,
'moving_variance': variance,
},
data_format='NCHW')
with self.test_session(use_gpu=True) as sess:
sess.run(variables_lib.global_variables_initializer())
return sess.run(output)
def testFusedBatchNormFloat16MatchesFloat32(self):
if test.is_gpu_available(cuda_only=True):
shape = [5, 4, 2, 3]
res_32 = self._runFusedBatchNorm(shape, np.float32)
res_16 = self._runFusedBatchNorm(shape, np.float16)
self.assertAllClose(res_32, res_16, rtol=1e-3)
def testAdjustmentCreated(self):
# Tests that the adjustment is appropriately passed to and used by the core
# BN layer.

View File

@ -119,7 +119,7 @@ class Head(object):
update_op = tf.contrib.layers.optimize_loss(optimizer=sync,
loss=model_fn_ops.loss, ...)
hooks = [sync.make_session_run_hook(is_chief)]
... upate train_op and hooks in ModelFnOps and return
... update train_op and hooks in ModelFnOps and return
```
"""
__metaclass__ = abc.ABCMeta

View File

@ -23,7 +23,6 @@ import collections
import six
from tensorflow.contrib import framework as contrib_framework
from tensorflow.contrib.framework import get_graph_from_inputs
from tensorflow.contrib.learn.python.learn.estimators import constants
from tensorflow.contrib.learn.python.learn.estimators import metric_key
@ -32,6 +31,7 @@ from tensorflow.python.estimator import model_fn as core_model_fn_lib
from tensorflow.python.estimator.export import export_output as core_export_lib
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import sparse_tensor
from tensorflow.python.framework import tensor_shape
from tensorflow.python.ops import array_ops
from tensorflow.python.platform import tf_logging as logging
@ -156,11 +156,11 @@ class ModelFnOps(
else:
if isinstance(predictions, dict):
predictions = {
k: contrib_framework.convert_to_tensor_or_sparse_tensor(v)
k: sparse_tensor.convert_to_tensor_or_sparse_tensor(v)
for k, v in six.iteritems(predictions)
}
else:
predictions = contrib_framework.convert_to_tensor_or_sparse_tensor(
predictions = sparse_tensor.convert_to_tensor_or_sparse_tensor(
predictions)
# Validate eval_metric_ops

View File

@ -28,13 +28,13 @@ import six
from six.moves import xrange # pylint: disable=redefined-builtin
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import tensor_util
from tensorflow.python.ops import array_ops
from tensorflow.python.platform import tf_logging as logging
# pylint: disable=g-multiple-import,g-bad-import-order
from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels
# pylint: enable=g-multiple-import,g-bad-import-order
@ -365,8 +365,14 @@ class DataFeeder(object):
self.random_state = np.random.RandomState(
42) if random_state is None else random_state
num_samples = list(self._x.values())[0].shape[
0] if x_is_dict else self._x.shape[0]
if x_is_dict:
num_samples = list(self._x.values())[0].shape[0]
elif tensor_util.is_tensor(self._x):
num_samples = self._x.shape[
0].value # shape will be a Dimension, extract an int
else:
num_samples = self._x.shape[0]
if self._shuffle:
self.indices = self.random_state.permutation(num_samples)
else:

View File

@ -238,10 +238,10 @@ class SdcaModel(object):
with name_scope('sdca/prediction'):
sparse_variables = self._convert_n_to_tensor(self._variables[
'sparse_features_weights'])
result = 0.0
result_sparse = 0.0
for sfc, sv in zip(examples['sparse_features'], sparse_variables):
# TODO(sibyl-Aix6ihai): following does not take care of missing features.
result += math_ops.segment_sum(
result_sparse += math_ops.segment_sum(
math_ops.multiply(
array_ops.gather(sv, sfc.feature_indices), sfc.feature_values),
sfc.example_indices)
@ -249,12 +249,14 @@ class SdcaModel(object):
dense_variables = self._convert_n_to_tensor(self._variables[
'dense_features_weights'])
result_dense = 0.0
for i in range(len(dense_variables)):
result += math_ops.matmul(dense_features[i],
array_ops.expand_dims(dense_variables[i], -1))
result_dense += math_ops.matmul(dense_features[i],
array_ops.expand_dims(
dense_variables[i], -1))
# Reshaping to allow shape inference at graph construction time.
return array_ops.reshape(result, [-1])
return array_ops.reshape(result_dense, [-1]) + result_sparse
def predictions(self, examples):
"""Add operations to compute predictions by the model.

View File

@ -23,6 +23,7 @@ py_library(
py_test(
name = "lite_test",
srcs = ["lite_test.py"],
srcs_version = "PY2AND3",
deps = [
":lite",
"//tensorflow/python:array_ops",

View File

@ -36,6 +36,11 @@ import traceback
import zipfile
import numpy as np
from six import StringIO
# TODO(aselle): Disable GPU for now
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# pylint: disable=g-import-not-at-top
import tensorflow as tf
from google.protobuf import text_format
# TODO(aselle): switch to TensorFlow's resource_loader
@ -379,12 +384,13 @@ def make_zip_of_tests(zip_path,
report["toco_log"] = ""
tf.reset_default_graph()
try:
inputs, outputs = make_graph(param_dict_real)
except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError,
ValueError):
report["tf_log"] += traceback.format_exc()
return None, report
with tf.device("/cpu:0"):
try:
inputs, outputs = make_graph(param_dict_real)
except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError,
ValueError):
report["tf_log"] += traceback.format_exc()
return None, report
sess = tf.Session()
try:

View File

@ -61,6 +61,7 @@ tf_py_test(
data = [
":toco_from_protos",
],
tags = ["no_pip"],
)
filegroup(

View File

@ -314,7 +314,8 @@ ifeq ($(TARGET),ANDROID)
-Wno-narrowing \
-fomit-frame-pointer \
$(MARCH_OPTION) \
-fPIE
-fPIE \
-fPIC
INCLUDES = \
-I$(NDK_ROOT)/sources/android/support/include \
-I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/include \

View File

@ -174,10 +174,26 @@ tensorflow/contrib/makefile/build_all_ios.sh
This process will take around twenty minutes on a modern MacBook Pro.
When it completes, you will have a library for a single architecture and the
benchmark program. Although successfully compiling the benchmark program is a
When it completes, you will have a unified library for all architectures
(i386sim, x86_64sim, armv7, armv7s and arm64) and the benchmark program.
Although successfully compiling the benchmark program is a
sign of success, the program is not a complete iOS app.
If you would only like to build only one architecture to save time:
(iOS 11+ only supports 64bit so you can get away with arm64)
```bash
tensorflow/contrib/makefile/build_all_ios.sh -a arm64
```
After the first build if you would like to just build the tensorflow
library you can pass the -T flag to avoid a clean & rebuild. This should
take you just a few seconds to generate the library if you modified one file.
```bash
tensorflow/contrib/makefile/build_all_ios.sh -a arm64 -T
```
To see TensorFlow running on iOS, the example Xcode project in
[tensorflow/examples/ios](../../examples/ios/) shows how to use the static
library in a simple app.
@ -193,19 +209,18 @@ If you have not already, you will need to download dependencies:
tensorflow/contrib/makefile/download_dependencies.sh
```
Next, you will need to compile protobufs for iOS:
Next, you will need to compile protobufs for iOS (optionally takes the -a $ARCH flag):
```bash
tensorflow/contrib/makefile/compile_ios_protobuf.sh
tensorflow/contrib/makefile/compile_ios_protobuf.sh
```
Then, you will need to compile the nsync library for iOS:
Then, you will need to compile the nsync library for iOS (optionally takes -a $ARCH flag):
```bash
export HOST_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh`
export TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
```
Then, you can run the makefile specifying iOS as the target, along with the
architecture you want to build for:
@ -219,10 +234,6 @@ This creates a library in
`tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a` that you can link any
xcode project against.
At this point, you will have a library for a single architecture and the
benchmark program. Although successfully compiling the benchmark program is a
sign of success, the program is not a complete iOS app.
To see TensorFlow running on iOS, the example Xcode project in
[tensorflow/examples/ios](../../examples/ios/) shows how to use the static
library in a simple app.
@ -237,6 +248,14 @@ time follow it with:
compile_ios_tensorflow.sh
```
`compile_ios_tensorflow.sh` takes the -a flag to build only for one architecture.
In case you run into issues with unresolved symbols with nsync you can also pass
-h ${HOST_NSYNC_LIB} and -n {TARGET_NSYNC_LIB} so it would look like:
```bash
tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -h tensorflow/contrib/makefile/downloads/nsync/builds/default.macos.c++11/nsync.a -n tensorflow/contrib/makefile/downloads/nsync/builds/lipo.ios.c++11/nsync.a -a arm64
```
In XCode, you will need to use -force_load in the linker flags
section of the build settings to pull in the global constructors that are used
to register ops and kernels.
@ -249,7 +268,7 @@ debug mode. If you are concerned about performance or are working on a release
build, you would likely want a higher optimization setting, like so:
```bash
compile_ios_tensorflow.sh "-Os"
compile_ios_tensorflow.sh -f "-Os"
```
For other variations of valid optimization flags, see [clang optimization levels](http://stackoverflow.com/questions/15548023/clang-optimization-levels).

View File

@ -23,14 +23,29 @@ if [[ $(uname) != "Darwin" ]]; then
exit 1
fi
usage() {
echo "Usage: $(basename "$0") [-a:T]"
echo "-a [build_arch] build only for specified arch x86_64 [default=all]"
echo "-T only build tensorflow (dont download other deps etc)"
exit 1
}
while getopts "a:T" opt_name; do
case "$opt_name" in
a) BUILD_ARCH="${OPTARG}";;
T) ONLY_MAKE_TENSORFLOW="true";;
*) usage;;
esac
done
shift $((OPTIND - 1))
# Make sure we're in the correct directory, at the root of the source tree.
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd ${SCRIPT_DIR}/../../../
# Remove any old files first.
make -f tensorflow/contrib/makefile/Makefile clean
rm -rf tensorflow/contrib/makefile/downloads
source "${SCRIPT_DIR}/build_helper.subr"
JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
# Setting a deployment target is required for building with bitcode,
# otherwise linking will fail with:
@ -41,20 +56,37 @@ if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
fi
# Pull down the required versions of the frameworks we need.
tensorflow/contrib/makefile/download_dependencies.sh
if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
# Remove any old files first.
make -f tensorflow/contrib/makefile/Makefile clean
rm -rf tensorflow/contrib/makefile/downloads
# Compile protobuf for the target iOS device architectures.
tensorflow/contrib/makefile/compile_ios_protobuf.sh
# Pull down the required versions of the frameworks we need.
tensorflow/contrib/makefile/download_dependencies.sh
# Compile protobuf for the target iOS device architectures.
tensorflow/contrib/makefile/compile_ios_protobuf.sh
fi
# Compile nsync for the target iOS device architectures.
# Don't use export var=`something` syntax; it swallows the exit status.
HOST_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh`
TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
if [[ -z "${BUILD_ARCH}" ]]; then
# No arch specified so build all architectures
TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
else
# arch specified so build just that
TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios -a ${BUILD_ARCH}`
fi
export HOST_NSYNC_LIB TARGET_NSYNC_LIB
# Build the iOS TensorFlow libraries.
tensorflow/contrib/makefile/compile_ios_tensorflow.sh "-O3"
if [[ -z "${BUILD_ARCH}" ]]; then
# build the ios tensorflow libraries.
tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
else
# arch specified so build just that
tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -a "${BUILD_ARCH}" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
fi
# Creates a static universal library in
# tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a

View File

@ -21,10 +21,28 @@ if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
fi
SCRIPT_DIR=$(dirname $0)
usage() {
echo "Usage: $(basename "$0") [-a]"
echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64,arm64)"
echo "default arch i386, x86_64, armv7, armv7s, arm64"
exit 1
}
BUILD_TARGET="i386 x86_64 armv7 armv7s arm64"
while getopts "a:" opt_name; do
case "$opt_name" in
a) BUILD_TARGET="${OPTARG}";;
*) usage;;
esac
done
shift $((OPTIND - 1))
IFS=' ' read -r -a build_targets <<< "${BUILD_TARGET}"
SCRIPT_DIR=$(cd `dirname $0` && pwd)
source "${SCRIPT_DIR}/build_helper.subr"
cd tensorflow/contrib/makefile
cd ${SCRIPT_DIR}
HOST_GENDIR="$(pwd)/gen/protobuf-host"
mkdir -p "${HOST_GENDIR}"
@ -64,6 +82,10 @@ else
echo "protoc found. Skip building host tools."
fi
# Remove old libs
rm -f ${LIBDIR}/libprotobuf.a
rm -f ${LIBDIR}/libprotobuf-lite.a
./autogen.sh
if [ $? -ne 0 ]
then
@ -71,157 +93,192 @@ then
exit 1
fi
make distclean
./configure \
--host=i386-apple-${OSX_VERSION} \
--disable-shared \
--enable-cross-compile \
--with-protoc="${PROTOC_PATH}" \
--prefix=${LIBDIR}/iossim_386 \
--exec-prefix=${LIBDIR}/iossim_386 \
"CFLAGS=${CFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch i386 \
-fembed-bitcode \
-isysroot ${IPHONESIMULATOR_SYSROOT}" \
"CXX=${CXX}" \
"CXXFLAGS=${CXXFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch i386 \
-fembed-bitcode \
-isysroot \
${IPHONESIMULATOR_SYSROOT}" \
LDFLAGS="-arch i386 \
-fembed-bitcode \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
${LDFLAGS} \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
"LIBS=${LIBS}"
make -j"${JOB_COUNT}"
make install
package_pb_library() {
pb_libs="${LIBDIR}/${1}/lib/libprotobuf.a"
if [ -f "${LIBDIR}/libprotobuf.a" ]; then
pb_libs="$pb_libs ${LIBDIR}/libprotobuf.a"
fi
lipo \
$pb_libs \
-create \
-output ${LIBDIR}/libprotobuf.a
make distclean
./configure \
--host=x86_64-apple-${OSX_VERSION} \
--disable-shared \
--enable-cross-compile \
--with-protoc="${PROTOC_PATH}" \
--prefix=${LIBDIR}/iossim_x86_64 \
--exec-prefix=${LIBDIR}/iossim_x86_64 \
"CFLAGS=${CFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch x86_64 \
-fembed-bitcode \
-isysroot ${IPHONESIMULATOR_SYSROOT}" \
"CXX=${CXX}" \
"CXXFLAGS=${CXXFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch x86_64 \
-fembed-bitcode \
-isysroot \
${IPHONESIMULATOR_SYSROOT}" \
LDFLAGS="-arch x86_64 \
-fembed-bitcode \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
${LDFLAGS} \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
"LIBS=${LIBS}"
make -j"${JOB_COUNT}"
make install
pblite_libs="${LIBDIR}/${1}/lib/libprotobuf-lite.a"
if [ -f "${LIBDIR}/libprotobuf-lite.a" ]; then
pblite_libs="$pblite_libs ${LIBDIR}/libprotobuf-lite.a"
fi
lipo \
$pblite_libs \
-create \
-output ${LIBDIR}/libprotobuf-lite.a
}
make distclean
./configure \
--host=armv7-apple-${OSX_VERSION} \
--with-protoc="${PROTOC_PATH}" \
--disable-shared \
--prefix=${LIBDIR}/ios_arm7 \
--exec-prefix=${LIBDIR}/ios_arm7 \
"CFLAGS=${CFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
"CXX=${CXX}" \
"CXXFLAGS=${CXXFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
LDFLAGS="-arch armv7 \
-fembed-bitcode \
-miphoneos-version-min=${MIN_SDK_VERSION} \
${LDFLAGS}" \
"LIBS=${LIBS}"
make -j"${JOB_COUNT}"
make install
build_target() {
case "$1" in
i386) make distclean
./configure \
--host=i386-apple-${OSX_VERSION} \
--disable-shared \
--enable-cross-compile \
--with-protoc="${PROTOC_PATH}" \
--prefix=${LIBDIR}/iossim_386 \
--exec-prefix=${LIBDIR}/iossim_386 \
"CFLAGS=${CFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch i386 \
-fembed-bitcode \
-isysroot ${IPHONESIMULATOR_SYSROOT}" \
"CXX=${CXX}" \
"CXXFLAGS=${CXXFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch i386 \
-fembed-bitcode \
-isysroot \
${IPHONESIMULATOR_SYSROOT}" \
LDFLAGS="-arch i386 \
-fembed-bitcode \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
${LDFLAGS} \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
"LIBS=${LIBS}"
make -j"${JOB_COUNT}"
make install
make distclean
./configure \
--host=armv7s-apple-${OSX_VERSION} \
--with-protoc="${PROTOC_PATH}" \
--disable-shared \
--prefix=${LIBDIR}/ios_arm7s \
--exec-prefix=${LIBDIR}/ios_arm7s \
"CFLAGS=${CFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7s \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
"CXX=${CXX}" \
"CXXFLAGS=${CXXFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7s \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
LDFLAGS="-arch armv7s \
-fembed-bitcode \
-miphoneos-version-min=${MIN_SDK_VERSION} \
${LDFLAGS}" \
"LIBS=${LIBS}"
make -j"${JOB_COUNT}"
make install
package_pb_library "iossim_386"
;;
make distclean
./configure \
--host=arm \
--with-protoc="${PROTOC_PATH}" \
--disable-shared \
--prefix=${LIBDIR}/ios_arm64 \
--exec-prefix=${LIBDIR}/ios_arm64 \
"CFLAGS=${CFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch arm64 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
"CXXFLAGS=${CXXFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch arm64 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
LDFLAGS="-arch arm64 \
-fembed-bitcode \
-miphoneos-version-min=${MIN_SDK_VERSION} \
${LDFLAGS}" \
"LIBS=${LIBS}"
make -j"${JOB_COUNT}"
make install
x86_64) make distclean
./configure \
--host=x86_64-apple-${OSX_VERSION} \
--disable-shared \
--enable-cross-compile \
--with-protoc="${PROTOC_PATH}" \
--prefix=${LIBDIR}/iossim_x86_64 \
--exec-prefix=${LIBDIR}/iossim_x86_64 \
"CFLAGS=${CFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch x86_64 \
-fembed-bitcode \
-isysroot ${IPHONESIMULATOR_SYSROOT}" \
"CXX=${CXX}" \
"CXXFLAGS=${CXXFLAGS} \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
-arch x86_64 \
-fembed-bitcode \
-isysroot \
${IPHONESIMULATOR_SYSROOT}" \
LDFLAGS="-arch x86_64 \
-fembed-bitcode \
-mios-simulator-version-min=${MIN_SDK_VERSION} \
${LDFLAGS} \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
-L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
"LIBS=${LIBS}"
make -j"${JOB_COUNT}"
make install
lipo \
${LIBDIR}/iossim_386/lib/libprotobuf.a \
${LIBDIR}/iossim_x86_64/lib/libprotobuf.a \
${LIBDIR}/ios_arm7/lib/libprotobuf.a \
${LIBDIR}/ios_arm7s/lib/libprotobuf.a \
${LIBDIR}/ios_arm64/lib/libprotobuf.a \
-create \
-output ${LIBDIR}/libprotobuf.a
package_pb_library "iossim_x86_64"
;;
lipo \
${LIBDIR}/iossim_386/lib/libprotobuf-lite.a \
${LIBDIR}/iossim_x86_64/lib/libprotobuf-lite.a \
${LIBDIR}/ios_arm7/lib/libprotobuf-lite.a \
${LIBDIR}/ios_arm7s/lib/libprotobuf-lite.a \
${LIBDIR}/ios_arm64/lib/libprotobuf-lite.a \
-create \
-output ${LIBDIR}/libprotobuf-lite.a
armv7) make distclean
./configure \
--host=armv7-apple-${OSX_VERSION} \
--with-protoc="${PROTOC_PATH}" \
--disable-shared \
--prefix=${LIBDIR}/ios_arm7 \
--exec-prefix=${LIBDIR}/ios_arm7 \
"CFLAGS=${CFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
"CXX=${CXX}" \
"CXXFLAGS=${CXXFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
LDFLAGS="-arch armv7 \
-fembed-bitcode \
-miphoneos-version-min=${MIN_SDK_VERSION} \
${LDFLAGS}" \
"LIBS=${LIBS}"
make -j"${JOB_COUNT}"
make install
package_pb_library "ios_arm7"
;;
armv7s) make distclean
./configure \
--host=armv7s-apple-${OSX_VERSION} \
--with-protoc="${PROTOC_PATH}" \
--disable-shared \
--prefix=${LIBDIR}/ios_arm7s \
--exec-prefix=${LIBDIR}/ios_arm7s \
"CFLAGS=${CFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7s \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
"CXX=${CXX}" \
"CXXFLAGS=${CXXFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch armv7s \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
LDFLAGS="-arch armv7s \
-fembed-bitcode \
-miphoneos-version-min=${MIN_SDK_VERSION} \
${LDFLAGS}" \
"LIBS=${LIBS}"
make -j"${JOB_COUNT}"
make install
package_pb_library "ios_arm7s"
;;
arm64) make distclean
./configure \
--host=arm \
--with-protoc="${PROTOC_PATH}" \
--disable-shared \
--prefix=${LIBDIR}/ios_arm64 \
--exec-prefix=${LIBDIR}/ios_arm64 \
"CFLAGS=${CFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch arm64 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
"CXXFLAGS=${CXXFLAGS} \
-miphoneos-version-min=${MIN_SDK_VERSION} \
-arch arm64 \
-fembed-bitcode \
-isysroot ${IPHONEOS_SYSROOT}" \
LDFLAGS="-arch arm64 \
-fembed-bitcode \
-miphoneos-version-min=${MIN_SDK_VERSION} \
${LDFLAGS}" \
"LIBS=${LIBS}"
make -j"${JOB_COUNT}"
make install
package_pb_library "ios_arm64"
;;
*)
echo "Unknown ARCH"
exit 1
;;
esac
}
for build_element in "${build_targets[@]}"
do
echo "$build_element"
build_target "$build_element"
done
file ${LIBDIR}/libprotobuf.a
file ${LIBDIR}/libprotobuf-lite.a
echo "Done building and packaging the libraries"

View File

@ -43,55 +43,124 @@ then
exit 1
fi
usage() {
echo "Usage: $(basename "$0") [-a]"
echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64,arm64)"
echo "default is [i386, x86_64, armv7, armv7s, arm64]"
exit 1
}
BUILD_TARGET="i386 x86_64 armv7 armv7s arm64"
while getopts "a:f:h:n:" opt_name; do
case "$opt_name" in
a) BUILD_TARGET="${OPTARG}";;
f) BUILD_OPT="${OPTARG}";;
h) NSYNC_HOST="${OPTARG}";;
n) NSYNC_TARGET="${OPTARG}";;
*) usage;;
esac
done
shift $((OPTIND - 1))
IFS=' ' read -r -a build_targets <<< "${BUILD_TARGET}"
SCRIPT_DIR=$(cd `dirname $0` && pwd)
source "${SCRIPT_DIR}/build_helper.subr"
GENDIR=tensorflow/contrib/makefile/gen/
LIBDIR=${GENDIR}lib
LIB_PREFIX=libtensorflow-core
make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
TARGET=IOS IOS_ARCH=ARMV7 LIB_NAME=${LIB_PREFIX}-armv7.a OPTFLAGS="$1"
if [ $? -ne 0 ]
then
echo "armv7 compilation failed."
exit 1
fi
#remove any old artifacts
rm -rf ${LIBDIR}/${LIB_PREFIX}.a
make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
TARGET=IOS IOS_ARCH=ARMV7S LIB_NAME=${LIB_PREFIX}-armv7s.a OPTFLAGS="$1"
if [ $? -ne 0 ]
then
echo "arm7vs compilation failed."
exit 1
fi
package_tf_library() {
CAP_DIR=`echo $1 | tr 'a-z' 'A-Z'`
tf_libs="${LIBDIR}/ios_${CAP_DIR}/${LIB_PREFIX}-${1}.a"
if [ -f "${LIBDIR}/${LIB_PREFIX}.a" ]; then
tf_libs="$tf_libs ${LIBDIR}/${LIB_PREFIX}.a"
fi
lipo \
$tf_libs \
-create \
-output ${LIBDIR}/${LIB_PREFIX}.a
}
make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
TARGET=IOS IOS_ARCH=ARM64 LIB_NAME=${LIB_PREFIX}-arm64.a OPTFLAGS="$1"
if [ $? -ne 0 ]
then
echo "arm64 compilation failed."
exit 1
fi
build_tf_target() {
case "$1" in
armv7)
make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
TARGET=IOS IOS_ARCH=ARMV7 LIB_NAME=${LIB_PREFIX}-armv7.a \
OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
TARGET_NSYNC_LIB="${NSYNC_TARGET}"
if [ $? -ne 0 ]
then
echo "armv7 compilation failed."
exit 1
fi
package_tf_library "armv7"
;;
armv7s)
make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
TARGET=IOS IOS_ARCH=ARMV7S LIB_NAME=${LIB_PREFIX}-armv7s.a \
OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
TARGET_NSYNC_LIB="${NSYNC_TARGET}"
make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
TARGET=IOS IOS_ARCH=I386 LIB_NAME=${LIB_PREFIX}-i386.a OPTFLAGS="$1"
if [ $? -ne 0 ]
then
echo "i386 compilation failed."
exit 1
fi
if [ $? -ne 0 ]
then
echo "arm7vs compilation failed."
exit 1
fi
package_tf_library "armv7s"
;;
arm64)
make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
TARGET=IOS IOS_ARCH=ARM64 LIB_NAME=${LIB_PREFIX}-arm64.a \
OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
TARGET_NSYNC_LIB="${NSYNC_TARGET}"
if [ $? -ne 0 ]
then
echo "arm64 compilation failed."
exit 1
fi
package_tf_library "arm64"
;;
i386)
make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
TARGET=IOS IOS_ARCH=I386 LIB_NAME=${LIB_PREFIX}-i386.a \
OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
TARGET_NSYNC_LIB="${NSYNC_TARGET}"
if [ $? -ne 0 ]
then
echo "i386 compilation failed."
exit 1
fi
package_tf_library "i386"
;;
x86_64)
make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
TARGET=IOS IOS_ARCH=X86_64 LIB_NAME=${LIB_PREFIX}-x86_64.a \
OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
TARGET_NSYNC_LIB="${NSYNC_TARGET}"
if [ $? -ne 0 ]
then
echo "x86_64 compilation failed."
exit 1
fi
package_tf_library "x86_64"
;;
*)
echo "Unknown ARCH"
exit 1
esac
}
make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
TARGET=IOS IOS_ARCH=X86_64 LIB_NAME=${LIB_PREFIX}-x86_64.a OPTFLAGS="$1"
if [ $? -ne 0 ]
then
echo "x86_64 compilation failed."
exit 1
fi
for build_tf_element in "${build_targets[@]}"
do
echo "$build_tf_element"
build_tf_target "$build_tf_element"
done
lipo \
${LIBDIR}/ios_ARMV7/${LIB_PREFIX}-armv7.a \
${LIBDIR}/ios_ARMV7S/${LIB_PREFIX}-armv7s.a \
${LIBDIR}/ios_ARM64/${LIB_PREFIX}-arm64.a \
${LIBDIR}/ios_I386/${LIB_PREFIX}-i386.a \
${LIBDIR}/ios_X86_64/${LIB_PREFIX}-x86_64.a \
-create \
-output ${LIBDIR}/${LIB_PREFIX}.a
echo "Done building and packaging TF"
file ${LIBDIR}/${LIB_PREFIX}.a

View File

@ -265,7 +265,7 @@ for arch in $archs; do
-I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/'"$arch"'/include \
-I../../platform/c++11 -I../../platform/gcc \
-I../../platform/posix -pthread
PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' -fPIE
PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' -fPIE -fPIC
PLATFORM_LDFLAGS=-pthread
MKDEP=${CC} -M -std=c++11
PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
@ -301,6 +301,9 @@ done
case "$target_platform" in
ios) nsync_platform_dir="$nsync_builds_dir/lipo.$target_platform.c++11"
if [ -d "$nsync_platform_dir" ]; then
rm -rf "$nsync_platform_dir"
fi
mkdir "$nsync_platform_dir"
eval lipo $platform_libs -create -output '$nsync_platform_dir/nsync.a'
echo "$nsync_platform_dir/nsync.a"

View File

@ -15,6 +15,7 @@
"""Module for variants of ops in tf.nn.
@@alpha_dropout
@@conv1d_transpose
@@deprecated_flipped_softmax_cross_entropy_with_logits
@@deprecated_flipped_sparse_softmax_cross_entropy_with_logits
@@deprecated_flipped_sigmoid_cross_entropy_with_logits
@ -32,6 +33,7 @@ from tensorflow.contrib.nn.python.ops.alpha_dropout import *
from tensorflow.contrib.nn.python.ops.cross_entropy import *
from tensorflow.contrib.nn.python.ops.sampling_ops import *
from tensorflow.contrib.nn.python.ops.scaled_softplus import *
from tensorflow.python.ops.nn_ops import conv1d_transpose
from tensorflow.python.ops.nn_ops import nth_element
# pylint: enable=unused-import,wildcard-import

View File

@ -19,6 +19,7 @@ py_library(
"python/training/external_optimizer.py",
"python/training/lazy_adam_optimizer.py",
"python/training/moving_average_optimizer.py",
"python/training/multitask_optimizer_wrapper.py",
"python/training/nadam_optimizer.py",
"python/training/powersign.py",
"python/training/sign_decay.py",
@ -98,6 +99,23 @@ py_test(
],
)
py_test(
name = "multitask_optimizer_wrapper_test",
srcs = ["python/training/multitask_optimizer_wrapper_test.py"],
srcs_version = "PY2AND3",
deps = [
":opt_py",
"//tensorflow/python:client",
"//tensorflow/python:client_testlib",
"//tensorflow/python:constant_op",
"//tensorflow/python:dtypes",
"//tensorflow/python:training",
"//tensorflow/python:variables",
"//third_party/py/numpy",
"@six_archive//:six",
],
)
py_test(
name = "lazy_adam_optimizer_test",
srcs = ["python/training/lazy_adam_optimizer_test.py"],

View File

@ -24,7 +24,7 @@ from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import
from tensorflow.contrib.opt.python.training.external_optimizer import *
from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
from tensorflow.contrib.opt.python.training.nadam_optimizer import *
from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import *
from tensorflow.contrib.opt.python.training.nadam_optimizer import *
from tensorflow.contrib.opt.python.training.powersign import *
from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
@ -34,11 +34,18 @@ from tensorflow.python.util.all_util import remove_undocumented
_allowed_symbols = [
'PowerSignOptimizer', 'AddSignOptimizer'
'PowerSignOptimizer',
'AddSignOptimizer'
'DelayCompensatedGradientDescentOptimizer',
'DropStaleGradientOptimizer', 'ExternalOptimizerInterface',
'LazyAdamOptimizer', 'NadamOptimizer', 'MovingAverageOptimizer',
'ScipyOptimizerInterface', 'VariableClippingOptimizer'
'DropStaleGradientOptimizer',
'ExternalOptimizerInterface',
'LazyAdamOptimizer',
'NadamOptimizer',
'MovingAverageOptimizer',
'ScipyOptimizerInterface',
'VariableClippingOptimizer',
'MultitaskOptimizerWrapper',
'clip_gradients_by_global_norm',
]
remove_undocumented(__name__, _allowed_symbols)

View File

@ -0,0 +1,140 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""An optimizer wrapper for stateful optimizers with multitask loss."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import types
import six
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.training import optimizer
__all__ = ['MultitaskOptimizerWrapper', 'clip_gradients_by_global_norm']
def _is_all_zeros(grad):
all_zeros = math_ops.equal(math_ops.count_nonzero(grad), 0)
return all_zeros
def _get_wrapper(fn, opt):
def wrapper(self, grad, *args, **kwargs): # pylint: disable=unused-argument
all_zeros = _is_all_zeros(grad)
return control_flow_ops.cond(all_zeros, control_flow_ops.no_op,
lambda: fn(grad, *args, **kwargs))
wrapper = types.MethodType(wrapper, opt)
return wrapper
class MultitaskOptimizerWrapper(object):
"""Optimizer wrapper making all-zero gradients harmless.
This might be useful when a multi-task loss is used,
and some components of the loss might be
not present (e.g. masked out) in some training batches.
Technically their gradient would be zero,
which would normally affect the optimizer state
(e.g. push running average to zero).
However this is not the desired behaviour,
since the missing loss component
should be treated as unknown rather than zero.
This wrapper filters out all-zero gradient tensors,
therefore preserving the optimizer state.
If gradient clipping by global norm is used,
the provided function clip_gradients_by_global_norm
should be used (and specified explicitly by the user).
Otherwise the global norm would be underestimated
because of all-zero tensors that should be ignored.
The gradient calculation and application
are delegated to an underlying optimizer.
The gradient application is altered only for all-zero tensors.
Example:
```python
momentum_optimizer = tf.train.MomentumOptimizer(
learning_rate, momentum=0.9)
multitask_momentum_optimizer = tf.contrib.opt.MultitaskOptimizerWrapper(
momentum_optimizer)
gradvars = multitask_momentum_optimizer.compute_gradients(
loss)
gradvars_clipped, _ = tf.contrib.opt.clip_gradients_by_global_norm(
gradvars, 15.0)
train_op = multitask_momentum_optimizer.apply_gradients(
gradvars_clipped, global_step=batch)
```
"""
def __init__(self, opt):
"""Constructor.
Args:
opt: an instance of a class that implements tf.train.Optimizer.
"""
if not isinstance(opt, optimizer.Optimizer):
raise TypeError(
'Supplied optimizer must be an instance of tf.train.Optimizer')
self._opt = opt
overridden_methods = ('_apply_dense', '_resource_apply_dense',
'_apply_sparse', '_resource_apply_sparse')
for name in overridden_methods:
fn = getattr(self._opt, name)
wrapper = _get_wrapper(fn, self._opt)
setattr(self._opt, name, wrapper)
def __getattr__(self, name):
return getattr(self._opt, name)
def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.):
"""Clips gradients of a multitask loss by their global norm.
Ignores all-zero tensors when computing the global norm.
Args:
gradients_variables: a list of pairs (gradient, variable).
clip_norm: a float Tensor, the global norm to clip on. Default is 20.0.
Returns:
list: A list of pairs of the same type as gradients_variables,.
fixed_global_norm: A 0-D (scalar) Tensor representing the global norm.
"""
gradients, variables = six.moves.zip(*gradients_variables)
def _replace_nonexisting_grad(grad):
if grad is None:
return grad
all_zeros = _is_all_zeros(grad)
return control_flow_ops.cond(
all_zeros,
lambda: array_ops.zeros([], dtype=dtypes.as_dtype(grad.dtype)),
lambda: grad)
nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients]
fixed_global_norm = clip_ops.global_norm(nonzero_gradients)
gradients, _ = clip_ops.clip_by_global_norm(
gradients, clip_norm, use_norm=fixed_global_norm)
return list(six.moves.zip(gradients, variables)), fixed_global_norm

View File

@ -0,0 +1,119 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for MultitaskOptimizerWrapper."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import six
from tensorflow.contrib.opt.python.training import multitask_optimizer_wrapper
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import variables
from tensorflow.python.platform import test
from tensorflow.python.training import momentum
class MultitaskOptimizerWrapperTest(test.TestCase):
"""Tests for the multitask optimizer wrapper.
"""
def testWrapper(self):
with self.test_session():
var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
grads0 = constant_op.constant([0.1, 0.1], dtype=dtypes.float32)
grads1 = constant_op.constant([0.01, 0.01], dtype=dtypes.float32)
grads_allzero = constant_op.constant([0.0, 0.0], dtype=dtypes.float32)
mom_opt_impl = momentum.MomentumOptimizer(learning_rate=2.0, momentum=0.9)
mom_opt = multitask_optimizer_wrapper.MultitaskOptimizerWrapper(
mom_opt_impl)
mom_update = mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
mom_update_partial = mom_opt.apply_gradients(
zip([grads_allzero, grads1], [var0, var1]))
mom_update_no_action = mom_opt.apply_gradients(
zip([grads_allzero, grads_allzero], [var0, var1]))
self.evaluate(variables.global_variables_initializer())
# Fetch params to validate initial values
self.assertAllClose([1.0, 2.0], self.evaluate(var0))
self.assertAllClose([3.0, 4.0], self.evaluate(var1))
self.assertEqual(["momentum"], mom_opt.get_slot_names())
slot0 = mom_opt.get_slot(var0, "momentum")
self.assertEquals(slot0.get_shape(), var0.get_shape())
slot1 = mom_opt.get_slot(var1, "momentum")
self.assertEquals(slot1.get_shape(), var1.get_shape())
# Step 1: normal momentum update.
self.evaluate(mom_update)
# Check that the momentum accumulators have been updated.
self.assertAllCloseAccordingToType(
np.array([0.1, 0.1]), self.evaluate(slot0))
self.assertAllCloseAccordingToType(
np.array([0.01, 0.01]), self.evaluate(slot1))
# Check that the parameters have been updated.
self.assertAllCloseAccordingToType(
np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), self.evaluate(var0))
self.assertAllCloseAccordingToType(
np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
self.evaluate(var1))
# Step 2: momentum update that changes only slot1 but not slot0.
self.evaluate(mom_update_partial)
# Check that only the relevant momentum accumulator has been updated.
self.assertAllCloseAccordingToType(
np.array([0.1, 0.1]), self.evaluate(slot0))
self.assertAllCloseAccordingToType(
np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
self.evaluate(slot1))
# Step 3: momentum update that does not change anything.
self.evaluate(mom_update_no_action)
# Check that the momentum accumulators have *NOT* been updated.
self.assertAllCloseAccordingToType(
np.array([0.1, 0.1]), self.evaluate(slot0))
self.assertAllCloseAccordingToType(
np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
self.evaluate(slot1))
def testGradientClipping(self):
with self.test_session():
var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
var2 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
var3 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
grads0 = constant_op.constant([10.0, 15.0], dtype=dtypes.float32)
grads1 = constant_op.constant([0.0, 5.0], dtype=dtypes.float32)
grads2 = constant_op.constant([0.0, 0.0], dtype=dtypes.float32)
grads3 = None
varlist = [var0, var1, var2, var3]
gradients = [grads0, grads1, grads2, grads3]
clipped_gradvars, global_norm = (
multitask_optimizer_wrapper.clip_gradients_by_global_norm(
six.moves.zip(gradients, varlist), clip_norm=1.0))
clipped_grads = list(six.moves.zip(*clipped_gradvars))[0]
reference_global_norm = np.sqrt(np.sum(np.square([10.0, 15.0, 0.0, 5.0])))
self.assertAllCloseAccordingToType(
self.evaluate(global_norm), reference_global_norm)
self.assertAllCloseAccordingToType(
self.evaluate(clipped_grads[2]), np.array([0., 0.]))
self.assertEqual(clipped_grads[3], None)
if __name__ == "__main__":
test.main()

View File

@ -24,6 +24,7 @@ import numpy as np
from tensorflow.contrib import rnn as contrib_rnn
from tensorflow.contrib.rnn.python.ops import core_rnn_cell
from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
from tensorflow.core.protobuf import config_pb2
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
@ -358,6 +359,46 @@ class RNNCellTest(test.TestCase):
self.assertEquals(variables[2].op.name,
"root/lstm_cell/projection/kernel")
def testLSTMCellLayerNorm(self):
with self.test_session() as sess:
num_units = 2
num_proj = 3
batch_size = 1
input_size = 4
with variable_scope.variable_scope(
"root", initializer=init_ops.constant_initializer(0.5)):
x = array_ops.zeros([batch_size, input_size])
c = array_ops.zeros([batch_size, num_units])
h = array_ops.zeros([batch_size, num_proj])
state = rnn_cell_impl.LSTMStateTuple(c, h)
cell = contrib_rnn_cell.LayerNormLSTMCell(
num_units=num_units,
num_proj=num_proj,
forget_bias=1.0,
layer_norm=True,
norm_gain=1.0,
norm_shift=0.0)
g, out_m = cell(x, state)
sess.run([variables_lib.global_variables_initializer()])
res = sess.run(
[g, out_m], {
x.name: np.ones((batch_size, input_size)),
c.name: 0.1 * np.ones((batch_size, num_units)),
h.name: 0.1 * np.ones((batch_size, num_proj))
})
self.assertEqual(len(res), 2)
# The numbers in results were not calculated, this is mostly just a
# smoke test.
self.assertEqual(res[0].shape, (batch_size, num_proj))
self.assertEqual(res[1][0].shape, (batch_size, num_units))
self.assertEqual(res[1][1].shape, (batch_size, num_proj))
# Different inputs so different outputs and states
for i in range(1, batch_size):
self.assertTrue(
float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
self.assertTrue(
float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
def testOutputProjectionWrapper(self):
with self.test_session() as sess:
with variable_scope.variable_scope(

View File

@ -37,6 +37,7 @@ from tensorflow.python.ops import math_ops
from tensorflow.python.ops import random_ops
from tensorflow.python.ops import rnn
from tensorflow.python.ops import rnn_cell
from tensorflow.python.ops import rnn_cell_impl
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops import variables
from tensorflow.python.platform import test
@ -995,26 +996,19 @@ class RNNCellTest(test.TestCase):
output, state = cell(x, hidden)
sess.run([variables.global_variables_initializer()])
res = sess.run([output, state], {
hidden[0].name:
np.array([[[[[1.],[1.]],
[[1.],[1.]]],
[[[1.],[1.]],
[[1.],[1.]]]],
[[[[2.],[2.]],
[[2.],[2.]]],
[[[2.],[2.]],
[[2.],[2.]]]]]),
x.name:
np.array([[[[[1.],[1.]],
[[1.],[1.]]],
[[[1.],[1.]],
[[1.],[1.]]]],
[[[[2.],[2.]],
[[2.],[2.]]],
[[[2.],[2.]],
[[2.],[2.]]]]])
})
res = sess.run(
[output, state], {
hidden[0].name:
np.array([[[[[1.], [1.]], [[1.], [1.]]], [[[1.], [1.]], [[
1.
], [1.]]]], [[[[2.], [2.]], [[2.], [2.]]],
[[[2.], [2.]], [[2.], [2.]]]]]),
x.name:
np.array([[[[[1.], [1.]], [[1.], [1.]]], [[[1.], [1.]], [[
1.
], [1.]]]], [[[[2.], [2.]], [[2.], [2.]]], [[[2.], [2.]],
[[2.], [2.]]]]])
})
# This is a smoke test, making sure expected values are unchanged.
self.assertEqual(len(res), 2)
self.assertAllClose(res[0], res[1].h)
@ -1275,6 +1269,47 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
self.assertAllClose(res[2].c, expected_c1, 1e-5)
self.assertAllClose(res[2].h, expected_h1, 1e-5)
def testBasicLSTMCellWithStateTupleLayerNorm(self):
"""The results of LSTMCell and LayerNormBasicLSTMCell should be the same."""
with self.test_session() as sess:
with variable_scope.variable_scope(
"root", initializer=init_ops.constant_initializer(0.5)):
x = array_ops.zeros([1, 2])
c0 = array_ops.zeros([1, 2])
h0 = array_ops.zeros([1, 2])
state0 = rnn_cell_impl.LSTMStateTuple(c0, h0)
c1 = array_ops.zeros([1, 2])
h1 = array_ops.zeros([1, 2])
state1 = rnn_cell_impl.LSTMStateTuple(c1, h1)
cell = rnn_cell_impl.MultiRNNCell([
contrib_rnn_cell.LayerNormLSTMCell(
2, layer_norm=True, norm_gain=1.0, norm_shift=0.0)
for _ in range(2)
])
h, (s0, s1) = cell(x, (state0, state1))
sess.run([variables.global_variables_initializer()])
res = sess.run(
[h, s0, s1], {
x.name: np.array([[1., 1.]]),
c0.name: 0.1 * np.asarray([[0, 1]]),
h0.name: 0.1 * np.asarray([[2, 3]]),
c1.name: 0.1 * np.asarray([[4, 5]]),
h1.name: 0.1 * np.asarray([[6, 7]]),
})
expected_h = np.array([[-0.38079708, 0.38079708]])
expected_h0 = np.array([[-0.38079708, 0.38079708]])
expected_c0 = np.array([[-1.0, 1.0]])
expected_h1 = np.array([[-0.38079708, 0.38079708]])
expected_c1 = np.array([[-1.0, 1.0]])
self.assertEqual(len(res), 3)
self.assertAllClose(res[0], expected_h, 1e-5)
self.assertAllClose(res[1].c, expected_c0, 1e-5)
self.assertAllClose(res[1].h, expected_h0, 1e-5)
self.assertAllClose(res[2].c, expected_c1, 1e-5)
self.assertAllClose(res[2].h, expected_h1, 1e-5)
def testBasicLSTMCellWithDropout(self):
def _is_close(x, y, digits=4):

View File

@ -76,6 +76,18 @@ def _get_sharded_variable(name, shape, dtype, num_shards):
return shards
def _norm(g, b, inp, scope):
shape = inp.get_shape()[-1:]
gamma_init = init_ops.constant_initializer(g)
beta_init = init_ops.constant_initializer(b)
with vs.variable_scope(scope):
# Initialize beta and gamma for use by layer_norm.
vs.get_variable("gamma", shape=shape, initializer=gamma_init)
vs.get_variable("beta", shape=shape, initializer=beta_init)
normalized = layers.layer_norm(inp, reuse=True, scope=scope)
return normalized
class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
"""Long short-term memory unit (LSTM) recurrent network cell.
@ -102,13 +114,33 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
The class uses optional peep-hole connections, and an optional projection
layer.
Layer normalization implementation is based on:
https://arxiv.org/abs/1607.06450.
"Layer Normalization"
Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
and is applied before the internal nonlinearities.
"""
def __init__(self, num_units, use_peepholes=False,
initializer=None, num_proj=None, proj_clip=None,
num_unit_shards=1, num_proj_shards=1,
forget_bias=1.0, state_is_tuple=True,
activation=math_ops.tanh, reuse=None):
def __init__(self,
num_units,
use_peepholes=False,
initializer=None,
num_proj=None,
proj_clip=None,
num_unit_shards=1,
num_proj_shards=1,
forget_bias=1.0,
state_is_tuple=True,
activation=math_ops.tanh,
reuse=None,
layer_norm=False,
norm_gain=1.0,
norm_shift=0.0):
"""Initialize the parameters for an LSTM cell.
Args:
@ -135,6 +167,11 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
reuse: (optional) Python boolean describing whether to reuse variables
in an existing scope. If not `True`, and the existing scope already has
the given variables, an error is raised.
layer_norm: If `True`, layer normalization will be applied.
norm_gain: float, The layer normalization gain initial value. If
`layer_norm` has been set to `False`, this argument will be ignored.
norm_shift: float, The layer normalization shift initial value. If
`layer_norm` has been set to `False`, this argument will be ignored.
"""
super(CoupledInputForgetGateLSTMCell, self).__init__(_reuse=reuse)
if not state_is_tuple:
@ -152,6 +189,9 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
self._state_is_tuple = state_is_tuple
self._activation = activation
self._reuse = reuse
self._layer_norm = layer_norm
self._norm_gain = norm_gain
self._norm_shift = norm_shift
if num_proj:
self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
@ -220,9 +260,20 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
# j = new_input, f = forget_gate, o = output_gate
cell_inputs = array_ops.concat([inputs, m_prev], 1)
lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
lstm_matrix = math_ops.matmul(cell_inputs, concat_w)
# If layer nomalization is applied, do not add bias
if not self._layer_norm:
lstm_matrix = nn_ops.bias_add(lstm_matrix, b)
j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=3, axis=1)
# Apply layer normalization
if self._layer_norm:
j = _norm(self._norm_gain, self._norm_shift, j, "transform")
f = _norm(self._norm_gain, self._norm_shift, f, "forget")
o = _norm(self._norm_gain, self._norm_shift, o, "output")
# Diagonal connections
if self._use_peepholes:
w_f_diag = vs.get_variable(
@ -236,6 +287,10 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
f_act = sigmoid(f + self._forget_bias)
c = (f_act * c_prev + (1 - f_act) * self._activation(j))
# Apply layer normalization
if self._layer_norm:
c = _norm(self._norm_gain, self._norm_shift, c, "state")
if self._use_peepholes:
m = sigmoid(o + w_o_diag * c) * self._activation(c)
else:
@ -1301,8 +1356,8 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
self._keep_prob = dropout_keep_prob
self._seed = dropout_prob_seed
self._layer_norm = layer_norm
self._g = norm_gain
self._b = norm_shift
self._norm_gain = norm_gain
self._norm_shift = norm_shift
self._reuse = reuse
@property
@ -1313,24 +1368,25 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
def output_size(self):
return self._num_units
def _norm(self, inp, scope):
def _norm(self, inp, scope, dtype=dtypes.float32):
shape = inp.get_shape()[-1:]
gamma_init = init_ops.constant_initializer(self._g)
beta_init = init_ops.constant_initializer(self._b)
gamma_init = init_ops.constant_initializer(self._norm_gain)
beta_init = init_ops.constant_initializer(self._norm_shift)
with vs.variable_scope(scope):
# Initialize beta and gamma for use by layer_norm.
vs.get_variable("gamma", shape=shape, initializer=gamma_init)
vs.get_variable("beta", shape=shape, initializer=beta_init)
vs.get_variable("gamma", shape=shape, initializer=gamma_init, dtype=dtype)
vs.get_variable("beta", shape=shape, initializer=beta_init, dtype=dtype)
normalized = layers.layer_norm(inp, reuse=True, scope=scope)
return normalized
def _linear(self, args):
out_size = 4 * self._num_units
proj_size = args.get_shape()[-1]
weights = vs.get_variable("kernel", [proj_size, out_size])
dtype = args.dtype
weights = vs.get_variable("kernel", [proj_size, out_size], dtype=dtype)
out = math_ops.matmul(args, weights)
if not self._layer_norm:
bias = vs.get_variable("bias", [out_size])
bias = vs.get_variable("bias", [out_size], dtype=dtype)
out = nn_ops.bias_add(out, bias)
return out
@ -1339,13 +1395,14 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
c, h = state
args = array_ops.concat([inputs, h], 1)
concat = self._linear(args)
dtype = args.dtype
i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
if self._layer_norm:
i = self._norm(i, "input")
j = self._norm(j, "transform")
f = self._norm(f, "forget")
o = self._norm(o, "output")
i = self._norm(i, "input", dtype=dtype)
j = self._norm(j, "transform", dtype=dtype)
f = self._norm(f, "forget", dtype=dtype)
o = self._norm(o, "output", dtype=dtype)
g = self._activation(j)
if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
@ -1354,7 +1411,7 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
new_c = (c * math_ops.sigmoid(f + self._forget_bias)
+ math_ops.sigmoid(i) * g)
if self._layer_norm:
new_c = self._norm(new_c, "state")
new_c = self._norm(new_c, "state", dtype=dtype)
new_h = self._activation(new_c) * math_ops.sigmoid(o)
new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_h)
@ -1998,8 +2055,8 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
if self._skip_connection:
self._total_output_channels += self._input_shape[-1]
state_size = tensor_shape.TensorShape(self._input_shape[:-1]
+ [self._output_channels])
state_size = tensor_shape.TensorShape(
self._input_shape[:-1] + [self._output_channels])
self._state_size = rnn_cell_impl.LSTMStateTuple(state_size, state_size)
self._output_size = tensor_shape.TensorShape(self._input_shape[:-1]
+ [self._total_output_channels])
@ -2059,11 +2116,8 @@ class Conv3DLSTMCell(ConvLSTMCell):
"""Construct Conv3DLSTM. See `ConvLSTMCell` for more details."""
super(Conv3DLSTMCell, self).__init__(conv_ndims=3, **kwargs)
def _conv(args,
filter_size,
num_features,
bias,
bias_start=0.0):
def _conv(args, filter_size, num_features, bias, bias_start=0.0):
"""convolution:
Args:
args: a Tensor or a list of Tensors of dimension 3D, 4D or 5D,
@ -2306,3 +2360,273 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
new_state = rnn_cell_impl.LSTMStateTuple(c, m)
return m, new_state
class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
"""Long short-term memory unit (LSTM) recurrent network cell.
The default non-peephole implementation is based on:
http://www.bioinf.jku.at/publications/older/2604.pdf
S. Hochreiter and J. Schmidhuber.
"Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
The peephole implementation is based on:
https://research.google.com/pubs/archive/43905.pdf
Hasim Sak, Andrew Senior, and Francoise Beaufays.
"Long short-term memory recurrent neural network architectures for
large scale acoustic modeling." INTERSPEECH, 2014.
The class uses optional peep-hole connections, optional cell clipping, and
an optional projection layer.
Layer normalization implementation is based on:
https://arxiv.org/abs/1607.06450.
"Layer Normalization"
Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
and is applied before the internal nonlinearities.
"""
def __init__(self,
num_units,
use_peepholes=False,
cell_clip=None,
initializer=None,
num_proj=None,
proj_clip=None,
forget_bias=1.0,
activation=None,
layer_norm=False,
norm_gain=1.0,
norm_shift=0.0,
reuse=None):
"""Initialize the parameters for an LSTM cell.
Args:
num_units: int, The number of units in the LSTM cell
use_peepholes: bool, set True to enable diagonal/peephole connections.
cell_clip: (optional) A float value, if provided the cell state is clipped
by this value prior to the cell output activation.
initializer: (optional) The initializer to use for the weight and
projection matrices.
num_proj: (optional) int, The output dimensionality for the projection
matrices. If None, no projection is performed.
proj_clip: (optional) A float value. If `num_proj > 0` and `proj_clip` is
provided, then the projected values are clipped elementwise to within
`[-proj_clip, proj_clip]`.
forget_bias: Biases of the forget gate are initialized by default to 1
in order to reduce the scale of forgetting at the beginning of
the training. Must set it manually to `0.0` when restoring from
CudnnLSTM trained checkpoints.
activation: Activation function of the inner states. Default: `tanh`.
layer_norm: If `True`, layer normalization will be applied.
norm_gain: float, The layer normalization gain initial value. If
`layer_norm` has been set to `False`, this argument will be ignored.
norm_shift: float, The layer normalization shift initial value. If
`layer_norm` has been set to `False`, this argument will be ignored.
reuse: (optional) Python boolean describing whether to reuse variables
in an existing scope. If not `True`, and the existing scope already has
the given variables, an error is raised.
When restoring from CudnnLSTM-trained checkpoints, must use
CudnnCompatibleLSTMCell instead.
"""
super(LayerNormLSTMCell, self).__init__(_reuse=reuse)
self._num_units = num_units
self._use_peepholes = use_peepholes
self._cell_clip = cell_clip
self._initializer = initializer
self._num_proj = num_proj
self._proj_clip = proj_clip
self._forget_bias = forget_bias
self._activation = activation or math_ops.tanh
self._layer_norm = layer_norm
self._norm_gain = norm_gain
self._norm_shift = norm_shift
if num_proj:
self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj))
self._output_size = num_proj
else:
self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_units))
self._output_size = num_units
@property
def state_size(self):
return self._state_size
@property
def output_size(self):
return self._output_size
def _linear(self,
args,
output_size,
bias,
bias_initializer=None,
kernel_initializer=None,
layer_norm=False):
"""Linear map: sum_i(args[i] * W[i]), where W[i] is a Variable.
Args:
args: a 2D Tensor or a list of 2D, batch x n, Tensors.
output_size: int, second dimension of W[i].
bias: boolean, whether to add a bias term or not.
bias_initializer: starting value to initialize the bias
(default is all zeros).
kernel_initializer: starting value to initialize the weight.
layer_norm: boolean, whether to apply layer normalization.
Returns:
A 2D Tensor with shape [batch x output_size] taking value
sum_i(args[i] * W[i]), where each W[i] is a newly created Variable.
Raises:
ValueError: if some of the arguments has unspecified or wrong shape.
"""
if args is None or (nest.is_sequence(args) and not args):
raise ValueError("`args` must be specified")
if not nest.is_sequence(args):
args = [args]
# Calculate the total size of arguments on dimension 1.
total_arg_size = 0
shapes = [a.get_shape() for a in args]
for shape in shapes:
if shape.ndims != 2:
raise ValueError("linear is expecting 2D arguments: %s" % shapes)
if shape[1].value is None:
raise ValueError("linear expects shape[1] to be provided for shape %s, "
"but saw %s" % (shape, shape[1]))
else:
total_arg_size += shape[1].value
dtype = [a.dtype for a in args][0]
# Now the computation.
scope = vs.get_variable_scope()
with vs.variable_scope(scope) as outer_scope:
weights = vs.get_variable(
"kernel", [total_arg_size, output_size],
dtype=dtype,
initializer=kernel_initializer)
if len(args) == 1:
res = math_ops.matmul(args[0], weights)
else:
res = math_ops.matmul(array_ops.concat(args, 1), weights)
if not bias:
return res
with vs.variable_scope(outer_scope) as inner_scope:
inner_scope.set_partitioner(None)
if bias_initializer is None:
bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
biases = vs.get_variable(
"bias", [output_size], dtype=dtype, initializer=bias_initializer)
if not layer_norm:
res = nn_ops.bias_add(res, biases)
return res
def call(self, inputs, state):
"""Run one step of LSTM.
Args:
inputs: input Tensor, 2D, batch x num_units.
state: this must be a tuple of state Tensors,
both `2-D`, with column sizes `c_state` and
`m_state`.
Returns:
A tuple containing:
- A `2-D, [batch x output_dim]`, Tensor representing the output of the
LSTM after reading `inputs` when previous state was `state`.
Here output_dim is:
num_proj if num_proj was set,
num_units otherwise.
- Tensor(s) representing the new state of LSTM after reading `inputs` when
the previous state was `state`. Same type and shape(s) as `state`.
Raises:
ValueError: If input size cannot be inferred from inputs via
static shape inference.
"""
sigmoid = math_ops.sigmoid
(c_prev, m_prev) = state
dtype = inputs.dtype
input_size = inputs.get_shape().with_rank(2)[1]
if input_size.value is None:
raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
scope = vs.get_variable_scope()
with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
lstm_matrix = self._linear(
[inputs, m_prev],
4 * self._num_units,
bias=True,
bias_initializer=None,
layer_norm=self._layer_norm)
i, j, f, o = array_ops.split(
value=lstm_matrix, num_or_size_splits=4, axis=1)
if self._layer_norm:
i = _norm(self._norm_gain, self._norm_shift, i, "input")
j = _norm(self._norm_gain, self._norm_shift, j, "transform")
f = _norm(self._norm_gain, self._norm_shift, f, "forget")
o = _norm(self._norm_gain, self._norm_shift, o, "output")
# Diagonal connections
if self._use_peepholes:
with vs.variable_scope(unit_scope):
w_f_diag = vs.get_variable(
"w_f_diag", shape=[self._num_units], dtype=dtype)
w_i_diag = vs.get_variable(
"w_i_diag", shape=[self._num_units], dtype=dtype)
w_o_diag = vs.get_variable(
"w_o_diag", shape=[self._num_units], dtype=dtype)
if self._use_peepholes:
c = (
sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
sigmoid(i + w_i_diag * c_prev) * self._activation(j))
else:
c = (
sigmoid(f + self._forget_bias) * c_prev +
sigmoid(i) * self._activation(j))
if self._layer_norm:
c = _norm(self._norm_gain, self._norm_shift, c, "state")
if self._cell_clip is not None:
# pylint: disable=invalid-unary-operand-type
c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
# pylint: enable=invalid-unary-operand-type
if self._use_peepholes:
m = sigmoid(o + w_o_diag * c) * self._activation(c)
else:
m = sigmoid(o) * self._activation(c)
if self._num_proj is not None:
with vs.variable_scope("projection"):
m = self._linear(m, self._num_proj, bias=False)
if self._proj_clip is not None:
# pylint: disable=invalid-unary-operand-type
m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
# pylint: enable=invalid-unary-operand-type
new_state = (rnn_cell_impl.LSTMStateTuple(c, m))
return m, new_state

View File

@ -149,7 +149,7 @@ class _BaseAttentionMechanism(AttentionMechanism):
memory_sequence_length=None,
memory_layer=None,
check_inner_dims_defined=True,
score_mask_value=float("-inf"),
score_mask_value=None,
name=None):
"""Construct base AttentionMechanism class.
@ -187,9 +187,13 @@ class _BaseAttentionMechanism(AttentionMechanism):
"memory_layer is not a Layer: %s" % type(memory_layer).__name__)
self._query_layer = query_layer
self._memory_layer = memory_layer
self.dtype = memory_layer.dtype
if not callable(probability_fn):
raise TypeError("probability_fn must be callable, saw type: %s" %
type(probability_fn).__name__)
if score_mask_value is None:
score_mask_value = dtypes.as_dtype(
self._memory_layer.dtype).as_numpy_dtype(-np.inf)
self._probability_fn = lambda score, prev: ( # pylint:disable=g-long-lambda
probability_fn(
_maybe_mask_score(score, memory_sequence_length, score_mask_value),
@ -334,7 +338,8 @@ class LuongAttention(_BaseAttentionMechanism):
memory_sequence_length=None,
scale=False,
probability_fn=None,
score_mask_value=float("-inf"),
score_mask_value=None,
dtype=None,
name="LuongAttention"):
"""Construct the AttentionMechanism mechanism.
@ -353,17 +358,20 @@ class LuongAttention(_BaseAttentionMechanism):
score_mask_value: (optional) The mask value for score before passing into
`probability_fn`. The default is -inf. Only used if
`memory_sequence_length` is not None.
dtype: The data type for the memory layer of the attention mechanism.
name: Name to use when creating ops.
"""
# For LuongAttention, we only transform the memory layer; thus
# num_units **must** match expected the query depth.
if probability_fn is None:
probability_fn = nn_ops.softmax
if dtype is None:
dtype = dtypes.float32
wrapped_probability_fn = lambda score, _: probability_fn(score)
super(LuongAttention, self).__init__(
query_layer=None,
memory_layer=layers_core.Dense(
num_units, name="memory_layer", use_bias=False),
num_units, name="memory_layer", use_bias=False, dtype=dtype),
memory=memory,
probability_fn=wrapped_probability_fn,
memory_sequence_length=memory_sequence_length,
@ -475,7 +483,8 @@ class BahdanauAttention(_BaseAttentionMechanism):
memory_sequence_length=None,
normalize=False,
probability_fn=None,
score_mask_value=float("-inf"),
score_mask_value=None,
dtype=None,
name="BahdanauAttention"):
"""Construct the Attention mechanism.
@ -494,16 +503,20 @@ class BahdanauAttention(_BaseAttentionMechanism):
score_mask_value: (optional): The mask value for score before passing into
`probability_fn`. The default is -inf. Only used if
`memory_sequence_length` is not None.
dtype: The data type for the query and memory layers of the attention
mechanism.
name: Name to use when creating ops.
"""
if probability_fn is None:
probability_fn = nn_ops.softmax
if dtype is None:
dtype = dtypes.float32
wrapped_probability_fn = lambda score, _: probability_fn(score)
super(BahdanauAttention, self).__init__(
query_layer=layers_core.Dense(
num_units, name="query_layer", use_bias=False),
num_units, name="query_layer", use_bias=False, dtype=dtype),
memory_layer=layers_core.Dense(
num_units, name="memory_layer", use_bias=False),
num_units, name="memory_layer", use_bias=False, dtype=dtype),
memory=memory,
probability_fn=wrapped_probability_fn,
memory_sequence_length=memory_sequence_length,
@ -738,11 +751,12 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
memory,
memory_sequence_length=None,
normalize=False,
score_mask_value=float("-inf"),
score_mask_value=None,
sigmoid_noise=0.,
sigmoid_noise_seed=None,
score_bias_init=0.,
mode="parallel",
dtype=None,
name="BahdanauMonotonicAttention"):
"""Construct the Attention mechanism.
@ -766,17 +780,21 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
mode: How to compute the attention distribution. Must be one of
'recursive', 'parallel', or 'hard'. See the docstring for
`tf.contrib.seq2seq.monotonic_attention` for more information.
dtype: The data type for the query and memory layers of the attention
mechanism.
name: Name to use when creating ops.
"""
# Set up the monotonic probability fn with supplied parameters
if dtype is None:
dtype = dtypes.float32
wrapped_probability_fn = functools.partial(
_monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
seed=sigmoid_noise_seed)
super(BahdanauMonotonicAttention, self).__init__(
query_layer=layers_core.Dense(
num_units, name="query_layer", use_bias=False),
num_units, name="query_layer", use_bias=False, dtype=dtype),
memory_layer=layers_core.Dense(
num_units, name="memory_layer", use_bias=False),
num_units, name="memory_layer", use_bias=False, dtype=dtype),
memory=memory,
probability_fn=wrapped_probability_fn,
memory_sequence_length=memory_sequence_length,
@ -834,11 +852,12 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
memory,
memory_sequence_length=None,
scale=False,
score_mask_value=float("-inf"),
score_mask_value=None,
sigmoid_noise=0.,
sigmoid_noise_seed=None,
score_bias_init=0.,
mode="parallel",
dtype=None,
name="LuongMonotonicAttention"):
"""Construct the Attention mechanism.
@ -862,17 +881,21 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
mode: How to compute the attention distribution. Must be one of
'recursive', 'parallel', or 'hard'. See the docstring for
`tf.contrib.seq2seq.monotonic_attention` for more information.
dtype: The data type for the query and memory layers of the attention
mechanism.
name: Name to use when creating ops.
"""
# Set up the monotonic probability fn with supplied parameters
if dtype is None:
dtype = dtypes.float32
wrapped_probability_fn = functools.partial(
_monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
seed=sigmoid_noise_seed)
super(LuongMonotonicAttention, self).__init__(
query_layer=layers_core.Dense(
num_units, name="query_layer", use_bias=False),
num_units, name="query_layer", use_bias=False, dtype=dtype),
memory_layer=layers_core.Dense(
num_units, name="memory_layer", use_bias=False),
num_units, name="memory_layer", use_bias=False, dtype=dtype),
memory=memory,
probability_fn=wrapped_probability_fn,
memory_sequence_length=memory_sequence_length,
@ -1123,8 +1146,11 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
% (len(attention_layer_sizes), len(attention_mechanisms)))
self._attention_layers = tuple(
layers_core.Dense(
attention_layer_size, name="attention_layer", use_bias=False)
for attention_layer_size in attention_layer_sizes)
attention_layer_size,
name="attention_layer",
use_bias=False,
dtype=attention_mechanisms[i].dtype)
for i, attention_layer_size in enumerate(attention_layer_sizes))
self._attention_layer_size = sum(attention_layer_sizes)
else:
self._attention_layers = None

View File

@ -237,7 +237,7 @@ One way to reduce this code duplication would be via a `for` loop:
```python
net = ...
for i in range(3):
net = slim.conv2d(net, 256, [3, 3], scope='conv3_' % (i+1))
net = slim.conv2d(net, 256, [3, 3], scope='conv3_%d' % (i+1))
net = slim.max_pool2d(net, [2, 2], scope='pool2')
```

View File

@ -386,7 +386,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
inputs, None, is_training=False, global_pool=False)
sess.run(variables.global_variables_initializer())
self.assertAllClose(
output.eval(), expected.eval(), atol=1e-4, rtol=1e-4)
output.eval(), expected.eval(), atol=2e-4, rtol=1e-4)
def testUnknownBatchSize(self):
batch = 2

View File

@ -1,4 +1,4 @@
## How to compile and use RDMA-enabled TensorFlow
## How to compile, use and configure RDMA-enabled TensorFlow
1. Follow the regular TF compilation instructions. During configure step, if you want ibverbs based RDMA support, answer yes to this question:
```Do you wish to build TensorFlow with VERBS-RDMA support [y/N]```
@ -7,6 +7,18 @@
```server = tf.train.Server(cluster, job_name="local", task_index=0, protocol='grpc+verbs') # default protocol is 'grpc'```
3. RDMA configuration is done by setting the following environment variables:
* **RDMA_DEVICE**: The RDMA device name to be used. If not defined by user, a default device with an active port will be set if exists.
* **RDMA_DEVICE_PORT**: The port within the selected device. Not relevant if RDMA_DEVICE is not defined. If not defined by user, a default active port will be set if exists.
* **RDMA_GID_INDEX**: The GID index of the port. If not defined by user, a default suitable GID index will be set (RoCEV2 is favourable as default).
* **RDMA_QP_PKEY_INDEX**: The Pkey for the QP. If not defined by user, the default value is 0.
* **RDMA_QP_QUEUE_DEPTH**: TX/RX queue size for the QP. If not defined by user, the default value is 1024.
* **RDMA_QP_TIMEOUT**: The retransmission timeout for QPs. If not defined by user, the default value is 14.
* **RDMA_QP_RETRY_COUNT**: Number of retransmission for QPs. If not defined by user, the default value is 7.
* **RDMA_QP_SL**: Service level configuration for QOS and ECN, valid values are 0-7. If not defined by user, the default value is 0.
* **RDMA_QP_MTU**: MTU configuration for the QPs. If not defined by user, the default value is active MTU from query_port.
* **RDMA_TRAFFIC_CLASS**: Traffic class configuration for QP, in case of DSCP trust level QoS configuration. If not defined by user, the default value is 0. For more info see [HowTo Configure Trust state on Mellanox Adapters](https://community.mellanox.com/docs/DOC-2866).
## Overview
The design is based on TensorFlow r1.0. An RDMA path is added between servers for tensor transfer (weights, gradients, etc). The existing GRPC path remains and is responsible for "administrative" tasks, such as setting up the RDMA path, exchanging computation graphs, etc.

View File

@ -16,6 +16,7 @@ limitations under the License.
#ifdef TENSORFLOW_USE_VERBS
#include "tensorflow/contrib/verbs/rdma.h"
#include <fcntl.h>
#include <cstdlib>
#include "tensorflow/contrib/verbs/verbs_util.h"
#include "tensorflow/core/common_runtime/device_mgr.h"
@ -33,6 +34,8 @@ limitations under the License.
namespace tensorflow {
#define RoCE_V2 "RoCE v2"
namespace {
// hash name to 32-bit integer
uint32_t NameHash(const string& name) {
@ -66,18 +69,338 @@ string MessageTypeToString(RdmaMessageType rmt) {
}
} // namespace
ibv_context* open_default_device() {
ibv_device** dev_list;
ibv_device* ib_dev;
dev_list = ibv_get_device_list(NULL);
CHECK(dev_list) << "No InfiniBand device found";
ib_dev = dev_list[0];
CHECK(ib_dev) << "No InfiniBand device found";
ibv_context* context = ibv_open_device(ib_dev);
CHECK(context) << "Open context failed for " << ibv_get_device_name(ib_dev);
// Function to get environment variable
// Args:
// var_name - the name of the environmental variable
// Returns:
// string with it's value or empty string if not set
string get_env_var(char const* var_name) {
char const* var_temp = getenv(var_name);
return (var_temp == NULL) ? string() : string(var_temp);
}
// Function to open device
// Args:
// ibv_dev device to open
// Returns:
// context of the opened device
ibv_context* open_device(ibv_device* ibv_dev) {
ibv_context* context = ibv_open_device(ibv_dev);
CHECK(context) << "Open context failed for " << ibv_get_device_name(ibv_dev);
return context;
}
// Function to count the number of active ports for device
// Args:
// device - to check active ports
// Returns:
// number of active ports of the given device
int get_dev_active_port_count(ibv_device* device) {
ibv_device_attr device_att;
ibv_port_attr port_attr;
ibv_context* context = NULL;
int rc, port_index, active_ports = 0;
context = ibv_open_device(device);
CHECK(context) << "Open context failed for " << ibv_get_device_name(device);
rc = ibv_query_device(context, &device_att);
CHECK(!rc) << "Failed to query the device";
for (port_index = 1; port_index <= device_att.phys_port_cnt; port_index++) {
rc = ibv_query_port(context, port_index, &port_attr);
CHECK(!rc) << "Failed to query the port" << port_index;
if (port_attr.state == IBV_PORT_ACTIVE) {
active_ports++;
}
}
ibv_close_device(context);
return active_ports;
}
// Function to set device. If RDMA_DEVICE not set, search for device with active
// port.
// Fails if more than one device with active port was found.
// Returns:
// device to use
ibv_device* set_device() {
ibv_device** dev_list;
int dev_num, device_index, device_to_open = 0;
int num_devs_with_active_port = 0;
string env_p_rdma_device, str_port_num;
dev_list = ibv_get_device_list(&dev_num);
CHECK(dev_list) << "No InfiniBand device found";
env_p_rdma_device = get_env_var("RDMA_DEVICE");
if (!env_p_rdma_device.empty()) {
for (device_index = 0; device_index < dev_num; device_index++) {
if (!env_p_rdma_device.compare(
ibv_get_device_name(dev_list[device_index]))) {
CHECK(get_dev_active_port_count(dev_list[device_index]) != 0)
<< "Device " << ibv_get_device_name(dev_list[device_index])
<< " has no active ports";
return dev_list[device_index];
}
}
// check validity of input device
CHECK(false) << "The device " << env_p_rdma_device << " wasn't found";
} else {
// set default device
str_port_num = get_env_var("RDMA_DEVICE_PORT");
CHECK(str_port_num.empty())
<< "RDMA_DEVICE should be provided if RDMA_DEVICE_PORT is set by user";
for (device_index = 0; device_index < dev_num; device_index++) {
// get port_num
if (get_dev_active_port_count(dev_list[device_index]) > 0) {
num_devs_with_active_port++;
CHECK(num_devs_with_active_port <= 1) << ". More than one device with "
"active port in the system. "
"Please enter RDMA_DEVICE";
// found device with at least 1 active port
device_to_open = device_index;
}
}
CHECK(num_devs_with_active_port > 0)
<< "There is no active port in the system";
return dev_list[device_to_open];
}
CHECK(false) << "No device was set!";
return NULL; // never happens
}
// Function to set port for device.
// If RDMA_DEVICE_PORT not set, first active port of the device will be set.
// Args:
// context of the device
// Returns:
// port to use
uint8_t set_port(ibv_context* context) {
uint8_t port_num = 0; // 0 is illegal port number
string str_port_num;
ibv_device_attr device_att;
ibv_port_attr port_attr;
int rc, port_index;
rc = ibv_query_device(context, &device_att);
CHECK(!rc) << "Failed to query the device\n";
str_port_num = get_env_var("RDMA_DEVICE_PORT");
// user defined port
if (!str_port_num.empty()) {
port_num = stoi(str_port_num);
CHECK(port_num > 0) << "RDMA_DEVICE_PORT should be positive";
CHECK(port_num <= device_att.phys_port_cnt) << "RDMA_DEVICE_PORT should be "
"less or equal to amount of "
"available ports";
rc = ibv_query_port(context, port_num, &port_attr);
CHECK(!rc) << "Failed to query the port" << port_num;
// check if port id active
CHECK(port_attr.state == IBV_PORT_ACTIVE)
<< "Selected RDMA_DEVICE_PORT is not active";
} else { // set default port
for (port_index = 1; port_index <= device_att.phys_port_cnt; port_index++) {
rc = ibv_query_port(context, port_index, &port_attr);
CHECK(!rc) << "Failed to query the port" << port_index;
if (port_attr.state == IBV_PORT_ACTIVE) {
port_num = port_index;
break;
}
}
CHECK_GT(port_num, 0) << "No active ports";
}
return port_num;
}
// Function read from sysfs file
// Args:
// dir - directory
// file - file
// buff - buffer for the result
// size - buffer size
// Returns:
// number of bytes were read or -1 if failed
int read_sysfs_file(const char* dir, const char* file, char* buf, size_t size) {
char* path;
int fd;
int len;
if (asprintf(&path, "%s/%s", dir, file) < 0) return -1;
fd = open(path, O_RDONLY);
if (fd < 0) {
free(path);
return -1;
}
len = read(fd, buf, size);
close(fd);
free(path);
if (len > 0 && buf[len - 1] == '\n') buf[--len] = '\0';
return len;
}
// Function to check if GID index support RoCE V2
// Args:
// context - device context
// port_num - port number
// index - GID index
// Returns:
// if GID supports RoCE V2 - true, otherwise - false.
bool is_gid_type_roce_v2(ibv_context* context, uint8_t port_num,
uint8_t index) {
char name[32];
char buff[41];
snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num, index);
if (read_sysfs_file(context->device->ibdev_path, name, buff, sizeof(buff)) <=
0) {
return false;
}
return !strcmp(buff, RoCE_V2);
}
// Function to set GID index.
// If the port link is IB, no GID index should be selected.
// If Ethernet but RDMA_GID_INDEX not set gid index that supports
// RoCE V2 will be chosen(fails if more than one IP is configured)
// Args:
// context - device context
// port_num - port number
// Returns:
// GID index to use
uint8_t set_gid(uint8_t port_num, ibv_context* context) {
ibv_port_attr port_attr;
string gid_str;
int rc, i, gids_num = 0, v2_ip_num = 0;
union ibv_gid gid;
uint8_t gid_index = 0;
rc = ibv_query_port(context, port_num, &port_attr);
CHECK(!rc) << "Failed to query the port" << port_num;
for (i = 0; i < port_attr.gid_tbl_len; i++) {
rc = ibv_query_gid(context, port_num, i, &gid);
CHECK(!rc) << "Failed to query gid to port " << (int)port_num << " index "
<< i;
if (gid.global.interface_id) {
gids_num++;
if (gid.global.subnet_prefix == 0 &&
is_gid_type_roce_v2(context, port_num, i)) {
if (v2_ip_num == 0) {
// can be overwritten by RDMA_GID_INDEX later
gid_index = i;
}
v2_ip_num++;
}
}
}
switch (port_attr.link_layer) {
case (IBV_LINK_LAYER_ETHERNET):
gid_str = get_env_var("RDMA_GID_INDEX");
if (!gid_str.empty()) {
gid_index = stoi(gid_str);
CHECK(gid_index < gids_num)
<< "RDMA_GID_INDEX should be less than GIDs amount" << gids_num;
} else {
CHECK(v2_ip_num <= 1)
<< "More than one IP is available, please specify GID_INDEX";
}
break;
case (IBV_LINK_LAYER_INFINIBAND): // no need in GID index
break;
default:
LOG(INFO) << "Unknown port link layer. Currently supporting Ethernet and "
"InfiniBand only. ";
}
if (!is_gid_type_roce_v2(context, port_num, gid_index)) {
LOG(INFO) << "RoCE v2 is not configured for GID_INDEX " << (int)gid_index;
}
return gid_index;
}
// set the default or environment value to the configuration parameter.
// Args:
// default_val- the default value for this parameter
// env_param- the environment parameter's name
// Returns:
// 32-bit value
uint32_t set_param(uint32_t default_val, const char* env_param) {
uint32_t val = default_val;
string val_s;
val_s = get_env_var(env_param);
if (!val_s.empty()) {
val = stoi(val_s);
}
return val;
}
enum ibv_mtu set_mtu(uint8_t port_num, ibv_context* context) {
ibv_port_attr port_attr;
enum ibv_mtu mtu;
string mtu_s;
int rc, mtu_i;
rc = ibv_query_port(context, port_num, &port_attr);
CHECK(!rc) << "Failed to query the port" << port_num;
mtu_s = get_env_var("RDMA_MTU");
if (!mtu_s.empty()) {
mtu_i = stoi(mtu_s);
switch (mtu_i) {
case 256:
mtu = IBV_MTU_256;
break;
case 512:
mtu = IBV_MTU_512;
break;
case 1024:
mtu = IBV_MTU_1024;
break;
case 2048:
mtu = IBV_MTU_2048;
break;
case 4096:
mtu = IBV_MTU_4096;
break;
default:
CHECK(0) << "Error: MTU input value must be one of the following: 256, "
"512, 1024, 2048, 4096. MTU "
<< mtu << " is invalid\n";
break;
}
CHECK(mtu < port_attr.active_mtu)
<< "MTU configuration for the QPs is larger than active MTU";
} else {
mtu = port_attr.active_mtu;
}
return mtu;
}
RdmaParams params_init(ibv_context* context) {
RdmaParams params;
params.port_num = set_port(context);
params.sgid_index = set_gid(params.port_num, context);
params.pkey_index = (uint8_t)set_param(PKEY_DEFAULT, "RDMA_PKEY");
params.queue_depth = set_param(QUEUE_DEPTH_DEFAULT, "RDMA_QUEUE_DEPTH");
params.timeout = (uint8_t)set_param(TIMEOUT_DEFAULT, "RDMA_TIMEOUT");
params.retry_cnt = (uint8_t)set_param(RETRY_CNT_DEFAULT, "RDMA_RETRY_CNT");
params.sl = (uint8_t)set_param(SL_DEFAULT, "RDMA_SL");
CHECK(params.sl <= 7) << "SL value is " << (int)params.sl
<< ". Valid values are 0-7.";
params.mtu = set_mtu(params.port_num, context);
params.traffic_class = set_param(TRAFFIC_CLASS, "RDMA_TRAFFIC_CLASS");
return params;
}
ibv_pd* alloc_protection_domain(ibv_context* context) {
ibv_pd* pd = ibv_alloc_pd(context);
CHECK(pd) << "Failed to allocate protection domain";
@ -85,7 +408,8 @@ ibv_pd* alloc_protection_domain(ibv_context* context) {
}
RdmaAdapter::RdmaAdapter(const WorkerEnv* worker_env)
: context_(open_default_device()),
: context_(open_device(set_device())),
params_(params_init(context_)),
pd_(alloc_protection_domain(context_)),
worker_env_(worker_env) {
event_channel_ = ibv_create_comp_channel(context_);
@ -242,8 +566,8 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
memset(&attr, 0, sizeof(ibv_qp_init_attr));
attr.send_cq = adapter_->cq_;
attr.recv_cq = adapter_->cq_;
attr.cap.max_send_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
attr.cap.max_recv_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
attr.cap.max_send_wr = adapter_->params_.queue_depth;
attr.cap.max_recv_wr = adapter_->params_.queue_depth;
attr.cap.max_send_sge = 1;
attr.cap.max_recv_sge = 1;
attr.qp_type = IBV_QPT_RC;
@ -257,8 +581,8 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
struct ibv_qp_attr attr;
memset(&attr, 0, sizeof(ibv_qp_attr));
attr.qp_state = IBV_QPS_INIT;
attr.pkey_index = 0;
attr.port_num = 1;
attr.pkey_index = adapter_->params_.pkey_index;
attr.port_num = adapter_->params_.port_num;
attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
int mask =
@ -269,13 +593,15 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
// Local address
{
struct ibv_port_attr attr;
CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &attr))
CHECK(
!ibv_query_port(adapter_->context_, adapter_->params_.port_num, &attr))
<< "Query port";
self_.lid = attr.lid;
self_.qpn = qp_->qp_num;
self_.psn = static_cast<uint32_t>(random::New64()) & 0xffffff;
union ibv_gid gid;
CHECK(!ibv_query_gid(adapter_->context_, (uint8_t)1, 0, &gid))
CHECK(!ibv_query_gid(adapter_->context_, adapter_->params_.port_num,
adapter_->params_.sgid_index, &gid))
<< "Query gid";
self_.snp = gid.global.subnet_prefix;
self_.iid = gid.global.interface_id;
@ -479,11 +805,9 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
struct ibv_qp_attr attr;
memset(&attr, 0, sizeof(ibv_qp_attr));
attr.qp_state = IBV_QPS_RTR;
struct ibv_port_attr port_attr;
CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &port_attr))
<< "Query port failed";
// This assumes both QP's ports are configured with the same MTU
attr.path_mtu = port_attr.active_mtu;
attr.path_mtu = adapter_->params_.mtu;
attr.dest_qp_num = remoteAddr.qpn;
attr.rq_psn = remoteAddr.psn;
attr.max_dest_rd_atomic = 1;
@ -494,9 +818,11 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
attr.ah_attr.grh.flow_label = 0;
attr.ah_attr.grh.hop_limit = 255;
attr.ah_attr.dlid = remoteAddr.lid;
attr.ah_attr.sl = 0;
attr.ah_attr.sl = adapter_->params_.sl;
attr.ah_attr.src_path_bits = 0;
attr.ah_attr.port_num = 1;
attr.ah_attr.port_num = adapter_->params_.port_num;
attr.ah_attr.grh.sgid_index = adapter_->params_.sgid_index;
attr.ah_attr.grh.traffic_class = adapter_->params_.traffic_class;
int r;
CHECK(!(r = ibv_modify_qp(qp_, &attr,
@ -509,8 +835,8 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
memset(&attr, 0, sizeof(ibv_qp_attr));
attr.qp_state = IBV_QPS_RTS;
attr.sq_psn = self_.psn;
attr.timeout = 14;
attr.retry_cnt = 7;
attr.timeout = adapter_->params_.timeout;
attr.retry_cnt = adapter_->params_.retry_cnt;
attr.rnr_retry = 7; /* infinite */
attr.max_rd_atomic = 1;

View File

@ -36,7 +36,24 @@ limitations under the License.
#include "tensorflow/core/platform/mutex.h"
namespace tensorflow {
#define PKEY_DEFAULT 0
#define QUEUE_DEPTH_DEFAULT 1024
#define TIMEOUT_DEFAULT 14
#define RETRY_CNT_DEFAULT 7
#define SL_DEFAULT 0
#define TRAFFIC_CLASS 0
struct RdmaParams {
uint8_t port_num;
uint8_t sgid_index;
uint8_t pkey_index;
uint32_t queue_depth;
uint8_t timeout;
uint8_t retry_cnt;
uint8_t sl;
enum ibv_mtu mtu;
uint8_t traffic_class;
};
// structure to save the address of remote channels.
struct RdmaAddress {
uint32_t lid;
@ -84,6 +101,8 @@ class RdmaAdapter {
protected:
static const int MAX_CONCURRENT_WRITES = 1000;
ibv_context* context_;
// RDMA configuration parameters
RdmaParams params_;
// ibverbs protection domain
ibv_pd* pd_;
// Completion event channel, to wait for work completions

View File

@ -2710,6 +2710,7 @@ tf_cc_test_mkl(
srcs = [
"graph/mkl_layout_pass_test.cc",
"graph/mkl_tfconversion_pass_test.cc",
"util/mkl_util_test.cc",
],
linkstatic = 1,
deps = [

View File

@ -0,0 +1,47 @@
op {
graph_op_name: "UniqueV2"
in_arg {
name: "x"
description: <<END
A `Tensor`.
END
}
in_arg {
name: "axis"
description: <<END
A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
find the unique elements.
END
}
out_arg {
name: "y"
description: <<END
A `Tensor`. Unique elements along the `axis` of `Tensor` x.
END
}
out_arg {
name: "idx"
description: <<END
A 1-D Tensor. Has the same type as x that contains the index of each
value of x in the output y.
END
}
summary: "Finds unique elements in a 1-D tensor."
description: <<END
This operation returns a tensor `y` containing all of the unique elements of `x`
sorted in the same order that they occur in `x`. This operation also returns a
tensor `idx` the same size as `x` that contains the index of each value of `x`
in the unique output `y`. In other words:
`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
For example:
```
# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
y, idx = unique(x)
y ==> [1, 2, 4, 7, 8]
idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
```
END
}

View File

@ -26,6 +26,8 @@ need not be sorted and need not cover all values in the full
range of valid values.
If the sum is empty for a given segment ID `i`, `output[i] = 0`.
If the given segment ID `i` is negative, the value is dropped and will not be
added to the sum of the segment.
`num_segments` should equal the number of distinct segment IDs.

View File

@ -81,7 +81,7 @@ class MklCPUAllocator : public Allocator {
}
#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
if (user_val > max_mem_bytes) {
LOG(WARNING) << "The user specifed a memory limit " << kMaxLimitStr
LOG(WARNING) << "The user specified a memory limit " << kMaxLimitStr
<< "=" << user_val
<< " greater than available physical memory: "
<< max_mem_bytes

View File

@ -46,8 +46,8 @@ class GSYCLInterface {
if (!found_device) {
// Currently Intel GPU is not supported
LOG(WARNING) << "No OpenCL GPU found that is supported by ComputeCpp, "
"trying OpenCL CPU";
LOG(WARNING) << "No OpenCL GPU found that is supported by "
<< "ComputeCpp/triSYCL, trying OpenCL CPU";
}
for (const auto& device : device_list) {
@ -58,10 +58,24 @@ class GSYCLInterface {
}
}
if (!found_device) {
LOG(WARNING) << "No OpenCL CPU found that is supported by "
<< "ComputeCpp/triSYCL, checking for host sycl device";
}
for (const auto& device : device_list) {
// triSYCL only supports the host device for now
if (device.is_host()) {
LOG(WARNING) << "Found SYCL host device";
AddDevice(device);
found_device = true;
}
}
if (!found_device) {
// Currently Intel GPU is not supported
LOG(FATAL)
<< "No OpenCL GPU nor CPU found that is supported by ComputeCpp";
LOG(FATAL) << "No SYCL host and no OpenCL GPU nor CPU"
<< " supported by ComputeCPP/triSYCL was found";
} else {
LOG(INFO) << "Found following OpenCL devices:";
for (int i = 0; i < device_list.size(); i++) {

View File

@ -453,6 +453,21 @@ const Edge* Graph::AddControlEdge(Node* source, Node* dest,
return AddEdge(source, kControlSlot, dest, kControlSlot);
}
void Graph::RemoveControlEdge(const Edge* e) {
if (!e->src_->IsSource() && !e->dst_->IsSink()) {
e->dst_->MaybeCopyOnWrite();
std::string e_src_name = strings::StrCat("^", e->src_->name());
auto* inputs = e->dst_->props_->node_def.mutable_input();
for (auto it = inputs->begin(); it != inputs->end(); ++it) {
if (*it == e_src_name) {
inputs->erase(it);
break;
}
}
}
RemoveEdge(e);
}
Status Graph::UpdateEdge(Node* new_src, int new_src_index, Node* dst,
int dst_index) {
TF_RETURN_IF_ERROR(IsValidOutputTensor(new_src, new_src_index));

View File

@ -451,6 +451,11 @@ class Graph {
// REQUIRES: The edge must exist.
void RemoveEdge(const Edge* edge);
// Removes control edge `edge` from the graph. Note that this also updates
// the corresponding NodeDef to reflect the change.
// REQUIRES: The control edge must exist.
void RemoveControlEdge(const Edge* e);
// Updates the input to a node. The existing edge to `dst` is removed and an
// edge from `new_src` to `dst` is created. The NodeDef associated with `dst`
// is also updated.

View File

@ -117,7 +117,7 @@ DataType EdgeType(const Edge* e) {
}
}
// Return true iff we need to add a same device send/recv for 'edge'.
// Return true iff we need to add the same device send/recv for 'edge'.
bool NeedSameDeviceSendRecv(const Edge* edge, const GraphInfo& info) {
if (edge->IsControlEdge()) {
return false;
@ -1116,7 +1116,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
// before the data is available.
AddInput(real_recv, send->name(), Graph::kControlSlot);
} else if (control_flow_edge != nullptr) {
// Redirect control edge to the real recv since this is not a same
// Redirect control edge to the real recv since this is not the same
// device send/recv.
--num_control_flow_edges;
AddInput(real_recv, control_flow_edge->src()->name(),

View File

@ -118,6 +118,23 @@ class GraphTest : public ::testing::Test {
LOG(FATAL) << name;
}
bool ControlEdgeExistsInGraphOrNodeDef(const Node* src, const Node* dst) {
for (const Edge* e : dst->in_edges()) {
if (e->IsControlEdge() && e->src() == src &&
e->src_output() == Graph::kControlSlot &&
e->dst_input() == Graph::kControlSlot) {
return true;
}
}
std::string control_edge_name = strings::StrCat("^", src->name());
for (int i = 0; i < dst->def().input_size(); ++i) {
if (dst->def().input(i) == control_edge_name) {
return true;
}
}
return false;
}
Graph graph_;
private:
@ -458,8 +475,8 @@ TEST_F(GraphTest, AddControlEdge) {
EXPECT_TRUE(edge == nullptr);
EXPECT_EQ(b->def().input_size(), 2);
// Can add redundant control edge with create_duplicate.
edge = graph_.AddControlEdge(a, b, /*create_duplicate=*/true);
// Can add redundant control edge with allow_duplicates.
edge = graph_.AddControlEdge(a, b, /*allow_duplicates=*/true);
EXPECT_TRUE(edge != nullptr);
// create_duplicate causes the NodeDef not to be updated.
ASSERT_EQ(b->def().input_size(), 2);
@ -477,6 +494,47 @@ TEST_F(GraphTest, AddControlEdge) {
EXPECT_EQ(b->def().input_size(), 2);
}
TEST_F(GraphTest, RemoveControlEdge) {
FromGraphDef(
"node { name: 'A' op: 'OneOutput' }"
"node { name: 'B' op: 'OneInputTwoOutputs' input: [ 'A:0' ] }"
"node { name: 'C' op: 'NoOp' } ");
Node* a = FindNode("A");
Node* b = FindNode("B");
Node* c = FindNode("C");
// Add a control edge.
const Edge* edge_1 = graph_.AddControlEdge(c, a);
const Edge* edge_2 = graph_.AddControlEdge(a, b);
ASSERT_TRUE(edge_1 != nullptr);
ASSERT_TRUE(edge_2 != nullptr);
ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(c, a));
ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(a, b));
graph_.RemoveControlEdge(edge_1);
ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(a, b));
graph_.RemoveControlEdge(edge_2);
ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(a, b));
// Test removing a duplicate control edge.
// Note that unless allow_duplicates is true, the duplicate edge
// will not be added. That's why we expect edge_4 to be a null
// pointer. We are not testing with allow_duplicates set to true,
// as that is a highly unlikely use case that does not make much
// sense.
const Edge* edge_3 = graph_.AddControlEdge(c, a);
const Edge* edge_4 = graph_.AddControlEdge(c, a);
ASSERT_TRUE(edge_3 != nullptr);
ASSERT_TRUE(edge_4 == nullptr);
graph_.RemoveControlEdge(edge_3);
ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
}
TEST_F(GraphTest, UpdateEdge) {
// Build a little graph
Node* a = FromNodeDef("A", "OneOutput", 0);

View File

@ -68,7 +68,7 @@ namespace tensorflow {
// take place before we hit the op. For this, we add a new op before each
// element-wise MKL op to deal with the inputs, called _MklInputConversion.
// This pass has been enhanced to add this capability.
//
//
// The _MklInputConversion op will check the inputs to the elementwise op and
// make sure that either both are in MKL format or both are in TF format,
// depending on their initial state and whether broadcast is needed or not.

View File

@ -58,6 +58,12 @@ class GraphProperties {
const std::vector<OpInfo::TensorProperties>& GetOutputProperties(
const string& node_name) const;
static void FillTensorPropertiesFromContext(
const shape_inference::ShapeHandle&, const DataType&,
shape_inference::InferenceContext*,
std::unordered_map<const shape_inference::Dimension*, int>* dim_ids,
OpInfo::TensorProperties*);
private:
// Inputs
GrapplerItem item_;

View File

@ -62,7 +62,7 @@ const std::set<NodeDef*>& NodeMap::GetOutputs(const string& node_name) const {
void NodeMap::AddNode(const string& name, NodeDef* node) {
auto ret = nodes_.insert(std::make_pair(name, node));
CHECK(ret.second) << "Pair (" << name << "," << node
<< ") is not inserted because a same key already exists.";
<< ") is not inserted because the same key already exists.";
}
void NodeMap::AddOutput(const string& node_name, const string& output_name) {

View File

@ -929,6 +929,25 @@ tf_cc_test(
],
)
tf_cuda_cc_test(
name = "bincount_op_test",
size = "small",
srcs = ["bincount_op_test.cc"],
deps = [
":bincount_op",
":ops_testutil",
":ops_util",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core:math_ops_op_lib",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core:testlib",
],
)
tf_cuda_cc_test(
name = "constant_op_test",
size = "small",
@ -1617,7 +1636,10 @@ DYNAMIC_DEPS = [
tf_kernel_library(
name = "dynamic_partition_op",
prefix = "dynamic_partition_op",
deps = DYNAMIC_DEPS,
deps = DYNAMIC_DEPS + [
":fill_functor",
":gather_functor",
] + if_cuda(["@cub_archive//:cub"]),
)
tf_kernel_library(
@ -1687,7 +1709,7 @@ tf_kernel_library(
],
)
tf_cc_tests(
tf_cuda_cc_tests(
name = "dynamic_op_test",
size = "small",
srcs = [
@ -1698,6 +1720,7 @@ tf_cc_tests(
":data_flow",
":ops_testutil",
":ops_util",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc",
@ -2572,8 +2595,9 @@ tf_kernel_library(
tf_kernel_library(
name = "bucketize_op",
gpu_srcs = ["cuda_device_array.h"],
prefix = "bucketize_op",
deps = MATH_DEPS,
deps = ARRAY_DEPS,
)
tf_kernel_library(
@ -3174,7 +3198,7 @@ tf_kernel_library(
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//third_party/eigen3",
],
] + if_cuda(["@cub_archive//:cub"]),
)
tf_kernel_library(

View File

@ -153,7 +153,8 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
if (data_format_ == FORMAT_NCHW) {
DnnPoolingOp<T>::Compute(
context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
stride_, padding_, data_format_, tensor_in, output_shape);
stride_, padding_, data_format_, tensor_in, output_shape,
/*propagate_nans=*/false);
} else {
Tensor* output = nullptr;
OP_REQUIRES_OK(context,
@ -408,7 +409,7 @@ class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
DnnPoolingGradOp<T>::Compute(
context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
output_shape);
output_shape, /*propagate_nans=*/false);
}
private:
@ -532,7 +533,7 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
DnnPoolingGradOp<T>::Compute(
context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
output_shape);
output_shape, /*propagate_nans=*/false);
}
}

View File

@ -17,6 +17,7 @@ limitations under the License.
#define EIGEN_USE_THREADS
#include "tensorflow/core/kernels/bincount_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/types.h"
@ -27,46 +28,37 @@ namespace tensorflow {
using thread::ThreadPool;
template <typename T>
class BincountOp : public OpKernel {
public:
explicit BincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
void Compute(OpKernelContext* ctx) override {
const Tensor& arr_t = ctx->input(0);
const Tensor& size_tensor = ctx->input(1);
const Tensor& weights_t = ctx->input(2);
int32 size = size_tensor.scalar<int32>()();
OP_REQUIRES(
ctx, size >= 0,
errors::InvalidArgument("size (", size, ") must be non-negative"));
const bool has_weights = weights_t.NumElements() > 0;
OP_REQUIRES(ctx, !(has_weights && arr_t.shape() != weights_t.shape()),
errors::InvalidArgument(
"If weights are passed, they must have the same shape (" +
weights_t.shape().DebugString() + ") as arr (" +
arr_t.shape().DebugString() + ")"));
const auto arr = arr_t.flat<int32>();
const auto weights = weights_t.flat<T>();
namespace functor {
template <typename T>
struct BincountFunctor<CPUDevice, T> {
static Status Compute(OpKernelContext* context,
const typename TTypes<int32, 1>::ConstTensor& arr,
const typename TTypes<T, 1>::ConstTensor& weights,
typename TTypes<T, 1>::Tensor& output) {
int size = output.size();
Tensor all_nonneg_t;
OP_REQUIRES_OK(ctx,
ctx->allocate_temp(DT_BOOL, TensorShape({}), &all_nonneg_t,
AllocatorAttributes()));
all_nonneg_t.scalar<bool>().device(ctx->eigen_cpu_device()) =
TF_RETURN_IF_ERROR(context->allocate_temp(
DT_BOOL, TensorShape({}), &all_nonneg_t, AllocatorAttributes()));
all_nonneg_t.scalar<bool>().device(context->eigen_cpu_device()) =
(arr >= 0).all();
OP_REQUIRES(ctx, all_nonneg_t.scalar<bool>()(),
errors::InvalidArgument("Input arr must be non-negative!"));
if (!all_nonneg_t.scalar<bool>()()) {
return errors::InvalidArgument("Input arr must be non-negative!");
}
// Allocate partial output bin sums for each worker thread. Worker ids in
// ParallelForWithWorkerId range from 0 to NumThreads() inclusive.
ThreadPool* thread_pool =
ctx->device()->tensorflow_cpu_worker_threads()->workers;
context->device()->tensorflow_cpu_worker_threads()->workers;
const int64 num_threads = thread_pool->NumThreads() + 1;
Tensor partial_bins_t;
OP_REQUIRES_OK(ctx, ctx->allocate_temp(weights_t.dtype(),
TensorShape({num_threads, size}),
&partial_bins_t));
TF_RETURN_IF_ERROR(context->allocate_temp(DataTypeToEnum<T>::value,
TensorShape({num_threads, size}),
&partial_bins_t));
auto partial_bins = partial_bins_t.matrix<T>();
partial_bins.setZero();
thread_pool->ParallelForWithWorkerId(
@ -75,7 +67,7 @@ class BincountOp : public OpKernel {
for (int64 i = start_ind; i < limit_ind; i++) {
int32 value = arr(i);
if (value < size) {
if (has_weights) {
if (weights.size()) {
partial_bins(worker_id, value) += weights(i);
} else {
// Complex numbers don't support "++".
@ -84,25 +76,63 @@ class BincountOp : public OpKernel {
}
}
});
TensorShape output_shape({size});
Tensor* output_t;
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t));
// Sum the partial bins along the 0th axis.
Eigen::array<int, 1> reduce_dims({0});
output_t->flat<T>().device(ctx->eigen_cpu_device()) =
partial_bins.sum(reduce_dims);
output.device(context->eigen_cpu_device()) = partial_bins.sum(reduce_dims);
return Status::OK();
}
};
#define REGISTER(TYPE) \
} // namespace functor
template <typename Device, typename T>
class BincountOp : public OpKernel {
public:
explicit BincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
void Compute(OpKernelContext* ctx) override {
const Tensor& arr_t = ctx->input(0);
const Tensor& size_tensor = ctx->input(1);
const Tensor& weights_t = ctx->input(2);
int32 size = size_tensor.scalar<int32>()();
OP_REQUIRES(
ctx, size >= 0,
errors::InvalidArgument("size (", size, ") must be non-negative"));
const auto arr = arr_t.flat<int32>();
const auto weights = weights_t.flat<T>();
Tensor* output_t;
OP_REQUIRES_OK(ctx,
ctx->allocate_output(0, TensorShape({size}), &output_t));
auto output = output_t->flat<T>();
OP_REQUIRES_OK(ctx, functor::BincountFunctor<Device, T>::Compute(
ctx, arr, weights, output));
}
};
#define REGISTER_KERNELS(type) \
REGISTER_KERNEL_BUILDER( \
Name("Bincount").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
BincountOp<TYPE>)
Name("Bincount").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
BincountOp<CPUDevice, type>)
TF_CALL_NUMBER_TYPES(REGISTER);
TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
#undef REGISTER_KERNELS
// TODO(ringwalt): Add a GPU implementation. We probably want to take a
// different approach, e.g. threads in a warp each taking a pass over the same
// data, and each thread summing a single bin.
#if GOOGLE_CUDA
#define REGISTER_KERNELS(type) \
REGISTER_KERNEL_BUILDER(Name("Bincount") \
.Device(DEVICE_GPU) \
.HostMemory("size") \
.TypeConstraint<type>("T"), \
BincountOp<GPUDevice, type>)
TF_CALL_int32(REGISTER_KERNELS);
TF_CALL_float(REGISTER_KERNELS);
#undef REGISTER_KERNELS
#endif // GOOGLE_CUDA
} // end namespace tensorflow

View File

@ -0,0 +1,41 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_BINCOUNT_OP_H_
#define TENSORFLOW_BINCOUNT_OP_H_
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/lib/core/errors.h"
namespace tensorflow {
namespace functor {
template <typename Device, typename T>
struct BincountFunctor {
static Status Compute(OpKernelContext* context,
const typename TTypes<int32, 1>::ConstTensor& arr,
const typename TTypes<T, 1>::ConstTensor& weights,
typename TTypes<T, 1>::Tensor& output);
};
} // end namespace functor
} // end namespace tensorflow
#endif // TENSORFLOW_BINCOUNT_OP_H_

View File

@ -0,0 +1,114 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#include "external/cub_archive/cub/device/device_histogram.cuh"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/kernels/bincount_op.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"
namespace tensorflow {
typedef Eigen::GpuDevice GPUDevice;
namespace functor {
template <typename T>
struct BincountFunctor<GPUDevice, T> {
static Status Compute(OpKernelContext* context,
const typename TTypes<int32, 1>::ConstTensor& arr,
const typename TTypes<T, 1>::ConstTensor& weights,
typename TTypes<T, 1>::Tensor& output) {
if (weights.size() != 0) {
return errors::InvalidArgument(
"Weights should not be passed as it should be "
"handled by unsorted_segment_sum");
}
if (output.size() == 0) {
return Status::OK();
}
// In case weight.size() == 0, use CUB
size_t temp_storage_bytes = 0;
const int32* d_samples = arr.data();
T* d_histogram = output.data();
int num_levels = output.size() + 1;
int32 lower_level = 0;
int32 upper_level = output.size();
int num_samples = arr.size();
const cudaStream_t& stream = GetCudaStream(context);
// The first HistogramEven is to obtain the temp storage size required
// with d_temp_storage = NULL passed to the call.
auto err = cub::DeviceHistogram::HistogramEven(
/* d_temp_storage */ NULL,
/* temp_storage_bytes */ temp_storage_bytes,
/* d_samples */ d_samples,
/* d_histogram */ d_histogram,
/* num_levels */ num_levels,
/* lower_level */ lower_level,
/* upper_level */ upper_level,
/* num_samples */ num_samples,
/* stream */ stream);
if (err != cudaSuccess) {
return errors::Internal(
"Could not launch HistogramEven to get temp storage: ",
cudaGetErrorString(err), ".");
}
Tensor temp_storage;
TF_RETURN_IF_ERROR(context->allocate_temp(
DataTypeToEnum<int8>::value,
TensorShape({static_cast<int64>(temp_storage_bytes)}), &temp_storage));
void* d_temp_storage = temp_storage.flat<int8>().data();
// The second HistogramEven is to actual run with d_temp_storage
// allocated with temp_storage_bytes.
err = cub::DeviceHistogram::HistogramEven(
/* d_temp_storage */ d_temp_storage,
/* temp_storage_bytes */ temp_storage_bytes,
/* d_samples */ d_samples,
/* d_histogram */ d_histogram,
/* num_levels */ num_levels,
/* lower_level */ lower_level,
/* upper_level */ upper_level,
/* num_samples */ num_samples,
/* stream */ stream);
if (err != cudaSuccess) {
return errors::Internal(
"Could not launch HistogramEven: ", cudaGetErrorString(err), ".");
}
return Status::OK();
}
};
} // end namespace functor
#define REGISTER_GPU_SPEC(type) \
template struct functor::BincountFunctor<GPUDevice, type>;
TF_CALL_int32(REGISTER_GPU_SPEC);
TF_CALL_float(REGISTER_GPU_SPEC);
#undef REGISTER_GPU_SPEC
} // namespace tensorflow
#endif // GOOGLE_CUDA

View File

@ -0,0 +1,75 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
#include "tensorflow/core/framework/fake_input.h"
#include "tensorflow/core/framework/node_def_builder.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/graph/node_builder.h"
#include "tensorflow/core/kernels/ops_testutil.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/platform/test_benchmark.h"
namespace tensorflow {
static Graph* Bincount(int arr_size, int nbins) {
Graph* g = new Graph(OpRegistry::Global());
Tensor arr(DT_INT32, TensorShape({arr_size}));
arr.flat<int32>() = arr.flat<int32>().setRandom().abs();
Tensor size(DT_INT32, TensorShape({static_cast<int32>(1)}));
size.flat<int32>()(0) = static_cast<int32>(nbins);
Tensor weights(DT_INT32, TensorShape({0}));
Node* node;
TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Bincount")
.Input(test::graph::Constant(g, arr))
.Input(test::graph::Constant(g, size))
.Input(test::graph::Constant(g, weights))
.Attr("T", DT_INT32)
.Finalize(g, &node));
return g;
}
#define BM_BincountDev(K, NBINS, type) \
static void BM_Bincount##_##type##_##K##_##NBINS(int iters) { \
testing::ItemsProcessed(static_cast<int64>(iters) * K * 1024); \
test::Benchmark(#type, Bincount(K * 1024, NBINS)).Run(iters); \
} \
BENCHMARK(BM_Bincount##_##type##_##K##_##NBINS);
BM_BincountDev(32, 1000, cpu);
BM_BincountDev(32, 2000, cpu);
BM_BincountDev(32, 5000, cpu);
BM_BincountDev(64, 1000, cpu);
BM_BincountDev(64, 2000, cpu);
BM_BincountDev(64, 5000, cpu);
BM_BincountDev(128, 1000, cpu);
BM_BincountDev(128, 2000, cpu);
BM_BincountDev(128, 5000, cpu);
BM_BincountDev(32, 1000, gpu);
BM_BincountDev(32, 2000, gpu);
BM_BincountDev(32, 5000, gpu);
BM_BincountDev(64, 1000, gpu);
BM_BincountDev(64, 2000, gpu);
BM_BincountDev(64, 5000, gpu);
BM_BincountDev(128, 1000, gpu);
BM_BincountDev(128, 2000, gpu);
BM_BincountDev(128, 5000, gpu);
} // end namespace tensorflow

View File

@ -15,15 +15,43 @@ limitations under the License.
// See docs in ../ops/math_ops.cc.
#include <algorithm>
#include <vector>
#include "tensorflow/core/kernels/bucketize_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
namespace tensorflow {
using thread::ThreadPool;
typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
namespace functor {
template <typename T>
struct BucketizeFunctor<CPUDevice, T> {
// PRECONDITION: boundaries_vector must be sorted.
static Status Compute(OpKernelContext* context,
const typename TTypes<T, 1>::ConstTensor& input,
const std::vector<float>& boundaries_vector,
typename TTypes<int32, 1>::Tensor& output) {
const int N = input.size();
for (int i = 0; i < N; i++) {
auto first_bigger_it = std::upper_bound(
boundaries_vector.begin(), boundaries_vector.end(), input(i));
output(i) = first_bigger_it - boundaries_vector.begin();
}
return Status::OK();
}
};
} // namespace functor
template <typename Device, typename T>
class BucketizeOp : public OpKernel {
public:
explicit BucketizeOp(OpKernelConstruction* context) : OpKernel(context) {
@ -34,31 +62,24 @@ class BucketizeOp : public OpKernel {
void Compute(OpKernelContext* context) override {
const Tensor& input_tensor = context->input(0);
auto input = input_tensor.flat<T>();
const auto input = input_tensor.flat<T>();
Tensor* output_tensor = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
&output_tensor));
auto output = output_tensor->template flat<int32>();
const int N = input.size();
for (int i = 0; i < N; i++) {
output(i) = CalculateBucketIndex(input(i));
}
OP_REQUIRES_OK(context, functor::BucketizeFunctor<Device, T>::Compute(
context, input, boundaries_, output));
}
private:
int32 CalculateBucketIndex(const T value) {
auto first_bigger_it =
std::upper_bound(boundaries_.begin(), boundaries_.end(), value);
return first_bigger_it - boundaries_.begin();
}
std::vector<float> boundaries_;
};
#define REGISTER_KERNEL(T) \
REGISTER_KERNEL_BUILDER( \
Name("Bucketize").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
BucketizeOp<T>);
BucketizeOp<CPUDevice, T>);
REGISTER_KERNEL(int32);
REGISTER_KERNEL(int64);
@ -66,4 +87,17 @@ REGISTER_KERNEL(float);
REGISTER_KERNEL(double);
#undef REGISTER_KERNEL
#if GOOGLE_CUDA
#define REGISTER_KERNEL(T) \
REGISTER_KERNEL_BUILDER( \
Name("Bucketize").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
BucketizeOp<GPUDevice, T>);
REGISTER_KERNEL(int32);
REGISTER_KERNEL(int64);
REGISTER_KERNEL(float);
REGISTER_KERNEL(double);
#undef REGISTER_KERNEL
#endif // GOOGLE_CUDA
} // namespace tensorflow

View File

@ -0,0 +1,41 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_BUCKETIZE_OP_H_
#define TENSORFLOW_BUCKETIZE_OP_H_
#include <vector>
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/lib/core/errors.h"
namespace tensorflow {
namespace functor {
template <typename Device, typename T>
struct BucketizeFunctor {
static Status Compute(OpKernelContext* context,
const typename TTypes<T, 1>::ConstTensor& input,
const std::vector<float>& boundaries_vector,
typename TTypes<int32, 1>::Tensor& output);
};
} // namespace functor
} // namespace tensorflow
#endif // TENSORFLOW_BUCKETIZE_OP_H_

View File

@ -0,0 +1,101 @@
/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/kernels/bucketize_op.h"
#include "tensorflow/core/kernels/cuda_device_array.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"
namespace tensorflow {
typedef Eigen::GpuDevice GPUDevice;
template <typename T>
__global__ void BucketizeCustomKernel(
const int32 size_in, const T* in, const int32 size_boundaries,
CudaDeviceArrayStruct<float> boundaries_array, int32* out) {
const float* boundaries = GetCudaDeviceArrayOnDevice(&boundaries_array);
CUDA_1D_KERNEL_LOOP(i, size_in) {
T value = in[i];
int32 bucket = 0;
int32 count = size_boundaries;
while (count > 0) {
int32 l = bucket;
int32 step = count / 2;
l += step;
if (!(value < static_cast<T>(boundaries[l]))) {
bucket = ++l;
count -= step + 1;
} else {
count = step;
}
}
out[i] = bucket;
}
}
namespace functor {
template <typename T>
struct BucketizeFunctor<GPUDevice, T> {
// PRECONDITION: boundaries_vector must be sorted.
static Status Compute(OpKernelContext* context,
const typename TTypes<T, 1>::ConstTensor& input,
const std::vector<float>& boundaries_vector,
typename TTypes<int32, 1>::Tensor& output) {
const GPUDevice& d = context->eigen_device<GPUDevice>();
CudaDeviceArrayOnHost<float> boundaries_array(context,
boundaries_vector.size());
TF_RETURN_IF_ERROR(boundaries_array.Init());
for (int i = 0; i < boundaries_vector.size(); ++i) {
boundaries_array.Set(i, boundaries_vector[i]);
}
TF_RETURN_IF_ERROR(boundaries_array.Finalize());
CudaLaunchConfig config = GetCudaLaunchConfig(input.size(), d);
BucketizeCustomKernel<T>
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
input.size(), input.data(), boundaries_vector.size(),
boundaries_array.data(), output.data());
return Status::OK();
}
};
} // namespace functor
#define REGISTER_GPU_SPEC(type) \
template struct functor::BucketizeFunctor<GPUDevice, type>;
REGISTER_GPU_SPEC(int32);
REGISTER_GPU_SPEC(int64);
REGISTER_GPU_SPEC(float);
REGISTER_GPU_SPEC(double);
#undef REGISTER_GPU_SPEC
} // namespace tensorflow
#endif // GOOGLE_CUDA

View File

@ -236,6 +236,7 @@ class Conv3DBackpropInputOp : public OpKernel {
REGISTER_KERNEL_BUILDER( \
Name("Conv3DBackpropInputV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
Conv3DBackpropInputOp<CPUDevice, T>);
TF_CALL_half(REGISTER_CPU_KERNEL);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
#undef REGISTER_CPU_KERNEL
@ -383,6 +384,7 @@ class Conv3DBackpropFilterOp : public OpKernel {
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T"), \
Conv3DBackpropFilterOp<CPUDevice, T>);
TF_CALL_half(REGISTER_CPU_KERNEL);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
#undef REGISTER_CPU_KERNEL
@ -409,6 +411,7 @@ namespace functor {
const std::array<int, 3>& padding_right, \
typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
DECLARE_GPU_SPEC(Eigen::half);
DECLARE_GPU_SPEC(float);
#undef DECLARE_GPU_SPEC
} // namespace functor
@ -1098,22 +1101,27 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
bool cudnn_use_autotune_;
};
REGISTER_KERNEL_BUILDER(
Name("Conv3DBackpropInput").Device(DEVICE_GPU).TypeConstraint<float>("T"),
Conv3DBackpropInputOp<GPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2")
.Device(DEVICE_GPU)
.TypeConstraint<float>("T")
.HostMemory("input_sizes"),
Conv3DBackpropInputOp<GPUDevice, float>);
REGISTER_KERNEL_BUILDER(
Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<float>("T"),
Conv3DBackpropFilterOp<GPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")
.Device(DEVICE_GPU)
.TypeConstraint<float>("T")
.HostMemory("filter_sizes"),
Conv3DBackpropFilterOp<GPUDevice, float>);
#define REGISTER_GPU_KERNEL(T) \
REGISTER_KERNEL_BUILDER( \
Name("Conv3DBackpropInput").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
Conv3DBackpropInputOp<GPUDevice, T>); \
REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2") \
.Device(DEVICE_GPU) \
.TypeConstraint<T>("T") \
.HostMemory("input_sizes"), \
Conv3DBackpropInputOp<GPUDevice, T>); \
REGISTER_KERNEL_BUILDER( \
Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
Conv3DBackpropFilterOp<GPUDevice, T>); \
REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2") \
.Device(DEVICE_GPU) \
.TypeConstraint<T>("T") \
.HostMemory("filter_sizes"), \
Conv3DBackpropFilterOp<GPUDevice, T>);
TF_CALL_half(REGISTER_GPU_KERNEL);
TF_CALL_float(REGISTER_GPU_KERNEL);
#undef REGISTER_GPU_KERNEL
#endif // GOOGLE_CUDA
} // namespace tensorflow

View File

@ -145,6 +145,7 @@ class Conv3DOp : public BinaryOp<T> {
REGISTER_KERNEL_BUILDER( \
Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
Conv3DOp<CPUDevice, T>);
TF_CALL_half(REGISTER_CPU_KERNEL);
TF_CALL_float(REGISTER_CPU_KERNEL);
TF_CALL_double(REGISTER_CPU_KERNEL);
#undef REGISTER_CPU_KERNEL
@ -482,12 +483,16 @@ namespace functor {
const std::array<int, 3>& padding_right, \
typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
DECLARE_GPU_SPEC(Eigen::half);
DECLARE_GPU_SPEC(float);
#undef DECLARE_GPU_SPEC
} // namespace functor
// Registration of the GPU implementations.
REGISTER_KERNEL_BUILDER(
Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
Conv3DOp<GPUDevice, Eigen::half>);
REGISTER_KERNEL_BUILDER(
Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
Conv3DOp<GPUDevice, float>);

View File

@ -20,16 +20,8 @@ namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
complex64, complex128);
#if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(TYPE) \
REGISTER_KERNEL_BUILDER( \
Name("Acosh") \
.Device(DEVICE_SYCL) \
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::acosh<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA

View File

@ -20,17 +20,9 @@ namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
complex64, complex128);
#if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(TYPE) \
REGISTER_KERNEL_BUILDER( \
Name("Asinh") \
.Device(DEVICE_SYCL) \
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::asinh<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYC
#ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
REGISTER2(UnaryOp, GPU, "Asinh", functor::asinh, float, double);

View File

@ -20,17 +20,9 @@ namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
complex64, complex128);
#if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(TYPE) \
REGISTER_KERNEL_BUILDER( \
Name("Atanh") \
.Device(DEVICE_SYCL) \
.TypeConstraint<TYPE>("T"), \
UnaryOp<SYCLDevice, functor::atanh<TYPE>>);
REGISTER_SYCL_KERNEL(float);
REGISTER_SYCL_KERNEL(double);
#undef REGISTER_SYCL_KERNEL
#endif // TENSORFLOW_USE_SYC
#ifdef TENSORFLOW_USE_SYCL
REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
#endif // TENSORFLOW_USE_SYCL
#if GOOGLE_CUDA
REGISTER2(UnaryOp, GPU, "Atanh", functor::atanh, float, double);

View File

@ -49,7 +49,11 @@ template <typename T>
struct scalar_asinh_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
#if EIGEN_HAS_CXX11_MATH
return numext::asinh(a);
#else
return std::asinh(a);
#endif // EIGEN_HAS_CXX11_MATH
}
};
template <typename T>
@ -61,7 +65,11 @@ template <typename T>
struct scalar_acosh_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
#if EIGEN_HAS_CXX11_MATH
return numext::acosh(a);
#else
return std::acosh(a);
#endif // EIGEN_HAS_CXX11_MATH
}
};
template <typename T>
@ -73,7 +81,11 @@ template <typename T>
struct scalar_atanh_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
#if EIGEN_HAS_CXX11_MATH
return numext::atanh(a);
#else
return std::atanh(a);
#endif // EIGEN_HAS_CXX11_MATH
}
};
template <typename T>

View File

@ -231,7 +231,8 @@ static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
}
// Pad to vector-register width (if needed).
for (int64 d = 0; d < pad_size; ++d) {
buffer[buf_base + vectorized_size + scalar_size + d] = 0;
buffer[buf_base + vectorized_size + scalar_size + d] =
static_cast<T>(0);
}
}
}
@ -297,7 +298,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
// Reset accumulator.
auto vaccum = Eigen::internal::pset1<Packet>(0);
auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
for (int j = 0; j < filter_spatial_size; ++j) {
// Calculate index.
const int64 index = i + j * padded_filter_inner_dim_size;
@ -318,7 +319,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
}
if (output_scalar_size > 0) {
auto vaccum = Eigen::internal::pset1<Packet>(0);
auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
for (int j = 0; j < filter_spatial_size; ++j) {
const int64 index =
output_vectorized_size + j * padded_filter_inner_dim_size;
@ -346,7 +347,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
if (depth_multiplier > 1) {
for (int64 d = 0; d < in_depth; ++d) {
const int64 index = d * args.depth_multiplier;
T accum = 0;
T accum = static_cast<T>(0);
for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
accum += Eigen::internal::predux(v);
@ -510,6 +511,8 @@ static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
#if GOOGLE_CUDA
extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
Eigen::half>;
extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
@ -884,6 +887,8 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
#if GOOGLE_CUDA
extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
Eigen::half>;
extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;

View File

@ -94,7 +94,7 @@ struct DepthwiseConv2DKernel {
for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
// Reset accumulator.
auto vaccum = Eigen::internal::pset1<Packet>(0);
auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
for (int j = 0; j < filter_spatial_size; ++j) {
// Calculate index.
const int64 index = i + j * padded_filter_inner_dim_size;
@ -115,7 +115,7 @@ struct DepthwiseConv2DKernel {
}
if (output_scalar_size > 0) {
auto vaccum = Eigen::internal::pset1<Packet>(0);
auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
for (int j = 0; j < filter_spatial_size; ++j) {
const int64 index =
output_vectorized_size + j * padded_filter_inner_dim_size;
@ -246,6 +246,7 @@ extern template class LaunchConv2DOp<CPUDevice, float>;
#if GOOGLE_CUDA
// Extern template instantiated in depthwise_conv_op_gpu.cc.
extern template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;
@ -419,12 +420,18 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
DepthwiseConv2dNativeOp<CPUDevice, T>);
TF_CALL_half(REGISTER_CPU_KERNEL);
TF_CALL_float(REGISTER_CPU_KERNEL);
#if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
TF_CALL_double(REGISTER_CPU_KERNEL);
#endif
#if GOOGLE_CUDA
REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
.Device(DEVICE_GPU)
.TypeConstraint<Eigen::half>("T"),
DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
REGISTER_KERNEL_BUILDER(
Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<float>("T"),
DepthwiseConv2dNativeOp<GPUDevice, float>);

View File

@ -158,7 +158,8 @@ struct DepthwiseFilterPadOp {
}
// Pad the remainder of output to vector-register boundary.
for (int64 j = 0; j < pad_size; ++j) {
padded_filter[output_base + vectorized_size + scalar_size + j] = 0;
padded_filter[output_base + vectorized_size + scalar_size + j] =
static_cast<T>(0);
}
}
}
@ -266,7 +267,7 @@ struct DepthwiseInputCopyOp {
// Pad the remainder of the output to vector register boundary.
for (int64 d = 0; d < output_pad_size; ++d) {
in_buf[d] = 0;
in_buf[d] = static_cast<T>(0);
}
in_buf += output_pad_size;

View File

@ -105,7 +105,7 @@ __global__ void __launch_bounds__(1024, 2)
const int input_row_end = input_row_start + filter_rows;
const int input_col_end = input_col_start + filter_cols;
T sum = 0;
T sum = static_cast<T>(0);
const int input_offset_temp = in_rows * OB;
if (input_row_start >= 0 && input_col_start >= 0 &&
@ -258,8 +258,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
__syncthreads();
if (depth_in_range) {
T sum1 = 0;
T sum2 = 0;
T sum1 = static_cast<T>(0);
T sum2 = static_cast<T>(0);
int shared_offset = data_idx;
const T* filter_ptr = filter_read_offset + shared_data;
UNROLL for (int r = 0; r < filter_rows; ++r) {
@ -369,7 +369,7 @@ __global__ void __launch_bounds__(1024, 2)
const int input_row_end = input_row_start + filter_rows;
const int input_col_end = input_col_start + filter_cols;
T sum = 0;
T sum = static_cast<T>(0);
if (input_row_start >= 0 && input_col_start >= 0 &&
input_row_end < in_rows && input_col_end < in_cols) {
// Loop that doesn't need to check for boundary conditions.
@ -529,8 +529,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
__syncthreads();
if (slice_in_range) {
T sum1 = 0;
T sum2 = 0;
T sum1 = static_cast<T>(0);
T sum2 = static_cast<T>(0);
int shared_offset = data_idx;
const T* filter_ptr = filter_read_offset + shared_data;
UNROLL for (int r = 0; r < filter_rows; ++r) {
@ -710,6 +710,7 @@ void LaunchDepthwiseConvOp<GPUDevice, T>::operator()(OpKernelContext* ctx,
"Launch of gpu kernel for DepthwiseConv2dGPULaunch failed"));
}
template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
template struct LaunchDepthwiseConvOp<GPUDevice, float>;
template struct LaunchDepthwiseConvOp<GPUDevice, double>;
@ -744,7 +745,7 @@ __global__ void __launch_bounds__(640, 2)
const int in_r = (thread_id / in_depth / in_cols) % in_rows;
const int b = thread_id / in_depth / in_cols / in_rows;
T sum = 0;
T sum = static_cast<T>(0);
const int out_r_start =
tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
@ -810,7 +811,7 @@ __global__ void __launch_bounds__(640, 2)
const int in_d = (thread_id / in_cols / in_rows) % in_depth;
const int b = thread_id / in_depth / in_cols / in_rows;
T sum = 0;
T sum = static_cast<T>(0);
const int out_d_start = in_d * depth_multiplier;
const int out_d_end = out_d_start + depth_multiplier;
@ -919,6 +920,7 @@ void LaunchDepthwiseConvBackpropInputOp<GPUDevice, T>::operator()(
"utGPULaunch failed"));
}
template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, Eigen::half>;
template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
@ -1631,6 +1633,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()(
"terGPULaunch failed"));
}
template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, Eigen::half>;
template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
} // namespace tensorflow

Some files were not shown because too many files have changed in this diff Show More