Merge changes from github.

PiperOrigin-RevId: 180746153
This commit is contained in:
Raghuraman Krishnamoorthi 2018-01-03 19:06:54 -08:00 committed by TensorFlower Gardener
parent 2eef71c3f9
commit 71896cc7e5
103 changed files with 2654 additions and 744 deletions

View File

@ -1,4 +1,4 @@
Copyright 2017 The TensorFlow Authors. All rights reserved.
Copyright 2018 The TensorFlow Authors. All rights reserved.
Apache License
Version 2.0, January 2004

View File

@ -46,8 +46,8 @@ packages on Linux, Mac, and Windows.
**Individual whl files**
* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/))
* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/))
* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/))
* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/))

View File

@ -763,24 +763,6 @@ Status LgammaGrad(const Scope& scope, const Operation& op,
}
REGISTER_GRADIENT_OP("Lgamma", LgammaGrad);
Status SelectGrad(const Scope& scope, const Operation& op,
const std::vector<Output>& grad_inputs,
std::vector<Output>* grad_outputs) {
auto comparator = op.input(0);
auto x = op.input(1);
auto zeros = ZerosLike(scope, x);
auto grad = grad_inputs[0];
auto gx_1 = Where3(scope, comparator, grad, zeros);
auto gx_2 = Where3(scope, comparator, zeros, grad);
grad_outputs->push_back(NoGradient());
grad_outputs->push_back(gx_1);
grad_outputs->push_back(gx_2);
return scope.status();
}
REGISTER_GRADIENT_OP("Select", SelectGrad);
Status MinOrMaxGrad(const Scope& scope, const Operation& op,
const std::vector<Output>& grad_inputs,
std::vector<Output>* grad_outputs) {

View File

@ -904,13 +904,5 @@ TEST_F(NaryGradTest, Prod) {
RunTest({x}, {x_shape}, {y}, {y_shape});
}
TEST_F(NaryGradTest, Select) {
TensorShape shape({3, 4});
auto x1 = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
auto x2 = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
auto y = Where3(scope_, Greater(scope_, x1, x2), x1, x2);
RunTest({x1, x2}, {shape, shape}, {y}, {shape});
}
} // namespace
} // namespace tensorflow

View File

@ -116,7 +116,7 @@ Status CreateCond(const Scope& scope, const CondGraphBuilderFn& cond,
return Status::OK();
}
// Create the bdoy subgraph defined by `body`. `outputs` must be non-null and
// Create the body subgraph defined by `body`. `outputs` must be non-null and
// empty.
Status CreateBody(const Scope& scope, const BodyGraphBuilderFn& body,
const std::vector<Output>& inputs,

View File

@ -4,6 +4,7 @@ package(
default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
)
load("//tensorflow:tensorflow.bzl", "tf_copts")
load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
tf_kernel_library(
@ -167,6 +168,7 @@ tf_kernel_library(
cc_library(
name = "index_ops_kernel_argmax_float_1d",
srcs = ["index_ops_kernel_argmax_float_1d.cc"],
copts = tf_copts(),
visibility = ["//visibility:public"],
deps = [
"//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
@ -179,6 +181,7 @@ cc_library(
cc_library(
name = "index_ops_kernel_argmax_float_2d",
srcs = ["index_ops_kernel_argmax_float_2d.cc"],
copts = tf_copts(),
visibility = ["//visibility:public"],
deps = [
"//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",

View File

@ -39,7 +39,7 @@ Status MakeXlaCompilerArgumentsFromInputs(
*has_uninitialized_vars = false;
*has_tensor_arrays = false;
for (int i = 0; i < ctx->num_inputs(); ++i) {
VLOG(2) << " Input " << i
VLOG(2) << " Input " << i
<< " type: " << DataTypeString(ctx->input_type(i))
<< " shape: " << ctx->InputShape(i).DebugString();
XlaCompiler::Argument& arg = (*args)[i];

View File

@ -19,7 +19,7 @@ limitations under the License.
#include "third_party/eigen3/Eigen/Core"
#ifdef __AVX__
#ifdef TF_XLA_HAS_AVX
xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_ExpV8F32AVX(
xla::cpu::runtime::V8F32AVX x) {
return Eigen::internal::pexp(x);
@ -29,7 +29,7 @@ xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_LogV8F32AVX(
xla::cpu::runtime::V8F32AVX x) {
return Eigen::internal::plog(x);
}
#endif // __AVX__
#endif // TF_XLA_HAS_AVX
namespace xla {
namespace cpu {

View File

@ -24,6 +24,11 @@ limitations under the License.
#include "tensorflow/core/platform/macros.h"
#if defined(__AVX__)
#include <immintrin.h>
#define TF_XLA_HAS_AVX
#endif
namespace xla {
namespace cpu {
namespace runtime {
@ -31,14 +36,16 @@ namespace runtime {
extern const char *const kExpV8F32AVXSymbolName;
extern const char *const kLogV8F32AVXSymbolName;
typedef float V8F32AVX __attribute__((__vector_size__(32)));
#ifdef TF_XLA_HAS_AVX
typedef __m256 V8F32AVX;
#endif
} // namespace runtime
} // namespace cpu
} // namespace xla
extern "C" {
#ifdef __AVX__
#ifdef TF_XLA_HAS_AVX
// The following functions are vectorized versions of a selection of libm
// library functions.
// References to these functions are created by the LLVM vectorizer.

View File

@ -19,7 +19,7 @@ limitations under the License.
#include "third_party/eigen3/Eigen/Core"
#ifdef __ARM_NEON__
#ifdef TF_XLA_HAS_NEON
xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(
xla::cpu::runtime::V4F32NEON x) {
@ -32,7 +32,7 @@ xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
return Eigen::internal::plog(p);
}
#endif // __ARM_NEON__
#endif // TF_XLA_HAS_NEON
namespace xla {
namespace cpu {

View File

@ -27,6 +27,7 @@ limitations under the License.
// __attribute__((__vector_size__(*))). Unfortunately, the typedef for the ARM
// NEON SIMD types is not portable, so the type has to come from <arm_neon.h>
#include <arm_neon.h>
#define TF_XLA_HAS_NEON
#endif // __ARM_NEON__
namespace xla {
@ -36,12 +37,9 @@ namespace runtime {
extern const char *const kExpV4F32NEONSymbolName;
extern const char *const kLogV4F32NEONSymbolName;
#ifdef __ARM_NEON__
#ifdef TF_XLA_HAS_NEON
typedef float32x4_t V4F32NEON;
#else
// On non-ARM platforms ensure the declaration is present
struct V4F32NEON;
#endif // __ARM_NEON__
#endif // TF_XLA_HAS_NEON
} // namespace runtime
} // namespace cpu
@ -49,7 +47,7 @@ struct V4F32NEON;
extern "C" {
#ifdef __ARM_NEON__
#ifdef TF_XLA_HAS_NEON
// The following functions are vectorized versions of a selection of libm
// library functions.
// References to these functions are created by the LLVM vectorizer.
@ -58,7 +56,7 @@ xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(
xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
xla::cpu::runtime::V4F32NEON x);
#endif // __ARM_NEON__
#endif // TF_XLA_HAS_NEON
}
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_

View File

@ -19,7 +19,7 @@ limitations under the License.
#include "third_party/eigen3/Eigen/Core"
#ifdef __SSE4_1__
#ifdef TF_XLA_HAS_SSE4_1
xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE(
xla::cpu::runtime::V4F32SSE x) {
@ -33,7 +33,7 @@ xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE(
return Eigen::internal::plog(p);
}
#endif // __SSE4_1__
#endif // TF_XLA_HAS_SSE4_1
namespace xla {
namespace cpu {

View File

@ -24,6 +24,13 @@ limitations under the License.
#include "tensorflow/core/platform/macros.h"
// MSVC does not have __SSE4_1__ macro. Eigen enables EIGEN_VECTORIZE_SSE4_1
// when __AVX__ is defined, we should do the same.
#if defined(__SSE4_1__) || (defined(_MSC_VER) && defined(__AVX__))
#include <smmintrin.h>
#define TF_XLA_HAS_SSE4_1
#endif
namespace xla {
namespace cpu {
namespace runtime {
@ -31,7 +38,9 @@ namespace runtime {
extern const char *const kExpV4F32SSESymbolName;
extern const char *const kLogV4F32SSESymbolName;
typedef float V4F32SSE __attribute__((__vector_size__(16)));
#ifdef TF_XLA_HAS_SSE4_1
typedef __m128 V4F32SSE;
#endif
} // namespace runtime
} // namespace cpu
@ -39,7 +48,7 @@ typedef float V4F32SSE __attribute__((__vector_size__(16)));
extern "C" {
#ifdef __SSE4_1__
#ifdef TF_XLA_HAS_SSE4_1
// The following functions are vectorized versions of a selection of libm
// library functions.
// References to these functions are created by the LLVM vectorizer.

View File

@ -64,14 +64,14 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
&ir_builder),
llvm::ConstantFP::get(vector_type, 9.0), &ir_builder);
std::array<float, 7> numerator_coeffs(
{-2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f,
4.89352455891786e-03f});
std::array<float, 7> numerator_coeffs{
-2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f,
4.89352455891786e-03f};
std::array<float, 4> denominator_coeffs(
{1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
4.89352518554385e-03f});
std::array<float, 4> denominator_coeffs{
1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
4.89352518554385e-03f};
llvm::Value* input_squared =
ir_builder.CreateFMul(input_clamped, input_clamped);

View File

@ -103,17 +103,17 @@ llvm::StringRef GetHostCpuName() {
CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() {
CompilerFunctor::VectorIntrinsics intrinsics;
#ifdef __SSE4_1__
#ifdef TF_XLA_HAS_SSE4_1
intrinsics.sse_intrinsics = true;
#else
intrinsics.sse_intrinsics = false;
#endif
#ifdef __AVX__
#ifdef TF_XLA_HAS_AVX
intrinsics.avx_intrinsics = true;
#else
intrinsics.avx_intrinsics = false;
#endif
#ifdef __ARM_NEON__
#ifdef TF_XLA_HAS_NEON
intrinsics.neon_intrinsics = true;
#else
intrinsics.neon_intrinsics = false;
@ -215,15 +215,15 @@ bool RegisterKnownJITSymbols() {
REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
#ifdef __ARM_NEON__
#ifdef TF_XLA_HAS_NEON
REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32NEON);
REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32NEON);
#endif
#ifdef __SSE4_1__
#ifdef TF_XLA_HAS_SSE4_1
REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32SSE);
REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32SSE);
#endif
#ifdef __AVX__
#ifdef TF_XLA_HAS_AVX
REGISTER_CPU_RUNTIME_SYMBOL(ExpV8F32AVX);
REGISTER_CPU_RUNTIME_SYMBOL(LogV8F32AVX);
#endif

View File

@ -407,7 +407,9 @@ endif()
# Let's get to work!
include(tf_core_framework.cmake)
include(tf_stream_executor.cmake)
if (tensorflow_ENABLE_GPU)
include(tf_stream_executor.cmake)
endif()
include(tf_core_cpu.cmake)
include(tf_core_ops.cmake)

View File

@ -0,0 +1,26 @@
#!/usr/bin/env bash
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
(
cd "$(dirname "$0")"
mkdir -p _build
(
cd _build
rm -rf -- *
cmake ..
)
)

View File

@ -86,9 +86,7 @@ tensorflow/python/ops/distributions
tensorflow/python/ops/linalg
tensorflow/python/ops/losses
tensorflow/python/platform
tensorflow/python/platform/default
tensorflow/python/platform/summary
tensorflow/python/profiler/
tensorflow/python/profiler
tensorflow/python/profiler/internal
tensorflow/python/saved_model
tensorflow/python/summary
@ -115,8 +113,6 @@ tensorflow/contrib/batching/kernels
tensorflow/contrib/batching/python
tensorflow/contrib/batching/python/ops
tensorflow/contrib/bayesflow
tensorflow/contrib/bayesflow/examples
tensorflow/contrib/bayesflow/examples/reinforce_simple
tensorflow/contrib/bayesflow/python
tensorflow/contrib/bayesflow/python/ops
tensorflow/contrib/boosted_trees
@ -212,16 +208,6 @@ tensorflow/contrib/input_pipeline/python/ops
tensorflow/contrib/integrate
tensorflow/contrib/integrate/python
tensorflow/contrib/integrate/python/ops
tensorflow/contrib/ios_examples
tensorflow/contrib/ios_examples/benchmark
tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj
tensorflow/contrib/ios_examples/benchmark/data
tensorflow/contrib/ios_examples/camera
tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj
tensorflow/contrib/ios_examples/camera/en.lproj
tensorflow/contrib/ios_examples/simple
tensorflow/contrib/ios_examples/simple/data
tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj
tensorflow/contrib/keras
tensorflow/contrib/keras/api
tensorflow/contrib/keras/api/keras
@ -276,9 +262,6 @@ tensorflow/contrib/layers/python/ops
tensorflow/contrib/learn
tensorflow/contrib/learn/python
tensorflow/contrib/learn/python/learn
tensorflow/contrib/learn/python/learn/dataframe
tensorflow/contrib/learn/python/learn/dataframe/queues
tensorflow/contrib/learn/python/learn/dataframe/transforms
tensorflow/contrib/learn/python/learn/datasets
tensorflow/contrib/learn/python/learn/datasets/data
tensorflow/contrib/learn/python/learn/estimators
@ -301,6 +284,9 @@ tensorflow/contrib/linear_optimizer/kernels
tensorflow/contrib/linear_optimizer/kernels/g3doc
tensorflow/contrib/linear_optimizer/python
tensorflow/contrib/linear_optimizer/python/ops
# TODO(drpngx): Fix failing imports
# tensorflow/contrib/lite/python
# tensorflow/contrib/lite/toco/python
tensorflow/contrib/lookup
tensorflow/contrib/losses
tensorflow/contrib/losses/python
@ -314,7 +300,6 @@ tensorflow/contrib/memory_stats/python
tensorflow/contrib/memory_stats/python/ops
tensorflow/contrib/meta_graph_transform
tensorflow/contrib/metrics
tensorflow/contrib/metrics/ops
tensorflow/contrib/metrics/python
tensorflow/contrib/metrics/python/metrics
tensorflow/contrib/metrics/python/ops
@ -346,7 +331,6 @@ tensorflow/contrib/pi_examples/label_image
tensorflow/contrib/pi_examples/label_image/data
tensorflow/contrib/periodic_resample
tensorflow/contrib/periodic_resample/python
tensorflow/contrib/periodic_resample/python/kernels
tensorflow/contrib/periodic_resample/python/ops
tensorflow/contrib/predictor
tensorflow/contrib/quantization
@ -411,13 +395,9 @@ tensorflow/contrib/tensorboard/plugins
tensorflow/contrib/tensorboard/plugins/projector
tensorflow/contrib/tensor_forest
tensorflow/contrib/tensor_forest/client
tensorflow/contrib/tensor_forest/core
tensorflow/contrib/tensor_forest/core/ops
tensorflow/contrib/tensor_forest/data
tensorflow/contrib/tensor_forest/hybrid
tensorflow/contrib/tensor_forest/hybrid/core
tensorflow/contrib/tensor_forest/hybrid/core/ops
tensorflow/contrib/tensor_forest/hybrid/ops
tensorflow/contrib/tensor_forest/hybrid/python
tensorflow/contrib/tensor_forest/hybrid/python/layers
tensorflow/contrib/tensor_forest/hybrid/python/models

View File

@ -126,10 +126,15 @@ STRING(REGEX REPLACE ";" "\\\\;" python_protos "${python_protos}")
STRING(REGEX REPLACE "\n" ";" python_protos "${python_protos}")
foreach(python_proto ${python_protos})
file(GLOB_RECURSE tf_python_protos_src RELATIVE ${tensorflow_source_dir}
"${tensorflow_source_dir}/${python_proto}/*.proto"
)
list(APPEND tf_python_protos_srcs ${tf_python_protos_src})
if(NOT python_proto MATCHES "\#")
if(NOT EXISTS "${tensorflow_source_dir}/${python_proto}")
message(SEND_ERROR "Python proto directory not found: ${python_proto}")
endif()
file(GLOB_RECURSE tf_python_protos_src RELATIVE ${tensorflow_source_dir}
"${tensorflow_source_dir}/${python_proto}/*.proto"
)
list(APPEND tf_python_protos_srcs ${tf_python_protos_src})
endif()
endforeach(python_proto)
RELATIVE_PROTOBUF_GENERATE_PYTHON(
@ -142,10 +147,15 @@ STRING(REGEX REPLACE ";" "\\\\;" python_protos_cc "${python_protos_cc}")
STRING(REGEX REPLACE "\n" ";" python_protos_cc "${python_protos_cc}")
foreach(python_proto_cc ${python_protos_cc})
file(GLOB_RECURSE tf_python_protos_cc_src RELATIVE ${tensorflow_source_dir}
"${tensorflow_source_dir}/${python_proto_cc}/*.proto"
)
list(APPEND tf_python_protos_cc_srcs ${tf_python_protos_cc_src})
if(NOT python_proto_cc MATCHES "\#")
if(NOT EXISTS "${tensorflow_source_dir}/${python_proto_cc}")
message(SEND_ERROR "Python proto CC directory not found: ${python_proto_cc}")
endif()
file(GLOB_RECURSE tf_python_protos_cc_src RELATIVE ${tensorflow_source_dir}
"${tensorflow_source_dir}/${python_proto_cc}/*.proto"
)
list(APPEND tf_python_protos_cc_srcs ${tf_python_protos_cc_src})
endif()
endforeach(python_proto_cc)
RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
@ -199,7 +209,12 @@ STRING(REGEX REPLACE ";" "\\\\;" python_modules "${python_modules}")
STRING(REGEX REPLACE "\n" ";" python_modules "${python_modules}")
foreach(python_module ${python_modules})
add_python_module(${python_module})
if(NOT python_module MATCHES "\#")
if(NOT EXISTS "${tensorflow_source_dir}/${python_module}")
message(SEND_ERROR "Python module not found: ${python_module}")
endif()
add_python_module(${python_module})
endif()
endforeach(python_module)
add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD

View File

@ -96,7 +96,7 @@ class GANEstimator(estimator.Estimator):
# Generate samples from generator.
predictions = np.array([
x for x in gan_estimator.predict(predict_input_fn)])
```
```
"""
def __init__(self,

View File

@ -2674,16 +2674,25 @@ def spatial_softmax(features,
indexing='ij')
pos_x = array_ops.reshape(pos_x, [height * width])
pos_y = array_ops.reshape(pos_y, [height * width])
if temperature is None:
temperature_collections = utils.get_variable_collections(
variables_collections, 'temperature')
temperature = variables.model_variable(
'temperature',
shape=(),
dtype=dtypes.float32,
initializer=init_ops.ones_initializer(),
collections=temperature_collections,
trainable=trainable)
temp_initializer = init_ops.ones_initializer()
else:
temp_initializer = init_ops.constant_initializer(temperature)
if not trainable:
temp_collections = None
else:
temp_collections = utils.get_variable_collections(
variables_collections, 'temperature')
temperature = variables.model_variable(
'temperature',
shape=(),
dtype=dtypes.float32,
initializer=temp_initializer,
collections=temp_collections,
trainable=trainable)
if data_format == 'NCHW':
features = array_ops.reshape(features, [-1, height * width])
else:

View File

@ -22,10 +22,6 @@ limitations under the License.
#include "tensorflow/core/lib/strings/str_util.h"
namespace tensorflow {
namespace {
template <typename T>
bool ConvertHelper(const string& s, T* value);
}
template <typename T, typename Tlabel>
class DecodeLibsvmOp : public OpKernel {
@ -57,7 +53,7 @@ class DecodeLibsvmOp : public OpKernel {
"]: \"", input_flat(i), "\""));
Tlabel label_value;
OP_REQUIRES(
ctx, ConvertHelper<Tlabel>(entries[0], &label_value),
ctx, strings::SafeStringToNumeric<Tlabel>(entries[0], &label_value),
errors::InvalidArgument("Label format incorrect: ", entries[0]));
label(i) = label_value;
for (int j = 1; j < entries.size(); j++) {
@ -74,7 +70,7 @@ class DecodeLibsvmOp : public OpKernel {
"Feature index should be >= 0, got ", feature_index));
T feature_value;
OP_REQUIRES(
ctx, ConvertHelper<T>(pair[1], &feature_value),
ctx, strings::SafeStringToNumeric<T>(pair[1], &feature_value),
errors::InvalidArgument("Feature format incorrect: ", entries[j]));
out_values.emplace_back(feature_value);
out_indices.emplace_back(std::pair<int64, int64>(i, feature_index));
@ -128,25 +124,6 @@ class DecodeLibsvmOp : public OpKernel {
int64 num_features_;
};
namespace {
template <>
bool ConvertHelper<float>(const string& s, float* value) {
return strings::safe_strtof(s.c_str(), value);
}
template <>
bool ConvertHelper<double>(const string& s, double* value) {
return strings::safe_strtod(s.c_str(), value);
}
template <>
bool ConvertHelper<int32>(const string& s, int32* value) {
return strings::safe_strto32(s.c_str(), value);
}
template <>
bool ConvertHelper<int64>(const string& s, int64* value) {
return strings::safe_strto64(s.c_str(), value);
}
} // namespace
#define REGISTER_KERNEL(type) \
REGISTER_KERNEL_BUILDER(Name("DecodeLibsvm") \
.Device(DEVICE_CPU) \

View File

@ -80,8 +80,7 @@ FlatBufferModel::FlatBufferModel(const char* filename, bool mmap_file,
} else {
allocation_ = new FileCopyAllocation(filename, error_reporter);
}
if (!allocation_->valid()) return;
if (!CheckModelIdentifier()) return;
if (!allocation_->valid() || !CheckModelIdentifier()) return;
model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
}

View File

@ -137,8 +137,8 @@ Following are the ops supported for using On-Device Smart Reply model:
* **HASHTABLE_LOOKUP**
This is a custom op that uses label id from predict op and looks up the
response text from the given label id.
This is an op inside TensorFlow Lite that uses label id from predict op and
looks up the response text from the given label id.
## Further Information

View File

@ -144,6 +144,7 @@ py_test(
tags = ["no_pip"],
deps = [
":predictor_factories",
":testing_common",
],
)

View File

@ -68,10 +68,10 @@ class CoreEstimatorPredictor(predictor.Predictor):
serving_input_receiver = serving_input_receiver_fn()
signature_def = _get_signature_def(
serving_input_receiver, estimator, output_key)
checkpoint_path = estimator.model_dir
checkpoint_dir = estimator.model_dir
self._session = monitored_session.MonitoredSession(
session_creator=monitored_session.ChiefSessionCreator(
checkpoint_filename_with_path=checkpoint_path))
checkpoint_dir=checkpoint_dir))
feed_tensor_info = signature_def.inputs
self._feed_tensors = {k: self._graph.get_tensor_by_name(v.name)

View File

@ -21,6 +21,8 @@ from __future__ import print_function
from tensorflow.contrib.predictor import contrib_estimator_predictor
from tensorflow.contrib.predictor import core_estimator_predictor
from tensorflow.contrib.predictor import saved_model_predictor
from tensorflow.contrib.learn.python.learn.estimators import estimator as contrib_estimator
from tensorflow.python.estimator import estimator as core_estimator
@ -85,7 +87,7 @@ def from_estimator(estimator,
TypeError: if `estimator` is a contrib `Estimator` instead of a core
`Estimator`.
"""
if isinstance(estimator, estimator.Estimator):
if isinstance(estimator, contrib_estimator.Estimator):
raise TypeError('Espected estimator to be of type '
'tf.python.estimator.Estimator, but got type '
'tf.contrib.learn.Estimator. You likely want to call '

View File

@ -19,6 +19,7 @@ from __future__ import division
from __future__ import print_function
from tensorflow.contrib.predictor import predictor_factories
from tensorflow.contrib.predictor import testing_common
from tensorflow.python.platform import test
MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@ -46,6 +47,29 @@ class PredictorFactoriesTest(test.TestCase):
with self.assertRaisesRegexp(RuntimeError, bad_tags_regex):
predictor_factories.from_saved_model(self._export_dir, tags='bad_tag')
def testFromContribEstimator(self):
estimator = testing_common.get_arithmetic_estimator(core=False)
input_fn = testing_common.get_arithmetic_input_fn(core=False)
predictor_factories.from_contrib_estimator(estimator, input_fn,
output_alternative_key='sum')
def testFromContribEstimatorWithCoreEstimatorRaises(self):
estimator = testing_common.get_arithmetic_estimator(core=True)
input_fn = testing_common.get_arithmetic_input_fn(core=True)
with self.assertRaises(TypeError):
predictor_factories.from_contrib_estimator(estimator, input_fn)
def testFromCoreEstimator(self):
estimator = testing_common.get_arithmetic_estimator(core=True)
input_fn = testing_common.get_arithmetic_input_fn(core=True)
predictor_factories.from_estimator(estimator, input_fn)
def testFromCoreEstimatorWithContribEstimatorRaises(self):
estimator = testing_common.get_arithmetic_estimator(core=False)
input_fn = testing_common.get_arithmetic_input_fn(core=False)
with self.assertRaises(TypeError):
predictor_factories.from_estimator(estimator, input_fn)
if __name__ == '__main__':
test.main()

View File

@ -50,7 +50,7 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
# A 1024-point STFT with frames of 64 ms and 75% overlap.
stfts = tf.contrib.signal.stft(pcm, frame_length=1024, frame_step=256,
fft_length=1024)
spectrograms = tf.abs(stft)
spectrograms = tf.abs(stfts)
# Warp the linear scale spectrograms into the mel-scale.
num_spectrogram_bins = stfts.shape[-1].value

View File

@ -676,7 +676,7 @@ file were implicitly obtained from each provided variable's `var.op.name`.
This works well when the variable names in the checkpoint file match those in
the graph. However, sometimes, we want to restore a model from a checkpoint
whose variables have different names those in the current graph. In this case,
whose variables have different names to those in the current graph. In this case,
we must provide the `Saver` a dictionary that maps from each checkpoint variable
name to each graph variable. Consider the following example where the checkpoint
variables names are obtained via a simple function:

View File

@ -753,9 +753,10 @@ def train(train_op,
if logdir:
sv.start_standard_services(sess)
elif startup_delay_steps > 0:
# (use sys.maxsize because sys.maxint doesn't exist in Python 3)
_wait_for_step(sess, global_step,
min(startup_delay_steps, number_of_steps or
sys.maxint))
sys.maxsize))
threads = sv.start_queue_runners(sess)
logging.info('Starting Queues.')
if is_chief and sync_optimizer is not None:

View File

@ -530,7 +530,6 @@ py_library(
srcs_version = "PY2AND3",
deps = [
":client_lib",
"//tensorflow/contrib/framework:framework_py",
"//tensorflow/contrib/layers:layers_py",
"//tensorflow/contrib/learn",
"//tensorflow/python:array_ops",

View File

@ -17,7 +17,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.contrib import framework as contrib_framework
from tensorflow.contrib import layers
from tensorflow.contrib.learn.python.learn.estimators import estimator
@ -190,7 +189,7 @@ def get_model_fn(params,
features, labels, input_weights=weights,
num_trainers=num_trainers,
trainer_id=trainer_id),
state_ops.assign_add(contrib_framework.get_global_step(), 1))
state_ops.assign_add(training_util.get_global_step(), 1))
# Put weights back in
if weights is not None:

View File

@ -26,7 +26,6 @@ py_library(
"python/training/resample.py",
"python/training/sampling_ops.py",
"python/training/sequence_queueing_state_saver.py",
"python/training/sgdr_learning_rate_decay.py",
"python/training/training.py",
"python/training/tuner.py",
],

View File

@ -79,6 +79,7 @@ load(
"if_linux_x86_64",
"if_mobile",
"if_not_mobile",
"if_windows",
"if_not_windows",
"tf_copts",
"tf_cc_test",
@ -562,7 +563,7 @@ cc_library(
"platform/prefetch.h",
"platform/thread_annotations.h",
"platform/types.h",
],
] + if_windows(["platform/windows/integral_types.h"]),
visibility = ["//visibility:public"],
deps =
[

View File

@ -761,6 +761,10 @@ int64 MinSystemMemory(int64 available_memory) {
// builds in windows); because in non-opt builds more system memory
// is necessary.
min_system_memory *= 2;
#endif
#if defined(NVIDIA_TEGRA)
// 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM and Video RAM
min_system_memory = 1<<30;
#endif
return min_system_memory;
}

View File

@ -194,10 +194,9 @@ cc_library(
],
)
cc_library(
tf_kernel_library(
name = "fill_functor",
srcs = ["fill_functor.cc"],
hdrs = ["fill_functor.h"],
prefix = "fill_functor",
deps = [
"//tensorflow/core:framework",
"//third_party/eigen3",
@ -3043,6 +3042,7 @@ tf_kernel_library(
"//conditions:default": [],
}),
hdrs = [
"fill_functor.h",
"conv_grad_ops.h",
"deep_conv2d.h",
"gemm_functors.h",
@ -3067,6 +3067,7 @@ tf_kernel_library(
":conv_2d",
":conv_3d",
":image_resizer_state",
":fill_functor",
":ops_util",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework",
@ -3173,7 +3174,9 @@ tf_kernel_library(
tf_kernel_library(
name = "fused_batch_norm_op",
prefix = "fused_batch_norm_op",
deps = NN_DEPS,
deps = NN_DEPS + [
":fill_functor",
],
)
tf_kernel_library(

View File

@ -151,18 +151,6 @@ typedef Eigen::GpuDevice GPUDevice;
typedef Eigen::SyclDevice SYCLDevice;
#endif // TENSORFLOW_USE_SYCL
namespace functor {
// Partial specialization of FillFunctor<Device=CPUDevice, T>.
template <typename T>
struct FillFunctor<CPUDevice, T> {
void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
typename TTypes<T>::ConstScalar in) {
out.device(d) = out.constant(in());
}
};
} // end namespace functor
template <typename Device, typename T, typename Index>
class FillOp : public OpKernel {
@ -191,28 +179,6 @@ class FillOp : public OpKernel {
}
};
#ifdef TENSORFLOW_USE_SYCL
namespace functor {
// Partial specialization of FillFunctor<Device=SYCLDevice, T>.
template <typename T>
struct FillFunctor<SYCLDevice, T> {
void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
typename TTypes<T>::ConstScalar in) {
#if !defined(EIGEN_HAS_INDEX_LIST)
Eigen::array<int, 1> rank1{1};
#else
Eigen::IndexList<Eigen::type2index<1> > rank1;
#endif
const int size = out.dimension(0);
Eigen::array<int, 1> broadcast_dims{size};
To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
}
};
} // namespace functor
#endif // TENSORFLOW_USE_SYCL
#define REGISTER_KERNEL(D, TYPE) \
REGISTER_KERNEL_BUILDER(Name("Fill") \
.Device(DEVICE_##D) \

View File

@ -30,6 +30,7 @@ limitations under the License.
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_slice.h"
#include "tensorflow/core/kernels/conv_2d.h"
#include "tensorflow/core/kernels/fill_functor.h"
#ifdef TENSORFLOW_USE_LIBXSMM
#include "tensorflow/core/kernels/xsmm_conv2d.h"
#endif
@ -595,6 +596,13 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
if (filter_shape.num_elements() == 0) {
return;
}
// If input is empty, set gradients to zero.
if (input.shape().num_elements() == 0) {
functor::SetZeroFunctor<Device, T> f;
f(context->eigen_device<Device>(), filter_backprop->flat<T>());
return;
}
// For now we take the stride from the second and third dimensions only (we
// do not support striding on the batch or depth dimension).

View File

@ -19,12 +19,15 @@ limitations under the License.
#include <algorithm>
#include <array>
#include <limits>
#include <utility>
#include "cuda/include/cuda.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/conv_2d.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"
#include "tensorflow/core/util/tensor_format.h"
#include "tensorflow/core/lib/math/math_util.h"
namespace tensorflow {
@ -223,185 +226,136 @@ __global__ void SwapDimension1And2InTensor3Simple(int nthreads, const T* input,
// Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor,
// where dimensions are zero-based: output[i][j][k] = input[i][k][j].
//
// Each thread block operates on a single tile, a square of dimensions TileSize
// x TileSize. We require that the thread block's X dimension equals TileSize,
// and its Y dimension equals NumSubTiles.
// Each thread block operates on a single tile, a rectangle of dimensions
// TileSizeI x TileSizeJ.
//
// For best performance, you should probably set TileSize equal to the number of
// threads in a warp (32 in nvidia GPUs). With a TileSize of 32, NumSubTiles ==
// 4 or 8 seems to get the best performance on K40 GPUs.
template <typename T, int TileSize, int NumSubTiles, bool conjugate = false>
__global__ void SwapDimension1And2InTensor3UsingTiles(const T* input,
Dimension<3> input_dims,
T* output) {
// One extra line in the inner dimension to avoid share memory bank conflict.
__shared__ T shared_memory_tile[TileSize][TileSize + 1];
static_assert(TileSize % NumSubTiles == 0,
"TileSize must be divisible by NumSubTiles");
eigen_assert(blockDim.x == TileSize);
eigen_assert(blockDim.y == NumSubTiles);
// In general, for best performance, you should probably set TileSizeI,
// TileSizeJ equal to the number of threads in a warp (32 in nvidia GPUs).
// With a TileSizeI, TileSizeJ of 32, NumThreads of 128 or 256 seems to get
// the best performance on K40 GPUs.
template <typename T, int NumThreads, int TileSizeI, int TileSizeJ,
bool conjugate = false>
__global__ void SwapDimension1And2InTensor3UsingTiles(
const T* __restrict__ input, Dimension<3> input_dims,
T* __restrict__ output) {
eigen_assert(blockDim.x == NumThreads);
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
eigen_assert(gridDim.y == 1);
eigen_assert(gridDim.z == 1);
// We break down the tile into NumSubTiles groups, so each thread processes
// kSubTileSize elements (except at the edges of the input).
const int kSubTileSize = TileSize / NumSubTiles;
constexpr int ReadRowPerPass = NumThreads / TileSizeJ;
constexpr int WriteRowPerPass = NumThreads / TileSizeI;
// One extra line in the inner dimension to avoid share memory bank conflict.
__shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
int x = threadIdx.x;
Dimension<3> output_dims = {
input_dims[0],
input_dims[2],
input_dims[1],
input_dims[0], input_dims[2], input_dims[1],
};
Dimension<3> input_dims_in_tiles = {
input_dims[0],
(input_dims[1] + TileSize - 1) / TileSize,
(input_dims[2] + TileSize - 1) / TileSize,
input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI,
(input_dims[2] + TileSizeJ - 1) / TileSizeJ,
};
Index<3> input_tile_index =
FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
Index<3> input_tile_origin = {
input_tile_index[0],
input_tile_index[1] * TileSize,
input_tile_index[2] * TileSize,
input_tile_index[0], input_tile_index[1] * TileSizeI,
input_tile_index[2] * TileSizeJ,
};
int input_origin_flat_index =
TensorIndexToFlat(input_tile_origin, input_dims);
int tile_width = TileSize;
bool full_tile = true;
int tile_width = TileSizeJ;
// Only the last row or column may not have the full size.
if (input_tile_index[2] == input_dims_in_tiles[2] - 1) {
tile_width = input_dims[2] - (input_dims_in_tiles[2] - 1) * TileSize;
tile_width = input_dims[2] - (input_dims_in_tiles[2] - 1) * TileSizeJ;
full_tile &= false;
}
int tile_height = TileSize;
int tile_height = TileSizeI;
if (input_tile_index[1] == input_dims_in_tiles[1] - 1) {
tile_height = input_dims[1] - (input_dims_in_tiles[1] - 1) * TileSize;
tile_height = input_dims[1] - (input_dims_in_tiles[1] - 1) * TileSizeI;
full_tile &= false;
}
int input_flat_index = input_origin_flat_index + x;
int y_start = static_cast<int>(threadIdx.y) * kSubTileSize;
// Calculate effective thread number. This ensures that we use the largest
// number of threads available to form a regular thread block with no
// trailing incomplete lines.
constexpr int in_effective_thread_num = NumThreads / TileSizeJ * TileSizeJ;
// Load the data from input memory to the shared memory tile.
if (x < tile_width) {
int y_end = min(y_start + kSubTileSize, tile_height);
for (int y = y_start; y < y_end; y++) {
shared_memory_tile[y][x] = maybe_conj<T, conjugate>::run(
input[input_flat_index + y * input_dims[2]]);
if (x < in_effective_thread_num) {
// Orient the logical thread block with respect to the input array.
// ie. align the contiguous dimension of thread blocks with the contiguous
// dimension of the input array.
int ti = x / TileSizeJ;
int tj = x % TileSizeJ;
int input_index = input_origin_flat_index + ti * input_dims[2] + tj;
int input_increment = ReadRowPerPass * input_dims[2];
if (full_tile) {
#pragma unroll
for (int i_loc = ti; i_loc < (TileSizeI); i_loc += ReadRowPerPass) {
shared_memory_tile[i_loc][tj] =
maybe_conj<T, conjugate>::run(input[input_index]);
input_index += input_increment;
}
} else {
if (tj < tile_width) {
for (int i_loc = ti; i_loc < (tile_height); i_loc += ReadRowPerPass) {
shared_memory_tile[i_loc][tj] =
maybe_conj<T, conjugate>::run(input[input_index]);
input_index += input_increment;
}
}
}
}
__syncthreads();
Index<3> output_tile_index = {
input_tile_index[0],
input_tile_index[2],
input_tile_index[1],
input_tile_index[0], input_tile_index[2], input_tile_index[1],
};
Index<3> output_tile_origin = {
output_tile_index[0],
output_tile_index[1] * TileSize,
output_tile_index[2] * TileSize,
output_tile_index[0], output_tile_index[1] * TileSizeJ,
output_tile_index[2] * TileSizeI,
};
int output_origin_flat_index =
TensorIndexToFlat(output_tile_origin, output_dims);
int output_flat_index = output_origin_flat_index + x;
constexpr int out_effective_thread_num = NumThreads / TileSizeI * TileSizeI;
// Load the data from the shared memory tile to the output memory.
if (x < tile_height) {
int y_end = min(y_start + kSubTileSize, tile_width);
for (int y = y_start; y < y_end; y++) {
output[output_flat_index + y * output_dims[2]] = shared_memory_tile[x][y];
}
}
}
if (x < out_effective_thread_num) {
// Re-orient the logical thread block with respect to the output array.
// ie. align the contiguous dimension of thread blocks with contiguous
// dimension of the output array.
int ti = x / TileSizeI;
int tj = x % TileSizeI;
int output_index = output_origin_flat_index + ti * output_dims[2] + tj;
int output_increment = WriteRowPerPass * output_dims[2];
// Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor
// when only one of the dimension sizes is smaller than 16,
// where dimensions are zero-based: output[i][j][k] = input[i][k][j].
//
// small_dim = the_smaller_dimension_size
// large_dim = the_larger_dimension_size
// tile_num_per_block = blockDim.x
// kTileLength = small_dim
//
// Each thread block operates on a single rectangle tile, where its width is
// kTileLength (we currently set it to 64) and its height is small_dim,
// We set the thread block's X dimension to be tile_num_per_block, and its Y
// and Z to be one.
template <typename T, int ShmemSize, bool SmallDim2, bool conjugate = false>
__global__ void SwapDimension1And2InTensor3SmallDim(const T* input,
int batch_per_block,
Dimension<3> input_dims,
T* output) {
// TODO(yangzihao) avoid share memory bank conflict.
__shared__ T shared_memory_tile[ShmemSize];
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
eigen_assert(gridDim.z == 1);
int block_offset = blockIdx.x * blockDim.x;
int x = threadIdx.x;
int tile_height = blockDim.x;
// Get tile height, width, and thread/block origin indices.
int small_dim = SmallDim2 ? input_dims[2] : input_dims[1];
int large_dim = SmallDim2 ? input_dims[1] : input_dims[2];
int global_offset = small_dim * large_dim * (blockIdx.y * batch_per_block) +
(SmallDim2 ? block_offset * small_dim : block_offset);
if (global_offset >= (input_dims[0] * input_dims[1] * input_dims[2])) return;
for (int batch = 0; batch < batch_per_block; ++batch) {
int block_origin_idx =
small_dim * large_dim * (blockIdx.y * batch_per_block + batch);
int thread_origin_idx =
block_origin_idx +
(SmallDim2 ? block_offset * small_dim : block_offset) + x;
if (block_offset + blockDim.x > large_dim) {
tile_height = large_dim - block_offset;
}
__syncthreads();
// Load a continuous memory region to shared memory tile.
if (x < tile_height) {
for (int y = 0; y < small_dim; y++) {
int shmem_index =
SmallDim2 ? (x + y * tile_height) : (x * small_dim + y);
shared_memory_tile[shmem_index] = maybe_conj<T, conjugate>::run(
ldg(input + thread_origin_idx +
y * (SmallDim2 ? tile_height : large_dim)));
if (full_tile) {
#pragma unroll
for (int i_loc = ti; i_loc < (TileSizeJ); i_loc += WriteRowPerPass) {
output[output_index] = shared_memory_tile[tj][i_loc];
output_index += output_increment;
}
}
__syncthreads();
// Get block origin index for output array.
int output_block_offset = block_origin_idx;
int output_block_idx = SmallDim2 ? block_offset : block_offset * small_dim;
int output_block_origin_idx = output_block_offset + output_block_idx;
// Store the transposed memory region in shared memory to device.
if (x < tile_height) {
for (int y = 0; y < small_dim; y++) {
int output_idx = output_block_origin_idx + x +
y * (SmallDim2 ? large_dim : tile_height);
int shmem_index =
SmallDim2 ? (x * small_dim + y) : (x + y * tile_height);
output[output_idx] = shared_memory_tile[shmem_index];
} else {
if (tj < tile_height) {
for (int i_loc = ti; i_loc < (tile_width); i_loc += WriteRowPerPass) {
output[output_index] = shared_memory_tile[tj][i_loc];
output_index += output_increment;
}
}
}
}
@ -548,6 +502,380 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
}
};
// We want std::equal_to and std::greater, but they're not constexpr until
// C++14.
struct EqualTo {
constexpr bool operator()(int a, int b) const { return a == b; }
};
struct GreaterThan {
constexpr bool operator()(int a, int b) const { return a > b; }
};
// For each data type, the tile size posibility frontier denotes the tile size
// combinations that consume the most computational resources constrained by
// - number of threads per SM limit,
// - limit on size of the short dimension (<=15) due to the definition of
// narrow matrix,
// - shared memory limit and
// - some experimentally determined, type-specific constraint on the product of
// two side lengths to increase grid-level parallelism.
//
// A tile size combination lies on the frontier if and only if one or more
// constraint mentioned above is hit. Tile size combinations lying outside this
// frontier are either not possible, or are slower than the alternatives.
//
// It is instrumental to consider, for each data type, two subsets of the
// corresponding frontier:
// - long side frontier: the union of the biggest tile size combination for
// each legal long side len.
// - non long side frontier: the frontier set minus the long side frontier.
//
// TileSizePossibilityFrontierCheck defines the frontier using only the long
// side frontier tile size combinations (since one can easily extrapolate
// the entire frontier from this subset). It serves as a utility function
// to help us determine where a tile size combination of interest lies with
// resepect to the frontier.
template <typename Op>
constexpr bool TileSizePossibilityFrontierCheck(int TileLongSide,
int TileShortSide,
int size_of_t, Op op) {
// clang-format off
return (size_of_t == 16 && ((TileLongSide == 32 && op(TileShortSide, 4)) ||
(TileLongSide == 64 && op(TileShortSide, 4)) ||
(TileLongSide == 128 && op(TileShortSide, 4)) ||
(TileLongSide == 256 && op(TileShortSide, 2)))) ||
(size_of_t == 8 && ((TileLongSide == 32 && op(TileShortSide, 15)) ||
(TileLongSide == 64 && op(TileShortSide, 15)) ||
(TileLongSide == 128 && op(TileShortSide, 8)) ||
(TileLongSide == 256 && op(TileShortSide, 4)) ||
(TileLongSide == 512 && op(TileShortSide, 2)))) ||
(size_of_t == 4 && ((TileLongSide == 32 && op(TileShortSide, 15)) ||
(TileLongSide == 64 && op(TileShortSide, 15)) ||
(TileLongSide == 128 && op(TileShortSide, 15)) ||
(TileLongSide == 256 && op(TileShortSide, 8)) ||
(TileLongSide == 512 && op(TileShortSide, 4)) ||
(TileLongSide == 1024 && op(TileShortSide, 2)))) ||
(size_of_t == 2 && ((TileLongSide == 32 && op(TileShortSide, 15)) ||
(TileLongSide == 64 && op(TileShortSide, 15)) ||
(TileLongSide == 128 && op(TileShortSide, 15)) ||
(TileLongSide == 256 && op(TileShortSide, 8)) ||
(TileLongSide == 512 && op(TileShortSide, 4)) ||
(TileLongSide == 1024 && op(TileShortSide, 2)))) ||
(size_of_t == 1 && ((TileLongSide == 32 && op(TileShortSide, 15)) ||
(TileLongSide == 64 && op(TileShortSide, 15)) ||
(TileLongSide == 128 && op(TileShortSide, 15)) ||
(TileLongSide == 256 && op(TileShortSide, 8)) ||
(TileLongSide == 512 && op(TileShortSide, 4)) ||
(TileLongSide == 1024 && op(TileShortSide, 2))));
// clang-format on
}
constexpr bool TileSizeOnLongSideFrontier(int TileLongSide, int TileShortSide,
int size_of_t) {
return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
size_of_t, EqualTo());
}
constexpr bool TileSizeOutsideFrontier(int TileLongSide, int TileShortSide,
int size_of_t) {
return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
size_of_t, GreaterThan());
}
constexpr bool TileSizeOnNonLongSideFrontier(int TileLongSide,
int TileShortSide, int size_of_t) {
// For a tile size combination (longside, shortside), lying on the frontier
// implies that (longside, shortside) is on or within the frontier but
// (longside*2, shortside) or (longside, shortside+1) is not. With the above
// critereon, we simply need to use !TileSizeOnLongSideFrontier to ensure that
// it is not on the long side frontier.
return !TileSizeOutsideFrontier(TileLongSide, TileShortSide, size_of_t) &&
(TileSizeOutsideFrontier(TileLongSide * 2, TileShortSide, size_of_t) ||
TileSizeOutsideFrontier(TileLongSide, TileShortSide + 1,
size_of_t)) &&
!TileSizeOnLongSideFrontier(TileLongSide, TileShortSide, size_of_t);
}
// Helper function to launch a batch narrow matirx transpose kernel.
template <typename T, int TileLongSide, int TileShortSide>
void LaunchBatchNarrowMatrixTransposeKernel(
const GPUDevice& d, int tile_size_i, int tile_size_j, int total_tiles_count,
const T* input, const Dimension<3>& input_dims, T* output) {
constexpr int NumThreads = TileLongSide;
if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
TileShortSide>
<<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
output);
} else {
SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
TileLongSide>
<<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
output);
}
}
// Recursive template function to search, in a trial-and-error manner, for the
// minimum tile size configuration satisfying the requested tile side lengths.
// An important invariant of this search procedure is that for an unsatisfied
// request, we always try doubling the long side len first, and only after
// the request is satisfied for the long side len do we begin incrementing
// the short side len.
//
// We have three specializations of this search function depending on where the
// current tile size combination lies with respect to the frontier.
// - It lies within the frontier. If request is not satisfied, for the next tile
// size combination, we first try doubling the long side len and if that does
// not work, we then increment the short side len.
// - It lies on the non long side frontier. If the request is not satisfied, we
// can only increment the short side len.
// - It lies on the long side frontier. We launch the kernel without checking if
// the request is satisfied or not.
template <typename T, int TileLongSide, int TileShortSide,
typename dummy = void>
struct BatchNarrowMatrixTransposeDispatcher {
static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
int total_tiles_count, const T* input,
const Dimension<3>& input_dims, T* output) {
static_assert(
(TileLongSide & (TileLongSide - 1)) == 0,
"The length of the longer side of the tile is always a power of 2.");
bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
min(tile_size_i, tile_size_j) <= TileShortSide;
if (request_satisfied) {
LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
output);
return;
}
// If the execution reaches here, then the kernel was not launched; we then
// determine whether it is the long side or the short side that falls short
// of the request and increase that parameter accordingly.
const bool long_side_request_not_satisfied =
max(tile_size_i, tile_size_j) > TileLongSide;
if (long_side_request_not_satisfied) {
BatchNarrowMatrixTransposeDispatcher<
T, TileLongSide * 2, TileShortSide>::DoIt(d, tile_size_i, tile_size_j,
total_tiles_count, input,
input_dims, output);
} else {
BatchNarrowMatrixTransposeDispatcher<
T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
total_tiles_count, input,
input_dims, output);
}
}
};
template <typename T, int TileLongSide, int TileShortSide>
struct BatchNarrowMatrixTransposeDispatcher<
T, TileLongSide, TileShortSide,
typename std::enable_if<TileSizeOnNonLongSideFrontier(
TileLongSide, TileShortSide, sizeof(T)),
void>::type> {
static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
int total_tiles_count, const T* input,
const Dimension<3>& input_dims, T* output) {
static_assert(
(TileLongSide & (TileLongSide - 1)) == 0,
"The length of the longer side of the tile is always a power of 2.");
bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
min(tile_size_i, tile_size_j) <= TileShortSide;
if (request_satisfied) {
LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
output);
return;
}
// If the execution reaches here, then the kernel was not launched; since
// we are on the non long side frontier, we increment the short dimension
// and try again.
BatchNarrowMatrixTransposeDispatcher<
T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
total_tiles_count, input,
input_dims, output);
}
};
template <typename T, int TileLongSide, int TileShortSide>
struct BatchNarrowMatrixTransposeDispatcher<
T, TileLongSide, TileShortSide,
typename std::enable_if<TileSizeOnLongSideFrontier(
TileLongSide, TileShortSide, sizeof(T)),
void>::type> {
static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
int total_tiles_count, const T* input,
const Dimension<3>& input_dims, T* output) {
static_assert(
(TileLongSide & (TileLongSide - 1)) == 0,
"The length of the longer side of the tile is always a power of 2.");
LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
output);
}
};
// This function tries to recover, in a brute force way, the frontier defined in
// TileSizePossibilityFrontierCheck as a vector of tile size combinations lying
// on the long side frontier. This vector is sufficient to determine the entire
// frontier.
//
// Note that if one changes the frontier definition in
// TileSizePossibilityFrontierCheck and forgets to set the largest short
// side len of the largest legal long side len to 2, this function will fail
// and crash the program.
template <int SizeOfT>
const std::vector<std::pair<int, int>>& GetTileSizesFrontier() {
static_assert(
SizeOfT <= 16,
"Currently, only data types of sizes 16 bytes or less are supported.");
static_assert((SizeOfT & (SizeOfT - 1)) == 0,
"Data types must have sizes that are powers of 2.");
// Expensive work to populate sizes, lazily run in a thread-safe
// manner the first time GetTileSizesFrontier<N> is called.
static auto* frontier = [] {
auto* frontier = new std::vector<std::pair<int, int>>();
const int kMaxLongSideLen = 1024;
const int kMaxShortSideLen = 15;
for (int long_side = 32; long_side <= kMaxLongSideLen; long_side *= 2) {
for (int short_side = 2; short_side <= kMaxShortSideLen;
short_side += 1) {
if (TileSizeOnLongSideFrontier(long_side, short_side, SizeOfT)) {
// The current combination lies on the frontier, thus we
// add it to the frontier definition.
frontier->push_back(std::make_pair(long_side, short_side));
// The long side length is the largest one allowed iff its
// corresponding short side length is 2.
if (short_side == 2) return frontier;
// We have exhausted all the possibilities in the frontier
// with the given long side length.
break;
}
}
}
LOG(FATAL)
<< "The corresponding short side length of the largest long side "
"length has to be 2.";
}();
return *frontier;
}
// Helper structs to help determine which data type to use given the size of
// the matrix data type. A transpose of elements of size N will use a kernel
// which operates on an array of TransposeElemType<N>::type.
template <int ElemBytes>
struct TransposeElemType;
template <>
struct TransposeElemType<1> {
using type = uint8;
};
template <>
struct TransposeElemType<2> {
using type = uint16;
};
template <>
struct TransposeElemType<4> {
using type = uint32;
};
template <>
struct TransposeElemType<8> {
using type = uint64;
};
template <>
struct TransposeElemType<16> {
using type = float4;
};
// A helper function to make RunSwapDimension1And2InTensor3 concise. This
// helper function looks at the data type and input matrix sizes and decides
// the thread numbers and tile sizes to use.
template <typename T, bool conjugate = false >
void SwapDimension1And2InTensor3WithNarrowMatrices(
const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
T* output, const int kMinDimensionToUseTiles) {
// Get available tile sizes here for the data type requested:
const auto& tile_spec = GetTileSizesFrontier<sizeof(T)>();
int tile_long_side_len = 0;
int tile_short_side_len = 0;
float lowest_cost = std::numeric_limits<float>::max();
int data_long_side = max(input_dims[1], input_dims[2]);
for (auto tile_size_pair : tile_spec) {
int proposed_tile_long_side_len = tile_size_pair.first;
// Number of threads that will not be doing anything useful when reading
// the matrix because the thread block size is bigger than the data block
// size.
int num_wasted_threads =
data_long_side - MathUtil::FloorOfRatio<int>(
data_long_side, proposed_tile_long_side_len) *
proposed_tile_long_side_len;
int num_full_tiles = MathUtil::FloorOfRatio<int>(
data_long_side, proposed_tile_long_side_len);
float cost = 0;
// However, if we can execute two or more full tiles, then we gladly
// accept any number of wasted threads and ignore its cost.
if (num_full_tiles <= 1) cost = num_wasted_threads;
// Using less than or equal to here because given the same cost, we
// would like to launch as many threads as possible.
if (cost <= lowest_cost) {
tile_long_side_len = proposed_tile_long_side_len;
tile_short_side_len = tile_size_pair.second;
lowest_cost = cost;
}
}
// Request tile sizes such that the longer side of threadblock aligns with
// the longer side of input data block to maximize read throughput.
// The ideal tile shape is one where the length of the shorter side of the
// tile is equal to the length of the shorter side of the input matrix.
int requested_tile_size_i = input_dims[1] >= kMinDimensionToUseTiles
? tile_long_side_len
: input_dims[1];
int requested_tile_size_j = input_dims[1] >= kMinDimensionToUseTiles
? input_dims[2]
: tile_long_side_len;
// Truncate the shorter size requested according to the manual limit set in
// tile_spec to make sure that we do not launch configurations violating
// hardware limits.
requested_tile_size_i = requested_tile_size_i == tile_long_side_len
? tile_long_side_len
: min(requested_tile_size_i, tile_short_side_len);
requested_tile_size_j = requested_tile_size_j == tile_long_side_len
? tile_long_side_len
: min(requested_tile_size_j, tile_short_side_len);
Dimension<3> input_dims_in_tiles = {
input_dims[0],
MathUtil::CeilOfRatio<int>(input_dims[1], requested_tile_size_i),
MathUtil::CeilOfRatio<int>(input_dims[2], requested_tile_size_j),
};
int total_tiles_count =
input_dims_in_tiles[0] * input_dims_in_tiles[1] * input_dims_in_tiles[2];
using ElemType = typename TransposeElemType<sizeof(T)>::type;
static_assert(alignof(T) >= alignof(ElemType), "Unexpected data alignment.");
BatchNarrowMatrixTransposeDispatcher<ElemType, 32, 2>::DoIt(
d, requested_tile_size_i, requested_tile_size_j, total_tiles_count,
reinterpret_cast<const ElemType*>(input), input_dims,
reinterpret_cast<ElemType*>(output));
}
// Launch the GPU kernel that would swap dimension-1 and dimension-2 in a
// 3D tensor. It looks at the shape of the incoming data, and decides the best
// strategy to launch.
@ -558,60 +886,33 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
// If one dimension is trivial, use SmallDim kernel for swapping.
// Otherwise, the trivial swapping relying on the ldg cache is more efficient.
static const int kMinDimensionToUseTiles = 16;
bool use_tiles = (input_dims[1] >= kMinDimensionToUseTiles &&
input_dims[2] >= kMinDimensionToUseTiles);
bool use_small_dim = ((input_dims[1] >= kMinDimensionToUseTiles &&
input_dims[2] < kMinDimensionToUseTiles)) ||
((input_dims[1] < kMinDimensionToUseTiles &&
input_dims[2] >= kMinDimensionToUseTiles));
static const int NumSubTiles = 8;
static const int kMinDimensionToUseRectTiles = 96;
if (use_tiles) {
static const int TileSize = 32;
Dimension<3> input_dims_in_tiles = {
input_dims[0],
(input_dims[1] + TileSize - 1) / TileSize,
(input_dims[2] + TileSize - 1) / TileSize,
};
int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
input_dims_in_tiles[2];
// We get best performance when TileSize is the number of threads in a warp
bool large_matrix = input_dims[1] >= kMinDimensionToUseTiles &&
input_dims[2] >= kMinDimensionToUseTiles;
bool narrow_matrix = input_dims[1] >= kMinDimensionToUseRectTiles ||
input_dims[2] >= kMinDimensionToUseRectTiles;
if (large_matrix) {
// We get best performance when kTileSize is the number of threads in a warp
// (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256
// threads.
SwapDimension1And2InTensor3UsingTiles<T, TileSize, NumSubTiles, conjugate>
<<<total_tiles_count, dim3(TileSize, NumSubTiles), 0, d.stream()>>>(
input, input_dims, output);
} else if (use_small_dim) {
// When only one of the dimensions is smaller than kMinDimensionToUseTiles,
// we use one block to process a rectangle region with the size of
// kTileLength * small_dim. We found that when set kTileLength to 64 on
// TitanX Maxwell GPU, it achieves the best performance.
// large_dim
// +---------------...--------+
// | | | |
// small_dim | | ... | |
// | | | |
// +--------------...---------+
// \----- ------/ \- -/
// V V
// kTileLength(tile_height) tile_height
static const int kTileLength = 64;
static const int kGridDimY = 65535;
int large_dim = std::max(input_dims[2], input_dims[1]);
int tile_num_per_block = (large_dim + kTileLength - 1) / kTileLength;
int grid_dim_y = std::min(input_dims[0], kGridDimY);
int batch_per_block = (input_dims[0] + grid_dim_y - 1) / grid_dim_y;
if (input_dims[2] < input_dims[1]) {
SwapDimension1And2InTensor3SmallDim<
T, kTileLength * kMinDimensionToUseTiles, true, conjugate>
<<<dim3(tile_num_per_block, grid_dim_y), kTileLength, 0,
d.stream()>>>(input, batch_per_block, input_dims, output);
} else {
SwapDimension1And2InTensor3SmallDim<
T, kTileLength * kMinDimensionToUseTiles, false, conjugate>
<<<dim3(tile_num_per_block, grid_dim_y), kTileLength, 0,
d.stream()>>>(input, batch_per_block, input_dims, output);
}
constexpr int kTileSize = 32;
constexpr int kNumThreads = 256;
Dimension<3> input_dims_in_tiles = {
input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
};
int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
input_dims_in_tiles[2];
SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate>
<<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
output);
} else if (narrow_matrix) {
SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output,
kMinDimensionToUseTiles);
} else {
int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);

View File

@ -112,7 +112,7 @@ class GraphDefBuilderWrapper {
Status AddTensor(const Tensor& val, Node** output) {
AddTensorInternal(val, output);
if (*output == nullptr) {
return errors::Internal("AddTesor: Failed to build Const op.");
return errors::Internal("AddTensor: Failed to build Const op.");
}
return Status::OK();
}

View File

@ -19,6 +19,7 @@ limitations under the License.
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/framework/variant_encode_decode.h"
@ -74,6 +75,7 @@ DEFINE_SETZERO_SYCL(int32);
DEFINE_SETZERO_SYCL(int64);
#undef DEFINE_SETZERO_SYCL
#endif // TENSORFLOW_USE_SYCL
template <typename T>
void SetOneFunctor<Eigen::ThreadPoolDevice, T>::operator()(
const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out) {
@ -112,5 +114,47 @@ DEFINE_SETONE_SYCL(double);
#undef DEFINE_SETONE_SYCL
#endif // TENSORFLOW_USE_SYCL
template <typename T>
struct FillFunctor<Eigen::ThreadPoolDevice, T> {
void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out,
typename TTypes<T>::ConstScalar in) {
out.device(d) = out.constant(in());
}
};
// Explicit instantiations.
#define DEFINE_FILL_CPU(T) \
template struct FillFunctor<Eigen::ThreadPoolDevice, T>;
TF_CALL_ALL_TYPES(DEFINE_FILL_CPU);
DEFINE_FILL_CPU(quint8);
DEFINE_FILL_CPU(quint16);
#undef DEFINE_FILL_CPU
#ifdef TENSORFLOW_USE_SYCL
template <typename T>
struct FillFunctor<Eigen::SyclDevice, T> {
void operator()(const Eigen::SyclDevice& d, typename TTypes<T>::Flat out,
typename TTypes<T>::ConstScalar in) {
#if !defined(EIGEN_HAS_INDEX_LIST)
Eigen::array<int, 1> rank1{1};
#else
Eigen::IndexList<Eigen::type2index<1> > rank1;
#endif
const int size = out.dimension(0);
Eigen::array<int, 1> broadcast_dims{size};
To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
}
};
#define DEFINE_FILL_SYCL(T) \
template struct FillFunctor<Eigen::SyclDevice, T>;
DEFINE_FILL_SYCL(float);
DEFINE_FILL_SYCL(double);
TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
#undef DEFINE_FILL_SYCL
#endif // TENSORFLOW_USE_SYCL
} // namespace functor
} // namespace tensorflow

View File

@ -0,0 +1,115 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#define EIGEN_USE_GPU
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/kernels/fill_functor.h"
#include "tensorflow/core/platform/types.h"
namespace Eigen {
namespace internal {
template <typename T>
struct scalar_const_op {
typedef typename packet_traits<T>::type Packet;
const T* val;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
scalar_const_op(const scalar_const_op& x)
: val(x.val) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_const_op(const T* v) : val(v) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()() const {
return *val;
}
template <typename PacketType = Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
return internal::pset1<PacketType>(*val);
}
};
template <typename T>
struct functor_traits<scalar_const_op<T> > {
enum {
Cost = 1,
PacketAccess = packet_traits<T>::Vectorizable,
IsRepeatable = true
};
};
} // end namespace internal
} // end namespace Eigen
namespace tensorflow {
namespace functor {
typedef Eigen::GpuDevice GPUDevice;
// Partial specialization FillFunctor<Device=GPUDevice, T>
template <typename T>
struct FillFunctor<GPUDevice, T> {
void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
typename TTypes<T>::ConstScalar in) {
Eigen::internal::scalar_const_op<T> f(in.data());
To32Bit(out).device(d) = To32Bit(out).nullaryExpr(f);
}
};
#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
TF_CALL_bfloat16(DEFINE_FILL_GPU);
TF_CALL_bool(DEFINE_FILL_GPU);
#undef DEFINE_FILL_GPU
// Partial specialization of FillFunctor<Device=GPUDevice, T>.
template <typename T>
struct SetZeroFunctor<GPUDevice, T> {
void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
To32Bit(out).device(d) = To32Bit(out).constant(T(0));
}
};
#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>;
TF_CALL_NUMBER_TYPES(DEFINE_SETZERO_GPU);
TF_CALL_bfloat16(DEFINE_SETZERO_GPU);
TF_CALL_bool(DEFINE_SETZERO_GPU);
#undef DEFINE_SETZERO_GPU
// Partial specialization of FillFunctor<Device=GPUDevice, T>.
template <typename T>
struct SetOneFunctor<GPUDevice, T> {
void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
To32Bit(out).device(d) = To32Bit(out).constant(T(1));
}
};
#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>;
TF_CALL_NUMBER_TYPES(DEFINE_SETONE_GPU);
TF_CALL_bfloat16(DEFINE_SETONE_GPU);
TF_CALL_bool(DEFINE_SETONE_GPU);
#undef DEFINE_SETONE_GPU
} // end namespace functor
} // end namespace tensorflow
#endif // GOOGLE_CUDA

View File

@ -27,6 +27,7 @@ limitations under the License.
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/kernels/fill_functor.h"
#include "tensorflow/core/kernels/fused_batch_norm_op.h"
#include "tensorflow/core/util/tensor_format.h"
@ -239,6 +240,14 @@ struct FusedBatchNorm<GPUDevice, T, U> {
<< " offset shape: " << offset.shape().DebugString()
<< " tensor format: " << tensor_format;
// If input is empty, return NaN mean/variance
if (x.shape().num_elements() == 0) {
functor::SetNanFunctor<U> f;
f(context->eigen_device<GPUDevice>(), batch_mean->flat<U>());
f(context->eigen_device<GPUDevice>(), batch_var->flat<U>());
return;
}
Tensor x_maybe_transformed = x;
Tensor x_transformed;
Tensor y_transformed;
@ -656,6 +665,14 @@ class FusedBatchNormGradOp : public OpKernel {
context, context->allocate_output(4, TensorShape({}), &placeholder_2));
FillZeros<Device>(placeholder_2);
// If input is empty, set gradients w.r.t scale/offset to zero.
if (x.shape().num_elements() == 0) {
functor::SetZeroFunctor<Device, U> f;
f(context->eigen_device<Device>(), scale_backprop->flat<U>());
f(context->eigen_device<Device>(), offset_backprop->flat<U>());
return;
}
if (is_training_) {
functor::FusedBatchNormGrad<Device, T, U>()(
context, y_backprop, x, scale, saved_mean_or_pop_mean,

View File

@ -65,8 +65,15 @@ void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
epsilon, sample_size, variance);
}
template <class T>
void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
typename TTypes<T>::Flat out) {
To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
}
template class VarianceToInvVariance<float>;
template class InvVarianceToVariance<float>;
template class SetNanFunctor<float>;
} // namespace functor
} // namespace tensorflow

View File

@ -49,6 +49,12 @@ struct InvVarianceToVariance {
int channels, T* variance);
};
// This function sets a GPU tensor to NaNs.
template <class T>
struct SetNanFunctor {
void operator()(const Eigen::GpuDevice& d, typename TTypes<T>::Flat out);
};
#endif // GOOGLE_CUDA
// Functor used by FusedBatchNormGradOp to do the computations when

View File

@ -210,7 +210,7 @@ class MatrixInverseOpGpu : public AsyncOpKernel {
done);
}
} else {
// For large matrices, we wompute the inverse of each matrix in the batch
// For large matrices, we compute the inverse of each matrix in the batch
// sequentially. Here we use the cuSolver methods GETRF/GETRS because they
// are MUCH faster than their batched cuBlas equivalents for large
// matrices.

View File

@ -147,6 +147,9 @@ void DnnPoolingOp<T>::Compute(
Tensor* tensor_out = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output(0, tensor_out_shape, &tensor_out));
if (tensor_in.shape().num_elements() == 0) {
return;
}
PoolParameters params{context, size, stride,
padding, data_format, tensor_in.shape()};
@ -247,6 +250,9 @@ void DnnPoolingGradOp<T>::Compute(
Tensor* input_backprop = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output(0, tensor_in_shape, &input_backprop));
if (tensor_in_shape.num_elements() == 0) {
return;
}
PoolParameters params{context, size, stride,
padding, data_format, tensor_in_shape};

View File

@ -38,6 +38,7 @@ class RecordInputOp : public OpKernel {
GETATTR(int64, batch_size);
GETATTR(string, compression_type);
#undef GETATTR
OP_REQUIRES_OK(ctx, ctx->GetAttr("compression_type", &compression_type));
RecordYielder::Options yopts;
yopts.file_pattern = file_pattern;

View File

@ -131,7 +131,7 @@ class WhereCPUOp : public OpKernel {
OP_REQUIRES(
context, input.dtype() != DT_HALF,
errors::Unimplemented("No WhereOp available for float16/half type on "
"GPU; dying in CPU WhereOp to avoid silently "
"CPU; dying in CPU WhereOp to avoid silently "
"creating costly copies from device."));
const int input_dims = input.dims();

View File

@ -38,7 +38,7 @@ computation is performed on the underlying representation of x.
.Output("z: T") \
.SetIsCommutative() \
.Attr("T: {int8, int16, int32, int64, uint8, uint16, uint32, uint64}") \
.SetShapeFn(shape_inference::UnchangedShape)
.SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
REGISTER_OP("PopulationCount")
.Input("x: T")

View File

@ -114,6 +114,8 @@ int64 LogLevelStrToInt(const char* tf_env_var_val) {
return level;
}
} // namespace
int64 MinLogLevelFromEnv() {
const char* tf_env_var_val = getenv("TF_CPP_MIN_LOG_LEVEL");
return LogLevelStrToInt(tf_env_var_val);
@ -124,8 +126,6 @@ int64 MinVLogLevelFromEnv() {
return LogLevelStrToInt(tf_env_var_val);
}
} // namespace
LogMessage::~LogMessage() {
// Read the min log level once during the first call to logging.
static int64 min_log_level = MinLogLevelFromEnv();

View File

@ -305,6 +305,10 @@ T&& CheckNotNull(const char* file, int line, const char* exprtext, T&& t) {
return std::forward<T>(t);
}
int64 MinLogLevelFromEnv();
int64 MinVLogLevelFromEnv();
} // namespace internal
} // namespace tensorflow

View File

@ -28,6 +28,8 @@ filegroup(
tf_cc_binary(
name = "s3_file_system.so",
srcs = [
"aws_logging.cc",
"aws_logging.h",
"s3_crypto.cc",
"s3_crypto.h",
"s3_file_system.cc",
@ -66,6 +68,22 @@ cc_library(
alwayslink = 1,
)
cc_library(
name = "aws_logging",
srcs = [
"aws_logging.cc",
],
hdrs = [
"aws_logging.h",
],
deps = [
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"@aws//:aws",
],
alwayslink = 1,
)
cc_library(
name = "s3_file_system",
srcs = [
@ -75,6 +93,7 @@ cc_library(
"s3_file_system.h",
],
deps = [
":aws_logging",
":s3_crypto",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",

View File

@ -0,0 +1,121 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/platform/s3/aws_logging.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/mutex.h"
#include <aws/core/Aws.h>
#include <aws/core/utils/logging/AWSLogging.h>
#include <aws/core/utils/logging/LogSystemInterface.h>
#include <cstdarg>
namespace tensorflow {
AWSLogSystem::AWSLogSystem(Aws::Utils::Logging::LogLevel log_level)
: log_level_(log_level) {}
void AWSLogSystem::Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
const char* format, ...) {
std::va_list args;
va_start(args, format);
const string s = strings::Printf(format, args);
va_end(args);
LogMessage(log_level, s);
}
void AWSLogSystem::LogStream(Aws::Utils::Logging::LogLevel log_level,
const char* tag,
const Aws::OStringStream& message_stream) {
LogMessage(log_level, message_stream.rdbuf()->str().c_str());
}
void AWSLogSystem::LogMessage(Aws::Utils::Logging::LogLevel log_level,
const std::string& message) {
switch (log_level) {
case Aws::Utils::Logging::LogLevel::Info:
LOG(INFO) << message;
break;
case Aws::Utils::Logging::LogLevel::Warn:
LOG(WARNING) << message;
break;
case Aws::Utils::Logging::LogLevel::Error:
LOG(ERROR) << message;
break;
case Aws::Utils::Logging::LogLevel::Fatal:
LOG(FATAL) << message;
break;
default:
LOG(ERROR) << message;
break;
}
}
namespace {
static const char* kAWSLoggingTag = "AWSLogging";
Aws::Utils::Logging::LogLevel ParseLogLevelFromEnv() {
Aws::Utils::Logging::LogLevel log_level = Aws::Utils::Logging::LogLevel::Info;
const int64_t level = tensorflow::internal::MinLogLevelFromEnv();
switch (level) {
case INFO:
log_level = Aws::Utils::Logging::LogLevel::Info;
break;
case WARNING:
log_level = Aws::Utils::Logging::LogLevel::Warn;
break;
case ERROR:
log_level = Aws::Utils::Logging::LogLevel::Error;
break;
case FATAL:
log_level = Aws::Utils::Logging::LogLevel::Fatal;
break;
default:
log_level = Aws::Utils::Logging::LogLevel::Info;
break;
}
return log_level;
}
}
static bool initialized = false;
static mutex s3_logging_mutex(LINKER_INITIALIZED);
void AWSLogSystem::InitializeAWSLogging() {
std::lock_guard<mutex> s3_logging_lock(s3_logging_mutex);
if (!initialized) {
Aws::Utils::Logging::InitializeAWSLogging(
Aws::MakeShared<AWSLogSystem>(kAWSLoggingTag, ParseLogLevelFromEnv()));
initialized = true;
return;
}
}
void AWSLogSystem::ShutdownAWSLogging() {
std::lock_guard<mutex> s3_logging_lock(s3_logging_mutex);
if (initialized) {
Aws::Utils::Logging::ShutdownAWSLogging();
initialized = false;
return;
}
}
} // namespace tensorflow

View File

@ -0,0 +1,68 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_
#define TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_
#include <atomic>
#include <string>
#include <aws/core/utils/logging/LogLevel.h>
#include <aws/core/utils/logging/LogSystemInterface.h>
#include "tensorflow/core/platform/default/logging.h"
namespace tensorflow {
class AWSLogSystem : public Aws::Utils::Logging::LogSystemInterface {
public:
static void InitializeAWSLogging();
static void ShutdownAWSLogging();
explicit AWSLogSystem(Aws::Utils::Logging::LogLevel log_level);
virtual ~AWSLogSystem() = default;
// Gets the currently configured log level.
virtual Aws::Utils::Logging::LogLevel GetLogLevel(void) const override {
return log_level_;
}
// Set a new log level. This has the immediate effect of changing the log.
void SetLogLevel(Aws::Utils::Logging::LogLevel log_level) {
log_level_.store(log_level);
}
// Does a printf style output to ProcessFormattedStatement. Don't use this,
// it's unsafe. See LogStream.
// Since non-static C++ methods have an implicit this argument,
// TF_PRINTF_ATTRIBUTE should be counted from two (vs. one).
virtual void Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
const char* format, ...) override TF_PRINTF_ATTRIBUTE(4, 5);
// Writes the stream to ProcessFormattedStatement.
virtual void LogStream(Aws::Utils::Logging::LogLevel log_level,
const char* tag,
const Aws::OStringStream& messageStream) override;
private:
void LogMessage(Aws::Utils::Logging::LogLevel log_level,
const string& message);
std::atomic<Aws::Utils::Logging::LogLevel> log_level_;
TF_DISALLOW_COPY_AND_ASSIGN(AWSLogSystem);
};
} // namespace tensorflow
#endif // TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_

View File

@ -15,10 +15,13 @@ limitations under the License.
#include "tensorflow/core/platform/s3/s3_file_system.h"
#include "tensorflow/core/lib/io/path.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/s3/aws_logging.h"
#include "tensorflow/core/platform/s3/s3_crypto.h"
#include <aws/core/Aws.h>
#include <aws/core/utils/FileSystemUtils.h>
#include <aws/core/utils/logging/AWSLogging.h>
#include <aws/core/utils/logging/LogSystemInterface.h>
#include <aws/s3/S3Client.h>
#include <aws/s3/S3Errors.h>
#include <aws/s3/model/CopyObjectRequest.h>
@ -33,6 +36,7 @@ limitations under the License.
namespace tensorflow {
namespace {
static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
static const int kS3GetChildrenMaxKeys = 100;
@ -226,7 +230,11 @@ class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
uint64 length_;
};
} // namespace
S3FileSystem::S3FileSystem() {
AWSLogSystem::InitializeAWSLogging();
Aws::SDKOptions options;
options.cryptoOptions.sha256Factory_create_fn = []() {
return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
@ -240,6 +248,8 @@ S3FileSystem::S3FileSystem() {
S3FileSystem::~S3FileSystem() {
Aws::SDKOptions options;
Aws::ShutdownAPI(options);
AWSLogSystem::ShutdownAWSLogging();
}
Status S3FileSystem::NewRandomAccessFile(

View File

@ -232,10 +232,10 @@ data and target values for the training set, respectively, and `test_set.data`
and `test_set.target` contain feature data and target values for the test set.
Later on, in
["Fit the DNNClassifier to the Iris Training Data,"](#fit-dnnclassifier)
["Fit the DNNClassifier to the Iris Training Data,"](#fit_the_dnnclassifier_to_the_iris_training_data)
you'll use `training_set.data` and
`training_set.target` to train your model, and in
["Evaluate Model Accuracy,"](#evaluate-accuracy) you'll use `test_set.data` and
["Evaluate Model Accuracy,"](#evaluate_model_accuracy) you'll use `test_set.data` and
`test_set.target`. But first, you'll construct your model in the next section.
## Construct a Deep Neural Network Classifier

View File

@ -177,7 +177,7 @@ dataset = dataset.batch(batch_size=FLAGS.batch_size)
to:
```
dataset = dataset.apply(tf.data.contrib.map_and_batch(
dataset = dataset.apply(tf.contrib.data.map_and_batch(
map_func=parse_fn, batch_size=FLAGS.batch_size))
```

View File

@ -300,7 +300,7 @@ functions, methods, and properties. We also adhere to the
[Google Python style guide](https://google.github.io/styleguide/pyguide.html).
The TensorFlow C++ code base adheres to the
[Google C++ style guide](http://google.github.io/styleguide/cppguide.html).
[Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
(<sup>*</sup> With one exception: we use 2-space indentation instead of 4-space
indentation.)

View File

@ -487,7 +487,7 @@ subgraph inside.
![](../images/mnist_deep.png)
For more information about visualizing your TensorFlow application with
TensorBoard, see the [TensorBoard tutorial](TODO).
TensorBoard, see the [TensorBoard tutorial](../get_started/summaries_and_tensorboard.md).
## Programming with multiple graphs

View File

@ -246,7 +246,7 @@ results as in your server testing.
The demo app updates its UI list of results automatically based on the labels
text file you copy into assets alongside your frozen graph, which means you can
easily try out different models without needing to make any code changes. You
will need to updaye `LABEL_FILENAME` and `MODEL_FILENAME` to point to the files
will need to update `LABEL_FILENAME` and `MODEL_FILENAME` to point to the files
you've added if you change the paths though.
## How does this Model Work?

View File

@ -51,7 +51,7 @@ def read_tensor_from_image_file(file_name, input_height=299, input_width=299,
image_reader = tf.image.decode_jpeg(file_reader, channels = 3,
name='jpeg_reader')
float_caster = tf.cast(image_reader, tf.float32)
dims_expander = tf.expand_dims(float_caster, 0);
dims_expander = tf.expand_dims(float_caster, 0)
resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
normalized = tf.divide(tf.subtract(resized, [input_mean]), [input_std])
sess = tf.Session()
@ -118,8 +118,8 @@ if __name__ == "__main__":
input_name = "import/" + input_layer
output_name = "import/" + output_layer
input_operation = graph.get_operation_by_name(input_name);
output_operation = graph.get_operation_by_name(output_name);
input_operation = graph.get_operation_by_name(input_name)
output_operation = graph.get_operation_by_name(output_name)
with tf.Session(graph=graph) as sess:
results = sess.run(output_operation.outputs[0],

View File

@ -97,10 +97,27 @@ tf_java_op_gen_srcjar(
# file before making it an executable. See tf_java_op_gen_srcjar().
cc_library(
name = "java_op_gen_tool",
srcs = glob([
"src/gen/cc/*.h",
"src/gen/cc/*.cc",
]),
srcs = [
"src/gen/cc/op_gen_main.cc",
],
copts = tf_copts(),
deps = [
":java_op_gen_lib",
"//tensorflow/core:framework",
"//tensorflow/core:framework_internal",
"//tensorflow/core:lib",
],
)
cc_library(
name = "java_op_gen_lib",
srcs = [
"src/gen/cc/op_generator.cc",
],
hdrs = [
"src/gen/cc/java_defs.h",
"src/gen/cc/op_generator.h",
],
copts = tf_copts(),
deps = [
"//tensorflow/core:framework",
@ -280,21 +297,6 @@ tf_java_test(
],
)
#java_test(
# name = "OperatorProcessorTest",
# size = "small",
# srcs = ["src/test/java/org/tensorflow/processor/OperatorProcessorTest.java"],
# javacopts = JAVACOPTS,
# resources = [":processor_test_resources"],
# test_class = "org.tensorflow.processor.OperatorProcessorTest",
# deps = [
# ":processor_library",
# "//third_party/java/junit",
# "@com_google_testing_compile",
# "@com_google_truth",
# ],
#)
filegroup(
name = "processor_test_resources",
srcs = glob([

View File

@ -0,0 +1,273 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
#define TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
#include <string>
#include <vector>
#include <deque>
#include "tensorflow/core/platform/env.h"
namespace tensorflow {
namespace java {
// An enumeration of different modifiers commonly used in Java
enum Modifier {
PUBLIC = (1 << 0),
PROTECTED = (1 << 1),
PRIVATE = (1 << 2),
STATIC = (1 << 3),
FINAL = (1 << 4),
};
class Annotation;
// A definition of any kind of Java type (classes, interfaces...)
//
// Note that most of the data fields of this class are only useful in specific
// contexts and are not required in many cases. For example, annotations and
// supertypes are only useful when declaring a type.
class Type {
public:
enum Kind {
PRIMITIVE, CLASS, INTERFACE, ENUM, GENERIC, ANNOTATION
};
static const Type Byte() {
return Type(Type::PRIMITIVE, "byte");
}
static const Type Char() {
return Type(Type::PRIMITIVE, "char");
}
static const Type Short() {
return Type(Type::PRIMITIVE, "short");
}
static const Type Int() {
return Type(Type::PRIMITIVE, "int");
}
static const Type Long() {
return Type(Type::PRIMITIVE, "long");
}
static const Type Float() {
return Type(Type::PRIMITIVE, "float");
}
static const Type Double() {
return Type(Type::PRIMITIVE, "double");
}
static const Type Boolean() {
return Type(Type::PRIMITIVE, "boolean");
}
static const Type Void() {
// For simplicity, we consider 'void' as a primitive type, like the Java
// Reflection API does
return Type(Type::PRIMITIVE, "void");
}
static Type Class(const string& name, const string& package = "") {
return Type(Type::CLASS, name, package);
}
static Type Interface(const string& name, const string& package = "") {
return Type(Type::INTERFACE, name, package);
}
static Type Enum(const string& name, const string& package = "") {
return Type(Type::ENUM, name, package);
}
static Type Generic(const string& name = "") {
return Type(Type::GENERIC, name);
}
static Type ClassOf(const Type& type) {
return Class("Class").add_parameter(type);
}
static Type ListOf(const Type& type) {
return Interface("List", "java.util").add_parameter(type);
}
static Type IterableOf(const Type& type) {
return Interface("Iterable").add_parameter(type);
}
const Kind& kind() const { return kind_; }
const string& name() const { return name_; }
const string& package() const { return package_; }
const string& description() const { return description_; }
Type& description(const string& description) {
description_ = description;
return *this;
}
const std::vector<Type>& parameters() const { return parameters_; }
Type& add_parameter(const Type& parameter) {
parameters_.push_back(parameter);
return *this;
}
const std::vector<Annotation>& annotations() const { return annotations_; }
Type& add_annotation(const Annotation& annotation) {
annotations_.push_back(annotation);
return *this;
}
const std::deque<Type>& supertypes() const { return supertypes_; }
Type& add_supertype(const Type& type) {
if (type.kind_ == CLASS) {
supertypes_.push_front(type); // keep superclass at the front of the list
} else if (type.kind_ == INTERFACE) {
supertypes_.push_back(type);
}
return *this;
}
// Returns true if "type" is of a known collection type (only a few for now)
bool IsCollection() const {
return name_ == "List" || name_ == "Iterable";
}
// Returns true if this instance is a wildcard (<?>)
bool IsWildcard() const {
return kind_ == GENERIC && name_.empty();
}
protected:
Type(Kind kind, const string& name, const string& package = "")
: kind_(kind), name_(name), package_(package) {}
private:
Kind kind_;
string name_;
string package_;
string description_;
std::vector<Type> parameters_;
std::vector<Annotation> annotations_;
std::deque<Type> supertypes_;
};
// Definition of a Java annotation
//
// This class only defines the usage of an annotation in a specific context,
// giving optionally a set of attributes to initialize.
class Annotation : public Type {
public:
static Annotation Create(const string& type_name, const string& pkg = "") {
return Annotation(type_name, pkg);
}
const string& attributes() const { return attributes_; }
Annotation& attributes(const string& attributes) {
attributes_ = attributes;
return *this;
}
private:
string attributes_;
Annotation(const string& name, const string& package)
: Type(Kind::ANNOTATION, name, package) {}
};
// A definition of a Java variable
//
// This class declares an instance of a type, such as a class field or a
// method argument, which can be documented.
class Variable {
public:
static Variable Create(const string& name, const Type& type) {
return Variable(name, type, false);
}
static Variable Varargs(const string& name, const Type& type) {
return Variable(name, type, true);
}
const string& name() const { return name_; }
const Type& type() const { return type_; }
bool variadic() const { return variadic_; }
const string& description() const { return description_; }
Variable& description(const string& description) {
description_ = description;
return *this;
}
private:
string name_;
Type type_;
bool variadic_;
string description_;
Variable(const string& name, const Type& type, bool variadic)
: name_(name), type_(type), variadic_(variadic) {}
};
// A definition of a Java class method
//
// This class defines the signature of a method, including its name, return
// type and arguments.
class Method {
public:
static Method Create(const string& name, const Type& return_type) {
return Method(name, return_type, false);
}
static Method ConstructorFor(const Type& clazz) {
return Method(clazz.name(), clazz, true);
}
bool constructor() const { return constructor_; }
const string& name() const { return name_; }
const Type& return_type() const { return return_type_; }
const string& description() const { return description_; }
Method& description(const string& description) {
description_ = description;
return *this;
}
const string& return_description() const { return return_description_; }
Method& return_description(const string& description) {
return_description_ = description;
return *this;
}
const std::vector<Variable>& arguments() const { return arguments_; }
Method& add_arguments(const std::vector<Variable>& args) {
arguments_.insert(arguments_.cend(), args.cbegin(), args.cend());
return *this;
}
Method& add_argument(const Variable& var) {
arguments_.push_back(var);
return *this;
}
const std::vector<Annotation>& annotations() const { return annotations_; }
Method& add_annotation(const Annotation& annotation) {
annotations_.push_back(annotation);
return *this;
}
private:
string name_;
Type return_type_;
bool constructor_;
string description_;
string return_description_;
std::vector<Variable> arguments_;
std::vector<Annotation> annotations_;
Method(const string& name, const Type& return_type, bool constructor)
: name_(name), return_type_(return_type), constructor_(constructor) {}
};
// A piece of code to read from a file.
class Snippet {
public:
static Snippet Create(const string& fname, Env* env = Env::Default()) {
return Snippet(fname, env);
}
const string& data() const { return data_; }
private:
string data_;
Snippet(const string& fname, Env* env) {
TF_CHECK_OK(ReadFileToString(env, fname, &data_));
}
};
} // namespace java
} // namespace tensorflow
#endif // TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_

View File

@ -25,7 +25,7 @@
#include "tensorflow/java/src/gen/cc/op_generator.h"
namespace tensorflow {
namespace op_gen {
namespace java {
const char kUsageHeader[] =
"\n\nGenerator of operation wrappers in Java.\n\n"
@ -51,7 +51,7 @@ const char kUsageHeader[] =
"Finally, the '--base_package' overrides the default parent package "
"under which the generated subpackage and classes are to be located.\n\n";
} // namespace op_gen
} // namespace java
} // namespace tensorflow
int main(int argc, char* argv[]) {
@ -67,13 +67,13 @@ int main(int argc, char* argv[]) {
tensorflow::Flag(
"base_package", &base_package,
"Package parent to the generated subpackage and classes")};
tensorflow::string usage = tensorflow::op_gen::kUsageHeader;
tensorflow::string usage = tensorflow::java::kUsageHeader;
usage += tensorflow::Flags::Usage(argv[0], flag_list);
bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
QCHECK(parsed_flags_ok && !lib_name.empty() && !output_dir.empty()) << usage;
tensorflow::OpGenerator generator;
tensorflow::java::OpGenerator generator;
tensorflow::OpList ops;
tensorflow::OpRegistry::Global()->Export(true, &ops);
tensorflow::Status status =

View File

@ -20,6 +20,7 @@ limitations under the License.
#include "tensorflow/java/src/gen/cc/op_generator.h"
namespace tensorflow {
namespace java {
namespace {
string CamelCase(const string& str, char delimiter, bool upper) {
@ -63,4 +64,5 @@ Status OpGenerator::Run(const OpList& ops, const string& lib_name,
return Status::OK();
}
} // namespace java
} // namespace tensorflow

View File

@ -23,6 +23,7 @@ limitations under the License.
#include "tensorflow/core/platform/env.h"
namespace tensorflow {
namespace java {
/// \brief A generator of Java operation wrappers.
///
@ -46,6 +47,7 @@ class OpGenerator {
Env* env;
};
} // namespace java
} // namespace tensorflow
#endif // TENSORFLOW_JAVA_SRC_GEN_CC_OP_GENERATOR_H_

View File

@ -52,7 +52,7 @@ def tf_java_op_gen_srcjar(name,
# Generate a source archive containing generated code for these ops.
gen_srcjar = out_dir + name + ".srcjar"
gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) ."]
gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) src"]
gen_tools += ["@local_jdk//:jar"] + ["@local_jdk//:jdk"]
gen_tools += tf_binary_additional_srcs()
native.genrule(

View File

@ -803,7 +803,7 @@ class Dataset(object):
```python
# Preprocess 4 files concurrently, and interleave blocks of 16 records from
# each file.
filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ..."]
filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ...]
dataset = (Dataset.from_tensor_slices(filenames)
.interleave(lambda x:
TextLineDataset(x).map(parse_fn, num_parallel_calls=1),

View File

@ -23,6 +23,9 @@
set -e
# Filter out LOG(INFO)
export TF_CPP_MIN_LOG_LEVEL=1
IS_VIRTUALENV=0
PYTHON_BIN_PATH=""
while true; do

View File

@ -688,9 +688,10 @@ class Estimator(object):
Raises:
ValueError: if input_fn takes invalid arguments.
"""
del mode # unused
input_fn_args = util.fn_args(input_fn)
kwargs = {}
if 'mode' in input_fn_args:
kwargs['mode'] = mode
if 'params' in input_fn_args:
kwargs['params'] = self.params
if 'config' in input_fn_args:

View File

@ -420,6 +420,7 @@ class EstimatorTrainTest(test.TestCase):
self.assertEqual(1, model_fn_call_count[0])
def test_callable_input_fn(self):
expected_mode = model_fn_lib.ModeKeys.TRAIN
expected_params = {'batch_size': 10}
expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
input_fn_call_count = [0]
@ -432,8 +433,9 @@ class EstimatorTrainTest(test.TestCase):
class InputFn(object):
def __call__(self, params, config):
def __call__(self, mode, params, config):
input_fn_call_count[0] += 1
test_self.assertEqual(expected_mode, mode)
test_self.assertEqual(expected_params, params)
test_self.assertEqual(4321, config.tf_random_seed)
return dummy_input_fn()
@ -446,6 +448,7 @@ class EstimatorTrainTest(test.TestCase):
self.assertEqual(1, input_fn_call_count[0])
def test_input_fn_args(self):
expected_mode = model_fn_lib.ModeKeys.TRAIN
expected_params = {'batch_size': 10}
expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
input_fn_call_count = [0]
@ -454,8 +457,9 @@ class EstimatorTrainTest(test.TestCase):
del params, config
return model_fn_global_step_incrementer(features, labels, mode)
def _input_fn(params, config):
def _input_fn(mode, params, config):
input_fn_call_count[0] += 1
self.assertEqual(expected_mode, mode)
self.assertEqual(expected_params, params)
self.assertEqual(4321, config.tf_random_seed)
return dummy_input_fn()
@ -992,6 +996,7 @@ class EstimatorDatasetIntegrationTest(test.TestCase):
class EstimatorEvaluateTest(test.TestCase):
def test_input_fn_args(self):
expected_mode = model_fn_lib.ModeKeys.EVAL
expected_params = {'batch_size': 10}
expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
input_fn_call_count = [0]
@ -1000,8 +1005,9 @@ class EstimatorEvaluateTest(test.TestCase):
del params, config
return model_fn_global_step_incrementer(features, labels, mode)
def _input_fn(params, config):
def _input_fn(mode, params, config):
input_fn_call_count[0] += 1
self.assertEqual(expected_mode, mode)
self.assertEqual(expected_params, params)
self.assertEqual(4321, config.tf_random_seed)
return dummy_input_fn()
@ -1265,6 +1271,7 @@ class EstimatorEvaluateTest(test.TestCase):
class EstimatorPredictTest(test.TestCase):
def test_input_fn_args(self):
expected_mode = model_fn_lib.ModeKeys.PREDICT
expected_params = {'batch_size': 10}
expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
input_fn_call_count = [0]
@ -1277,8 +1284,9 @@ class EstimatorPredictTest(test.TestCase):
train_op=state_ops.assign_add(training.get_global_step(), 1),
predictions=constant_op.constant([[10.]]))
def _input_fn(params, config):
def _input_fn(mode, params, config):
input_fn_call_count[0] += 1
self.assertEqual(expected_mode, mode)
self.assertEqual(expected_params, params)
self.assertEqual(4321, config.tf_random_seed)
return dummy_input_fn()

View File

@ -677,11 +677,19 @@ class _TrainingExecutor(object):
'RunConfig or set the TF_CONFIG environment variable.')
logging.info('Start Tensorflow server.')
if config.session_config is None:
session_config=config_pb2.ConfigProto(log_device_placement=False)
else:
session_config=config_pb2.ConfigProto(
log_device_placement=False,
gpu_options=config.session_config.gpu_options)
server = server_lib.Server(
config.cluster_spec,
job_name=config.task_type,
task_index=config.task_id,
config=config_pb2.ConfigProto(log_device_placement=False),
config=session_config,
start=False)
server.start()
return server

View File

@ -796,6 +796,20 @@ class Conv2DTest(test.TestCase):
data_format=data_format,
use_gpu=use_gpu)
@test_util.run_in_graph_and_eager_modes()
def testConv2DBackpropFilterWithEmptyInput(self):
expected = [0, 0, 0, 0]
for (data_format, use_gpu) in GetTestConfigs():
self._RunAndVerifyBackpropFilter(
input_sizes=[0, 2, 3, 1],
filter_sizes=[2, 2, 1, 1],
output_sizes=[0, 1, 2, 1],
strides=[1, 1],
padding="VALID",
expected=expected,
data_format=data_format,
use_gpu=use_gpu)
@test_util.run_in_graph_and_eager_modes()
def testConv2D2x2Depth3ValidBackpropFilter(self):
expected = [

View File

@ -361,6 +361,16 @@ class PoolingTest(test.TestCase):
expected=expected_output,
use_gpu=use_gpu)
def _testAvgPoolEmptyInput(self, use_gpu):
self._VerifyValues(
nn_ops.avg_pool,
input_sizes=[0, 8, 8, 8],
ksize=[1, 3, 3, 1],
strides=[1, 2, 2, 1],
padding="SAME",
expected=[],
use_gpu=use_gpu)
def testAvgPooling(self):
for use_gpu in True, False:
self._testAvgPoolValidPadding(use_gpu)
@ -371,6 +381,7 @@ class PoolingTest(test.TestCase):
self._testAvgPoolSamePadding4(use_gpu)
self._testAvgPoolSamePaddingPacket4(use_gpu)
self._testAvgPoolSamePaddingPacket8(use_gpu)
self._testAvgPoolEmptyInput(use_gpu)
def _testMaxPoolValidPadding(self, use_gpu):
expected_output = [13.0, 14.0, 15.0]
@ -543,6 +554,16 @@ class PoolingTest(test.TestCase):
use_gpu=use_gpu,
v2=v2)
def _testMaxPoolEmptyInput(self, use_gpu):
self._VerifyValues(
gen_nn_ops._max_pool_v2,
input_sizes=[0, 8, 8, 8],
ksize=[1, 3, 3, 1],
strides=[1, 2, 2, 1],
padding="SAME",
expected=[],
use_gpu=use_gpu)
def testMaxPooling(self):
for use_gpu in True, False:
self._testMaxPoolValidPadding(use_gpu)
@ -551,6 +572,7 @@ class PoolingTest(test.TestCase):
self._testMaxPoolValidPaddingUnevenStride(use_gpu)
self._testMaxPoolSamePaddingPacket4(use_gpu)
self._testMaxPoolSamePaddingPacket8(use_gpu)
self._testMaxPoolEmptyInput(use_gpu)
# Tests for DepthwiseMaxPooling on CPU only.
def testDepthwiseMaxPool1x1DepthWindow1(self):

View File

@ -135,5 +135,36 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
bitwise_ops.right_shift(lhs, rhs)])
def testShapeInference(self):
dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
dtypes.uint8, dtypes.uint16]
with self.test_session(use_gpu=True) as sess:
for dtype in dtype_list:
lhs = constant_op.constant([[0], [3], [5]], dtype=dtype)
rhs = constant_op.constant([[1, 2, 4]], dtype=dtype)
and_tensor = bitwise_ops.bitwise_and(lhs, rhs)
or_tensor = bitwise_ops.bitwise_or(lhs, rhs)
xor_tensor = bitwise_ops.bitwise_xor(lhs, rhs)
ls_tensor = bitwise_ops.left_shift(lhs, rhs)
rs_tensor = bitwise_ops.right_shift(lhs, rhs)
and_result, or_result, xor_result, ls_result, rs_result = sess.run(
[and_tensor, or_tensor, xor_tensor, ls_tensor, rs_tensor])
# Compare shape inference with result
self.assertAllEqual(and_tensor.get_shape().as_list(), and_result.shape)
self.assertAllEqual(and_tensor.get_shape().as_list(), [3, 3])
self.assertAllEqual(or_tensor.get_shape().as_list(), or_result.shape)
self.assertAllEqual(or_tensor.get_shape().as_list(), [3, 3])
self.assertAllEqual(xor_tensor.get_shape().as_list(), xor_result.shape)
self.assertAllEqual(xor_tensor.get_shape().as_list(), [3, 3])
self.assertAllEqual(ls_tensor.get_shape().as_list(), ls_result.shape)
self.assertAllEqual(ls_tensor.get_shape().as_list(), [3, 3])
self.assertAllEqual(rs_tensor.get_shape().as_list(), rs_result.shape)
self.assertAllEqual(rs_tensor.get_shape().as_list(), [3, 3])
if __name__ == "__main__":
googletest.main()

View File

@ -977,9 +977,7 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
`hessians()` adds ops to the graph to output the Hessian matrix of `ys`
with respect to `xs`. It returns a list of `Tensor` of length `len(xs)`
where each tensor is the Hessian of `sum(ys)`. This function currently
only supports evaluating the Hessian with respect to (a list of) one-
dimensional tensors.
where each tensor is the Hessian of `sum(ys)`.
The Hessian is a matrix of second-order partial derivatives of a scalar
tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
@ -1005,31 +1003,32 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
'colocate_gradients_with_ops': colocate_gradients_with_ops,
'gate_gradients': gate_gradients,
'aggregation_method': aggregation_method
}
}
# Compute first-order derivatives and iterate for each x in xs.
hessians = []
_gradients = gradients(ys, xs, **kwargs)
for i, _gradient, x in zip(range(len(xs)), _gradients, xs):
# Ensure that x is a vector.
check_rank = check_ops.assert_rank(
x, 1, message='Cannot compute Hessian because element %d of `xs` does '
'not have rank one.' % i
)
with ops.control_dependencies([check_rank]):
# Declare an iterator and tensor array loop variables for the gradients.
n = array_ops.size(x)
loop_vars = [
for gradient, x in zip(_gradients, xs):
# change shape to one-dimension without graph branching
gradient = array_ops.reshape(gradient, [-1])
# Declare an iterator and tensor array loop variables for the gradients.
n = array_ops.size(x)
loop_vars = [
array_ops.constant(0, dtypes.int32),
tensor_array_ops.TensorArray(x.dtype, n)
]
# Iterate over all elements of the gradient and compute second order
# derivatives.
_, hessian = control_flow_ops.while_loop(
lambda j, _: j < n,
lambda j, result: (j + 1,
result.write(j, gradients(_gradient[j], x)[0])),
loop_vars
)
]
# Iterate over all elements of the gradient and compute second order
# derivatives.
_, hessian = control_flow_ops.while_loop(
lambda j, _: j < n,
lambda j, result: (j + 1,
result.write(j, gradients(gradient[j], x)[0])),
loop_vars
)
hessians.append(hessian.stack())
_shape = array_ops.shape(x)
_reshaped_hessian = array_ops.reshape(
hessian.stack(), array_ops.concat((_shape, _shape), 0)
)
hessians.append(_reshaped_hessian)
return hessians

View File

@ -621,6 +621,45 @@ class HessianTest(test_util.TensorFlowTestCase):
with self.assertRaises(ValueError):
gradients.hessians(x, x)
def testHessian2D_square_matrix(self):
# Manually compute the Hessian explicitly for a low-dimensional problem
# and check that `hessian` matches. Specifically, the Hessian of
# f(x) = 1/2 * x^T * x is H = constant (block identity matrix)
m = 3
rng = np.random.RandomState([1, 2, 3])
x_value = rng.randn(m, m).astype("float32")
with self.test_session(use_gpu=True):
x = constant_op.constant(x_value)
x_square = math_ops.reduce_sum(
math_ops.matmul(array_ops.transpose(x), x) * 0.5
)
hess = gradients.hessians(x_square, x)[0]
hess_actual = hess.eval()
hess_value = np.bmat([
[elem*np.ones((m, m)) for elem in vec]
for vec in np.eye(m)
]).astype("float32")
self.assertAllEqual((m, m, m, m), hess_actual.shape)
self.assertAllClose(hess_value, hess_actual.reshape((m * m, m * m)))
def testHessian2D_non_square_matrix(self):
m = 3
n = 4
rng = np.random.RandomState([1, 2, 3])
x_value = rng.randn(m, n).astype("float32")
with self.test_session(use_gpu=True):
x = constant_op.constant(x_value)
x_square = math_ops.reduce_sum(
math_ops.matmul(array_ops.transpose(x), x) * 0.5
)
hess = gradients.hessians(x_square, x)[0]
hess_actual = hess.eval()
hess_value = np.bmat([
[elem*np.ones((n, n)) for elem in vec]
for vec in np.eye(m)
]).astype("float32")
self.assertAllEqual((m, n, m, n), hess_actual.shape)
self.assertAllClose(hess_value, hess_actual.reshape((m * n, m * n)))
@test_util.with_c_api
class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):

View File

@ -219,15 +219,17 @@ def random_flip_up_down(image, seed=None):
Raises:
ValueError: if the shape of `image` not supported.
"""
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
mirror_cond = math_ops.less(uniform_random, .5)
result = control_flow_ops.cond(mirror_cond,
lambda: array_ops.reverse(image, [0]),
lambda: image)
return fix_image_flip_shape(image, result)
with ops.name_scope(None, 'random_flip_up_down', [image]) as scope:
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
mirror_cond = math_ops.less(uniform_random, .5)
result = control_flow_ops.cond(mirror_cond,
lambda: array_ops.reverse(image, [0]),
lambda: image,
name=scope)
return fix_image_flip_shape(image, result)
def random_flip_left_right(image, seed=None):
@ -248,15 +250,19 @@ def random_flip_left_right(image, seed=None):
Raises:
ValueError: if the shape of `image` not supported.
"""
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
mirror_cond = math_ops.less(uniform_random, .5)
result = control_flow_ops.cond(mirror_cond,
lambda: array_ops.reverse(image, [1]),
lambda: image)
return fix_image_flip_shape(image, result)
with ops.name_scope(None, 'random_flip_left_right', [image]) as scope:
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
mirror_cond = math_ops.less(uniform_random, .5)
result = control_flow_ops.cond(mirror_cond,
lambda: array_ops.reverse(image, [1]),
lambda: image,
name=scope)
print('scope: ' + scope)
print('result name: ' + result.name)
return fix_image_flip_shape(image, result)
def flip_left_right(image):
@ -276,10 +282,12 @@ def flip_left_right(image):
Raises:
ValueError: if the shape of `image` not supported.
"""
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
with ops.name_scope(None, 'flip_left_right', [image]) as scope:
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
return fix_image_flip_shape(image,
array_ops.reverse(image, [1], name=scope))
def flip_up_down(image):
@ -299,10 +307,12 @@ def flip_up_down(image):
Raises:
ValueError: if the shape of `image` not supported.
"""
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
with ops.name_scope(None, 'flip_up_down', [image]) as scope:
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
return fix_image_flip_shape(image,
array_ops.reverse(image, [0], name=scope))
def rot90(image, k=1, name=None):
@ -356,10 +366,11 @@ def transpose_image(image):
Raises:
ValueError: if the shape of `image` not supported.
"""
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
with ops.name_scope(None, 'transpose_image', [image]) as scope:
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
return array_ops.transpose(image, [1, 0, 2], name=scope)
def central_crop(image, central_fraction):
@ -386,33 +397,34 @@ def central_crop(image, central_fraction):
Returns:
3-D float Tensor
"""
image = ops.convert_to_tensor(image, name='image')
if central_fraction <= 0.0 or central_fraction > 1.0:
raise ValueError('central_fraction must be within (0, 1]')
if central_fraction == 1.0:
with ops.name_scope(None, 'central_crop', [image]):
image = ops.convert_to_tensor(image, name='image')
if central_fraction <= 0.0 or central_fraction > 1.0:
raise ValueError('central_fraction must be within (0, 1]')
if central_fraction == 1.0:
return image
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
img_shape = array_ops.shape(image)
depth = image.get_shape()[2]
img_h = math_ops.to_double(img_shape[0])
img_w = math_ops.to_double(img_shape[1])
bbox_h_start = math_ops.to_int32((img_h - img_h * central_fraction) / 2)
bbox_w_start = math_ops.to_int32((img_w - img_w * central_fraction) / 2)
bbox_h_size = img_shape[0] - bbox_h_start * 2
bbox_w_size = img_shape[1] - bbox_w_start * 2
bbox_begin = array_ops.stack([bbox_h_start, bbox_w_start, 0])
bbox_size = array_ops.stack([bbox_h_size, bbox_w_size, -1])
image = array_ops.slice(image, bbox_begin, bbox_size)
# The first two dimensions are dynamic and unknown.
image.set_shape([None, None, depth])
return image
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
img_shape = array_ops.shape(image)
depth = image.get_shape()[2]
img_h = math_ops.to_double(img_shape[0])
img_w = math_ops.to_double(img_shape[1])
bbox_h_start = math_ops.to_int32((img_h - img_h * central_fraction) / 2)
bbox_w_start = math_ops.to_int32((img_w - img_w * central_fraction) / 2)
bbox_h_size = img_shape[0] - bbox_h_start * 2
bbox_w_size = img_shape[1] - bbox_w_start * 2
bbox_begin = array_ops.stack([bbox_h_start, bbox_w_start, 0])
bbox_size = array_ops.stack([bbox_h_size, bbox_w_size, -1])
image = array_ops.slice(image, bbox_begin, bbox_size)
# The first two dimensions are dynamic and unknown.
image.set_shape([None, None, depth])
return image
def pad_to_bounding_box(image, offset_height, offset_width, target_height,
target_width):
@ -444,53 +456,54 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
`target_*` arguments, or either `offset_height` or `offset_width` is
negative.
"""
image = ops.convert_to_tensor(image, name='image')
with ops.name_scope(None, 'pad_to_bounding_box', [image]):
image = ops.convert_to_tensor(image, name='image')
is_batch = True
image_shape = image.get_shape()
if image_shape.ndims == 3:
is_batch = False
image = array_ops.expand_dims(image, 0)
elif image_shape.ndims is None:
is_batch = False
image = array_ops.expand_dims(image, 0)
image.set_shape([None] * 4)
elif image_shape.ndims != 4:
raise ValueError('\'image\' must have either 3 or 4 dimensions.')
is_batch = True
image_shape = image.get_shape()
if image_shape.ndims == 3:
is_batch = False
image = array_ops.expand_dims(image, 0)
elif image_shape.ndims is None:
is_batch = False
image = array_ops.expand_dims(image, 0)
image.set_shape([None] * 4)
elif image_shape.ndims != 4:
raise ValueError('\'image\' must have either 3 or 4 dimensions.')
assert_ops = _CheckAtLeast3DImage(image, require_static=False)
assert_ops = _CheckAtLeast3DImage(image, require_static=False)
batch, height, width, depth = _ImageDimensions(image, rank=4)
batch, height, width, depth = _ImageDimensions(image, rank=4)
after_padding_width = target_width - offset_width - width
after_padding_width = target_width - offset_width - width
after_padding_height = target_height - offset_height - height
after_padding_height = target_height - offset_height - height
assert_ops += _assert(offset_height >= 0, ValueError,
'offset_height must be >= 0')
assert_ops += _assert(offset_width >= 0, ValueError,
'offset_width must be >= 0')
assert_ops += _assert(after_padding_width >= 0, ValueError,
'width must be <= target - offset')
assert_ops += _assert(after_padding_height >= 0, ValueError,
'height must be <= target - offset')
image = control_flow_ops.with_dependencies(assert_ops, image)
assert_ops += _assert(offset_height >= 0, ValueError,
'offset_height must be >= 0')
assert_ops += _assert(offset_width >= 0, ValueError,
'offset_width must be >= 0')
assert_ops += _assert(after_padding_width >= 0, ValueError,
'width must be <= target - offset')
assert_ops += _assert(after_padding_height >= 0, ValueError,
'height must be <= target - offset')
image = control_flow_ops.with_dependencies(assert_ops, image)
# Do not pad on the depth dimensions.
paddings = array_ops.reshape(
array_ops.stack([
0, 0, offset_height, after_padding_height, offset_width,
after_padding_width, 0, 0
]), [4, 2])
padded = array_ops.pad(image, paddings)
# Do not pad on the depth dimensions.
paddings = array_ops.reshape(
array_ops.stack([
0, 0, offset_height, after_padding_height, offset_width,
after_padding_width, 0, 0
]), [4, 2])
padded = array_ops.pad(image, paddings)
padded_shape = [None if _is_tensor(i) else i
for i in [batch, target_height, target_width, depth]]
padded.set_shape(padded_shape)
padded_shape = [None if _is_tensor(i) else i
for i in [batch, target_height, target_width, depth]]
padded.set_shape(padded_shape)
if not is_batch:
padded = array_ops.squeeze(padded, squeeze_dims=[0])
if not is_batch:
padded = array_ops.squeeze(padded, squeeze_dims=[0])
return padded
return padded
def crop_to_bounding_box(image, offset_height, offset_width, target_height,
@ -523,51 +536,52 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
`target_*` arguments, or either `offset_height` or `offset_width` is
negative, or either `target_height` or `target_width` is not positive.
"""
image = ops.convert_to_tensor(image, name='image')
with ops.name_scope(None, 'crop_to_bounding_box', [image]):
image = ops.convert_to_tensor(image, name='image')
is_batch = True
image_shape = image.get_shape()
if image_shape.ndims == 3:
is_batch = False
image = array_ops.expand_dims(image, 0)
elif image_shape.ndims is None:
is_batch = False
image = array_ops.expand_dims(image, 0)
image.set_shape([None] * 4)
elif image_shape.ndims != 4:
raise ValueError('\'image\' must have either 3 or 4 dimensions.')
is_batch = True
image_shape = image.get_shape()
if image_shape.ndims == 3:
is_batch = False
image = array_ops.expand_dims(image, 0)
elif image_shape.ndims is None:
is_batch = False
image = array_ops.expand_dims(image, 0)
image.set_shape([None] * 4)
elif image_shape.ndims != 4:
raise ValueError('\'image\' must have either 3 or 4 dimensions.')
assert_ops = _CheckAtLeast3DImage(image, require_static=False)
assert_ops = _CheckAtLeast3DImage(image, require_static=False)
batch, height, width, depth = _ImageDimensions(image, rank=4)
batch, height, width, depth = _ImageDimensions(image, rank=4)
assert_ops += _assert(offset_width >= 0, ValueError,
'offset_width must be >= 0.')
assert_ops += _assert(offset_height >= 0, ValueError,
'offset_height must be >= 0.')
assert_ops += _assert(target_width > 0, ValueError,
'target_width must be > 0.')
assert_ops += _assert(target_height > 0, ValueError,
'target_height must be > 0.')
assert_ops += _assert(width >= (target_width + offset_width), ValueError,
'width must be >= target + offset.')
assert_ops += _assert(height >= (target_height + offset_height), ValueError,
'height must be >= target + offset.')
image = control_flow_ops.with_dependencies(assert_ops, image)
assert_ops += _assert(offset_width >= 0, ValueError,
'offset_width must be >= 0.')
assert_ops += _assert(offset_height >= 0, ValueError,
'offset_height must be >= 0.')
assert_ops += _assert(target_width > 0, ValueError,
'target_width must be > 0.')
assert_ops += _assert(target_height > 0, ValueError,
'target_height must be > 0.')
assert_ops += _assert(width >= (target_width + offset_width), ValueError,
'width must be >= target + offset.')
assert_ops += _assert(height >= (target_height + offset_height), ValueError,
'height must be >= target + offset.')
image = control_flow_ops.with_dependencies(assert_ops, image)
cropped = array_ops.slice(
image,
array_ops.stack([0, offset_height, offset_width, 0]),
array_ops.stack([-1, target_height, target_width, -1]))
cropped = array_ops.slice(
image,
array_ops.stack([0, offset_height, offset_width, 0]),
array_ops.stack([-1, target_height, target_width, -1]))
cropped_shape = [None if _is_tensor(i) else i
for i in [batch, target_height, target_width, depth]]
cropped.set_shape(cropped_shape)
cropped_shape = [None if _is_tensor(i) else i
for i in [batch, target_height, target_width, depth]]
cropped.set_shape(cropped_shape)
if not is_batch:
cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
if not is_batch:
cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
return cropped
return cropped
def resize_image_with_crop_or_pad(image, target_height, target_width):
@ -598,88 +612,90 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
If `images` was 3-D, a 3-D float Tensor of shape
`[new_height, new_width, channels]`.
"""
image = ops.convert_to_tensor(image, name='image')
image_shape = image.get_shape()
is_batch = True
if image_shape.ndims == 3:
is_batch = False
image = array_ops.expand_dims(image, 0)
elif image_shape.ndims is None:
is_batch = False
image = array_ops.expand_dims(image, 0)
image.set_shape([None] * 4)
elif image_shape.ndims != 4:
raise ValueError('\'image\' must have either 3 or 4 dimensions.')
with ops.name_scope(None, 'resize_image_with_crop_or_pad', [image]):
image = ops.convert_to_tensor(image, name='image')
image_shape = image.get_shape()
is_batch = True
if image_shape.ndims == 3:
is_batch = False
image = array_ops.expand_dims(image, 0)
elif image_shape.ndims is None:
is_batch = False
image = array_ops.expand_dims(image, 0)
image.set_shape([None] * 4)
elif image_shape.ndims != 4:
raise ValueError('\'image\' must have either 3 or 4 dimensions.')
assert_ops = _CheckAtLeast3DImage(image, require_static=False)
assert_ops += _assert(target_width > 0, ValueError,
'target_width must be > 0.')
assert_ops += _assert(target_height > 0, ValueError,
'target_height must be > 0.')
assert_ops = _CheckAtLeast3DImage(image, require_static=False)
assert_ops += _assert(target_width > 0, ValueError,
'target_width must be > 0.')
assert_ops += _assert(target_height > 0, ValueError,
'target_height must be > 0.')
image = control_flow_ops.with_dependencies(assert_ops, image)
# `crop_to_bounding_box` and `pad_to_bounding_box` have their own checks.
# Make sure our checks come first, so that error messages are clearer.
if _is_tensor(target_height):
target_height = control_flow_ops.with_dependencies(
assert_ops, target_height)
if _is_tensor(target_width):
target_width = control_flow_ops.with_dependencies(assert_ops, target_width)
image = control_flow_ops.with_dependencies(assert_ops, image)
# `crop_to_bounding_box` and `pad_to_bounding_box` have their own checks.
# Make sure our checks come first, so that error messages are clearer.
if _is_tensor(target_height):
target_height = control_flow_ops.with_dependencies(
assert_ops, target_height)
if _is_tensor(target_width):
target_width = control_flow_ops.with_dependencies(
assert_ops, target_width)
def max_(x, y):
if _is_tensor(x) or _is_tensor(y):
return math_ops.maximum(x, y)
else:
return max(x, y)
def max_(x, y):
if _is_tensor(x) or _is_tensor(y):
return math_ops.maximum(x, y)
else:
return max(x, y)
def min_(x, y):
if _is_tensor(x) or _is_tensor(y):
return math_ops.minimum(x, y)
else:
return min(x, y)
def min_(x, y):
if _is_tensor(x) or _is_tensor(y):
return math_ops.minimum(x, y)
else:
return min(x, y)
def equal_(x, y):
if _is_tensor(x) or _is_tensor(y):
return math_ops.equal(x, y)
else:
return x == y
def equal_(x, y):
if _is_tensor(x) or _is_tensor(y):
return math_ops.equal(x, y)
else:
return x == y
_, height, width, _ = _ImageDimensions(image, rank=4)
width_diff = target_width - width
offset_crop_width = max_(-width_diff // 2, 0)
offset_pad_width = max_(width_diff // 2, 0)
_, height, width, _ = _ImageDimensions(image, rank=4)
width_diff = target_width - width
offset_crop_width = max_(-width_diff // 2, 0)
offset_pad_width = max_(width_diff // 2, 0)
height_diff = target_height - height
offset_crop_height = max_(-height_diff // 2, 0)
offset_pad_height = max_(height_diff // 2, 0)
height_diff = target_height - height
offset_crop_height = max_(-height_diff // 2, 0)
offset_pad_height = max_(height_diff // 2, 0)
# Maybe crop if needed.
cropped = crop_to_bounding_box(image, offset_crop_height, offset_crop_width,
min_(target_height, height),
min_(target_width, width))
# Maybe crop if needed.
cropped = crop_to_bounding_box(image, offset_crop_height, offset_crop_width,
min_(target_height, height),
min_(target_width, width))
# Maybe pad if needed.
resized = pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width,
target_height, target_width)
# Maybe pad if needed.
resized = pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width,
target_height, target_width)
# In theory all the checks below are redundant.
if resized.get_shape().ndims is None:
raise ValueError('resized contains no shape.')
# In theory all the checks below are redundant.
if resized.get_shape().ndims is None:
raise ValueError('resized contains no shape.')
_, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4)
_, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4)
assert_ops = []
assert_ops += _assert(equal_(resized_height, target_height), ValueError,
'resized height is not correct.')
assert_ops += _assert(equal_(resized_width, target_width), ValueError,
'resized width is not correct.')
assert_ops = []
assert_ops += _assert(equal_(resized_height, target_height), ValueError,
'resized height is not correct.')
assert_ops += _assert(equal_(resized_width, target_width), ValueError,
'resized width is not correct.')
resized = control_flow_ops.with_dependencies(assert_ops, resized)
resized = control_flow_ops.with_dependencies(assert_ops, resized)
if not is_batch:
resized = array_ops.squeeze(resized, squeeze_dims=[0])
if not is_batch:
resized = array_ops.squeeze(resized, squeeze_dims=[0])
return resized
return resized
class ResizeMethod(object):
@ -736,66 +752,68 @@ def resize_images(images,
If `images` was 3-D, a 3-D float Tensor of shape
`[new_height, new_width, channels]`.
"""
images = ops.convert_to_tensor(images, name='images')
if images.get_shape().ndims is None:
raise ValueError('\'images\' contains no shape.')
# TODO(shlens): Migrate this functionality to the underlying Op's.
is_batch = True
if images.get_shape().ndims == 3:
is_batch = False
images = array_ops.expand_dims(images, 0)
elif images.get_shape().ndims != 4:
raise ValueError('\'images\' must have either 3 or 4 dimensions.')
with ops.name_scope(None, 'resize_images', [images, size]):
images = ops.convert_to_tensor(images, name='images')
if images.get_shape().ndims is None:
raise ValueError('\'images\' contains no shape.')
# TODO(shlens): Migrate this functionality to the underlying Op's.
is_batch = True
if images.get_shape().ndims == 3:
is_batch = False
images = array_ops.expand_dims(images, 0)
elif images.get_shape().ndims != 4:
raise ValueError('\'images\' must have either 3 or 4 dimensions.')
_, height, width, _ = images.get_shape().as_list()
_, height, width, _ = images.get_shape().as_list()
try:
size = ops.convert_to_tensor(size, dtypes.int32, name='size')
except (TypeError, ValueError):
raise ValueError('\'size\' must be a 1-D int32 Tensor')
if not size.get_shape().is_compatible_with([2]):
raise ValueError('\'size\' must be a 1-D Tensor of 2 elements: '
'new_height, new_width')
size_const_as_shape = tensor_util.constant_value_as_shape(size)
new_height_const = size_const_as_shape[0].value
new_width_const = size_const_as_shape[1].value
try:
size = ops.convert_to_tensor(size, dtypes.int32, name='size')
except (TypeError, ValueError):
raise ValueError('\'size\' must be a 1-D int32 Tensor')
if not size.get_shape().is_compatible_with([2]):
raise ValueError('\'size\' must be a 1-D Tensor of 2 elements: '
'new_height, new_width')
size_const_as_shape = tensor_util.constant_value_as_shape(size)
new_height_const = size_const_as_shape[0].value
new_width_const = size_const_as_shape[1].value
# If we can determine that the height and width will be unmodified by this
# transformation, we avoid performing the resize.
if all(x is not None
for x in [new_width_const, width, new_height_const, height]) and (
width == new_width_const and height == new_height_const):
if not is_batch:
images = array_ops.squeeze(images, squeeze_dims=[0])
return images
if method == ResizeMethod.BILINEAR:
images = gen_image_ops.resize_bilinear(images,
size,
align_corners=align_corners)
elif method == ResizeMethod.NEAREST_NEIGHBOR:
images = gen_image_ops.resize_nearest_neighbor(images,
size,
align_corners=
align_corners)
elif method == ResizeMethod.BICUBIC:
images = gen_image_ops.resize_bicubic(images,
size,
align_corners=align_corners)
elif method == ResizeMethod.AREA:
images = gen_image_ops.resize_area(images,
size,
align_corners=align_corners)
else:
raise ValueError('Resize method is not implemented.')
# NOTE(mrry): The shape functions for the resize ops cannot unpack
# the packed values in `new_size`, so set the shape here.
images.set_shape([None, new_height_const, new_width_const, None])
# If we can determine that the height and width will be unmodified by this
# transformation, we avoid performing the resize.
if all(x is not None
for x in [new_width_const, width, new_height_const, height]) and (
width == new_width_const and height == new_height_const):
if not is_batch:
images = array_ops.squeeze(images, squeeze_dims=[0])
return images
if method == ResizeMethod.BILINEAR:
images = gen_image_ops.resize_bilinear(images,
size,
align_corners=align_corners)
elif method == ResizeMethod.NEAREST_NEIGHBOR:
images = gen_image_ops.resize_nearest_neighbor(images,
size,
align_corners=align_corners)
elif method == ResizeMethod.BICUBIC:
images = gen_image_ops.resize_bicubic(images,
size,
align_corners=align_corners)
elif method == ResizeMethod.AREA:
images = gen_image_ops.resize_area(images,
size,
align_corners=align_corners)
else:
raise ValueError('Resize method is not implemented.')
# NOTE(mrry): The shape functions for the resize ops cannot unpack
# the packed values in `new_size`, so set the shape here.
images.set_shape([None, new_height_const, new_width_const, None])
if not is_batch:
images = array_ops.squeeze(images, squeeze_dims=[0])
return images
def per_image_standardization(image):
"""Linearly scales `image` to have zero mean and unit norm.
@ -816,27 +834,28 @@ def per_image_standardization(image):
Raises:
ValueError: if the shape of 'image' is incompatible with this function.
"""
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
num_pixels = math_ops.reduce_prod(array_ops.shape(image))
with ops.name_scope(None, 'per_image_standardization', [image]) as scope:
image = ops.convert_to_tensor(image, name='image')
image = control_flow_ops.with_dependencies(
_Check3DImage(image, require_static=False), image)
num_pixels = math_ops.reduce_prod(array_ops.shape(image))
image = math_ops.cast(image, dtype=dtypes.float32)
image_mean = math_ops.reduce_mean(image)
image = math_ops.cast(image, dtype=dtypes.float32)
image_mean = math_ops.reduce_mean(image)
variance = (math_ops.reduce_mean(math_ops.square(image)) -
math_ops.square(image_mean))
variance = gen_nn_ops.relu(variance)
stddev = math_ops.sqrt(variance)
variance = (math_ops.reduce_mean(math_ops.square(image)) -
math_ops.square(image_mean))
variance = gen_nn_ops.relu(variance)
stddev = math_ops.sqrt(variance)
# Apply a minimum normalization that protects us against uniform images.
min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
pixel_value_scale = math_ops.maximum(stddev, min_stddev)
pixel_value_offset = image_mean
# Apply a minimum normalization that protects us against uniform images.
min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
pixel_value_scale = math_ops.maximum(stddev, min_stddev)
pixel_value_offset = image_mean
image = math_ops.subtract(image, pixel_value_offset)
image = math_ops.div(image, pixel_value_scale)
return image
image = math_ops.subtract(image, pixel_value_offset)
image = math_ops.div(image, pixel_value_scale, name=scope)
return image
def random_brightness(image, max_delta, seed=None):

View File

@ -857,6 +857,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
with self.test_session(use_gpu=True):
x_tf = constant_op.constant(x_np, shape=x_np.shape)
y = image_ops.flip_left_right(x_tf)
self.assertTrue(y.op.name.startswith('flip_left_right'))
y_tf = y.eval()
self.assertAllEqual(y_tf, y_np)
@ -867,6 +868,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
with self.test_session(use_gpu=True):
x_tf = constant_op.constant(x_np, shape=x_np.shape)
y = image_ops.random_flip_left_right(x_tf)
self.assertTrue(y.op.name.startswith('random_flip_left_right'))
count_flipped = 0
count_unflipped = 0
@ -897,6 +899,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
with self.test_session(use_gpu=True):
x_tf = constant_op.constant(x_np, shape=x_np.shape)
y = image_ops.flip_up_down(x_tf)
self.assertTrue(y.op.name.startswith('flip_up_down'))
y_tf = y.eval()
self.assertAllEqual(y_tf, y_np)
@ -907,6 +910,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
with self.test_session(use_gpu=True):
x_tf = constant_op.constant(x_np, shape=x_np.shape)
y = image_ops.random_flip_up_down(x_tf)
self.assertTrue(y.op.name.startswith('random_flip_up_down'))
count_flipped = 0
count_unflipped = 0
for _ in range(50):
@ -936,6 +940,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
with self.test_session(use_gpu=True):
x_tf = constant_op.constant(x_np, shape=x_np.shape)
y = image_ops.transpose_image(x_tf)
self.assertTrue(y.op.name.startswith('transpose_image'))
y_tf = y.eval()
self.assertAllEqual(y_tf, y_np)
@ -1160,6 +1165,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
with self.test_session(use_gpu=True):
x = constant_op.constant(x_np, shape=x_shape)
y = image_ops.per_image_standardization(x)
self.assertTrue(y.op.name.startswith('per_image_standardization'))
y_tf = y.eval()
self.assertAllClose(y_tf, y_np, atol=1e-4)
@ -1341,6 +1347,11 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
for params, err_msg in test_config:
self._assertRaises(x, x_shape, *params, err_msg=err_msg)
def testNameScope(self):
image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
self.assertTrue(y.name.startswith('crop_to_bounding_box'))
class CentralCropTest(test_util.TensorFlowTestCase):
@ -1417,6 +1428,13 @@ class CentralCropTest(test_util.TensorFlowTestCase):
with self.assertRaises(ValueError):
_ = image_ops.central_crop(x, 1.01)
def testNameScope(self):
x_shape = [13, 9, 3]
x_np = np.ones(x_shape, dtype=np.float32)
with self.test_session(use_gpu=True):
y = image_ops.central_crop(x_np, 1.0)
self.assertTrue(y.op.name.startswith('central_crop'))
class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
@ -1620,6 +1638,11 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
for config_item in test_config:
self._assertRaises(x, x_shape, *config_item)
def testNameScope(self):
image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
self.assertTrue(y.op.name.startswith('pad_to_bounding_box'))
class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
@ -2224,6 +2247,13 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
def testNameScope(self):
img_shape = [1, 3, 2, 1]
with self.test_session(use_gpu=True):
single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
y = image_ops.resize_images(single_image, [55, 66])
self.assertTrue(y.op.name.startswith('resize_images'))
class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
@ -2499,6 +2529,11 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
self._assertRaises(x, x_shape, target_height, target_width,
"target_width must be > 0")
def testNameScope(self):
image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
self.assertTrue(y.op.name.startswith('resize_image_with_crop_or_pad'))
def _SimpleColorRamp():
"""Build a simple color ramp RGB image."""

View File

@ -171,6 +171,10 @@ class BatchNormalizationTest(test.TestCase):
x, x_shape, y, y_shape, delta=1e-3, x_init_value=x_init_val)
_, numerical_grad = gradient_checker.compute_gradient(
x32, x_shape, y32, y_shape, delta=1e-3, x_init_value=x32_init_val)
# If grad is empty, no error.
if theoretical_grad.size == 0 and numerical_grad.size == 0:
return 0
return np.fabs(theoretical_grad - numerical_grad).max()
def _test_gradient(self,
@ -371,6 +375,17 @@ class BatchNormalizationTest(test.TestCase):
self._test_inference(
x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
def testInferenceShape5(self):
x_shape = [0, 131, 127, 6]
for dtype in [np.float16, np.float32]:
if test.is_gpu_available(cuda_only=True):
self._test_inference(
x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
self._test_inference(
x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
self._test_inference(
x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
def testTrainingShape1(self):
x_shape = [1, 1, 6, 1]
for dtype in [np.float16, np.float32]:
@ -409,6 +424,17 @@ class BatchNormalizationTest(test.TestCase):
self._test_training(
x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
def testTrainingShape5(self):
x_shape = [0, 131, 127, 6]
for dtype in [np.float16, np.float32]:
if test.is_gpu_available(cuda_only=True):
self._test_training(
x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
self._test_training(
x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
self._test_training(
x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
def testBatchNormGradShape1(self):
for is_training in [True, False]:
x_shape = [1, 1, 6, 1]
@ -496,6 +522,33 @@ class BatchNormalizationTest(test.TestCase):
data_format='NHWC',
is_training=is_training)
def testBatchNormGradShape5(self):
for is_training in [True, False]:
x_shape = [0, 7, 11, 4]
for dtype in [np.float16, np.float32]:
if test.is_gpu_available(cuda_only=True):
self._test_gradient(
x_shape,
dtype, [7],
np.float32,
use_gpu=True,
data_format='NCHW',
is_training=is_training)
self._test_gradient(
x_shape,
dtype, [4],
np.float32,
use_gpu=True,
data_format='NHWC',
is_training=is_training)
self._test_gradient(
x_shape,
dtype, [4],
np.float32,
use_gpu=False,
data_format='NHWC',
is_training=is_training)
def _testBatchNormGradGrad(self, config):
shape = config['shape']
err_tolerance = config['err_tolerance']

View File

@ -120,7 +120,7 @@ def piecewise_constant(x, boundaries, values, name=None):
`float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
increasing entries, and with all elements having the same type as `x`.
values: A list of `Tensor`s or float`s or `int`s that specifies the values
values: A list of `Tensor`s or `float`s or `int`s that specifies the values
for the intervals defined by `boundaries`. It should have one more element
than `boundaries`, and all elements should have the same type.
name: A string. Optional name of the operation. Defaults to
@ -424,11 +424,12 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
return math_ops.div(learning_rate, denom, name=name)
def cosine_decay(learning_rate, global_step, decay_steps, name=None):
def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
name=None):
"""Applies cosine decay to the learning rate.
See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
with Warm Restarts.
with Warm Restarts. https://arxiv.org/abs/1608.03983
When training a model, it is often recommended to lower the learning rate as
the training progresses. This function applies a cosine decay function
@ -439,7 +440,8 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
The function returns the decayed learning rate. It is computed as:
```python
global_step = min(global_step, decay_steps)
decayed = 0.5 * (1 + cos(pi * global_step / decay_steps))
cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
decayed = (1 - alpha) * cosine_decay + alpha
decayed_learning_rate = learning_rate * decayed
```
@ -456,6 +458,8 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
Global step to use for the decay computation.
decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
Number of steps to decay over.
alpha: A scalar `float32` or `float64` Tensor or a Python number.
Minimum learning rate value as a fraction of learning_rate.
name: String. Optional name of the operation. Defaults to 'CosineDecay'.
Returns:
A scalar `Tensor` of the same type as `learning_rate`. The decayed
@ -476,7 +480,96 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
cosine_decayed = 0.5 * (
1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))
return math_ops.multiply(learning_rate, cosine_decayed)
decayed = (1 - alpha) * cosine_decayed + alpha
return math_ops.multiply(learning_rate, decayed)
def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
t_mul=2.0, m_mul=1.0, alpha=0.0, name=None):
"""Applies cosine decay with restarts to the learning rate.
See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
with Warm Restarts. https://arxiv.org/abs/1608.03983
When training a model, it is often recommended to lower the learning rate as
the training progresses. This function applies a cosine decay function with
restarts to a provided initial learning rate. It requires a `global_step`
value to compute the decayed learning rate. You can just pass a TensorFlow
variable that you increment at each training step.
The function returns the decayed learning rate while taking into account
possible warm restarts. The learning rate multiplier first decays
from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
restart is performed. Each new warm restart runs for `t_mul` times more steps
and with `m_mul` times smaller initial learning rate.
Example usage:
```python
first_decay_steps = 1000
lr_decayed = cosine_decay_restarts(learning_rate, global_step,
first_decay_steps)
```
Args:
learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
The initial learning rate.
global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
Global step to use for the decay computation.
first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
Number of steps to decay over.
t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
Used to derive the number of iterations in the i-th period
m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
Used to derive the initial learning rate of the i-th period:
alpha: A scalar `float32` or `float64` Tensor or a Python number.
Minimum learning rate value as a fraction of the learning_rate.
name: String. Optional name of the operation. Defaults to 'SGDRDecay'.
Returns:
A scalar `Tensor` of the same type as `learning_rate`. The decayed
learning rate.
Raises:
ValueError: if `global_step` is not supplied.
"""
if global_step is None:
raise ValueError("cosine decay restarts requires global_step")
with ops.name_scope(name, "SGDRDecay",
[learning_rate, global_step]) as name:
learning_rate = ops.convert_to_tensor(learning_rate,
name="initial_learning_rate")
dtype = learning_rate.dtype
global_step = math_ops.cast(global_step, dtype)
first_decay_steps = math_ops.cast(first_decay_steps, dtype)
alpha = math_ops.cast(alpha, dtype)
t_mul = math_ops.cast(t_mul, dtype)
m_mul = math_ops.cast(m_mul, dtype)
completed_fraction = global_step / first_decay_steps
def compute_step(completed_fraction, geometric=False):
if geometric:
i_restart = math_ops.floor(math_ops.log(1.0 - completed_fraction * (
1.0 - t_mul)) / math_ops.log(t_mul))
sum_r = (1.0 - t_mul ** i_restart) / (1.0 - t_mul)
completed_fraction = (completed_fraction - sum_r) / t_mul ** i_restart
else:
i_restart = math_ops.floor(completed_fraction)
completed_fraction = completed_fraction - i_restart
return i_restart, completed_fraction
i_restart, completed_fraction = control_flow_ops.cond(
math_ops.equal(t_mul, 1.0),
lambda: compute_step(completed_fraction, geometric=False),
lambda: compute_step(completed_fraction, geometric=True))
m_fac = m_mul ** i_restart
cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
constant_op.constant(math.pi) * completed_fraction))
decayed = (1 - alpha) * cosine_decayed + alpha
return math_ops.multiply(learning_rate, decayed, name=name)
def linear_cosine_decay(learning_rate, global_step, decay_steps,
@ -487,6 +580,10 @@ def linear_cosine_decay(learning_rate, global_step, decay_steps,
See [Bello et al., ICML2017] Neural Optimizer Search with RL.
https://arxiv.org/abs/1709.07417
For the idea of warm starts here controlled by `num_periods`,
see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
with Warm Restarts. https://arxiv.org/abs/1608.03983
Note that linear cosine decay is more aggressive than cosine decay and
larger initial learning rates can typically be used.
@ -563,6 +660,10 @@ def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
See [Bello et al., ICML2017] Neural Optimizer Search with RL.
https://arxiv.org/abs/1709.07417
For the idea of warm starts here controlled by `num_periods`,
see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
with Warm Restarts. https://arxiv.org/abs/1608.03983
Note that linear cosine decay is more aggressive than cosine decay and
larger initial learning rates can typically be used.

View File

@ -342,10 +342,11 @@ class InverseDecayTest(test_util.TensorFlowTestCase):
class CosineDecayTest(test_util.TensorFlowTestCase):
def np_cosine_decay(self, step, decay_steps):
def np_cosine_decay(self, step, decay_steps, alpha=0.0):
step = min(step, decay_steps)
completed_fraction = step / decay_steps
return 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
return (1.0 - alpha) * decay + alpha
def testDecay(self):
num_training_steps = 1000
@ -357,6 +358,77 @@ class CosineDecayTest(test_util.TensorFlowTestCase):
expected = self.np_cosine_decay(step, num_training_steps)
self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
def testAlpha(self):
num_training_steps = 1000
initial_lr = 1.0
alpha = 0.1
for step in range(0, 1500, 250):
with self.test_session():
decayed_lr = learning_rate_decay.cosine_decay(
initial_lr, step, num_training_steps, alpha)
expected = self.np_cosine_decay(step, num_training_steps, alpha)
self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
class CosineDecayRestartsTest(test_util.TensorFlowTestCase):
def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
alpha=0.0):
fac = 1.0
while step >= decay_steps:
step = step - decay_steps
decay_steps *= t_mul
fac *= m_mul
completed_fraction = step / decay_steps
decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
return (1.0 - alpha) * decay + alpha
def testDecay(self):
num_training_steps = 1000
initial_lr = 1.0
for step in range(0, 1500, 250):
with self.test_session():
decayed_lr = learning_rate_decay.cosine_decay_restarts(
initial_lr, step, num_training_steps)
expected = self.np_cosine_decay_restarts(step, num_training_steps)
self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
def testAlpha(self):
num_training_steps = 1000
initial_lr = 1.0
alpha = 0.1
for step in range(0, 1500, 250):
with self.test_session():
decayed_lr = learning_rate_decay.cosine_decay_restarts(
initial_lr, step, num_training_steps, alpha=alpha)
expected = self.np_cosine_decay_restarts(step, num_training_steps,
alpha=alpha)
self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
def testMMul(self):
num_training_steps = 1000
initial_lr = 1.0
m_mul = 0.9
for step in range(0, 1500, 250):
with self.test_session():
decayed_lr = learning_rate_decay.cosine_decay_restarts(
initial_lr, step, num_training_steps, m_mul=m_mul)
expected = self.np_cosine_decay_restarts(step, num_training_steps,
m_mul=m_mul)
self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
def testTMul(self):
num_training_steps = 1000
initial_lr = 1.0
t_mul = 1.0
for step in range(0, 1500, 250):
with self.test_session():
decayed_lr = learning_rate_decay.cosine_decay_restarts(
initial_lr, step, num_training_steps, t_mul=t_mul)
expected = self.np_cosine_decay_restarts(step, num_training_steps,
t_mul=t_mul)
self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
class LinearCosineDecayTest(test_util.TensorFlowTestCase):

View File

@ -38,6 +38,7 @@ See the @{$python/train} guide.
@@clip_by_global_norm
@@global_norm
@@cosine_decay
@@cosine_decay_restarts
@@linear_cosine_decay
@@noisy_linear_cosine_decay
@@exponential_decay

View File

@ -266,7 +266,11 @@ tf_module {
}
member_method {
name: "cosine_decay"
argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
}
member_method {
name: "cosine_decay_restarts"
argspec: "args=[\'learning_rate\', \'global_step\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
}
member_method {
name: "create_global_step"

View File

@ -1,4 +1,4 @@
FROM ubuntu:14.04
FROM ubuntu:16.04
LABEL maintainer="Jan Prach <jendap@google.com>"

View File

@ -1,4 +1,4 @@
FROM ubuntu:14.04
FROM ubuntu:16.04
LABEL maintainer="Jan Prach <jendap@google.com>"

View File

@ -1,4 +1,4 @@
FROM ubuntu:14.04
FROM ubuntu:16.04
LABEL authors="Andrew Gibiansky <andrew.gibiansky@gmail.com>, Joel Hestness <jthestness@gmail.com>"

View File

@ -1,4 +1,4 @@
FROM ubuntu:14.04
FROM ubuntu:16.04
LABEL maintainer="Jonathan Hseu <jhseu@google.com>"

View File

@ -1,4 +1,4 @@
FROM ubuntu:14.04
FROM ubuntu:16.04
LABEL maintainer="Jan Prach <jendap@google.com>"

View File

@ -1,4 +1,4 @@
FROM ubuntu:14.04
FROM ubuntu:16.04
LABEL maintainer="Jan Prach <jendap@google.com>"

View File

@ -111,9 +111,9 @@ do_pylint() {
fi
if [[ $1 == "PYTHON2" ]]; then
PYLINT_BIN="python /usr/local/lib/python2.7/dist-packages/pylint/lint.py"
PYLINT_BIN="python -m pylint"
elif [[ $1 == "PYTHON3" ]]; then
PYLINT_BIN="python3 /usr/local/lib/python3.4/dist-packages/pylint/lint.py"
PYLINT_BIN="python3 -m pylint"
else
echo "Unrecognized python version (PYTHON2 | PYTHON3): $1"
return 1

View File

@ -15,7 +15,7 @@
# ==============================================================================
# Select bazel version.
BAZEL_VERSION="0.5.4"
BAZEL_VERSION="0.8.0"
set +e
local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')

View File

@ -22,9 +22,24 @@
# fkrull/deadsnakes is for Python3.6
add-apt-repository -y ppa:fkrull/deadsnakes
apt-get update
apt-get upgrade
# Install python dep
apt-get install python-dev
# Install bz2 dep
apt-get install libbz2-dev
# Install curses dep
apt-get install libncurses5 libncurses5-dev
apt-get install libncursesw5 libncursesw5-dev
# Install readline dep
apt-get install libreadline6 libreadline6-dev
# Install sqlite3 dependencies
apt-get install libsqlite3-dev
set -e
# Install Python 3.6 and dev library
wget https://www.python.org/ftp/python/3.6.1/Python-3.6.1.tar.xz
tar xvf Python-3.6.1.tar.xz
@ -63,6 +78,10 @@ pip3 install scikit-learn==0.18.1
# pandas required by `inflow`
pip3 install pandas==0.19.2
pip3 install gnureadline
pip3 install bz2file
# Install recent-enough version of wheel for Python 3.6 wheel builds
pip3 install wheel==0.29.0

View File

@ -10,10 +10,7 @@ load(
py_binary(
name = "tf_upgrade",
srcs = [
"ast_edits.py",
"tf_upgrade.py",
],
srcs = ["tf_upgrade.py"],
srcs_version = "PY2AND3",
)
@ -22,7 +19,7 @@ py_test(
srcs = ["tf_upgrade_test.py"],
srcs_version = "PY2AND3",
deps = [
"tf_upgrade",
":tf_upgrade",
"//tensorflow/python:client_testlib",
"//tensorflow/python:framework_test_lib",
"@six_archive//:six",
@ -48,11 +45,11 @@ genrule(
"test_file_v1_0.py",
"report.txt",
],
cmd = ("$(location tf_upgrade)" +
cmd = ("$(location :tf_upgrade)" +
" --infile $(location testdata/test_file_v0_11.py)" +
" --outfile $(location test_file_v1_0.py)" +
" --reportfile $(location report.txt)"),
tools = ["tf_upgrade"],
tools = [":tf_upgrade"],
)
py_test(

View File

@ -19,11 +19,486 @@ from __future__ import division
from __future__ import print_function
import argparse
from tensorflow.tools.compatibility import ast_edits
import ast
import collections
import os
import shutil
import sys
import tempfile
import traceback
class TFAPIChangeSpec(ast_edits.APIChangeSpec):
class APIChangeSpec(object):
"""This class defines the transformations that need to happen.
This class must provide the following fields:
* `function_keyword_renames`: maps function names to a map of old -> new
argument names
* `function_renames`: maps function names to new function names
* `change_to_function`: a set of function names that have changed (for
notifications)
* `function_reorders`: maps functions whose argument order has changed to the
list of arguments in the new order
* `function_handle`: maps function names to custom handlers for the function
For an example, see `TFAPIChangeSpec`.
"""
class _FileEditTuple(collections.namedtuple(
"_FileEditTuple", ["comment", "line", "start", "old", "new"])):
"""Each edit that is recorded by a _FileEditRecorder.
Fields:
comment: A description of the edit and why it was made.
line: The line number in the file where the edit occurs (1-indexed).
start: The line number in the file where the edit occurs (0-indexed).
old: text string to remove (this must match what was in file).
new: text string to add in place of `old`.
"""
__slots__ = ()
class _FileEditRecorder(object):
"""Record changes that need to be done to the file."""
def __init__(self, filename):
# all edits are lists of chars
self._filename = filename
self._line_to_edit = collections.defaultdict(list)
self._errors = []
def process(self, text):
"""Process a list of strings, each corresponding to the recorded changes.
Args:
text: A list of lines of text (assumed to contain newlines)
Returns:
A tuple of the modified text and a textual description of what is done.
Raises:
ValueError: if substitution source location does not have expected text.
"""
change_report = ""
# Iterate of each line
for line, edits in self._line_to_edit.items():
offset = 0
# sort by column so that edits are processed in order in order to make
# indexing adjustments cumulative for changes that change the string
# length
edits.sort(key=lambda x: x.start)
# Extract each line to a list of characters, because mutable lists
# are editable, unlike immutable strings.
char_array = list(text[line - 1])
# Record a description of the change
change_report += "%r Line %d\n" % (self._filename, line)
change_report += "-" * 80 + "\n\n"
for e in edits:
change_report += "%s\n" % e.comment
change_report += "\n Old: %s" % (text[line - 1])
# Make underscore buffers for underlining where in the line the edit was
change_list = [" "] * len(text[line - 1])
change_list_new = [" "] * len(text[line - 1])
# Iterate for each edit
for e in edits:
# Create effective start, end by accounting for change in length due
# to previous edits
start_eff = e.start + offset
end_eff = start_eff + len(e.old)
# Make sure the edit is changing what it should be changing
old_actual = "".join(char_array[start_eff:end_eff])
if old_actual != e.old:
raise ValueError("Expected text %r but got %r" %
("".join(e.old), "".join(old_actual)))
# Make the edit
char_array[start_eff:end_eff] = list(e.new)
# Create the underline highlighting of the before and after
change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
change_list_new[start_eff:end_eff] = "~" * len(e.new)
# Keep track of how to generate effective ranges
offset += len(e.new) - len(e.old)
# Finish the report comment
change_report += " %s\n" % "".join(change_list)
text[line - 1] = "".join(char_array)
change_report += " New: %s" % (text[line - 1])
change_report += " %s\n\n" % "".join(change_list_new)
return "".join(text), change_report, self._errors
def add(self, comment, line, start, old, new, error=None):
"""Add a new change that is needed.
Args:
comment: A description of what was changed
line: Line number (1 indexed)
start: Column offset (0 indexed)
old: old text
new: new text
error: this "edit" is something that cannot be fixed automatically
Returns:
None
"""
self._line_to_edit[line].append(
_FileEditTuple(comment, line, start, old, new))
if error:
self._errors.append("%s:%d: %s" % (self._filename, line, error))
class _ASTCallVisitor(ast.NodeVisitor):
"""AST Visitor that processes function calls.
Updates function calls from old API version to new API version using a given
change spec.
"""
def __init__(self, filename, lines, api_change_spec):
self._filename = filename
self._file_edit = _FileEditRecorder(filename)
self._lines = lines
self._api_change_spec = api_change_spec
def process(self, lines):
return self._file_edit.process(lines)
def generic_visit(self, node):
ast.NodeVisitor.generic_visit(self, node)
def _rename_functions(self, node, full_name):
function_renames = self._api_change_spec.function_renames
try:
new_name = function_renames[full_name]
self._file_edit.add("Renamed function %r to %r" % (full_name,
new_name),
node.lineno, node.col_offset, full_name, new_name)
except KeyError:
pass
def _get_attribute_full_path(self, node):
"""Traverse an attribute to generate a full name e.g. tf.foo.bar.
Args:
node: A Node of type Attribute.
Returns:
a '.'-delimited full-name or None if the tree was not a simple form.
i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
"""
curr = node
items = []
while not isinstance(curr, ast.Name):
if not isinstance(curr, ast.Attribute):
return None
items.append(curr.attr)
curr = curr.value
items.append(curr.id)
return ".".join(reversed(items))
def _find_true_position(self, node):
"""Return correct line number and column offset for a given node.
This is necessary mainly because ListComp's location reporting reports
the next token after the list comprehension list opening.
Args:
node: Node for which we wish to know the lineno and col_offset
"""
import re
find_open = re.compile("^\s*(\\[).*$")
find_string_chars = re.compile("['\"]")
if isinstance(node, ast.ListComp):
# Strangely, ast.ListComp returns the col_offset of the first token
# after the '[' token which appears to be a bug. Workaround by
# explicitly finding the real start of the list comprehension.
line = node.lineno
col = node.col_offset
# loop over lines
while 1:
# Reverse the text to and regular expression search for whitespace
text = self._lines[line-1]
reversed_preceding_text = text[:col][::-1]
# First find if a [ can be found with only whitespace between it and
# col.
m = find_open.match(reversed_preceding_text)
if m:
new_col_offset = col - m.start(1) - 1
return line, new_col_offset
else:
if (reversed_preceding_text=="" or
reversed_preceding_text.isspace()):
line = line - 1
prev_line = self._lines[line - 1]
# TODO(aselle):
# this is poor comment detection, but it is good enough for
# cases where the comment does not contain string literal starting/
# ending characters. If ast gave us start and end locations of the
# ast nodes rather than just start, we could use string literal
# node ranges to filter out spurious #'s that appear in string
# literals.
comment_start = prev_line.find("#")
if comment_start == -1:
col = len(prev_line) -1
elif find_string_chars.search(prev_line[comment_start:]) is None:
col = comment_start
else:
return None, None
else:
return None, None
# Most other nodes return proper locations (with notably does not), but
# it is not possible to use that in an argument.
return node.lineno, node.col_offset
def visit_Call(self, node): # pylint: disable=invalid-name
"""Handle visiting a call node in the AST.
Args:
node: Current Node
"""
# Find a simple attribute name path e.g. "tf.foo.bar"
full_name = self._get_attribute_full_path(node.func)
# Make sure the func is marked as being part of a call
node.func.is_function_for_call = True
if full_name:
# Call special handlers
function_handles = self._api_change_spec.function_handle
if full_name in function_handles:
function_handles[full_name](self._file_edit, node)
# Examine any non-keyword argument and make it into a keyword argument
# if reordering required.
function_reorders = self._api_change_spec.function_reorders
function_keyword_renames = (
self._api_change_spec.function_keyword_renames)
if full_name in function_reorders:
reordered = function_reorders[full_name]
for idx, arg in enumerate(node.args):
lineno, col_offset = self._find_true_position(arg)
if lineno is None or col_offset is None:
self._file_edit.add(
"Failed to add keyword %r to reordered function %r"
% (reordered[idx], full_name), arg.lineno, arg.col_offset,
"", "",
error="A necessary keyword argument failed to be inserted.")
else:
keyword_arg = reordered[idx]
if (full_name in function_keyword_renames and
keyword_arg in function_keyword_renames[full_name]):
keyword_arg = function_keyword_renames[full_name][keyword_arg]
self._file_edit.add("Added keyword %r to reordered function %r"
% (reordered[idx], full_name), lineno,
col_offset, "", keyword_arg + "=")
# Examine each keyword argument and convert it to the final renamed form
renamed_keywords = ({} if full_name not in function_keyword_renames else
function_keyword_renames[full_name])
for keyword in node.keywords:
argkey = keyword.arg
argval = keyword.value
if argkey in renamed_keywords:
argval_lineno, argval_col_offset = self._find_true_position(argval)
if argval_lineno is not None and argval_col_offset is not None:
# TODO(aselle): We should scan backward to find the start of the
# keyword key. Unfortunately ast does not give you the location of
# keyword keys, so we are forced to infer it from the keyword arg
# value.
key_start = argval_col_offset - len(argkey) - 1
key_end = key_start + len(argkey) + 1
if (self._lines[argval_lineno - 1][key_start:key_end] ==
argkey + "="):
self._file_edit.add("Renamed keyword argument from %r to %r" %
(argkey, renamed_keywords[argkey]),
argval_lineno,
argval_col_offset - len(argkey) - 1,
argkey + "=", renamed_keywords[argkey] + "=")
continue
self._file_edit.add(
"Failed to rename keyword argument from %r to %r" %
(argkey, renamed_keywords[argkey]),
argval.lineno,
argval.col_offset - len(argkey) - 1,
"", "",
error="Failed to find keyword lexographically. Fix manually.")
ast.NodeVisitor.generic_visit(self, node)
def visit_Attribute(self, node): # pylint: disable=invalid-name
"""Handle bare Attributes i.e. [tf.foo, tf.bar].
Args:
node: Node that is of type ast.Attribute
"""
full_name = self._get_attribute_full_path(node)
if full_name:
self._rename_functions(node, full_name)
if full_name in self._api_change_spec.change_to_function:
if not hasattr(node, "is_function_for_call"):
new_text = full_name + "()"
self._file_edit.add("Changed %r to %r"%(full_name, new_text),
node.lineno, node.col_offset, full_name, new_text)
ast.NodeVisitor.generic_visit(self, node)
class ASTCodeUpgrader(object):
"""Handles upgrading a set of Python files using a given API change spec."""
def __init__(self, api_change_spec):
if not isinstance(api_change_spec, APIChangeSpec):
raise TypeError("Must pass APIChangeSpec to ASTCodeUpgrader, got %s" %
type(api_change_spec))
self._api_change_spec = api_change_spec
def process_file(self, in_filename, out_filename):
"""Process the given python file for incompatible changes.
Args:
in_filename: filename to parse
out_filename: output file to write to
Returns:
A tuple representing number of files processed, log of actions, errors
"""
# Write to a temporary file, just in case we are doing an implace modify.
with open(in_filename, "r") as in_file, \
tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
ret = self.process_opened_file(
in_filename, in_file, out_filename, temp_file)
shutil.move(temp_file.name, out_filename)
return ret
# Broad exceptions are required here because ast throws whatever it wants.
# pylint: disable=broad-except
def process_opened_file(self, in_filename, in_file, out_filename, out_file):
"""Process the given python file for incompatible changes.
This function is split out to facilitate StringIO testing from
tf_upgrade_test.py.
Args:
in_filename: filename to parse
in_file: opened file (or StringIO)
out_filename: output file to write to
out_file: opened file (or StringIO)
Returns:
A tuple representing number of files processed, log of actions, errors
"""
process_errors = []
text = "-" * 80 + "\n"
text += "Processing file %r\n outputting to %r\n" % (in_filename,
out_filename)
text += "-" * 80 + "\n\n"
parsed_ast = None
lines = in_file.readlines()
try:
parsed_ast = ast.parse("".join(lines))
except Exception:
text += "Failed to parse %r\n\n" % in_filename
text += traceback.format_exc()
if parsed_ast:
visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
visitor.visit(parsed_ast)
out_text, new_text, process_errors = visitor.process(lines)
text += new_text
if out_file:
out_file.write(out_text)
text += "\n"
return 1, text, process_errors
# pylint: enable=broad-except
def process_tree(self, root_directory, output_root_directory,
copy_other_files):
"""Processes upgrades on an entire tree of python files in place.
Note that only Python files. If you have custom code in other languages,
you will need to manually upgrade those.
Args:
root_directory: Directory to walk and process.
output_root_directory: Directory to use as base.
copy_other_files: Copy files that are not touched by this converter.
Returns:
A tuple of files processed, the report string ofr all files, and errors
"""
# make sure output directory doesn't exist
if output_root_directory and os.path.exists(output_root_directory):
print("Output directory %r must not already exist." % (
output_root_directory))
sys.exit(1)
# make sure output directory does not overlap with root_directory
norm_root = os.path.split(os.path.normpath(root_directory))
norm_output = os.path.split(os.path.normpath(output_root_directory))
if norm_root == norm_output:
print("Output directory %r same as input directory %r" % (
root_directory, output_root_directory))
sys.exit(1)
# Collect list of files to process (we do this to correctly handle if the
# user puts the output directory in some sub directory of the input dir)
files_to_process = []
files_to_copy = []
for dir_name, _, file_list in os.walk(root_directory):
py_files = [f for f in file_list if f.endswith(".py")]
copy_files = [f for f in file_list if not f.endswith(".py")]
for filename in py_files:
fullpath = os.path.join(dir_name, filename)
fullpath_output = os.path.join(
output_root_directory, os.path.relpath(fullpath, root_directory))
files_to_process.append((fullpath, fullpath_output))
if copy_other_files:
for filename in copy_files:
fullpath = os.path.join(dir_name, filename)
fullpath_output = os.path.join(
output_root_directory, os.path.relpath(fullpath, root_directory))
files_to_copy.append((fullpath, fullpath_output))
file_count = 0
tree_errors = []
report = ""
report += ("=" * 80) + "\n"
report += "Input tree: %r\n" % root_directory
report += ("=" * 80) + "\n"
for input_path, output_path in files_to_process:
output_directory = os.path.dirname(output_path)
if not os.path.isdir(output_directory):
os.makedirs(output_directory)
file_count += 1
_, l_report, l_errors = self.process_file(input_path, output_path)
tree_errors += l_errors
report += l_report
for input_path, output_path in files_to_copy:
output_directory = os.path.dirname(output_path)
if not os.path.isdir(output_directory):
os.makedirs(output_directory)
shutil.copy(input_path, output_path)
return file_count, report, tree_errors
class TFAPIChangeSpec(APIChangeSpec):
"""List of maps that describe what changed in the API."""
def __init__(self):
@ -238,7 +713,7 @@ Simple usage:
default="report.txt")
args = parser.parse_args()
upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
upgrade = ASTCodeUpgrader(TFAPIChangeSpec())
report_text = None
report_filename = args.report_filename
files_processed = 0

View File

@ -22,7 +22,6 @@ import tempfile
import six
from tensorflow.python.framework import test_util
from tensorflow.python.platform import test as test_lib
from tensorflow.tools.compatibility import ast_edits
from tensorflow.tools.compatibility import tf_upgrade
@ -37,7 +36,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
def _upgrade(self, old_file_text):
in_file = six.StringIO(old_file_text)
out_file = six.StringIO()
upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
count, report, errors = (
upgrader.process_opened_file("test.py", in_file,
"test_out.py", out_file))
@ -140,7 +139,7 @@ class TestUpgradeFiles(test_util.TensorFlowTestCase):
upgraded = "tf.multiply(a, b)\n"
temp_file.write(original)
temp_file.close()
upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
upgrader.process_file(temp_file.name, temp_file.name)
self.assertAllEqual(open(temp_file.name).read(), upgraded)
os.unlink(temp_file.name)

View File

@ -57,7 +57,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
>>/etc/bazel.bazelrc
# Install the most recent bazel release.
ENV BAZEL_VERSION 0.5.4
ENV BAZEL_VERSION 0.8.0
WORKDIR /
RUN mkdir /bazel && \
cd /bazel && \

View File

@ -66,7 +66,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
>>/etc/bazel.bazelrc
# Install the most recent bazel release.
ENV BAZEL_VERSION 0.5.4
ENV BAZEL_VERSION 0.8.0
WORKDIR /
RUN mkdir /bazel && \
cd /bazel && \

Some files were not shown because too many files have changed in this diff Show More